git.saurik.com Git - apple/xnu.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2000-2016 Apple Inc. All rights reserved.
	3	*
	4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
	5	*
	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. The rights granted to you under the License
	10	* may not be used to create, or enable the creation or redistribution of,
	11	* unlawful or unlicensed copies of an Apple operating system, or to
	12	* circumvent, violate, or enable the circumvention or violation of, any
	13	* terms of an Apple operating system software license agreement.
	14	*
	15	* Please obtain a copy of the License at
	16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
	17	*
	18	* The Original Code and all software distributed under the License are
	19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	23	* Please see the License for the specific language governing rights and
	24	* limitations under the License.
	25	*
	26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
	27	*/
	28	/*
	29	* @OSF_FREE_COPYRIGHT@
	30	*/
	31	/*
	32	* Mach Operating System
	33	* Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
	34	* All Rights Reserved.
	35	*
	36	* Permission to use, copy, modify and distribute this software and its
	37	* documentation is hereby granted, provided that both the copyright
	38	* notice and this permission notice appear in all copies of the
	39	* software, derivative works or modified versions, and any portions
	40	* thereof, and that both notices appear in supporting documentation.
	41	*
	42	* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
	43	* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
	44	* ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
	45	*
	46	* Carnegie Mellon requests users of this software to return to
	47	*
	48	* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
	49	* School of Computer Science
	50	* Carnegie Mellon University
	51	* Pittsburgh PA 15213-3890
	52	*
	53	* any improvements or extensions that they make and grant Carnegie Mellon
	54	* the rights to redistribute these changes.
	55	*/
	56	/*
	57	*/
	58	/*
	59	* File: sched_prim.c
	60	* Author: Avadis Tevanian, Jr.
	61	* Date: 1986
	62	*
	63	* Scheduling primitives
	64	*
	65	*/
	66
	67	#include <debug.h>
	68
	69	#include <mach/mach_types.h>
	70	#include <mach/machine.h>
	71	#include <mach/policy.h>
	72	#include <mach/sync_policy.h>
	73	#include <mach/thread_act.h>
	74
	75	#include <machine/machine_routines.h>
	76	#include <machine/sched_param.h>
	77	#include <machine/machine_cpu.h>
	78	#include <machine/machlimits.h>
	79	#include <machine/atomic.h>
	80
	81	#ifdef CONFIG_MACH_APPROXIMATE_TIME
	82	#include <machine/commpage.h>
	83	#endif
	84
	85	#include <kern/kern_types.h>
	86	#include <kern/backtrace.h>
	87	#include <kern/clock.h>
	88	#include <kern/counters.h>
	89	#include <kern/cpu_number.h>
	90	#include <kern/cpu_data.h>
	91	#include <kern/smp.h>
	92	#include <kern/debug.h>
	93	#include <kern/macro_help.h>
	94	#include <kern/machine.h>
	95	#include <kern/misc_protos.h>
	96	#if MONOTONIC
	97	#include <kern/monotonic.h>
	98	#endif /* MONOTONIC */
	99	#include <kern/processor.h>
	100	#include <kern/queue.h>
	101	#include <kern/sched.h>
	102	#include <kern/sched_prim.h>
	103	#include <kern/sfi.h>
	104	#include <kern/syscall_subr.h>
	105	#include <kern/task.h>
	106	#include <kern/thread.h>
	107	#include <kern/ledger.h>
	108	#include <kern/timer_queue.h>
	109	#include <kern/waitq.h>
	110	#include <kern/policy_internal.h>
	111
	112	#include <vm/pmap.h>
	113	#include <vm/vm_kern.h>
	114	#include <vm/vm_map.h>
	115	#include <vm/vm_pageout.h>
	116
	117	#include <mach/sdt.h>
	118	#include <mach/mach_host.h>
	119	#include <mach/host_info.h>
	120
	121	#include <sys/kdebug.h>
	122	#include <kperf/kperf.h>
	123	#include <kern/kpc.h>
	124	#include <san/kasan.h>
	125	#include <kern/pms.h>
	126	#include <kern/host.h>
	127	#include <stdatomic.h>
	128
	129	int rt_runq_count(processor_set_t pset)
	130	{
	131	return atomic_load_explicit(&SCHED(rt_runq)(pset)->count, memory_order_relaxed);
	132	}
	133
	134	void rt_runq_count_incr(processor_set_t pset)
	135	{
	136	atomic_fetch_add_explicit(&SCHED(rt_runq)(pset)->count, 1, memory_order_relaxed);
	137	}
	138
	139	void rt_runq_count_decr(processor_set_t pset)
	140	{
	141	atomic_fetch_sub_explicit(&SCHED(rt_runq)(pset)->count, 1, memory_order_relaxed);
	142	}
	143
	144	#define DEFAULT_PREEMPTION_RATE 100 /* (1/s) */
	145	int default_preemption_rate = DEFAULT_PREEMPTION_RATE;
	146
	147	#define DEFAULT_BG_PREEMPTION_RATE 400 /* (1/s) */
	148	int default_bg_preemption_rate = DEFAULT_BG_PREEMPTION_RATE;
	149
	150	#define MAX_UNSAFE_QUANTA 800
	151	int max_unsafe_quanta = MAX_UNSAFE_QUANTA;
	152
	153	#define MAX_POLL_QUANTA 2
	154	int max_poll_quanta = MAX_POLL_QUANTA;
	155
	156	#define SCHED_POLL_YIELD_SHIFT 4 /* 1/16 */
	157	int sched_poll_yield_shift = SCHED_POLL_YIELD_SHIFT;
	158
	159	uint64_t max_poll_computation;
	160
	161	uint64_t max_unsafe_computation;
	162	uint64_t sched_safe_duration;
	163
	164	#if defined(CONFIG_SCHED_TIMESHARE_CORE)
	165
	166	uint32_t std_quantum;
	167	uint32_t min_std_quantum;
	168	uint32_t bg_quantum;
	169
	170	uint32_t std_quantum_us;
	171	uint32_t bg_quantum_us;
	172
	173	#endif /* CONFIG_SCHED_TIMESHARE_CORE */
	174
	175	uint32_t thread_depress_time;
	176	uint32_t default_timeshare_computation;
	177	uint32_t default_timeshare_constraint;
	178
	179	uint32_t max_rt_quantum;
	180	uint32_t min_rt_quantum;
	181
	182	#if defined(CONFIG_SCHED_TIMESHARE_CORE)
	183
	184	unsigned sched_tick;
	185	uint32_t sched_tick_interval;
	186
	187	uint32_t sched_pri_shifts[TH_BUCKET_MAX];
	188	uint32_t sched_fixed_shift;
	189
	190	uint32_t sched_decay_usage_age_factor = 1; /* accelerate 5/8^n usage aging */
	191
	192	/* Allow foreground to decay past default to resolve inversions */
	193	#define DEFAULT_DECAY_BAND_LIMIT ((BASEPRI_FOREGROUND - BASEPRI_DEFAULT) + 2)
	194	int sched_pri_decay_band_limit = DEFAULT_DECAY_BAND_LIMIT;
	195
	196	/* Defaults for timer deadline profiling */
	197	#define TIMER_DEADLINE_TRACKING_BIN_1_DEFAULT 2000000 /* Timers with deadlines <=
	198	* 2ms */
	199	#define TIMER_DEADLINE_TRACKING_BIN_2_DEFAULT 5000000 /* Timers with deadlines
	200	<= 5ms */
	201
	202	uint64_t timer_deadline_tracking_bin_1;
	203	uint64_t timer_deadline_tracking_bin_2;
	204
	205	#endif /* CONFIG_SCHED_TIMESHARE_CORE */
	206
	207	thread_t sched_maintenance_thread;
	208
	209	#if __arm__ \|\| __arm64__
	210	/* interrupts disabled lock to guard recommended cores state */
	211	decl_simple_lock_data(static,sched_recommended_cores_lock);
	212	static void sched_recommended_cores_maintenance(void);
	213	static void sched_update_recommended_cores(uint32_t recommended_cores);
	214
	215	uint64_t perfcontrol_failsafe_starvation_threshold;
	216	extern char proc_name_address(struct proc p);
	217
	218	#endif /* __arm__ \|\| __arm64__ */
	219
	220	uint64_t sched_one_second_interval;
	221
	222	/* Forwards */
	223
	224	#if defined(CONFIG_SCHED_TIMESHARE_CORE)
	225
	226	static void load_shift_init(void);
	227	static void preempt_pri_init(void);
	228
	229	#endif /* CONFIG_SCHED_TIMESHARE_CORE */
	230
	231	#if CONFIG_SCHED_IDLE_IN_PLACE
	232	static thread_t thread_select_idle(
	233	thread_t thread,
	234	processor_t processor);
	235	#endif
	236
	237	thread_t processor_idle(
	238	thread_t thread,
	239	processor_t processor);
	240
	241	ast_t
	242	csw_check_locked( processor_t processor,
	243	processor_set_t pset,
	244	ast_t check_reason);
	245
	246	static void processor_setrun(
	247	processor_t processor,
	248	thread_t thread,
	249	integer_t options);
	250
	251	static void
	252	sched_realtime_timebase_init(void);
	253
	254	static void
	255	sched_timer_deadline_tracking_init(void);
	256
	257	#if DEBUG
	258	extern int debug_task;
	259	#define TLOG(a, fmt, args...) if(debug_task & a) kprintf(fmt, ## args)
	260	#else
	261	#define TLOG(a, fmt, args...) do {} while (0)
	262	#endif
	263
	264	static processor_t
	265	thread_bind_internal(
	266	thread_t thread,
	267	processor_t processor);
	268
	269	static void
	270	sched_vm_group_maintenance(void);
	271
	272	#if defined(CONFIG_SCHED_TIMESHARE_CORE)
	273	int8_t sched_load_shifts[NRQS];
	274	bitmap_t sched_preempt_pri[BITMAP_LEN(NRQS)];
	275	#endif /* CONFIG_SCHED_TIMESHARE_CORE */
	276
	277	const struct sched_dispatch_table *sched_current_dispatch = NULL;
	278
	279	/*
	280	* Statically allocate a buffer to hold the longest possible
	281	* scheduler description string, as currently implemented.
	282	* bsd/kern/kern_sysctl.c has a corresponding definition in bsd/
	283	* to export to userspace via sysctl(3). If either version
	284	* changes, update the other.
	285	*
	286	* Note that in addition to being an upper bound on the strings
	287	* in the kernel, it's also an exact parameter to PE_get_default(),
	288	* which interrogates the device tree on some platforms. That
	289	* API requires the caller know the exact size of the device tree
	290	* property, so we need both a legacy size (32) and the current size
	291	* (48) to deal with old and new device trees. The device tree property
	292	* is similarly padded to a fixed size so that the same kernel image
	293	* can run on multiple devices with different schedulers configured
	294	* in the device tree.
	295	*/
	296	char sched_string[SCHED_STRING_MAX_LENGTH];
	297
	298	uint32_t sched_debug_flags = SCHED_DEBUG_FLAG_CHOOSE_PROCESSOR_TRACEPOINTS;
	299
	300	/* Global flag which indicates whether Background Stepper Context is enabled */
	301	static int cpu_throttle_enabled = 1;
	302
	303	#if DEBUG
	304
	305	/* Since using the indirect function dispatch table has a negative impact on
	306	* context switch performance, only allow DEBUG kernels to use that mechanism.
	307	*/
	308	static void
	309	sched_init_override(void)
	310	{
	311	char sched_arg[SCHED_STRING_MAX_LENGTH] = { '\0' };
	312
	313	/* Check for runtime selection of the scheduler algorithm */
	314	if (!PE_parse_boot_argn("sched", sched_arg, sizeof (sched_arg))) {
	315	sched_arg[0] = '\0';
	316	}
	317	if (strlen(sched_arg) > 0) {
	318	if (0) {
	319	/* Allow pattern below */
	320	#if defined(CONFIG_SCHED_TRADITIONAL)
	321	} else if (0 == strcmp(sched_arg, sched_traditional_dispatch.sched_name)) {
	322	sched_current_dispatch = &sched_traditional_dispatch;
	323	} else if (0 == strcmp(sched_arg, sched_traditional_with_pset_runqueue_dispatch.sched_name)) {
	324	sched_current_dispatch = &sched_traditional_with_pset_runqueue_dispatch;
	325	#endif
	326	#if defined(CONFIG_SCHED_MULTIQ)
	327	} else if (0 == strcmp(sched_arg, sched_multiq_dispatch.sched_name)) {
	328	sched_current_dispatch = &sched_multiq_dispatch;
	329	} else if (0 == strcmp(sched_arg, sched_dualq_dispatch.sched_name)) {
	330	sched_current_dispatch = &sched_dualq_dispatch;
	331	#endif
	332	} else {
	333	#if defined(CONFIG_SCHED_TRADITIONAL)
	334	printf("Unrecognized scheduler algorithm: %s\n", sched_arg);
	335	printf("Scheduler: Using instead: %s\n", sched_traditional_with_pset_runqueue_dispatch.sched_name);
	336	sched_current_dispatch = &sched_traditional_with_pset_runqueue_dispatch;
	337	#else
	338	panic("Unrecognized scheduler algorithm: %s", sched_arg);
	339	#endif
	340	}
	341	kprintf("Scheduler: Runtime selection of %s\n", SCHED(sched_name));
	342	} else {
	343	#if defined(CONFIG_SCHED_MULTIQ)
	344	sched_current_dispatch = &sched_multiq_dispatch;
	345	#elif defined(CONFIG_SCHED_TRADITIONAL)
	346	sched_current_dispatch = &sched_traditional_with_pset_runqueue_dispatch;
	347	#else
	348	#error No default scheduler implementation
	349	#endif
	350	kprintf("Scheduler: Default of %s\n", SCHED(sched_name));
	351	}
	352	}
	353
	354	#endif /* DEBUG */
	355
	356	void
	357	sched_init(void)
	358	{
	359	#if DEBUG
	360	sched_init_override();
	361	#else /* DEBUG */
	362	kprintf("Scheduler: Default of %s\n", SCHED(sched_name));
	363	#endif /* DEBUG */
	364
	365	if (!PE_parse_boot_argn("sched_pri_decay_limit", &sched_pri_decay_band_limit, sizeof(sched_pri_decay_band_limit))) {
	366	/* No boot-args, check in device tree */
	367	if (!PE_get_default("kern.sched_pri_decay_limit",
	368	&sched_pri_decay_band_limit,
	369	sizeof(sched_pri_decay_band_limit))) {
	370	/* Allow decay all the way to normal limits */
	371	sched_pri_decay_band_limit = DEFAULT_DECAY_BAND_LIMIT;
	372	}
	373	}
	374
	375	kprintf("Setting scheduler priority decay band limit %d\n", sched_pri_decay_band_limit);
	376
	377	if (PE_parse_boot_argn("sched_debug", &sched_debug_flags, sizeof(sched_debug_flags))) {
	378	kprintf("Scheduler: Debug flags 0x%08x\n", sched_debug_flags);
	379	}
	380	strlcpy(sched_string, SCHED(sched_name), sizeof(sched_string));
	381
	382	SCHED(init)();
	383	SCHED(rt_init)(&pset0);
	384	sched_timer_deadline_tracking_init();
	385
	386	SCHED(pset_init)(&pset0);
	387	SCHED(processor_init)(master_processor);
	388	}
	389
	390	void
	391	sched_timebase_init(void)
	392	{
	393	uint64_t abstime;
	394
	395	clock_interval_to_absolutetime_interval(1, NSEC_PER_SEC, &abstime);
	396	sched_one_second_interval = abstime;
	397
	398	SCHED(timebase_init)();
	399	sched_realtime_timebase_init();
	400	}
	401
	402	#if defined(CONFIG_SCHED_TIMESHARE_CORE)
	403
	404	void
	405	sched_timeshare_init(void)
	406	{
	407	/*
	408	* Calculate the timeslicing quantum
	409	* in us.
	410	*/
	411	if (default_preemption_rate < 1)
	412	default_preemption_rate = DEFAULT_PREEMPTION_RATE;
	413	std_quantum_us = (1000 * 1000) / default_preemption_rate;
	414
	415	printf("standard timeslicing quantum is %d us\n", std_quantum_us);
	416
	417	if (default_bg_preemption_rate < 1)
	418	default_bg_preemption_rate = DEFAULT_BG_PREEMPTION_RATE;
	419	bg_quantum_us = (1000 * 1000) / default_bg_preemption_rate;
	420
	421	printf("standard background quantum is %d us\n", bg_quantum_us);
	422
	423	load_shift_init();
	424	preempt_pri_init();
	425	sched_tick = 0;
	426	}
	427
	428	void
	429	sched_timeshare_timebase_init(void)
	430	{
	431	uint64_t abstime;
	432	uint32_t shift;
	433
	434	/* standard timeslicing quantum */
	435	clock_interval_to_absolutetime_interval(
	436	std_quantum_us, NSEC_PER_USEC, &abstime);
	437	assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
	438	std_quantum = (uint32_t)abstime;
	439
	440	/* smallest remaining quantum (250 us) */
	441	clock_interval_to_absolutetime_interval(250, NSEC_PER_USEC, &abstime);
	442	assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
	443	min_std_quantum = (uint32_t)abstime;
	444
	445	/* quantum for background tasks */
	446	clock_interval_to_absolutetime_interval(
	447	bg_quantum_us, NSEC_PER_USEC, &abstime);
	448	assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
	449	bg_quantum = (uint32_t)abstime;
	450
	451	/* scheduler tick interval */
	452	clock_interval_to_absolutetime_interval(USEC_PER_SEC >> SCHED_TICK_SHIFT,
	453	NSEC_PER_USEC, &abstime);
	454	assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
	455	sched_tick_interval = (uint32_t)abstime;
	456
	457	/*
	458	* Compute conversion factor from usage to
	459	* timesharing priorities with 5/8 ** n aging.
	460	*/
	461	abstime = (abstime * 5) / 3;
	462	for (shift = 0; abstime > BASEPRI_DEFAULT; ++shift)
	463	abstime >>= 1;
	464	sched_fixed_shift = shift;
	465
	466	for (uint32_t i = 0 ; i < TH_BUCKET_MAX ; i++)
	467	sched_pri_shifts[i] = INT8_MAX;
	468
	469	max_unsafe_computation = ((uint64_t)max_unsafe_quanta) * std_quantum;
	470	sched_safe_duration = 2 * ((uint64_t)max_unsafe_quanta) * std_quantum;
	471
	472	max_poll_computation = ((uint64_t)max_poll_quanta) * std_quantum;
	473	thread_depress_time = 1 * std_quantum;
	474	default_timeshare_computation = std_quantum / 2;
	475	default_timeshare_constraint = std_quantum;
	476
	477	#if __arm__ \|\| __arm64__
	478	perfcontrol_failsafe_starvation_threshold = (2 * sched_tick_interval);
	479	#endif /* __arm__ \|\| __arm64__ */
	480	}
	481
	482	#endif /* CONFIG_SCHED_TIMESHARE_CORE */
	483
	484	void
	485	pset_rt_init(processor_set_t pset)
	486	{
	487	rt_lock_init(pset);
	488
	489	pset->rt_runq.count = 0;
	490	queue_init(&pset->rt_runq.queue);
	491	memset(&pset->rt_runq.runq_stats, 0, sizeof pset->rt_runq.runq_stats);
	492	}
	493
	494	rt_queue_t
	495	sched_rtglobal_runq(processor_set_t pset)
	496	{
	497	(void)pset;
	498
	499	return &pset0.rt_runq;
	500	}
	501
	502	void
	503	sched_rtglobal_init(processor_set_t pset)
	504	{
	505	if (pset == &pset0) {
	506	return pset_rt_init(pset);
	507	}
	508
	509	/* Only pset0 rt_runq is used, so make it easy to detect
	510	* buggy accesses to others.
	511	*/
	512	memset(&pset->rt_runq, 0xfd, sizeof pset->rt_runq);
	513	}
	514
	515	void
	516	sched_rtglobal_queue_shutdown(processor_t processor)
	517	{
	518	(void)processor;
	519	}
	520
	521	static void
	522	sched_realtime_timebase_init(void)
	523	{
	524	uint64_t abstime;
	525
	526	/* smallest rt computaton (50 us) */
	527	clock_interval_to_absolutetime_interval(50, NSEC_PER_USEC, &abstime);
	528	assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
	529	min_rt_quantum = (uint32_t)abstime;
	530
	531	/* maximum rt computation (50 ms) */
	532	clock_interval_to_absolutetime_interval(
	533	50, 1000*NSEC_PER_USEC, &abstime);
	534	assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
	535	max_rt_quantum = (uint32_t)abstime;
	536
	537	}
	538
	539	void
	540	sched_check_spill(processor_set_t pset, thread_t thread)
	541	{
	542	(void)pset;
	543	(void)thread;
	544
	545	return;
	546	}
	547
	548	bool
	549	sched_thread_should_yield(processor_t processor, thread_t thread)
	550	{
	551	(void)thread;
	552
	553	return (!SCHED(processor_queue_empty)(processor) \|\| rt_runq_count(processor->processor_set) > 0);
	554	}
	555
	556	#if defined(CONFIG_SCHED_TIMESHARE_CORE)
	557
	558	/*
	559	* Set up values for timeshare
	560	* loading factors.
	561	*/
	562	static void
	563	load_shift_init(void)
	564	{
	565	int8_t k, *p = sched_load_shifts;
	566	uint32_t i, j;
	567
	568	uint32_t sched_decay_penalty = 1;
	569
	570	if (PE_parse_boot_argn("sched_decay_penalty", &sched_decay_penalty, sizeof (sched_decay_penalty))) {
	571	kprintf("Overriding scheduler decay penalty %u\n", sched_decay_penalty);
	572	}
	573
	574	if (PE_parse_boot_argn("sched_decay_usage_age_factor", &sched_decay_usage_age_factor, sizeof (sched_decay_usage_age_factor))) {
	575	kprintf("Overriding scheduler decay usage age factor %u\n", sched_decay_usage_age_factor);
	576	}
	577
	578	if (sched_decay_penalty == 0) {
	579	/*
	580	* There is no penalty for timeshare threads for using too much
	581	* CPU, so set all load shifts to INT8_MIN. Even under high load,
	582	* sched_pri_shift will be >INT8_MAX, and there will be no
	583	* penalty applied to threads (nor will sched_usage be updated per
	584	* thread).
	585	*/
	586	for (i = 0; i < NRQS; i++) {
	587	sched_load_shifts[i] = INT8_MIN;
	588	}
	589
	590	return;
	591	}
	592
	593	p++ = INT8_MIN; p++ = 0;
	594
	595	/*
	596	* For a given system load "i", the per-thread priority
	597	* penalty per quantum of CPU usage is ~2^k priority
	598	* levels. "sched_decay_penalty" can cause more
	599	* array entries to be filled with smaller "k" values
	600	*/
	601	for (i = 2, j = 1 << sched_decay_penalty, k = 1; i < NRQS; ++k) {
	602	for (j <<= 1; (i < j) && (i < NRQS); ++i)
	603	*p++ = k;
	604	}
	605	}
	606
	607	static void
	608	preempt_pri_init(void)
	609	{
	610	bitmap_t *p = sched_preempt_pri;
	611
	612	for (int i = BASEPRI_FOREGROUND; i < MINPRI_KERNEL; ++i)
	613	bitmap_set(p, i);
	614
	615	for (int i = BASEPRI_PREEMPT; i <= MAXPRI; ++i)
	616	bitmap_set(p, i);
	617	}
	618
	619	#endif /* CONFIG_SCHED_TIMESHARE_CORE */
	620
	621	/*
	622	* Thread wait timer expiration.
	623	*/
	624	void
	625	thread_timer_expire(
	626	void *p0,
	627	__unused void *p1)
	628	{
	629	thread_t thread = p0;
	630	spl_t s;
	631
	632	assert_thread_magic(thread);
	633
	634	s = splsched();
	635	thread_lock(thread);
	636	if (--thread->wait_timer_active == 0) {
	637	if (thread->wait_timer_is_set) {
	638	thread->wait_timer_is_set = FALSE;
	639	clear_wait_internal(thread, THREAD_TIMED_OUT);
	640	}
	641	}
	642	thread_unlock(thread);
	643	splx(s);
	644	}
	645
	646	/*
	647	* thread_unblock:
	648	*
	649	* Unblock thread on wake up.
	650	*
	651	* Returns TRUE if the thread should now be placed on the runqueue.
	652	*
	653	* Thread must be locked.
	654	*
	655	* Called at splsched().
	656	*/
	657	boolean_t
	658	thread_unblock(
	659	thread_t thread,
	660	wait_result_t wresult)
	661	{
	662	boolean_t ready_for_runq = FALSE;
	663	thread_t cthread = current_thread();
	664	uint32_t new_run_count;
	665
	666	/*
	667	* Set wait_result.
	668	*/
	669	thread->wait_result = wresult;
	670
	671	/*
	672	* Cancel pending wait timer.
	673	*/
	674	if (thread->wait_timer_is_set) {
	675	if (timer_call_cancel(&thread->wait_timer))
	676	thread->wait_timer_active--;
	677	thread->wait_timer_is_set = FALSE;
	678	}
	679
	680	/*
	681	* Update scheduling state: not waiting,
	682	* set running.
	683	*/
	684	thread->state &= ~(TH_WAIT\|TH_UNINT);
	685
	686	if (!(thread->state & TH_RUN)) {
	687	thread->state \|= TH_RUN;
	688	thread->last_made_runnable_time = thread->last_basepri_change_time = mach_approximate_time();
	689
	690	ready_for_runq = TRUE;
	691
	692	(*thread->sched_call)(SCHED_CALL_UNBLOCK, thread);
	693
	694	/* Update the runnable thread count */
	695	new_run_count = sched_run_incr(thread);
	696	} else {
	697	/*
	698	* Either the thread is idling in place on another processor,
	699	* or it hasn't finished context switching yet.
	700	*/
	701	#if CONFIG_SCHED_IDLE_IN_PLACE
	702	if (thread->state & TH_IDLE) {
	703	processor_t processor = thread->last_processor;
	704
	705	if (processor != current_processor())
	706	machine_signal_idle(processor);
	707	}
	708	#else
	709	assert((thread->state & TH_IDLE) == 0);
	710	#endif
	711	/*
	712	* The run count is only dropped after the context switch completes
	713	* and the thread is still waiting, so we should not run_incr here
	714	*/
	715	new_run_count = sched_run_buckets[TH_BUCKET_RUN];
	716	}
	717
	718
	719	/*
	720	* Calculate deadline for real-time threads.
	721	*/
	722	if (thread->sched_mode == TH_MODE_REALTIME) {
	723	uint64_t ctime;
	724
	725	ctime = mach_absolute_time();
	726	thread->realtime.deadline = thread->realtime.constraint + ctime;
	727	}
	728
	729	/*
	730	* Clear old quantum, fail-safe computation, etc.
	731	*/
	732	thread->quantum_remaining = 0;
	733	thread->computation_metered = 0;
	734	thread->reason = AST_NONE;
	735	thread->block_hint = kThreadWaitNone;
	736
	737	/* Obtain power-relevant interrupt and "platform-idle exit" statistics.
	738	* We also account for "double hop" thread signaling via
	739	* the thread callout infrastructure.
	740	* DRK: consider removing the callout wakeup counters in the future
	741	* they're present for verification at the moment.
	742	*/
	743	boolean_t aticontext, pidle;
	744	ml_get_power_state(&aticontext, &pidle);
	745
	746	if (__improbable(aticontext && !(thread_get_tag_internal(thread) & THREAD_TAG_CALLOUT))) {
	747	DTRACE_SCHED2(iwakeup, struct thread , thread, struct proc , thread->task->bsd_info);
	748
	749	uint64_t ttd = PROCESSOR_DATA(current_processor(), timer_call_ttd);
	750
	751	if (ttd) {
	752	if (ttd <= timer_deadline_tracking_bin_1)
	753	thread->thread_timer_wakeups_bin_1++;
	754	else
	755	if (ttd <= timer_deadline_tracking_bin_2)
	756	thread->thread_timer_wakeups_bin_2++;
	757	}
	758
	759	ledger_credit_thread(thread, thread->t_ledger,
	760	task_ledgers.interrupt_wakeups, 1);
	761	if (pidle) {
	762	ledger_credit_thread(thread, thread->t_ledger,
	763	task_ledgers.platform_idle_wakeups, 1);
	764	}
	765
	766	} else if (thread_get_tag_internal(cthread) & THREAD_TAG_CALLOUT) {
	767	/* TODO: what about an interrupt that does a wake taken on a callout thread? */
	768	if (cthread->callout_woken_from_icontext) {
	769	ledger_credit_thread(thread, thread->t_ledger,
	770	task_ledgers.interrupt_wakeups, 1);
	771	thread->thread_callout_interrupt_wakeups++;
	772
	773	if (cthread->callout_woken_from_platform_idle) {
	774	ledger_credit_thread(thread, thread->t_ledger,
	775	task_ledgers.platform_idle_wakeups, 1);
	776	thread->thread_callout_platform_idle_wakeups++;
	777	}
	778
	779	cthread->callout_woke_thread = TRUE;
	780	}
	781	}
	782
	783	if (thread_get_tag_internal(thread) & THREAD_TAG_CALLOUT) {
	784	thread->callout_woken_from_icontext = aticontext;
	785	thread->callout_woken_from_platform_idle = pidle;
	786	thread->callout_woke_thread = FALSE;
	787	}
	788
	789	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	790	MACHDBG_CODE(DBG_MACH_SCHED,MACH_MAKE_RUNNABLE) \| DBG_FUNC_NONE,
	791	(uintptr_t)thread_tid(thread), thread->sched_pri, thread->wait_result,
	792	sched_run_buckets[TH_BUCKET_RUN], 0);
	793
	794	DTRACE_SCHED2(wakeup, struct thread , thread, struct proc , thread->task->bsd_info);
	795
	796	return (ready_for_runq);
	797	}
	798
	799	/*
	800	* Routine: thread_go
	801	* Purpose:
	802	* Unblock and dispatch thread.
	803	* Conditions:
	804	* thread lock held, IPC locks may be held.
	805	* thread must have been pulled from wait queue under same lock hold.
	806	* thread must have been waiting
	807	* Returns:
	808	* KERN_SUCCESS - Thread was set running
	809	*
	810	* TODO: This should return void
	811	*/
	812	kern_return_t
	813	thread_go(
	814	thread_t thread,
	815	wait_result_t wresult)
	816	{
	817	assert_thread_magic(thread);
	818
	819	assert(thread->at_safe_point == FALSE);
	820	assert(thread->wait_event == NO_EVENT64);
	821	assert(thread->waitq == NULL);
	822
	823	assert(!(thread->state & (TH_TERMINATE\|TH_TERMINATE2)));
	824	assert(thread->state & TH_WAIT);
	825
	826
	827	if (thread_unblock(thread, wresult)) {
	828	#if SCHED_TRACE_THREAD_WAKEUPS
	829	backtrace(&thread->thread_wakeup_bt[0],
	830	(sizeof(thread->thread_wakeup_bt)/sizeof(uintptr_t)));
	831	#endif
	832	thread_setrun(thread, SCHED_PREEMPT \| SCHED_TAILQ);
	833	}
	834
	835	return (KERN_SUCCESS);
	836	}
	837
	838	/*
	839	* Routine: thread_mark_wait_locked
	840	* Purpose:
	841	* Mark a thread as waiting. If, given the circumstances,
	842	* it doesn't want to wait (i.e. already aborted), then
	843	* indicate that in the return value.
	844	* Conditions:
	845	* at splsched() and thread is locked.
	846	*/
	847	__private_extern__
	848	wait_result_t
	849	thread_mark_wait_locked(
	850	thread_t thread,
	851	wait_interrupt_t interruptible)
	852	{
	853	boolean_t at_safe_point;
	854
	855	assert(!(thread->state & (TH_WAIT\|TH_IDLE\|TH_UNINT\|TH_TERMINATE2)));
	856
	857	/*
	858	* The thread may have certain types of interrupts/aborts masked
	859	* off. Even if the wait location says these types of interrupts
	860	* are OK, we have to honor mask settings (outer-scoped code may
	861	* not be able to handle aborts at the moment).
	862	*/
	863	if (interruptible > (thread->options & TH_OPT_INTMASK))
	864	interruptible = thread->options & TH_OPT_INTMASK;
	865
	866	at_safe_point = (interruptible == THREAD_ABORTSAFE);
	867
	868	if ( interruptible == THREAD_UNINT \|\|
	869	!(thread->sched_flags & TH_SFLAG_ABORT) \|\|
	870	(!at_safe_point &&
	871	(thread->sched_flags & TH_SFLAG_ABORTSAFELY))) {
	872
	873	if ( !(thread->state & TH_TERMINATE))
	874	DTRACE_SCHED(sleep);
	875
	876	thread->state \|= (interruptible) ? TH_WAIT : (TH_WAIT \| TH_UNINT);
	877	thread->at_safe_point = at_safe_point;
	878
	879	/* TODO: pass this through assert_wait instead, have
	880	* assert_wait just take a struct as an argument */
	881	assert(!thread->block_hint);
	882	thread->block_hint = thread->pending_block_hint;
	883	thread->pending_block_hint = kThreadWaitNone;
	884
	885	return (thread->wait_result = THREAD_WAITING);
	886	}
	887	else
	888	if (thread->sched_flags & TH_SFLAG_ABORTSAFELY)
	889	thread->sched_flags &= ~TH_SFLAG_ABORTED_MASK;
	890	thread->pending_block_hint = kThreadWaitNone;
	891
	892	return (thread->wait_result = THREAD_INTERRUPTED);
	893	}
	894
	895	/*
	896	* Routine: thread_interrupt_level
	897	* Purpose:
	898	* Set the maximum interruptible state for the
	899	* current thread. The effective value of any
	900	* interruptible flag passed into assert_wait
	901	* will never exceed this.
	902	*
	903	* Useful for code that must not be interrupted,
	904	* but which calls code that doesn't know that.
	905	* Returns:
	906	* The old interrupt level for the thread.
	907	*/
	908	__private_extern__
	909	wait_interrupt_t
	910	thread_interrupt_level(
	911	wait_interrupt_t new_level)
	912	{
	913	thread_t thread = current_thread();
	914	wait_interrupt_t result = thread->options & TH_OPT_INTMASK;
	915
	916	thread->options = (thread->options & ~TH_OPT_INTMASK) \| (new_level & TH_OPT_INTMASK);
	917
	918	return result;
	919	}
	920
	921	/*
	922	* assert_wait:
	923	*
	924	* Assert that the current thread is about to go to
	925	* sleep until the specified event occurs.
	926	*/
	927	wait_result_t
	928	assert_wait(
	929	event_t event,
	930	wait_interrupt_t interruptible)
	931	{
	932	if (__improbable(event == NO_EVENT))
	933	panic("%s() called with NO_EVENT", __func__);
	934
	935	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	936	MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)\|DBG_FUNC_NONE,
	937	VM_KERNEL_UNSLIDE_OR_PERM(event), 0, 0, 0, 0);
	938
	939	struct waitq *waitq;
	940	waitq = global_eventq(event);
	941	return waitq_assert_wait64(waitq, CAST_EVENT64_T(event), interruptible, TIMEOUT_WAIT_FOREVER);
	942	}
	943
	944	/*
	945	* assert_wait_queue:
	946	*
	947	* Return the global waitq for the specified event
	948	*/
	949	struct waitq *
	950	assert_wait_queue(
	951	event_t event)
	952	{
	953	return global_eventq(event);
	954	}
	955
	956	wait_result_t
	957	assert_wait_timeout(
	958	event_t event,
	959	wait_interrupt_t interruptible,
	960	uint32_t interval,
	961	uint32_t scale_factor)
	962	{
	963	thread_t thread = current_thread();
	964	wait_result_t wresult;
	965	uint64_t deadline;
	966	spl_t s;
	967
	968	if (__improbable(event == NO_EVENT))
	969	panic("%s() called with NO_EVENT", __func__);
	970
	971	struct waitq *waitq;
	972	waitq = global_eventq(event);
	973
	974	s = splsched();
	975	waitq_lock(waitq);
	976
	977	clock_interval_to_deadline(interval, scale_factor, &deadline);
	978
	979	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	980	MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)\|DBG_FUNC_NONE,
	981	VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
	982
	983	wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
	984	interruptible,
	985	TIMEOUT_URGENCY_SYS_NORMAL,
	986	deadline, TIMEOUT_NO_LEEWAY,
	987	thread);
	988
	989	waitq_unlock(waitq);
	990	splx(s);
	991	return wresult;
	992	}
	993
	994	wait_result_t
	995	assert_wait_timeout_with_leeway(
	996	event_t event,
	997	wait_interrupt_t interruptible,
	998	wait_timeout_urgency_t urgency,
	999	uint32_t interval,
	1000	uint32_t leeway,
	1001	uint32_t scale_factor)
	1002	{
	1003	thread_t thread = current_thread();
	1004	wait_result_t wresult;
	1005	uint64_t deadline;
	1006	uint64_t abstime;
	1007	uint64_t slop;
	1008	uint64_t now;
	1009	spl_t s;
	1010
	1011	if (__improbable(event == NO_EVENT))
	1012	panic("%s() called with NO_EVENT", __func__);
	1013
	1014	now = mach_absolute_time();
	1015	clock_interval_to_absolutetime_interval(interval, scale_factor, &abstime);
	1016	deadline = now + abstime;
	1017
	1018	clock_interval_to_absolutetime_interval(leeway, scale_factor, &slop);
	1019
	1020	struct waitq *waitq;
	1021	waitq = global_eventq(event);
	1022
	1023	s = splsched();
	1024	waitq_lock(waitq);
	1025
	1026	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	1027	MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)\|DBG_FUNC_NONE,
	1028	VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
	1029
	1030	wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
	1031	interruptible,
	1032	urgency, deadline, slop,
	1033	thread);
	1034
	1035	waitq_unlock(waitq);
	1036	splx(s);
	1037	return wresult;
	1038	}
	1039
	1040	wait_result_t
	1041	assert_wait_deadline(
	1042	event_t event,
	1043	wait_interrupt_t interruptible,
	1044	uint64_t deadline)
	1045	{
	1046	thread_t thread = current_thread();
	1047	wait_result_t wresult;
	1048	spl_t s;
	1049
	1050	if (__improbable(event == NO_EVENT))
	1051	panic("%s() called with NO_EVENT", __func__);
	1052
	1053	struct waitq *waitq;
	1054	waitq = global_eventq(event);
	1055
	1056	s = splsched();
	1057	waitq_lock(waitq);
	1058
	1059	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	1060	MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)\|DBG_FUNC_NONE,
	1061	VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
	1062
	1063	wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
	1064	interruptible,
	1065	TIMEOUT_URGENCY_SYS_NORMAL, deadline,
	1066	TIMEOUT_NO_LEEWAY, thread);
	1067	waitq_unlock(waitq);
	1068	splx(s);
	1069	return wresult;
	1070	}
	1071
	1072	wait_result_t
	1073	assert_wait_deadline_with_leeway(
	1074	event_t event,
	1075	wait_interrupt_t interruptible,
	1076	wait_timeout_urgency_t urgency,
	1077	uint64_t deadline,
	1078	uint64_t leeway)
	1079	{
	1080	thread_t thread = current_thread();
	1081	wait_result_t wresult;
	1082	spl_t s;
	1083
	1084	if (__improbable(event == NO_EVENT))
	1085	panic("%s() called with NO_EVENT", __func__);
	1086
	1087	struct waitq *waitq;
	1088	waitq = global_eventq(event);
	1089
	1090	s = splsched();
	1091	waitq_lock(waitq);
	1092
	1093	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	1094	MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)\|DBG_FUNC_NONE,
	1095	VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
	1096
	1097	wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
	1098	interruptible,
	1099	urgency, deadline, leeway,
	1100	thread);
	1101	waitq_unlock(waitq);
	1102	splx(s);
	1103	return wresult;
	1104	}
	1105
	1106	/*
	1107	* thread_isoncpu:
	1108	*
	1109	* Return TRUE if a thread is running on a processor such that an AST
	1110	* is needed to pull it out of userspace execution, or if executing in
	1111	* the kernel, bring to a context switch boundary that would cause
	1112	* thread state to be serialized in the thread PCB.
	1113	*
	1114	* Thread locked, returns the same way. While locked, fields
	1115	* like "state" cannot change. "runq" can change only from set to unset.
	1116	*/
	1117	static inline boolean_t
	1118	thread_isoncpu(thread_t thread)
	1119	{
	1120	/* Not running or runnable */
	1121	if (!(thread->state & TH_RUN))
	1122	return (FALSE);
	1123
	1124	/* Waiting on a runqueue, not currently running */
	1125	/* TODO: This is invalid - it can get dequeued without thread lock, but not context switched. */
	1126	if (thread->runq != PROCESSOR_NULL)
	1127	return (FALSE);
	1128
	1129	/*
	1130	* Thread does not have a stack yet
	1131	* It could be on the stack alloc queue or preparing to be invoked
	1132	*/
	1133	if (!thread->kernel_stack)
	1134	return (FALSE);
	1135
	1136	/*
	1137	* Thread must be running on a processor, or
	1138	* about to run, or just did run. In all these
	1139	* cases, an AST to the processor is needed
	1140	* to guarantee that the thread is kicked out
	1141	* of userspace and the processor has
	1142	* context switched (and saved register state).
	1143	*/
	1144	return (TRUE);
	1145	}
	1146
	1147	/*
	1148	* thread_stop:
	1149	*
	1150	* Force a preemption point for a thread and wait
	1151	* for it to stop running on a CPU. If a stronger
	1152	* guarantee is requested, wait until no longer
	1153	* runnable. Arbitrates access among
	1154	* multiple stop requests. (released by unstop)
	1155	*
	1156	* The thread must enter a wait state and stop via a
	1157	* separate means.
	1158	*
	1159	* Returns FALSE if interrupted.
	1160	*/
	1161	boolean_t
	1162	thread_stop(
	1163	thread_t thread,
	1164	boolean_t until_not_runnable)
	1165	{
	1166	wait_result_t wresult;
	1167	spl_t s = splsched();
	1168	boolean_t oncpu;
	1169
	1170	wake_lock(thread);
	1171	thread_lock(thread);
	1172
	1173	while (thread->state & TH_SUSP) {
	1174	thread->wake_active = TRUE;
	1175	thread_unlock(thread);
	1176
	1177	wresult = assert_wait(&thread->wake_active, THREAD_ABORTSAFE);
	1178	wake_unlock(thread);
	1179	splx(s);
	1180
	1181	if (wresult == THREAD_WAITING)
	1182	wresult = thread_block(THREAD_CONTINUE_NULL);
	1183
	1184	if (wresult != THREAD_AWAKENED)
	1185	return (FALSE);
	1186
	1187	s = splsched();
	1188	wake_lock(thread);
	1189	thread_lock(thread);
	1190	}
	1191
	1192	thread->state \|= TH_SUSP;
	1193
	1194	while ((oncpu = thread_isoncpu(thread)) \|\|
	1195	(until_not_runnable && (thread->state & TH_RUN))) {
	1196	processor_t processor;
	1197
	1198	if (oncpu) {
	1199	assert(thread->state & TH_RUN);
	1200	processor = thread->chosen_processor;
	1201	cause_ast_check(processor);
	1202	}
	1203
	1204	thread->wake_active = TRUE;
	1205	thread_unlock(thread);
	1206
	1207	wresult = assert_wait(&thread->wake_active, THREAD_ABORTSAFE);
	1208	wake_unlock(thread);
	1209	splx(s);
	1210
	1211	if (wresult == THREAD_WAITING)
	1212	wresult = thread_block(THREAD_CONTINUE_NULL);
	1213
	1214	if (wresult != THREAD_AWAKENED) {
	1215	thread_unstop(thread);
	1216	return (FALSE);
	1217	}
	1218
	1219	s = splsched();
	1220	wake_lock(thread);
	1221	thread_lock(thread);
	1222	}
	1223
	1224	thread_unlock(thread);
	1225	wake_unlock(thread);
	1226	splx(s);
	1227
	1228	/*
	1229	* We return with the thread unlocked. To prevent it from
	1230	* transitioning to a runnable state (or from TH_RUN to
	1231	* being on the CPU), the caller must ensure the thread
	1232	* is stopped via an external means (such as an AST)
	1233	*/
	1234
	1235	return (TRUE);
	1236	}
	1237
	1238	/*
	1239	* thread_unstop:
	1240	*
	1241	* Release a previous stop request and set
	1242	* the thread running if appropriate.
	1243	*
	1244	* Use only after a successful stop operation.
	1245	*/
	1246	void
	1247	thread_unstop(
	1248	thread_t thread)
	1249	{
	1250	spl_t s = splsched();
	1251
	1252	wake_lock(thread);
	1253	thread_lock(thread);
	1254
	1255	assert((thread->state & (TH_RUN\|TH_WAIT\|TH_SUSP)) != TH_SUSP);
	1256
	1257	if (thread->state & TH_SUSP) {
	1258	thread->state &= ~TH_SUSP;
	1259
	1260	if (thread->wake_active) {
	1261	thread->wake_active = FALSE;
	1262	thread_unlock(thread);
	1263
	1264	thread_wakeup(&thread->wake_active);
	1265	wake_unlock(thread);
	1266	splx(s);
	1267
	1268	return;
	1269	}
	1270	}
	1271
	1272	thread_unlock(thread);
	1273	wake_unlock(thread);
	1274	splx(s);
	1275	}
	1276
	1277	/*
	1278	* thread_wait:
	1279	*
	1280	* Wait for a thread to stop running. (non-interruptible)
	1281	*
	1282	*/
	1283	void
	1284	thread_wait(
	1285	thread_t thread,
	1286	boolean_t until_not_runnable)
	1287	{
	1288	wait_result_t wresult;
	1289	boolean_t oncpu;
	1290	processor_t processor;
	1291	spl_t s = splsched();
	1292
	1293	wake_lock(thread);
	1294	thread_lock(thread);
	1295
	1296	/*
	1297	* Wait until not running on a CPU. If stronger requirement
	1298	* desired, wait until not runnable. Assumption: if thread is
	1299	* on CPU, then TH_RUN is set, so we're not waiting in any case
	1300	* where the original, pure "TH_RUN" check would have let us
	1301	* finish.
	1302	*/
	1303	while ((oncpu = thread_isoncpu(thread)) \|\|
	1304	(until_not_runnable && (thread->state & TH_RUN))) {
	1305
	1306	if (oncpu) {
	1307	assert(thread->state & TH_RUN);
	1308	processor = thread->chosen_processor;
	1309	cause_ast_check(processor);
	1310	}
	1311
	1312	thread->wake_active = TRUE;
	1313	thread_unlock(thread);
	1314
	1315	wresult = assert_wait(&thread->wake_active, THREAD_UNINT);
	1316	wake_unlock(thread);
	1317	splx(s);
	1318
	1319	if (wresult == THREAD_WAITING)
	1320	thread_block(THREAD_CONTINUE_NULL);
	1321
	1322	s = splsched();
	1323	wake_lock(thread);
	1324	thread_lock(thread);
	1325	}
	1326
	1327	thread_unlock(thread);
	1328	wake_unlock(thread);
	1329	splx(s);
	1330	}
	1331
	1332	/*
	1333	* Routine: clear_wait_internal
	1334	*
	1335	* Clear the wait condition for the specified thread.
	1336	* Start the thread executing if that is appropriate.
	1337	* Arguments:
	1338	* thread thread to awaken
	1339	* result Wakeup result the thread should see
	1340	* Conditions:
	1341	* At splsched
	1342	* the thread is locked.
	1343	* Returns:
	1344	* KERN_SUCCESS thread was rousted out a wait
	1345	* KERN_FAILURE thread was waiting but could not be rousted
	1346	* KERN_NOT_WAITING thread was not waiting
	1347	*/
	1348	__private_extern__ kern_return_t
	1349	clear_wait_internal(
	1350	thread_t thread,
	1351	wait_result_t wresult)
	1352	{
	1353	uint32_t i = LockTimeOutUsec;
	1354	struct waitq *waitq = thread->waitq;
	1355
	1356	do {
	1357	if (wresult == THREAD_INTERRUPTED && (thread->state & TH_UNINT))
	1358	return (KERN_FAILURE);
	1359
	1360	if (waitq != NULL) {
	1361	if (!waitq_pull_thread_locked(waitq, thread)) {
	1362	thread_unlock(thread);
	1363	delay(1);
	1364	if (i > 0 && !machine_timeout_suspended())
	1365	i--;
	1366	thread_lock(thread);
	1367	if (waitq != thread->waitq)
	1368	return KERN_NOT_WAITING;
	1369	continue;
	1370	}
	1371	}
	1372
	1373	/* TODO: Can we instead assert TH_TERMINATE is not set? */
	1374	if ((thread->state & (TH_WAIT\|TH_TERMINATE)) == TH_WAIT)
	1375	return (thread_go(thread, wresult));
	1376	else
	1377	return (KERN_NOT_WAITING);
	1378	} while (i > 0);
	1379
	1380	panic("clear_wait_internal: deadlock: thread=%p, wq=%p, cpu=%d\n",
	1381	thread, waitq, cpu_number());
	1382
	1383	return (KERN_FAILURE);
	1384	}
	1385
	1386
	1387	/*
	1388	* clear_wait:
	1389	*
	1390	* Clear the wait condition for the specified thread. Start the thread
	1391	* executing if that is appropriate.
	1392	*
	1393	* parameters:
	1394	* thread thread to awaken
	1395	* result Wakeup result the thread should see
	1396	*/
	1397	kern_return_t
	1398	clear_wait(
	1399	thread_t thread,
	1400	wait_result_t result)
	1401	{
	1402	kern_return_t ret;
	1403	spl_t s;
	1404
	1405	s = splsched();
	1406	thread_lock(thread);
	1407	ret = clear_wait_internal(thread, result);
	1408	thread_unlock(thread);
	1409	splx(s);
	1410	return ret;
	1411	}
	1412
	1413
	1414	/*
	1415	* thread_wakeup_prim:
	1416	*
	1417	* Common routine for thread_wakeup, thread_wakeup_with_result,
	1418	* and thread_wakeup_one.
	1419	*
	1420	*/
	1421	kern_return_t
	1422	thread_wakeup_prim(
	1423	event_t event,
	1424	boolean_t one_thread,
	1425	wait_result_t result)
	1426	{
	1427	if (__improbable(event == NO_EVENT))
	1428	panic("%s() called with NO_EVENT", __func__);
	1429
	1430	struct waitq *wq = global_eventq(event);
	1431
	1432	if (one_thread)
	1433	return waitq_wakeup64_one(wq, CAST_EVENT64_T(event), result, WAITQ_ALL_PRIORITIES);
	1434	else
	1435	return waitq_wakeup64_all(wq, CAST_EVENT64_T(event), result, WAITQ_ALL_PRIORITIES);
	1436	}
	1437
	1438	/*
	1439	* Wakeup a specified thread if and only if it's waiting for this event
	1440	*/
	1441	kern_return_t
	1442	thread_wakeup_thread(
	1443	event_t event,
	1444	thread_t thread)
	1445	{
	1446	if (__improbable(event == NO_EVENT))
	1447	panic("%s() called with NO_EVENT", __func__);
	1448
	1449	if (__improbable(thread == THREAD_NULL))
	1450	panic("%s() called with THREAD_NULL", __func__);
	1451
	1452	struct waitq *wq = global_eventq(event);
	1453
	1454	return waitq_wakeup64_thread(wq, CAST_EVENT64_T(event), thread, THREAD_AWAKENED);
	1455	}
	1456
	1457	/*
	1458	* Wakeup a thread waiting on an event and promote it to a priority.
	1459	*
	1460	* Requires woken thread to un-promote itself when done.
	1461	*/
	1462	kern_return_t
	1463	thread_wakeup_one_with_pri(
	1464	event_t event,
	1465	int priority)
	1466	{
	1467	if (__improbable(event == NO_EVENT))
	1468	panic("%s() called with NO_EVENT", __func__);
	1469
	1470	struct waitq *wq = global_eventq(event);
	1471
	1472	return waitq_wakeup64_one(wq, CAST_EVENT64_T(event), THREAD_AWAKENED, priority);
	1473	}
	1474
	1475	/*
	1476	* Wakeup a thread waiting on an event,
	1477	* promote it to a priority,
	1478	* and return a reference to the woken thread.
	1479	*
	1480	* Requires woken thread to un-promote itself when done.
	1481	*/
	1482	thread_t
	1483	thread_wakeup_identify(event_t event,
	1484	int priority)
	1485	{
	1486	if (__improbable(event == NO_EVENT))
	1487	panic("%s() called with NO_EVENT", __func__);
	1488
	1489	struct waitq *wq = global_eventq(event);
	1490
	1491	return waitq_wakeup64_identify(wq, CAST_EVENT64_T(event), THREAD_AWAKENED, priority);
	1492	}
	1493
	1494	/*
	1495	* thread_bind:
	1496	*
	1497	* Force the current thread to execute on the specified processor.
	1498	* Takes effect after the next thread_block().
	1499	*
	1500	* Returns the previous binding. PROCESSOR_NULL means
	1501	* not bound.
	1502	*
	1503	* XXX - DO NOT export this to users - XXX
	1504	*/
	1505	processor_t
	1506	thread_bind(
	1507	processor_t processor)
	1508	{
	1509	thread_t self = current_thread();
	1510	processor_t prev;
	1511	spl_t s;
	1512
	1513	s = splsched();
	1514	thread_lock(self);
	1515
	1516	prev = thread_bind_internal(self, processor);
	1517
	1518	thread_unlock(self);
	1519	splx(s);
	1520
	1521	return (prev);
	1522	}
	1523
	1524	/*
	1525	* thread_bind_internal:
	1526	*
	1527	* If the specified thread is not the current thread, and it is currently
	1528	* running on another CPU, a remote AST must be sent to that CPU to cause
	1529	* the thread to migrate to its bound processor. Otherwise, the migration
	1530	* will occur at the next quantum expiration or blocking point.
	1531	*
	1532	* When the thread is the current thread, and explicit thread_block() should
	1533	* be used to force the current processor to context switch away and
	1534	* let the thread migrate to the bound processor.
	1535	*
	1536	* Thread must be locked, and at splsched.
	1537	*/
	1538
	1539	static processor_t
	1540	thread_bind_internal(
	1541	thread_t thread,
	1542	processor_t processor)
	1543	{
	1544	processor_t prev;
	1545
	1546	/* <rdar://problem/15102234> */
	1547	assert(thread->sched_pri < BASEPRI_RTQUEUES);
	1548	/* A thread can't be bound if it's sitting on a (potentially incorrect) runqueue */
	1549	assert(thread->runq == PROCESSOR_NULL);
	1550
	1551	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_THREAD_BIND), thread_tid(thread), processor ? (uintptr_t)processor->cpu_id : (uintptr_t)-1, 0, 0, 0);
	1552
	1553	prev = thread->bound_processor;
	1554	thread->bound_processor = processor;
	1555
	1556	return (prev);
	1557	}
	1558
	1559	/*
	1560	* thread_vm_bind_group_add:
	1561	*
	1562	* The "VM bind group" is a special mechanism to mark a collection
	1563	* of threads from the VM subsystem that, in general, should be scheduled
	1564	* with only one CPU of parallelism. To accomplish this, we initially
	1565	* bind all the threads to the master processor, which has the effect
	1566	* that only one of the threads in the group can execute at once, including
	1567	* preempting threads in the group that are a lower priority. Future
	1568	* mechanisms may use more dynamic mechanisms to prevent the collection
	1569	* of VM threads from using more CPU time than desired.
	1570	*
	1571	* The current implementation can result in priority inversions where
	1572	* compute-bound priority 95 or realtime threads that happen to have
	1573	* landed on the master processor prevent the VM threads from running.
	1574	* When this situation is detected, we unbind the threads for one
	1575	* scheduler tick to allow the scheduler to run the threads an
	1576	* additional CPUs, before restoring the binding (assuming high latency
	1577	* is no longer a problem).
	1578	*/
	1579
	1580	/*
	1581	* The current max is provisioned for:
	1582	* vm_compressor_swap_trigger_thread (92)
	1583	* 2 x vm_pageout_iothread_internal (92) when vm_restricted_to_single_processor==TRUE
	1584	* vm_pageout_continue (92)
	1585	* memorystatus_thread (95)
	1586	*/
	1587	#define MAX_VM_BIND_GROUP_COUNT (5)
	1588	decl_simple_lock_data(static,sched_vm_group_list_lock);
	1589	static thread_t sched_vm_group_thread_list[MAX_VM_BIND_GROUP_COUNT];
	1590	static int sched_vm_group_thread_count;
	1591	static boolean_t sched_vm_group_temporarily_unbound = FALSE;
	1592
	1593	void
	1594	thread_vm_bind_group_add(void)
	1595	{
	1596	thread_t self = current_thread();
	1597
	1598	thread_reference_internal(self);
	1599	self->options \|= TH_OPT_SCHED_VM_GROUP;
	1600
	1601	simple_lock(&sched_vm_group_list_lock);
	1602	assert(sched_vm_group_thread_count < MAX_VM_BIND_GROUP_COUNT);
	1603	sched_vm_group_thread_list[sched_vm_group_thread_count++] = self;
	1604	simple_unlock(&sched_vm_group_list_lock);
	1605
	1606	thread_bind(master_processor);
	1607
	1608	/* Switch to bound processor if not already there */
	1609	thread_block(THREAD_CONTINUE_NULL);
	1610	}
	1611
	1612	static void
	1613	sched_vm_group_maintenance(void)
	1614	{
	1615	uint64_t ctime = mach_absolute_time();
	1616	uint64_t longtime = ctime - sched_tick_interval;
	1617	int i;
	1618	spl_t s;
	1619	boolean_t high_latency_observed = FALSE;
	1620	boolean_t runnable_and_not_on_runq_observed = FALSE;
	1621	boolean_t bind_target_changed = FALSE;
	1622	processor_t bind_target = PROCESSOR_NULL;
	1623
	1624	/* Make sure nobody attempts to add new threads while we are enumerating them */
	1625	simple_lock(&sched_vm_group_list_lock);
	1626
	1627	s = splsched();
	1628
	1629	for (i=0; i < sched_vm_group_thread_count; i++) {
	1630	thread_t thread = sched_vm_group_thread_list[i];
	1631	assert(thread != THREAD_NULL);
	1632	thread_lock(thread);
	1633	if ((thread->state & (TH_RUN\|TH_WAIT)) == TH_RUN) {
	1634	if (thread->runq != PROCESSOR_NULL && thread->last_made_runnable_time < longtime) {
	1635	high_latency_observed = TRUE;
	1636	} else if (thread->runq == PROCESSOR_NULL) {
	1637	/* There are some cases where a thread be transitiong that also fall into this case */
	1638	runnable_and_not_on_runq_observed = TRUE;
	1639	}
	1640	}
	1641	thread_unlock(thread);
	1642
	1643	if (high_latency_observed && runnable_and_not_on_runq_observed) {
	1644	/* All the things we are looking for are true, stop looking */
	1645	break;
	1646	}
	1647	}
	1648
	1649	splx(s);
	1650
	1651	if (sched_vm_group_temporarily_unbound) {
	1652	/* If we turned off binding, make sure everything is OK before rebinding */
	1653	if (!high_latency_observed) {
	1654	/* rebind */
	1655	bind_target_changed = TRUE;
	1656	bind_target = master_processor;
	1657	sched_vm_group_temporarily_unbound = FALSE; /* might be reset to TRUE if change cannot be completed */
	1658	}
	1659	} else {
	1660	/*
	1661	* Check if we're in a bad state, which is defined by high
	1662	* latency with no core currently executing a thread. If a
	1663	* single thread is making progress on a CPU, that means the
	1664	* binding concept to reduce parallelism is working as
	1665	* designed.
	1666	*/
	1667	if (high_latency_observed && !runnable_and_not_on_runq_observed) {
	1668	/* unbind */
	1669	bind_target_changed = TRUE;
	1670	bind_target = PROCESSOR_NULL;
	1671	sched_vm_group_temporarily_unbound = TRUE;
	1672	}
	1673	}
	1674
	1675	if (bind_target_changed) {
	1676	s = splsched();
	1677	for (i=0; i < sched_vm_group_thread_count; i++) {
	1678	thread_t thread = sched_vm_group_thread_list[i];
	1679	boolean_t removed;
	1680	assert(thread != THREAD_NULL);
	1681
	1682	thread_lock(thread);
	1683	removed = thread_run_queue_remove(thread);
	1684	if (removed \|\| ((thread->state & (TH_RUN \| TH_WAIT)) == TH_WAIT)) {
	1685	thread_bind_internal(thread, bind_target);
	1686	} else {
	1687	/*
	1688	* Thread was in the middle of being context-switched-to,
	1689	* or was in the process of blocking. To avoid switching the bind
	1690	* state out mid-flight, defer the change if possible.
	1691	*/
	1692	if (bind_target == PROCESSOR_NULL) {
	1693	thread_bind_internal(thread, bind_target);
	1694	} else {
	1695	sched_vm_group_temporarily_unbound = TRUE; /* next pass will try again */
	1696	}
	1697	}
	1698
	1699	if (removed) {
	1700	thread_run_queue_reinsert(thread, SCHED_PREEMPT \| SCHED_TAILQ);
	1701	}
	1702	thread_unlock(thread);
	1703	}
	1704	splx(s);
	1705	}
	1706
	1707	simple_unlock(&sched_vm_group_list_lock);
	1708	}
	1709
	1710	/* Invoked prior to idle entry to determine if, on SMT capable processors, an SMT
	1711	* rebalancing opportunity exists when a core is (instantaneously) idle, but
	1712	* other SMT-capable cores may be over-committed. TODO: some possible negatives:
	1713	* IPI thrash if this core does not remain idle following the load balancing ASTs
	1714	* Idle "thrash", when IPI issue is followed by idle entry/core power down
	1715	* followed by a wakeup shortly thereafter.
	1716	*/
	1717
	1718	#if (DEVELOPMENT \|\| DEBUG)
	1719	int sched_smt_balance = 1;
	1720	#endif
	1721
	1722	#if __SMP__
	1723	/* Invoked with pset locked, returns with pset unlocked */
	1724	void
	1725	sched_SMT_balance(processor_t cprocessor, processor_set_t cpset) {
	1726	processor_t ast_processor = NULL;
	1727
	1728	#if (DEVELOPMENT \|\| DEBUG)
	1729	if (__improbable(sched_smt_balance == 0))
	1730	goto smt_balance_exit;
	1731	#endif
	1732
	1733	assert(cprocessor == current_processor());
	1734	if (cprocessor->is_SMT == FALSE)
	1735	goto smt_balance_exit;
	1736
	1737	processor_t sib_processor = cprocessor->processor_secondary ? cprocessor->processor_secondary : cprocessor->processor_primary;
	1738
	1739	/* Determine if both this processor and its sibling are idle,
	1740	* indicating an SMT rebalancing opportunity.
	1741	*/
	1742	if (sib_processor->state != PROCESSOR_IDLE)
	1743	goto smt_balance_exit;
	1744
	1745	processor_t sprocessor;
	1746
	1747	sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
	1748	qe_foreach_element(sprocessor, &cpset->active_queue, processor_queue) {
	1749	if ((sprocessor->state == PROCESSOR_RUNNING) &&
	1750	(sprocessor->processor_primary != sprocessor) &&
	1751	(sprocessor->processor_primary->state == PROCESSOR_RUNNING) &&
	1752	(sprocessor->current_pri < BASEPRI_RTQUEUES)) {
	1753
	1754	ipi_type = sched_ipi_action(sprocessor, NULL, false, SCHED_IPI_EVENT_SMT_REBAL);
	1755	if (ipi_type != SCHED_IPI_NONE) {
	1756	assert(sprocessor != cprocessor);
	1757	ast_processor = sprocessor;
	1758	break;
	1759	}
	1760	}
	1761	}
	1762
	1763	smt_balance_exit:
	1764	pset_unlock(cpset);
	1765
	1766	if (ast_processor) {
	1767	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_SMT_BALANCE), ast_processor->cpu_id, ast_processor->state, ast_processor->processor_primary->state, 0, 0);
	1768	sched_ipi_perform(ast_processor, ipi_type);
	1769	}
	1770	}
	1771	#else
	1772	/* Invoked with pset locked, returns with pset unlocked */
	1773	void
	1774	sched_SMT_balance(__unused processor_t cprocessor, processor_set_t cpset)
	1775	{
	1776	pset_unlock(cpset);
	1777	}
	1778	#endif /* __SMP__ */
	1779
	1780	static processor_t choose_processor_for_realtime_thread(processor_set_t pset);
	1781	static bool all_available_primaries_are_running_realtime_threads(processor_set_t pset);
	1782	int sched_allow_rt_smt = 1;
	1783
	1784	/*
	1785	* thread_select:
	1786	*
	1787	* Select a new thread for the current processor to execute.
	1788	*
	1789	* May select the current thread, which must be locked.
	1790	*/
	1791	static thread_t
	1792	thread_select(thread_t thread,
	1793	processor_t processor,
	1794	ast_t *reason)
	1795	{
	1796	processor_set_t pset = processor->processor_set;
	1797	thread_t new_thread = THREAD_NULL;
	1798
	1799	assert(processor == current_processor());
	1800	assert((thread->state & (TH_RUN\|TH_TERMINATE2)) == TH_RUN);
	1801
	1802	do {
	1803	/*
	1804	* Update the priority.
	1805	*/
	1806	if (SCHED(can_update_priority)(thread))
	1807	SCHED(update_priority)(thread);
	1808
	1809	processor_state_update_from_thread(processor, thread);
	1810
	1811	pset_lock(pset);
	1812
	1813	assert(processor->state != PROCESSOR_OFF_LINE);
	1814
	1815	if (!processor->is_recommended) {
	1816	/*
	1817	* The performance controller has provided a hint to not dispatch more threads,
	1818	* unless they are bound to us (and thus we are the only option
	1819	*/
	1820	if (!SCHED(processor_bound_count)(processor)) {
	1821	goto idle;
	1822	}
	1823	} else if (processor->processor_primary != processor) {
	1824	/*
	1825	* Should this secondary SMT processor attempt to find work? For pset runqueue systems,
	1826	* we should look for work only under the same conditions that choose_processor()
	1827	* would have assigned work, which is when all primary processors have been assigned work.
	1828	*
	1829	* An exception is that bound threads are dispatched to a processor without going through
	1830	* choose_processor(), so in those cases we should continue trying to dequeue work.
	1831	*/
	1832	if (!SCHED(processor_bound_count)(processor)) {
	1833	if (!queue_empty(&pset->idle_queue)) {
	1834	goto idle;
	1835	}
	1836
	1837	/* There are no idle primaries */
	1838
	1839	if (processor->processor_primary->current_pri >= BASEPRI_RTQUEUES) {
	1840	bool secondary_can_run_realtime_thread = sched_allow_rt_smt && rt_runq_count(pset) && all_available_primaries_are_running_realtime_threads(pset);
	1841	if (!secondary_can_run_realtime_thread) {
	1842	goto idle;
	1843	}
	1844	}
	1845	}
	1846	}
	1847
	1848	/*
	1849	* Test to see if the current thread should continue
	1850	* to run on this processor. Must not be attempting to wait, and not
	1851	* bound to a different processor, nor be in the wrong
	1852	* processor set, nor be forced to context switch by TH_SUSP.
	1853	*
	1854	* Note that there are never any RT threads in the regular runqueue.
	1855	*
	1856	* This code is very insanely tricky.
	1857	*/
	1858
	1859	/* i.e. not waiting, not TH_SUSP'ed */
	1860	boolean_t still_running = ((thread->state & (TH_TERMINATE\|TH_IDLE\|TH_WAIT\|TH_RUN\|TH_SUSP)) == TH_RUN);
	1861
	1862	/*
	1863	* Threads running on SMT processors are forced to context switch. Don't rebalance realtime threads.
	1864	* TODO: This should check if it's worth it to rebalance, i.e. 'are there any idle primary processors'
	1865	*/
	1866	boolean_t needs_smt_rebalance = (thread->sched_pri < BASEPRI_RTQUEUES && processor->processor_primary != processor);
	1867
	1868	boolean_t affinity_mismatch = (thread->affinity_set != AFFINITY_SET_NULL && thread->affinity_set->aset_pset != pset);
	1869
	1870	boolean_t bound_elsewhere = (thread->bound_processor != PROCESSOR_NULL && thread->bound_processor != processor);
	1871
	1872	boolean_t avoid_processor = (SCHED(avoid_processor_enabled) && SCHED(thread_avoid_processor)(processor, thread));
	1873
	1874	if (still_running && !needs_smt_rebalance && !affinity_mismatch && !bound_elsewhere && !avoid_processor) {
	1875	/*
	1876	* This thread is eligible to keep running on this processor.
	1877	*
	1878	* RT threads with un-expired quantum stay on processor,
	1879	* unless there's a valid RT thread with an earlier deadline.
	1880	*/
	1881	if (thread->sched_pri >= BASEPRI_RTQUEUES && processor->first_timeslice) {
	1882	if (rt_runq_count(pset) > 0) {
	1883
	1884	rt_lock_lock(pset);
	1885
	1886	if (rt_runq_count(pset) > 0) {
	1887
	1888	thread_t next_rt = qe_queue_first(&SCHED(rt_runq)(pset)->queue, struct thread, runq_links);
	1889
	1890	if (next_rt->realtime.deadline < processor->deadline &&
	1891	(next_rt->bound_processor == PROCESSOR_NULL \|\|
	1892	next_rt->bound_processor == processor)) {
	1893	/* The next RT thread is better, so pick it off the runqueue. */
	1894	goto pick_new_rt_thread;
	1895	}
	1896	}
	1897
	1898	rt_lock_unlock(pset);
	1899	}
	1900
	1901	/* This is still the best RT thread to run. */
	1902	processor->deadline = thread->realtime.deadline;
	1903
	1904	sched_update_pset_load_average(pset);
	1905
	1906	processor_t next_rt_processor = PROCESSOR_NULL;
	1907	sched_ipi_type_t next_rt_ipi_type = SCHED_IPI_NONE;
	1908
	1909	if (rt_runq_count(pset) > 0) {
	1910	next_rt_processor = choose_processor_for_realtime_thread(pset);
	1911	if (next_rt_processor) {
	1912	next_rt_ipi_type = sched_ipi_action(next_rt_processor, NULL, false, SCHED_IPI_EVENT_PREEMPT);
	1913	}
	1914	}
	1915	pset_unlock(pset);
	1916
	1917	if (next_rt_processor) {
	1918	sched_ipi_perform(next_rt_processor, next_rt_ipi_type);
	1919	}
	1920
	1921	return (thread);
	1922	}
	1923
	1924	if ((rt_runq_count(pset) == 0) &&
	1925	SCHED(processor_queue_has_priority)(processor, thread->sched_pri, TRUE) == FALSE) {
	1926	/* This thread is still the highest priority runnable (non-idle) thread */
	1927	processor->deadline = UINT64_MAX;
	1928
	1929	sched_update_pset_load_average(pset);
	1930	pset_unlock(pset);
	1931
	1932	return (thread);
	1933	}
	1934	} else {
	1935	/*
	1936	* This processor must context switch.
	1937	* If it's due to a rebalance, we should aggressively find this thread a new home.
	1938	*/
	1939	if (needs_smt_rebalance \|\| affinity_mismatch \|\| bound_elsewhere \|\| avoid_processor)
	1940	*reason \|= AST_REBALANCE;
	1941	}
	1942
	1943	/* OK, so we're not going to run the current thread. Look at the RT queue. */
	1944	if (rt_runq_count(pset) > 0) {
	1945
	1946	rt_lock_lock(pset);
	1947
	1948	if (rt_runq_count(pset) > 0) {
	1949	thread_t next_rt = qe_queue_first(&SCHED(rt_runq)(pset)->queue, struct thread, runq_links);
	1950
	1951	if (__probable((next_rt->bound_processor == PROCESSOR_NULL \|\|
	1952	(next_rt->bound_processor == processor)))) {
	1953	pick_new_rt_thread:
	1954	new_thread = qe_dequeue_head(&SCHED(rt_runq)(pset)->queue, struct thread, runq_links);
	1955
	1956	new_thread->runq = PROCESSOR_NULL;
	1957	SCHED_STATS_RUNQ_CHANGE(&SCHED(rt_runq)(pset)->runq_stats, rt_runq_count(pset));
	1958	rt_runq_count_decr(pset);
	1959
	1960	processor->deadline = new_thread->realtime.deadline;
	1961	processor_state_update_from_thread(processor, new_thread);
	1962
	1963	rt_lock_unlock(pset);
	1964	sched_update_pset_load_average(pset);
	1965
	1966	processor_t ast_processor = PROCESSOR_NULL;
	1967	processor_t next_rt_processor = PROCESSOR_NULL;
	1968	sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
	1969	sched_ipi_type_t next_rt_ipi_type = SCHED_IPI_NONE;
	1970
	1971	if (processor->processor_secondary != NULL) {
	1972	processor_t sprocessor = processor->processor_secondary;
	1973	if ((sprocessor->state == PROCESSOR_RUNNING) \|\| (sprocessor->state == PROCESSOR_DISPATCHING)) {
	1974	ipi_type = sched_ipi_action(sprocessor, NULL, false, SCHED_IPI_EVENT_SMT_REBAL);
	1975	ast_processor = sprocessor;
	1976	}
	1977	}
	1978	if (rt_runq_count(pset) > 0) {
	1979	next_rt_processor = choose_processor_for_realtime_thread(pset);
	1980	if (next_rt_processor) {
	1981	next_rt_ipi_type = sched_ipi_action(next_rt_processor, NULL, false, SCHED_IPI_EVENT_PREEMPT);
	1982	}
	1983	}
	1984	pset_unlock(pset);
	1985
	1986	if (ast_processor) {
	1987	sched_ipi_perform(ast_processor, ipi_type);
	1988	}
	1989
	1990	if (next_rt_processor) {
	1991	sched_ipi_perform(next_rt_processor, next_rt_ipi_type);
	1992	}
	1993
	1994	return (new_thread);
	1995	}
	1996	}
	1997
	1998	rt_lock_unlock(pset);
	1999	}
	2000
	2001	processor->deadline = UINT64_MAX;
	2002
	2003	/* No RT threads, so let's look at the regular threads. */
	2004	if ((new_thread = SCHED(choose_thread)(processor, MINPRI, *reason)) != THREAD_NULL) {
	2005	sched_update_pset_load_average(pset);
	2006	processor_state_update_from_thread(processor, new_thread);
	2007	pset_unlock(pset);
	2008	return (new_thread);
	2009	}
	2010
	2011	#if __SMP__
	2012	if (SCHED(steal_thread_enabled)) {
	2013	/*
	2014	* No runnable threads, attempt to steal
	2015	* from other processors. Returns with pset lock dropped.
	2016	*/
	2017
	2018	if ((new_thread = SCHED(steal_thread)(pset)) != THREAD_NULL) {
	2019	return (new_thread);
	2020	}
	2021
	2022	/*
	2023	* If other threads have appeared, shortcut
	2024	* around again.
	2025	*/
	2026	if (!SCHED(processor_queue_empty)(processor) \|\| rt_runq_count(pset) > 0)
	2027	continue;
	2028
	2029	pset_lock(pset);
	2030	}
	2031	#endif
	2032
	2033	idle:
	2034	/*
	2035	* Nothing is runnable, so set this processor idle if it
	2036	* was running.
	2037	*/
	2038	if (processor->state == PROCESSOR_RUNNING) {
	2039	processor->state = PROCESSOR_IDLE;
	2040
	2041	if (!processor->is_recommended) {
	2042	re_queue_head(&pset->unused_queue, &processor->processor_queue);
	2043	} else if (processor->processor_primary == processor) {
	2044	re_queue_head(&pset->idle_queue, &processor->processor_queue);
	2045	} else {
	2046	re_queue_head(&pset->idle_secondary_queue, &processor->processor_queue);
	2047	}
	2048
	2049	pset->active_processor_count--;
	2050	sched_update_pset_load_average(pset);
	2051	}
	2052
	2053	#if __SMP__
	2054	/* Invoked with pset locked, returns with pset unlocked */
	2055	SCHED(processor_balance)(processor, pset);
	2056	#else
	2057	pset_unlock(pset);
	2058	#endif
	2059
	2060	#if CONFIG_SCHED_IDLE_IN_PLACE
	2061	/*
	2062	* Choose idle thread if fast idle is not possible.
	2063	*/
	2064	if (processor->processor_primary != processor)
	2065	return (processor->idle_thread);
	2066
	2067	if ((thread->state & (TH_IDLE\|TH_TERMINATE\|TH_SUSP)) \|\| !(thread->state & TH_WAIT) \|\| thread->wake_active \|\| thread->sched_pri >= BASEPRI_RTQUEUES)
	2068	return (processor->idle_thread);
	2069
	2070	/*
	2071	* Perform idling activities directly without a
	2072	* context switch. Return dispatched thread,
	2073	* else check again for a runnable thread.
	2074	*/
	2075	new_thread = thread_select_idle(thread, processor);
	2076
	2077	#else /* !CONFIG_SCHED_IDLE_IN_PLACE */
	2078
	2079	/*
	2080	* Do a full context switch to idle so that the current
	2081	* thread can start running on another processor without
	2082	* waiting for the fast-idled processor to wake up.
	2083	*/
	2084	new_thread = processor->idle_thread;
	2085
	2086	#endif /* !CONFIG_SCHED_IDLE_IN_PLACE */
	2087
	2088	} while (new_thread == THREAD_NULL);
	2089
	2090	return (new_thread);
	2091	}
	2092
	2093	#if CONFIG_SCHED_IDLE_IN_PLACE
	2094	/*
	2095	* thread_select_idle:
	2096	*
	2097	* Idle the processor using the current thread context.
	2098	*
	2099	* Called with thread locked, then dropped and relocked.
	2100	*/
	2101	static thread_t
	2102	thread_select_idle(
	2103	thread_t thread,
	2104	processor_t processor)
	2105	{
	2106	thread_t new_thread;
	2107	uint64_t arg1, arg2;
	2108	int urgency;
	2109
	2110	sched_run_decr(thread);
	2111
	2112	thread->state \|= TH_IDLE;
	2113	processor_state_update_idle(procssor);
	2114
	2115	/* Reload precise timing global policy to thread-local policy */
	2116	thread->precise_user_kernel_time = use_precise_user_kernel_time(thread);
	2117
	2118	thread_unlock(thread);
	2119
	2120	/*
	2121	* Switch execution timing to processor idle thread.
	2122	*/
	2123	processor->last_dispatch = mach_absolute_time();
	2124
	2125	#ifdef CONFIG_MACH_APPROXIMATE_TIME
	2126	commpage_update_mach_approximate_time(processor->last_dispatch);
	2127	#endif
	2128
	2129	thread->last_run_time = processor->last_dispatch;
	2130	thread_timer_event(processor->last_dispatch, &processor->idle_thread->system_timer);
	2131	PROCESSOR_DATA(processor, kernel_timer) = &processor->idle_thread->system_timer;
	2132
	2133
	2134	/*
	2135	* Cancel the quantum timer while idling.
	2136	*/
	2137	timer_call_quantum_timer_cancel(&processor->quantum_timer);
	2138	processor->first_timeslice = FALSE;
	2139
	2140	(*thread->sched_call)(SCHED_CALL_BLOCK, thread);
	2141
	2142	thread_tell_urgency(THREAD_URGENCY_NONE, 0, 0, 0, NULL);
	2143
	2144	/*
	2145	* Enable interrupts and perform idling activities. No
	2146	* preemption due to TH_IDLE being set.
	2147	*/
	2148	spllo(); new_thread = processor_idle(thread, processor);
	2149
	2150	/*
	2151	* Return at splsched.
	2152	*/
	2153	(*thread->sched_call)(SCHED_CALL_UNBLOCK, thread);
	2154
	2155	thread_lock(thread);
	2156
	2157	/*
	2158	* If awakened, switch to thread timer and start a new quantum.
	2159	* Otherwise skip; we will context switch to another thread or return here.
	2160	*/
	2161	if (!(thread->state & TH_WAIT)) {
	2162	processor->last_dispatch = mach_absolute_time();
	2163	thread_timer_event(processor->last_dispatch, &thread->system_timer);
	2164	PROCESSOR_DATA(processor, kernel_timer) = &thread->system_timer;
	2165	thread_quantum_init(thread);
	2166	processor->quantum_end = processor->last_dispatch + thread->quantum_remaining;
	2167	timer_call_quantum_timer_enter(&processor->quantum_timer,
	2168	thread, processor->quantum_end, processor->last_dispatch);
	2169	processor->first_timeslice = TRUE;
	2170
	2171	thread->computation_epoch = processor->last_dispatch;
	2172	}
	2173
	2174	thread->state &= ~TH_IDLE;
	2175
	2176	urgency = thread_get_urgency(thread, &arg1, &arg2);
	2177
	2178	thread_tell_urgency(urgency, arg1, arg2, 0, new_thread);
	2179
	2180	sched_run_incr(thread);
	2181
	2182	return (new_thread);
	2183	}
	2184	#endif /* CONFIG_SCHED_IDLE_IN_PLACE */
	2185
	2186	/*
	2187	* thread_invoke
	2188	*
	2189	* Called at splsched with neither thread locked.
	2190	*
	2191	* Perform a context switch and start executing the new thread.
	2192	*
	2193	* Returns FALSE when the context switch didn't happen.
	2194	* The reference to the new thread is still consumed.
	2195	*
	2196	* "self" is what is currently running on the processor,
	2197	* "thread" is the new thread to context switch to
	2198	* (which may be the same thread in some cases)
	2199	*/
	2200	static boolean_t
	2201	thread_invoke(
	2202	thread_t self,
	2203	thread_t thread,
	2204	ast_t reason)
	2205	{
	2206	if (__improbable(get_preemption_level() != 0)) {
	2207	int pl = get_preemption_level();
	2208	panic("thread_invoke: preemption_level %d, possible cause: %s",
	2209	pl, (pl < 0 ? "unlocking an unlocked mutex or spinlock" :
	2210	"blocking while holding a spinlock, or within interrupt context"));
	2211	}
	2212
	2213	thread_continue_t continuation = self->continuation;
	2214	void *parameter = self->parameter;
	2215	processor_t processor;
	2216
	2217	uint64_t ctime = mach_absolute_time();
	2218
	2219	#ifdef CONFIG_MACH_APPROXIMATE_TIME
	2220	commpage_update_mach_approximate_time(ctime);
	2221	#endif
	2222
	2223	#if defined(CONFIG_SCHED_TIMESHARE_CORE)
	2224	if ((thread->state & TH_IDLE) == 0)
	2225	sched_timeshare_consider_maintenance(ctime);
	2226	#endif
	2227
	2228	#if MONOTONIC
	2229	mt_sched_update(self);
	2230	#endif /* MONOTONIC */
	2231
	2232	assert_thread_magic(self);
	2233	assert(self == current_thread());
	2234	assert(self->runq == PROCESSOR_NULL);
	2235	assert((self->state & (TH_RUN\|TH_TERMINATE2)) == TH_RUN);
	2236
	2237	thread_lock(thread);
	2238
	2239	assert_thread_magic(thread);
	2240	assert((thread->state & (TH_RUN\|TH_WAIT\|TH_UNINT\|TH_TERMINATE\|TH_TERMINATE2)) == TH_RUN);
	2241	assert(thread->bound_processor == PROCESSOR_NULL \|\| thread->bound_processor == current_processor());
	2242	assert(thread->runq == PROCESSOR_NULL);
	2243
	2244	/* Reload precise timing global policy to thread-local policy */
	2245	thread->precise_user_kernel_time = use_precise_user_kernel_time(thread);
	2246
	2247	/* Update SFI class based on other factors */
	2248	thread->sfi_class = sfi_thread_classify(thread);
	2249
	2250	/* Update the same_pri_latency for the thread (used by perfcontrol callouts) */
	2251	thread->same_pri_latency = ctime - thread->last_basepri_change_time;
	2252	/*
	2253	* In case a base_pri update happened between the timestamp and
	2254	* taking the thread lock
	2255	*/
	2256	if (ctime <= thread->last_basepri_change_time)
	2257	thread->same_pri_latency = ctime - thread->last_made_runnable_time;
	2258
	2259	/* Allow realtime threads to hang onto a stack. */
	2260	if ((self->sched_mode == TH_MODE_REALTIME) && !self->reserved_stack)
	2261	self->reserved_stack = self->kernel_stack;
	2262
	2263	/* Prepare for spin debugging */
	2264	#if INTERRUPT_MASKED_DEBUG
	2265	ml_spin_debug_clear(thread);
	2266	#endif
	2267
	2268	if (continuation != NULL) {
	2269	if (!thread->kernel_stack) {
	2270	/*
	2271	* If we are using a privileged stack,
	2272	* check to see whether we can exchange it with
	2273	* that of the other thread.
	2274	*/
	2275	if (self->kernel_stack == self->reserved_stack && !thread->reserved_stack)
	2276	goto need_stack;
	2277
	2278	/*
	2279	* Context switch by performing a stack handoff.
	2280	*/
	2281	continuation = thread->continuation;
	2282	parameter = thread->parameter;
	2283
	2284	processor = current_processor();
	2285	processor->active_thread = thread;
	2286	processor_state_update_from_thread(processor, thread);
	2287
	2288	if (thread->last_processor != processor && thread->last_processor != NULL) {
	2289	if (thread->last_processor->processor_set != processor->processor_set)
	2290	thread->ps_switch++;
	2291	thread->p_switch++;
	2292	}
	2293	thread->last_processor = processor;
	2294	thread->c_switch++;
	2295	ast_context(thread);
	2296
	2297	thread_unlock(thread);
	2298
	2299	self->reason = reason;
	2300
	2301	processor->last_dispatch = ctime;
	2302	self->last_run_time = ctime;
	2303	thread_timer_event(ctime, &thread->system_timer);
	2304	PROCESSOR_DATA(processor, kernel_timer) = &thread->system_timer;
	2305
	2306	/*
	2307	* Since non-precise user/kernel time doesn't update the state timer
	2308	* during privilege transitions, synthesize an event now.
	2309	*/
	2310	if (!thread->precise_user_kernel_time) {
	2311	timer_switch(PROCESSOR_DATA(processor, current_state),
	2312	ctime,
	2313	PROCESSOR_DATA(processor, current_state));
	2314	}
	2315
	2316	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	2317	MACHDBG_CODE(DBG_MACH_SCHED, MACH_STACK_HANDOFF)\|DBG_FUNC_NONE,
	2318	self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
	2319
	2320	if ((thread->chosen_processor != processor) && (thread->chosen_processor != PROCESSOR_NULL)) {
	2321	SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_MOVED)\|DBG_FUNC_NONE,
	2322	(uintptr_t)thread_tid(thread), (uintptr_t)thread->chosen_processor->cpu_id, 0, 0, 0);
	2323	}
	2324
	2325	DTRACE_SCHED2(off__cpu, struct thread , thread, struct proc , thread->task->bsd_info);
	2326
	2327	SCHED_STATS_CSW(processor, self->reason, self->sched_pri, thread->sched_pri);
	2328
	2329	TLOG(1, "thread_invoke: calling stack_handoff\n");
	2330	stack_handoff(self, thread);
	2331
	2332	/* 'self' is now off core */
	2333	assert(thread == current_thread());
	2334
	2335	DTRACE_SCHED(on__cpu);
	2336
	2337	#if KPERF
	2338	kperf_on_cpu(thread, continuation, NULL);
	2339	#endif /* KPERF */
	2340
	2341	#if KASAN
	2342	kasan_unpoison_fakestack(self);
	2343	kasan_unpoison_stack(thread->kernel_stack, kernel_stack_size);
	2344	#endif
	2345
	2346	thread_dispatch(self, thread);
	2347
	2348	thread->continuation = thread->parameter = NULL;
	2349
	2350	counter(c_thread_invoke_hits++);
	2351
	2352	(void) spllo();
	2353
	2354	assert(continuation);
	2355	call_continuation(continuation, parameter, thread->wait_result);
	2356	/NOTREACHED/
	2357	}
	2358	else if (thread == self) {
	2359	/* same thread but with continuation */
	2360	ast_context(self);
	2361	counter(++c_thread_invoke_same);
	2362
	2363	thread_unlock(self);
	2364
	2365	#if KPERF
	2366	kperf_on_cpu(thread, continuation, NULL);
	2367	#endif /* KPERF */
	2368
	2369	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	2370	MACHDBG_CODE(DBG_MACH_SCHED,MACH_SCHED) \| DBG_FUNC_NONE,
	2371	self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
	2372
	2373	#if KASAN
	2374	kasan_unpoison_fakestack(self);
	2375	kasan_unpoison_stack(self->kernel_stack, kernel_stack_size);
	2376	#endif
	2377
	2378	self->continuation = self->parameter = NULL;
	2379
	2380	(void) spllo();
	2381
	2382	call_continuation(continuation, parameter, self->wait_result);
	2383	/NOTREACHED/
	2384	}
	2385	} else {
	2386	/*
	2387	* Check that the other thread has a stack
	2388	*/
	2389	if (!thread->kernel_stack) {
	2390	need_stack:
	2391	if (!stack_alloc_try(thread)) {
	2392	counter(c_thread_invoke_misses++);
	2393	thread_unlock(thread);
	2394	thread_stack_enqueue(thread);
	2395	return (FALSE);
	2396	}
	2397	} else if (thread == self) {
	2398	ast_context(self);
	2399	counter(++c_thread_invoke_same);
	2400	thread_unlock(self);
	2401
	2402	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	2403	MACHDBG_CODE(DBG_MACH_SCHED,MACH_SCHED) \| DBG_FUNC_NONE,
	2404	self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
	2405
	2406	return (TRUE);
	2407	}
	2408	}
	2409
	2410	/*
	2411	* Context switch by full context save.
	2412	*/
	2413	processor = current_processor();
	2414	processor->active_thread = thread;
	2415	processor_state_update_from_thread(processor, thread);
	2416
	2417	if (thread->last_processor != processor && thread->last_processor != NULL) {
	2418	if (thread->last_processor->processor_set != processor->processor_set)
	2419	thread->ps_switch++;
	2420	thread->p_switch++;
	2421	}
	2422	thread->last_processor = processor;
	2423	thread->c_switch++;
	2424	ast_context(thread);
	2425
	2426	thread_unlock(thread);
	2427
	2428	counter(c_thread_invoke_csw++);
	2429
	2430	self->reason = reason;
	2431
	2432	processor->last_dispatch = ctime;
	2433	self->last_run_time = ctime;
	2434	thread_timer_event(ctime, &thread->system_timer);
	2435	PROCESSOR_DATA(processor, kernel_timer) = &thread->system_timer;
	2436
	2437	/*
	2438	* Since non-precise user/kernel time doesn't update the state timer
	2439	* during privilege transitions, synthesize an event now.
	2440	*/
	2441	if (!thread->precise_user_kernel_time) {
	2442	timer_switch(PROCESSOR_DATA(processor, current_state),
	2443	ctime,
	2444	PROCESSOR_DATA(processor, current_state));
	2445	}
	2446
	2447	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	2448	MACHDBG_CODE(DBG_MACH_SCHED,MACH_SCHED) \| DBG_FUNC_NONE,
	2449	self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
	2450
	2451	if ((thread->chosen_processor != processor) && (thread->chosen_processor != NULL)) {
	2452	SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_MOVED)\|DBG_FUNC_NONE,
	2453	(uintptr_t)thread_tid(thread), (uintptr_t)thread->chosen_processor->cpu_id, 0, 0, 0);
	2454	}
	2455
	2456	DTRACE_SCHED2(off__cpu, struct thread , thread, struct proc , thread->task->bsd_info);
	2457
	2458	SCHED_STATS_CSW(processor, self->reason, self->sched_pri, thread->sched_pri);
	2459
	2460	/*
	2461	* This is where we actually switch register context,
	2462	* and address space if required. We will next run
	2463	* as a result of a subsequent context switch.
	2464	*
	2465	* Once registers are switched and the processor is running "thread",
	2466	* the stack variables and non-volatile registers will contain whatever
	2467	* was there the last time that thread blocked. No local variables should
	2468	* be used after this point, except for the special case of "thread", which
	2469	* the platform layer returns as the previous thread running on the processor
	2470	* via the function call ABI as a return register, and "self", which may have
	2471	* been stored on the stack or a non-volatile register, but a stale idea of
	2472	* what was on the CPU is newly-accurate because that thread is again
	2473	* running on the CPU.
	2474	*/
	2475	assert(continuation == self->continuation);
	2476	thread = machine_switch_context(self, continuation, thread);
	2477	assert(self == current_thread());
	2478	TLOG(1,"thread_invoke: returning machine_switch_context: self %p continuation %p thread %p\n", self, continuation, thread);
	2479
	2480	DTRACE_SCHED(on__cpu);
	2481
	2482	#if KPERF
	2483	kperf_on_cpu(self, NULL, __builtin_frame_address(0));
	2484	#endif /* KPERF */
	2485
	2486	/*
	2487	* We have been resumed and are set to run.
	2488	*/
	2489	thread_dispatch(thread, self);
	2490
	2491	if (continuation) {
	2492	self->continuation = self->parameter = NULL;
	2493
	2494	(void) spllo();
	2495
	2496	call_continuation(continuation, parameter, self->wait_result);
	2497	/NOTREACHED/
	2498	}
	2499
	2500	return (TRUE);
	2501	}
	2502
	2503	#if defined(CONFIG_SCHED_DEFERRED_AST)
	2504	/*
	2505	* pset_cancel_deferred_dispatch:
	2506	*
	2507	* Cancels all ASTs that we can cancel for the given processor set
	2508	* if the current processor is running the last runnable thread in the
	2509	* system.
	2510	*
	2511	* This function assumes the current thread is runnable. This must
	2512	* be called with the pset unlocked.
	2513	*/
	2514	static void
	2515	pset_cancel_deferred_dispatch(
	2516	processor_set_t pset,
	2517	processor_t processor)
	2518	{
	2519	processor_t active_processor = NULL;
	2520	uint32_t sampled_sched_run_count;
	2521
	2522	pset_lock(pset);
	2523	sampled_sched_run_count = (volatile uint32_t) sched_run_buckets[TH_BUCKET_RUN];
	2524
	2525	/*
	2526	* If we have emptied the run queue, and our current thread is runnable, we
	2527	* should tell any processors that are still DISPATCHING that they will
	2528	* probably not have any work to do. In the event that there are no
	2529	* pending signals that we can cancel, this is also uninteresting.
	2530	*
	2531	* In the unlikely event that another thread becomes runnable while we are
	2532	* doing this (sched_run_count is atomically updated, not guarded), the
	2533	* codepath making it runnable SHOULD (a dangerous word) need the pset lock
	2534	* in order to dispatch it to a processor in our pset. So, the other
	2535	* codepath will wait while we squash all cancelable ASTs, get the pset
	2536	* lock, and then dispatch the freshly runnable thread. So this should be
	2537	* correct (we won't accidentally have a runnable thread that hasn't been
	2538	* dispatched to an idle processor), if not ideal (we may be restarting the
	2539	* dispatch process, which could have some overhead).
	2540	*
	2541	*/
	2542	if ((sampled_sched_run_count == 1) &&
	2543	(pset->pending_deferred_AST_cpu_mask)) {
	2544	qe_foreach_element_safe(active_processor, &pset->active_queue, processor_queue) {
	2545	/*
	2546	* If a processor is DISPATCHING, it could be because of
	2547	* a cancelable signal.
	2548	*
	2549	* IF the processor is not our
	2550	* current processor (the current processor should not
	2551	* be DISPATCHING, so this is a bit paranoid), AND there
	2552	* is a cancelable signal pending on the processor, AND
	2553	* there is no non-cancelable signal pending (as there is
	2554	* no point trying to backtrack on bringing the processor
	2555	* up if a signal we cannot cancel is outstanding), THEN
	2556	* it should make sense to roll back the processor state
	2557	* to the IDLE state.
	2558	*
	2559	* If the racey nature of this approach (as the signal
	2560	* will be arbitrated by hardware, and can fire as we
	2561	* roll back state) results in the core responding
	2562	* despite being pushed back to the IDLE state, it
	2563	* should be no different than if the core took some
	2564	* interrupt while IDLE.
	2565	*/
	2566	if ((active_processor->state == PROCESSOR_DISPATCHING) &&
	2567	(bit_test(pset->pending_deferred_AST_cpu_mask, active_processor->cpu_id)) &&
	2568	(!bit_test(pset->pending_AST_cpu_mask, active_processor->cpu_id)) &&
	2569	(active_processor != processor)) {
	2570	/*
	2571	* Squash all of the processor state back to some
	2572	* reasonable facsimile of PROCESSOR_IDLE.
	2573	*
	2574	* TODO: What queue policy do we actually want here?
	2575	* We want to promote selection of a good processor
	2576	* to run on. Do we want to enqueue at the head?
	2577	* The tail? At the (relative) old position in the
	2578	* queue? Or something else entirely?
	2579	*/
	2580	if (!active_processor->is_recommended) {
	2581	re_queue_head(&pset->unused_queue, &active_processor->processor_queue);
	2582	} else if (active_processor->processor_primary == active_processor) {
	2583	re_queue_head(&pset->idle_queue, &active_processor->processor_queue);
	2584	} else {
	2585	re_queue_head(&pset->idle_secondary_queue, &active_processor->processor_queue);
	2586	}
	2587
	2588	pset->active_processor_count--;
	2589	sched_update_pset_load_average(pset);
	2590
	2591	assert(active_processor->next_thread == THREAD_NULL);
	2592	processor_state_update_idle(active_processor);
	2593	active_processor->deadline = UINT64_MAX;
	2594	active_processor->state = PROCESSOR_IDLE;
	2595	bit_clear(pset->pending_deferred_AST_cpu_mask, active_processor->cpu_id);
	2596	machine_signal_idle_cancel(active_processor);
	2597	}
	2598
	2599	}
	2600	}
	2601
	2602	pset_unlock(pset);
	2603	}
	2604	#else
	2605	/* We don't support deferred ASTs; everything is candycanes and sunshine. */
	2606	#endif
	2607
	2608	static void
	2609	thread_csw_callout(
	2610	thread_t old,
	2611	thread_t new,
	2612	uint64_t timestamp)
	2613	{
	2614	perfcontrol_event event = (new->state & TH_IDLE) ? IDLE : CONTEXT_SWITCH;
	2615	uint64_t same_pri_latency = (new->state & TH_IDLE) ? 0 : new->same_pri_latency;
	2616	machine_switch_perfcontrol_context(event, timestamp, 0,
	2617	same_pri_latency, old, new);
	2618	}
	2619
	2620
	2621	/*
	2622	* thread_dispatch:
	2623	*
	2624	* Handle threads at context switch. Re-dispatch other thread
	2625	* if still running, otherwise update run state and perform
	2626	* special actions. Update quantum for other thread and begin
	2627	* the quantum for ourselves.
	2628	*
	2629	* "thread" is the old thread that we have switched away from.
	2630	* "self" is the new current thread that we have context switched to
	2631	*
	2632	* Called at splsched.
	2633	*/
	2634	void
	2635	thread_dispatch(
	2636	thread_t thread,
	2637	thread_t self)
	2638	{
	2639	processor_t processor = self->last_processor;
	2640
	2641	assert(processor == current_processor());
	2642	assert(self == current_thread());
	2643	assert(thread != self);
	2644
	2645	if (thread != THREAD_NULL) {
	2646	/*
	2647	* Do the perfcontrol callout for context switch.
	2648	* The reason we do this here is:
	2649	* - thread_dispatch() is called from various places that are not
	2650	* the direct context switch path for eg. processor shutdown etc.
	2651	* So adding the callout here covers all those cases.
	2652	* - We want this callout as early as possible to be close
	2653	* to the timestamp taken in thread_invoke()
	2654	* - We want to avoid holding the thread lock while doing the
	2655	* callout
	2656	* - We do not want to callout if "thread" is NULL.
	2657	*/
	2658	thread_csw_callout(thread, self, processor->last_dispatch);
	2659
	2660	/*
	2661	* If blocked at a continuation, discard
	2662	* the stack.
	2663	*/
	2664	if (thread->continuation != NULL && thread->kernel_stack != 0)
	2665	stack_free(thread);
	2666
	2667	if (thread->state & TH_IDLE) {
	2668	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	2669	MACHDBG_CODE(DBG_MACH_SCHED,MACH_DISPATCH) \| DBG_FUNC_NONE,
	2670	(uintptr_t)thread_tid(thread), 0, thread->state,
	2671	sched_run_buckets[TH_BUCKET_RUN], 0);
	2672	} else {
	2673	int64_t consumed;
	2674	int64_t remainder = 0;
	2675
	2676	if (processor->quantum_end > processor->last_dispatch)
	2677	remainder = processor->quantum_end -
	2678	processor->last_dispatch;
	2679
	2680	consumed = thread->quantum_remaining - remainder;
	2681
	2682	if ((thread->reason & AST_LEDGER) == 0) {
	2683	/*
	2684	* Bill CPU time to both the task and
	2685	* the individual thread.
	2686	*/
	2687	ledger_credit_thread(thread, thread->t_ledger,
	2688	task_ledgers.cpu_time, consumed);
	2689	ledger_credit_thread(thread, thread->t_threadledger,
	2690	thread_ledgers.cpu_time, consumed);
	2691	if (thread->t_bankledger) {
	2692	ledger_credit_thread(thread, thread->t_bankledger,
	2693	bank_ledgers.cpu_time,
	2694	(consumed - thread->t_deduct_bank_ledger_time));
	2695	}
	2696	thread->t_deduct_bank_ledger_time = 0;
	2697	}
	2698
	2699	wake_lock(thread);
	2700	thread_lock(thread);
	2701
	2702	/*
	2703	* Apply a priority floor if the thread holds a kernel resource
	2704	* Do this before checking starting_pri to avoid overpenalizing
	2705	* repeated rwlock blockers.
	2706	*/
	2707	if (__improbable(thread->rwlock_count != 0))
	2708	lck_rw_set_promotion_locked(thread);
	2709
	2710	boolean_t keep_quantum = processor->first_timeslice;
	2711
	2712	/*
	2713	* Treat a thread which has dropped priority since it got on core
	2714	* as having expired its quantum.
	2715	*/
	2716	if (processor->starting_pri > thread->sched_pri)
	2717	keep_quantum = FALSE;
	2718
	2719	/* Compute remainder of current quantum. */
	2720	if (keep_quantum &&
	2721	processor->quantum_end > processor->last_dispatch)
	2722	thread->quantum_remaining = (uint32_t)remainder;
	2723	else
	2724	thread->quantum_remaining = 0;
	2725
	2726	if (thread->sched_mode == TH_MODE_REALTIME) {
	2727	/*
	2728	* Cancel the deadline if the thread has
	2729	* consumed the entire quantum.
	2730	*/
	2731	if (thread->quantum_remaining == 0) {
	2732	thread->realtime.deadline = UINT64_MAX;
	2733	}
	2734	} else {
	2735	#if defined(CONFIG_SCHED_TIMESHARE_CORE)
	2736	/*
	2737	* For non-realtime threads treat a tiny
	2738	* remaining quantum as an expired quantum
	2739	* but include what's left next time.
	2740	*/
	2741	if (thread->quantum_remaining < min_std_quantum) {
	2742	thread->reason \|= AST_QUANTUM;
	2743	thread->quantum_remaining += SCHED(initial_quantum_size)(thread);
	2744	}
	2745	#endif /* CONFIG_SCHED_TIMESHARE_CORE */
	2746	}
	2747
	2748	/*
	2749	* If we are doing a direct handoff then
	2750	* take the remainder of the quantum.
	2751	*/
	2752	if ((thread->reason & (AST_HANDOFF\|AST_QUANTUM)) == AST_HANDOFF) {
	2753	self->quantum_remaining = thread->quantum_remaining;
	2754	thread->reason \|= AST_QUANTUM;
	2755	thread->quantum_remaining = 0;
	2756	} else {
	2757	#if defined(CONFIG_SCHED_MULTIQ)
	2758	if (SCHED(sched_groups_enabled) &&
	2759	thread->sched_group == self->sched_group) {
	2760	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	2761	MACHDBG_CODE(DBG_MACH_SCHED, MACH_QUANTUM_HANDOFF),
	2762	self->reason, (uintptr_t)thread_tid(thread),
	2763	self->quantum_remaining, thread->quantum_remaining, 0);
	2764
	2765	self->quantum_remaining = thread->quantum_remaining;
	2766	thread->quantum_remaining = 0;
	2767	/* Don't set AST_QUANTUM here - old thread might still want to preempt someone else */
	2768	}
	2769	#endif /* defined(CONFIG_SCHED_MULTIQ) */
	2770	}
	2771
	2772	thread->computation_metered += (processor->last_dispatch - thread->computation_epoch);
	2773
	2774	if (!(thread->state & TH_WAIT)) {
	2775	/*
	2776	* Still runnable.
	2777	*/
	2778	thread->last_made_runnable_time = thread->last_basepri_change_time = processor->last_dispatch;
	2779
	2780	machine_thread_going_off_core(thread, FALSE, processor->last_dispatch);
	2781
	2782	ast_t reason = thread->reason;
	2783	sched_options_t options = SCHED_NONE;
	2784
	2785	if (reason & AST_REBALANCE) {
	2786	options \|= SCHED_REBALANCE;
	2787	if (reason & AST_QUANTUM) {
	2788	/* Having gone to the trouble of forcing this thread off a less preferred core,
	2789	* we should force the preferable core to reschedule immediatey to give this
	2790	* thread a chance to run instead of just sitting on the run queue where
	2791	* it may just be stolen back by the idle core we just forced it off.
	2792	* But only do this at the end of a quantum to prevent cascading effects.
	2793	*/
	2794	options \|= SCHED_PREEMPT;
	2795	}
	2796	}
	2797
	2798	if (reason & AST_QUANTUM)
	2799	options \|= SCHED_TAILQ;
	2800	else if (reason & AST_PREEMPT)
	2801	options \|= SCHED_HEADQ;
	2802	else
	2803	options \|= (SCHED_PREEMPT \| SCHED_TAILQ);
	2804
	2805	thread_setrun(thread, options);
	2806
	2807	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	2808	MACHDBG_CODE(DBG_MACH_SCHED,MACH_DISPATCH) \| DBG_FUNC_NONE,
	2809	(uintptr_t)thread_tid(thread), thread->reason, thread->state,
	2810	sched_run_buckets[TH_BUCKET_RUN], 0);
	2811
	2812	if (thread->wake_active) {
	2813	thread->wake_active = FALSE;
	2814	thread_unlock(thread);
	2815
	2816	thread_wakeup(&thread->wake_active);
	2817	} else {
	2818	thread_unlock(thread);
	2819	}
	2820
	2821	wake_unlock(thread);
	2822	} else {
	2823	/*
	2824	* Waiting.
	2825	*/
	2826	boolean_t should_terminate = FALSE;
	2827	uint32_t new_run_count;
	2828
	2829	/* Only the first call to thread_dispatch
	2830	* after explicit termination should add
	2831	* the thread to the termination queue
	2832	*/
	2833	if ((thread->state & (TH_TERMINATE\|TH_TERMINATE2)) == TH_TERMINATE) {
	2834	should_terminate = TRUE;
	2835	thread->state \|= TH_TERMINATE2;
	2836	}
	2837
	2838	thread->state &= ~TH_RUN;
	2839	thread->last_made_runnable_time = thread->last_basepri_change_time = THREAD_NOT_RUNNABLE;
	2840	thread->chosen_processor = PROCESSOR_NULL;
	2841
	2842	new_run_count = sched_run_decr(thread);
	2843
	2844	#if CONFIG_SCHED_SFI
	2845	if ((thread->state & (TH_WAIT \| TH_TERMINATE)) == TH_WAIT) {
	2846	if (thread->reason & AST_SFI) {
	2847	thread->wait_sfi_begin_time = processor->last_dispatch;
	2848	}
	2849	}
	2850	#endif
	2851
	2852	machine_thread_going_off_core(thread, should_terminate, processor->last_dispatch);
	2853
	2854	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	2855	MACHDBG_CODE(DBG_MACH_SCHED,MACH_DISPATCH) \| DBG_FUNC_NONE,
	2856	(uintptr_t)thread_tid(thread), thread->reason, thread->state,
	2857	new_run_count, 0);
	2858
	2859	(*thread->sched_call)(SCHED_CALL_BLOCK, thread);
	2860
	2861	if (thread->wake_active) {
	2862	thread->wake_active = FALSE;
	2863	thread_unlock(thread);
	2864
	2865	thread_wakeup(&thread->wake_active);
	2866	} else {
	2867	thread_unlock(thread);
	2868	}
	2869
	2870	wake_unlock(thread);
	2871
	2872	if (should_terminate)
	2873	thread_terminate_enqueue(thread);
	2874	}
	2875	}
	2876	}
	2877
	2878	int urgency = THREAD_URGENCY_NONE;
	2879	uint64_t latency = 0;
	2880
	2881	/* Update (new) current thread and reprogram quantum timer */
	2882	thread_lock(self);
	2883
	2884	if (!(self->state & TH_IDLE)) {
	2885	uint64_t arg1, arg2;
	2886
	2887	#if CONFIG_SCHED_SFI
	2888	ast_t new_ast;
	2889
	2890	new_ast = sfi_thread_needs_ast(self, NULL);
	2891
	2892	if (new_ast != AST_NONE) {
	2893	ast_on(new_ast);
	2894	}
	2895	#endif
	2896
	2897	assertf(processor->last_dispatch >= self->last_made_runnable_time,
	2898	"Non-monotonic time? dispatch at 0x%llx, runnable at 0x%llx",
	2899	processor->last_dispatch, self->last_made_runnable_time);
	2900
	2901	assert(self->last_made_runnable_time <= self->last_basepri_change_time);
	2902
	2903	latency = processor->last_dispatch - self->last_made_runnable_time;
	2904	assert(latency >= self->same_pri_latency);
	2905
	2906	urgency = thread_get_urgency(self, &arg1, &arg2);
	2907
	2908	thread_tell_urgency(urgency, arg1, arg2, latency, self);
	2909
	2910	/*
	2911	* Get a new quantum if none remaining.
	2912	*/
	2913	if (self->quantum_remaining == 0) {
	2914	thread_quantum_init(self);
	2915	}
	2916
	2917	/*
	2918	* Set up quantum timer and timeslice.
	2919	*/
	2920	processor->quantum_end = processor->last_dispatch + self->quantum_remaining;
	2921	timer_call_quantum_timer_enter(&processor->quantum_timer, self,
	2922	processor->quantum_end, processor->last_dispatch);
	2923
	2924	processor->first_timeslice = TRUE;
	2925	} else {
	2926	timer_call_quantum_timer_cancel(&processor->quantum_timer);
	2927	processor->first_timeslice = FALSE;
	2928
	2929	thread_tell_urgency(THREAD_URGENCY_NONE, 0, 0, 0, self);
	2930	}
	2931
	2932	assert(self->block_hint == kThreadWaitNone);
	2933	self->computation_epoch = processor->last_dispatch;
	2934	self->reason = AST_NONE;
	2935	processor->starting_pri = self->sched_pri;
	2936
	2937	thread_unlock(self);
	2938
	2939	machine_thread_going_on_core(self, urgency, latency, self->same_pri_latency,
	2940	processor->last_dispatch);
	2941
	2942	#if defined(CONFIG_SCHED_DEFERRED_AST)
	2943	/*
	2944	* TODO: Can we state that redispatching our old thread is also
	2945	* uninteresting?
	2946	*/
	2947	if ((((volatile uint32_t)sched_run_buckets[TH_BUCKET_RUN]) == 1) &&
	2948	!(self->state & TH_IDLE)) {
	2949	pset_cancel_deferred_dispatch(processor->processor_set, processor);
	2950	}
	2951	#endif
	2952
	2953	}
	2954
	2955	/*
	2956	* thread_block_reason:
	2957	*
	2958	* Forces a reschedule, blocking the caller if a wait
	2959	* has been asserted.
	2960	*
	2961	* If a continuation is specified, then thread_invoke will
	2962	* attempt to discard the thread's kernel stack. When the
	2963	* thread resumes, it will execute the continuation function
	2964	* on a new kernel stack.
	2965	*/
	2966	counter(mach_counter_t c_thread_block_calls = 0;)
	2967
	2968	wait_result_t
	2969	thread_block_reason(
	2970	thread_continue_t continuation,
	2971	void *parameter,
	2972	ast_t reason)
	2973	{
	2974	thread_t self = current_thread();
	2975	processor_t processor;
	2976	thread_t new_thread;
	2977	spl_t s;
	2978
	2979	counter(++c_thread_block_calls);
	2980
	2981	s = splsched();
	2982
	2983	processor = current_processor();
	2984
	2985	/* If we're explicitly yielding, force a subsequent quantum */
	2986	if (reason & AST_YIELD)
	2987	processor->first_timeslice = FALSE;
	2988
	2989	/* We're handling all scheduling AST's */
	2990	ast_off(AST_SCHEDULING);
	2991
	2992	#if PROC_REF_DEBUG
	2993	if ((continuation != NULL) && (self->task != kernel_task)) {
	2994	if (uthread_get_proc_refcount(self->uthread) != 0) {
	2995	panic("thread_block_reason with continuation uthread %p with uu_proc_refcount != 0", self->uthread);
	2996	}
	2997	}
	2998	#endif
	2999
	3000	self->continuation = continuation;
	3001	self->parameter = parameter;
	3002
	3003	if (self->state & ~(TH_RUN \| TH_IDLE)) {
	3004	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	3005	MACHDBG_CODE(DBG_MACH_SCHED,MACH_BLOCK),
	3006	reason, VM_KERNEL_UNSLIDE(continuation), 0, 0, 0);
	3007	}
	3008
	3009	do {
	3010	thread_lock(self);
	3011	new_thread = thread_select(self, processor, &reason);
	3012	thread_unlock(self);
	3013	} while (!thread_invoke(self, new_thread, reason));
	3014
	3015	splx(s);
	3016
	3017	return (self->wait_result);
	3018	}
	3019
	3020	/*
	3021	* thread_block:
	3022	*
	3023	* Block the current thread if a wait has been asserted.
	3024	*/
	3025	wait_result_t
	3026	thread_block(
	3027	thread_continue_t continuation)
	3028	{
	3029	return thread_block_reason(continuation, NULL, AST_NONE);
	3030	}
	3031
	3032	wait_result_t
	3033	thread_block_parameter(
	3034	thread_continue_t continuation,
	3035	void *parameter)
	3036	{
	3037	return thread_block_reason(continuation, parameter, AST_NONE);
	3038	}
	3039
	3040	/*
	3041	* thread_run:
	3042	*
	3043	* Switch directly from the current thread to the
	3044	* new thread, handing off our quantum if appropriate.
	3045	*
	3046	* New thread must be runnable, and not on a run queue.
	3047	*
	3048	* Called at splsched.
	3049	*/
	3050	int
	3051	thread_run(
	3052	thread_t self,
	3053	thread_continue_t continuation,
	3054	void *parameter,
	3055	thread_t new_thread)
	3056	{
	3057	ast_t reason = AST_HANDOFF;
	3058
	3059	self->continuation = continuation;
	3060	self->parameter = parameter;
	3061
	3062	while (!thread_invoke(self, new_thread, reason)) {
	3063	/* the handoff failed, so we have to fall back to the normal block path */
	3064	processor_t processor = current_processor();
	3065
	3066	reason = AST_NONE;
	3067
	3068	thread_lock(self);
	3069	new_thread = thread_select(self, processor, &reason);
	3070	thread_unlock(self);
	3071	}
	3072
	3073	return (self->wait_result);
	3074	}
	3075
	3076	/*
	3077	* thread_continue:
	3078	*
	3079	* Called at splsched when a thread first receives
	3080	* a new stack after a continuation.
	3081	*/
	3082	void
	3083	thread_continue(
	3084	thread_t thread)
	3085	{
	3086	thread_t self = current_thread();
	3087	thread_continue_t continuation;
	3088	void *parameter;
	3089
	3090	DTRACE_SCHED(on__cpu);
	3091
	3092	continuation = self->continuation;
	3093	parameter = self->parameter;
	3094
	3095	#if KPERF
	3096	kperf_on_cpu(self, continuation, NULL);
	3097	#endif
	3098
	3099	thread_dispatch(thread, self);
	3100
	3101	self->continuation = self->parameter = NULL;
	3102
	3103	#if INTERRUPT_MASKED_DEBUG
	3104	/* Reset interrupt-masked spin debugging timeout */
	3105	ml_spin_debug_clear(self);
	3106	#endif
	3107
	3108	if (thread != THREAD_NULL)
	3109	(void)spllo();
	3110
	3111	TLOG(1, "thread_continue: calling call_continuation \n");
	3112	call_continuation(continuation, parameter, self->wait_result);
	3113	/NOTREACHED/
	3114	}
	3115
	3116	void
	3117	thread_quantum_init(thread_t thread)
	3118	{
	3119	if (thread->sched_mode == TH_MODE_REALTIME) {
	3120	thread->quantum_remaining = thread->realtime.computation;
	3121	} else {
	3122	thread->quantum_remaining = SCHED(initial_quantum_size)(thread);
	3123	}
	3124	}
	3125
	3126	uint32_t
	3127	sched_timeshare_initial_quantum_size(thread_t thread)
	3128	{
	3129	if ((thread != THREAD_NULL) && thread->th_sched_bucket == TH_BUCKET_SHARE_BG)
	3130	return bg_quantum;
	3131	else
	3132	return std_quantum;
	3133	}
	3134
	3135	/*
	3136	* run_queue_init:
	3137	*
	3138	* Initialize a run queue before first use.
	3139	*/
	3140	void
	3141	run_queue_init(
	3142	run_queue_t rq)
	3143	{
	3144	rq->highq = NOPRI;
	3145	for (u_int i = 0; i < BITMAP_LEN(NRQS); i++)
	3146	rq->bitmap[i] = 0;
	3147	rq->urgency = rq->count = 0;
	3148	for (int i = 0; i < NRQS; i++)
	3149	queue_init(&rq->queues[i]);
	3150	}
	3151
	3152	/*
	3153	* run_queue_dequeue:
	3154	*
	3155	* Perform a dequeue operation on a run queue,
	3156	* and return the resulting thread.
	3157	*
	3158	* The run queue must be locked (see thread_run_queue_remove()
	3159	* for more info), and not empty.
	3160	*/
	3161	thread_t
	3162	run_queue_dequeue(
	3163	run_queue_t rq,
	3164	integer_t options)
	3165	{
	3166	thread_t thread;
	3167	queue_t queue = &rq->queues[rq->highq];
	3168
	3169	if (options & SCHED_HEADQ) {
	3170	thread = qe_dequeue_head(queue, struct thread, runq_links);
	3171	} else {
	3172	thread = qe_dequeue_tail(queue, struct thread, runq_links);
	3173	}
	3174
	3175	assert(thread != THREAD_NULL);
	3176	assert_thread_magic(thread);
	3177
	3178	thread->runq = PROCESSOR_NULL;
	3179	SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
	3180	rq->count--;
	3181	if (SCHED(priority_is_urgent)(rq->highq)) {
	3182	rq->urgency--; assert(rq->urgency >= 0);
	3183	}
	3184	if (queue_empty(queue)) {
	3185	bitmap_clear(rq->bitmap, rq->highq);
	3186	rq->highq = bitmap_first(rq->bitmap, NRQS);
	3187	}
	3188
	3189	return thread;
	3190	}
	3191
	3192	/*
	3193	* run_queue_enqueue:
	3194	*
	3195	* Perform a enqueue operation on a run queue.
	3196	*
	3197	* The run queue must be locked (see thread_run_queue_remove()
	3198	* for more info).
	3199	*/
	3200	boolean_t
	3201	run_queue_enqueue(
	3202	run_queue_t rq,
	3203	thread_t thread,
	3204	integer_t options)
	3205	{
	3206	queue_t queue = &rq->queues[thread->sched_pri];
	3207	boolean_t result = FALSE;
	3208
	3209	assert_thread_magic(thread);
	3210
	3211	if (queue_empty(queue)) {
	3212	enqueue_tail(queue, &thread->runq_links);
	3213
	3214	rq_bitmap_set(rq->bitmap, thread->sched_pri);
	3215	if (thread->sched_pri > rq->highq) {
	3216	rq->highq = thread->sched_pri;
	3217	result = TRUE;
	3218	}
	3219	} else {
	3220	if (options & SCHED_TAILQ)
	3221	enqueue_tail(queue, &thread->runq_links);
	3222	else
	3223	enqueue_head(queue, &thread->runq_links);
	3224	}
	3225	if (SCHED(priority_is_urgent)(thread->sched_pri))
	3226	rq->urgency++;
	3227	SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
	3228	rq->count++;
	3229
	3230	return (result);
	3231	}
	3232
	3233	/*
	3234	* run_queue_remove:
	3235	*
	3236	* Remove a specific thread from a runqueue.
	3237	*
	3238	* The run queue must be locked.
	3239	*/
	3240	void
	3241	run_queue_remove(
	3242	run_queue_t rq,
	3243	thread_t thread)
	3244	{
	3245	assert(thread->runq != PROCESSOR_NULL);
	3246	assert_thread_magic(thread);
	3247
	3248	remqueue(&thread->runq_links);
	3249	SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
	3250	rq->count--;
	3251	if (SCHED(priority_is_urgent)(thread->sched_pri)) {
	3252	rq->urgency--; assert(rq->urgency >= 0);
	3253	}
	3254
	3255	if (queue_empty(&rq->queues[thread->sched_pri])) {
	3256	/* update run queue status */
	3257	bitmap_clear(rq->bitmap, thread->sched_pri);
	3258	rq->highq = bitmap_first(rq->bitmap, NRQS);
	3259	}
	3260
	3261	thread->runq = PROCESSOR_NULL;
	3262	}
	3263
	3264	/* Assumes RT lock is not held, and acquires splsched/rt_lock itself */
	3265	void
	3266	sched_rtglobal_runq_scan(sched_update_scan_context_t scan_context)
	3267	{
	3268	spl_t s;
	3269	thread_t thread;
	3270
	3271	processor_set_t pset = &pset0;
	3272
	3273	s = splsched();
	3274	rt_lock_lock(pset);
	3275
	3276	qe_foreach_element_safe(thread, &pset->rt_runq.queue, runq_links) {
	3277	if (thread->last_made_runnable_time < scan_context->earliest_rt_make_runnable_time) {
	3278	scan_context->earliest_rt_make_runnable_time = thread->last_made_runnable_time;
	3279	}
	3280	}
	3281
	3282	rt_lock_unlock(pset);
	3283	splx(s);
	3284	}
	3285
	3286	int64_t
	3287	sched_rtglobal_runq_count_sum(void)
	3288	{
	3289	return pset0.rt_runq.runq_stats.count_sum;
	3290	}
	3291
	3292	/*
	3293	* realtime_queue_insert:
	3294	*
	3295	* Enqueue a thread for realtime execution.
	3296	*/
	3297	static boolean_t
	3298	realtime_queue_insert(processor_t processor, processor_set_t pset, thread_t thread)
	3299	{
	3300	queue_t queue = &SCHED(rt_runq)(pset)->queue;
	3301	uint64_t deadline = thread->realtime.deadline;
	3302	boolean_t preempt = FALSE;
	3303
	3304	rt_lock_lock(pset);
	3305
	3306	if (queue_empty(queue)) {
	3307	enqueue_tail(queue, &thread->runq_links);
	3308	preempt = TRUE;
	3309	} else {
	3310	/* Insert into rt_runq in thread deadline order */
	3311	queue_entry_t iter;
	3312	qe_foreach(iter, queue) {
	3313	thread_t iter_thread = qe_element(iter, struct thread, runq_links);
	3314	assert_thread_magic(iter_thread);
	3315
	3316	if (deadline < iter_thread->realtime.deadline) {
	3317	if (iter == queue_first(queue))
	3318	preempt = TRUE;
	3319	insque(&thread->runq_links, queue_prev(iter));
	3320	break;
	3321	} else if (iter == queue_last(queue)) {
	3322	enqueue_tail(queue, &thread->runq_links);
	3323	break;
	3324	}
	3325	}
	3326	}
	3327
	3328	thread->runq = processor;
	3329	SCHED_STATS_RUNQ_CHANGE(&SCHED(rt_runq)(pset)->runq_stats, rt_runq_count(pset));
	3330	rt_runq_count_incr(pset);
	3331
	3332	rt_lock_unlock(pset);
	3333
	3334	return (preempt);
	3335	}
	3336
	3337	/*
	3338	* realtime_setrun:
	3339	*
	3340	* Dispatch a thread for realtime execution.
	3341	*
	3342	* Thread must be locked. Associated pset must
	3343	* be locked, and is returned unlocked.
	3344	*/
	3345	static void
	3346	realtime_setrun(
	3347	processor_t processor,
	3348	thread_t thread)
	3349	{
	3350	processor_set_t pset = processor->processor_set;
	3351	ast_t preempt;
	3352
	3353	sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
	3354
	3355	thread->chosen_processor = processor;
	3356
	3357	/* <rdar://problem/15102234> */
	3358	assert(thread->bound_processor == PROCESSOR_NULL);
	3359
	3360	/*
	3361	* Dispatch directly onto idle processor.
	3362	*/
	3363	if ( (thread->bound_processor == processor)
	3364	&& processor->state == PROCESSOR_IDLE) {
	3365	re_queue_tail(&pset->active_queue, &processor->processor_queue);
	3366
	3367	pset->active_processor_count++;
	3368	sched_update_pset_load_average(pset);
	3369
	3370	processor->next_thread = thread;
	3371	processor_state_update_from_thread(processor, thread);
	3372	processor->deadline = thread->realtime.deadline;
	3373	processor->state = PROCESSOR_DISPATCHING;
	3374
	3375	ipi_type = sched_ipi_action(processor, thread, true, SCHED_IPI_EVENT_BOUND_THR);
	3376	pset_unlock(pset);
	3377	sched_ipi_perform(processor, ipi_type);
	3378	return;
	3379	}
	3380
	3381	if (processor->current_pri < BASEPRI_RTQUEUES)
	3382	preempt = (AST_PREEMPT \| AST_URGENT);
	3383	else if (thread->realtime.deadline < processor->deadline)
	3384	preempt = (AST_PREEMPT \| AST_URGENT);
	3385	else
	3386	preempt = AST_NONE;
	3387
	3388	realtime_queue_insert(processor, pset, thread);
	3389
	3390	ipi_type = SCHED_IPI_NONE;
	3391	if (preempt != AST_NONE) {
	3392	if (processor->state == PROCESSOR_IDLE) {
	3393	re_queue_tail(&pset->active_queue, &processor->processor_queue);
	3394
	3395	pset->active_processor_count++;
	3396	sched_update_pset_load_average(pset);
	3397
	3398	processor->next_thread = THREAD_NULL;
	3399	processor_state_update_from_thread(processor, thread);
	3400	processor->deadline = thread->realtime.deadline;
	3401	processor->state = PROCESSOR_DISPATCHING;
	3402	if (processor == current_processor()) {
	3403	ast_on(preempt);
	3404	} else {
	3405	ipi_type = sched_ipi_action(processor, thread, true, SCHED_IPI_EVENT_PREEMPT);
	3406	}
	3407	} else if (processor->state == PROCESSOR_DISPATCHING) {
	3408	if ((processor->next_thread == THREAD_NULL) && ((processor->current_pri < thread->sched_pri) \|\| (processor->deadline > thread->realtime.deadline))) {
	3409	processor_state_update_from_thread(processor, thread);
	3410	processor->deadline = thread->realtime.deadline;
	3411	}
	3412	} else {
	3413	if (processor == current_processor()) {
	3414	ast_on(preempt);
	3415	} else {
	3416	ipi_type = sched_ipi_action(processor, thread, false, SCHED_IPI_EVENT_PREEMPT);
	3417	}
	3418	}
	3419	} else {
	3420	/* Selected processor was too busy, just keep thread enqueued and let other processors drain it naturally. */
	3421	}
	3422
	3423	pset_unlock(pset);
	3424	sched_ipi_perform(processor, ipi_type);
	3425	}
	3426
	3427
	3428	sched_ipi_type_t sched_ipi_deferred_policy(processor_set_t pset, processor_t dst,
	3429	__unused sched_ipi_event_t event)
	3430	{
	3431	#if defined(CONFIG_SCHED_DEFERRED_AST)
	3432	if (!bit_test(pset->pending_deferred_AST_cpu_mask, dst->cpu_id)) {
	3433	return SCHED_IPI_DEFERRED;
	3434	}
	3435	#else /* CONFIG_SCHED_DEFERRED_AST */
	3436	panic("Request for deferred IPI on an unsupported platform; pset: %p CPU: %d", pset, dst->cpu_id);
	3437	#endif /* CONFIG_SCHED_DEFERRED_AST */
	3438	return SCHED_IPI_NONE;
	3439	}
	3440
	3441	sched_ipi_type_t sched_ipi_action(processor_t dst, thread_t thread, boolean_t dst_idle, sched_ipi_event_t event)
	3442	{
	3443	sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
	3444	assert(dst != NULL);
	3445
	3446	processor_set_t pset = dst->processor_set;
	3447	if (current_processor() == dst) {
	3448	return SCHED_IPI_NONE;
	3449	}
	3450
	3451	if (bit_test(pset->pending_AST_cpu_mask, dst->cpu_id)) {
	3452	return SCHED_IPI_NONE;
	3453	}
	3454
	3455	ipi_type = SCHED(ipi_policy)(dst, thread, dst_idle, event);
	3456	switch(ipi_type) {
	3457	case SCHED_IPI_NONE:
	3458	return SCHED_IPI_NONE;
	3459	#if defined(CONFIG_SCHED_DEFERRED_AST)
	3460	case SCHED_IPI_DEFERRED:
	3461	bit_set(pset->pending_deferred_AST_cpu_mask, dst->cpu_id);
	3462	break;
	3463	#endif /* CONFIG_SCHED_DEFERRED_AST */
	3464	default:
	3465	bit_set(pset->pending_AST_cpu_mask, dst->cpu_id);
	3466	break;
	3467	}
	3468	return ipi_type;
	3469	}
	3470
	3471	sched_ipi_type_t sched_ipi_policy(processor_t dst, thread_t thread, boolean_t dst_idle, sched_ipi_event_t event)
	3472	{
	3473	sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
	3474	boolean_t deferred_ipi_supported = false;
	3475	processor_set_t pset = dst->processor_set;
	3476
	3477	#if defined(CONFIG_SCHED_DEFERRED_AST)
	3478	deferred_ipi_supported = true;
	3479	#endif /* CONFIG_SCHED_DEFERRED_AST */
	3480
	3481	switch(event) {
	3482	case SCHED_IPI_EVENT_SPILL:
	3483	case SCHED_IPI_EVENT_SMT_REBAL:
	3484	case SCHED_IPI_EVENT_REBALANCE:
	3485	case SCHED_IPI_EVENT_BOUND_THR:
	3486	/*
	3487	* The spill, SMT rebalance, rebalance and the bound thread
	3488	* scenarios use immediate IPIs always.
	3489	*/
	3490	ipi_type = dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
	3491	break;
	3492	case SCHED_IPI_EVENT_PREEMPT:
	3493	/* In the preemption case, use immediate IPIs for RT threads */
	3494	if (thread && (thread->sched_pri >= BASEPRI_RTQUEUES)) {
	3495	ipi_type = dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
	3496	break;
	3497	}
	3498
	3499	/*
	3500	* For Non-RT threads preemption,
	3501	* If the core is active, use immediate IPIs.
	3502	* If the core is idle, use deferred IPIs if supported; otherwise immediate IPI.
	3503	*/
	3504	if (deferred_ipi_supported && dst_idle) {
	3505	return sched_ipi_deferred_policy(pset, dst, event);
	3506	}
	3507	ipi_type = dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
	3508	break;
	3509	default:
	3510	panic("Unrecognized scheduler IPI event type %d", event);
	3511	}
	3512	assert(ipi_type != SCHED_IPI_NONE);
	3513	return ipi_type;
	3514	}
	3515
	3516	void sched_ipi_perform(processor_t dst, sched_ipi_type_t ipi)
	3517	{
	3518	switch (ipi) {
	3519	case SCHED_IPI_NONE:
	3520	break;
	3521	case SCHED_IPI_IDLE:
	3522	machine_signal_idle(dst);
	3523	break;
	3524	case SCHED_IPI_IMMEDIATE:
	3525	cause_ast_check(dst);
	3526	break;
	3527	case SCHED_IPI_DEFERRED:
	3528	machine_signal_idle_deferred(dst);
	3529	break;
	3530	default:
	3531	panic("Unrecognized scheduler IPI type: %d", ipi);
	3532	}
	3533	}
	3534
	3535	#if defined(CONFIG_SCHED_TIMESHARE_CORE)
	3536
	3537	boolean_t
	3538	priority_is_urgent(int priority)
	3539	{
	3540	return bitmap_test(sched_preempt_pri, priority) ? TRUE : FALSE;
	3541	}
	3542
	3543	#endif /* CONFIG_SCHED_TIMESHARE_CORE */
	3544
	3545	/*
	3546	* processor_setrun:
	3547	*
	3548	* Dispatch a thread for execution on a
	3549	* processor.
	3550	*
	3551	* Thread must be locked. Associated pset must
	3552	* be locked, and is returned unlocked.
	3553	*/
	3554	static void
	3555	processor_setrun(
	3556	processor_t processor,
	3557	thread_t thread,
	3558	integer_t options)
	3559	{
	3560	processor_set_t pset = processor->processor_set;
	3561	ast_t preempt;
	3562	enum { eExitIdle, eInterruptRunning, eDoNothing } ipi_action = eDoNothing;
	3563
	3564	sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
	3565
	3566	thread->chosen_processor = processor;
	3567
	3568	/*
	3569	* Dispatch directly onto idle processor.
	3570	*/
	3571	if ( (SCHED(direct_dispatch_to_idle_processors) \|\|
	3572	thread->bound_processor == processor)
	3573	&& processor->state == PROCESSOR_IDLE) {
	3574
	3575	re_queue_tail(&pset->active_queue, &processor->processor_queue);
	3576
	3577	pset->active_processor_count++;
	3578	sched_update_pset_load_average(pset);
	3579
	3580	processor->next_thread = thread;
	3581	processor_state_update_from_thread(processor, thread);
	3582	processor->deadline = UINT64_MAX;
	3583	processor->state = PROCESSOR_DISPATCHING;
	3584
	3585	ipi_type = sched_ipi_action(processor, thread, true, SCHED_IPI_EVENT_BOUND_THR);
	3586	pset_unlock(pset);
	3587	sched_ipi_perform(processor, ipi_type);
	3588	return;
	3589	}
	3590
	3591	/*
	3592	* Set preemption mode.
	3593	*/
	3594	#if defined(CONFIG_SCHED_DEFERRED_AST)
	3595	/* TODO: Do we need to care about urgency (see rdar://problem/20136239)? */
	3596	#endif
	3597	if (SCHED(priority_is_urgent)(thread->sched_pri) && thread->sched_pri > processor->current_pri)
	3598	preempt = (AST_PREEMPT \| AST_URGENT);
	3599	else if(processor->active_thread && thread_eager_preemption(processor->active_thread))
	3600	preempt = (AST_PREEMPT \| AST_URGENT);
	3601	else if ((thread->sched_mode == TH_MODE_TIMESHARE) && (thread->sched_pri < thread->base_pri)) {
	3602	if(SCHED(priority_is_urgent)(thread->base_pri) && thread->sched_pri > processor->current_pri) {
	3603	preempt = (options & SCHED_PREEMPT)? AST_PREEMPT: AST_NONE;
	3604	} else {
	3605	preempt = AST_NONE;
	3606	}
	3607	} else
	3608	preempt = (options & SCHED_PREEMPT)? AST_PREEMPT: AST_NONE;
	3609
	3610	SCHED(processor_enqueue)(processor, thread, options);
	3611	sched_update_pset_load_average(pset);
	3612
	3613	if (preempt != AST_NONE) {
	3614	if (processor->state == PROCESSOR_IDLE) {
	3615	re_queue_tail(&pset->active_queue, &processor->processor_queue);
	3616	pset->active_processor_count++;
	3617	processor->next_thread = THREAD_NULL;
	3618	processor_state_update_from_thread(processor, thread);
	3619	processor->deadline = UINT64_MAX;
	3620	processor->state = PROCESSOR_DISPATCHING;
	3621	ipi_action = eExitIdle;
	3622	} else if ( processor->state == PROCESSOR_DISPATCHING) {
	3623	if ((processor->next_thread == THREAD_NULL) && (processor->current_pri < thread->sched_pri)) {
	3624	processor_state_update_from_thread(processor, thread);
	3625	processor->deadline = UINT64_MAX;
	3626	}
	3627	} else if ( (processor->state == PROCESSOR_RUNNING \|\|
	3628	processor->state == PROCESSOR_SHUTDOWN) &&
	3629	(thread->sched_pri >= processor->current_pri)) {
	3630	ipi_action = eInterruptRunning;
	3631	}
	3632	} else {
	3633	/*
	3634	* New thread is not important enough to preempt what is running, but
	3635	* special processor states may need special handling
	3636	*/
	3637	if (processor->state == PROCESSOR_SHUTDOWN &&
	3638	thread->sched_pri >= processor->current_pri ) {
	3639	ipi_action = eInterruptRunning;
	3640	} else if (processor->state == PROCESSOR_IDLE) {
	3641	re_queue_tail(&pset->active_queue, &processor->processor_queue);
	3642
	3643	pset->active_processor_count++;
	3644	// sched_update_pset_load_average(pset);
	3645
	3646	processor->next_thread = THREAD_NULL;
	3647	processor_state_update_from_thread(processor, thread);
	3648	processor->deadline = UINT64_MAX;
	3649	processor->state = PROCESSOR_DISPATCHING;
	3650
	3651	ipi_action = eExitIdle;
	3652	}
	3653	}
	3654
	3655	if (ipi_action != eDoNothing) {
	3656	if (processor == current_processor()) {
	3657	if (csw_check_locked(processor, pset, AST_NONE) != AST_NONE)
	3658	ast_on(preempt);
	3659	} else {
	3660	sched_ipi_event_t event = (options & SCHED_REBALANCE) ? SCHED_IPI_EVENT_REBALANCE : SCHED_IPI_EVENT_PREEMPT;
	3661	ipi_type = sched_ipi_action(processor, thread, (ipi_action == eExitIdle), event);
	3662	}
	3663	}
	3664	pset_unlock(pset);
	3665	sched_ipi_perform(processor, ipi_type);
	3666	}
	3667
	3668	/*
	3669	* choose_next_pset:
	3670	*
	3671	* Return the next sibling pset containing
	3672	* available processors.
	3673	*
	3674	* Returns the original pset if none other is
	3675	* suitable.
	3676	*/
	3677	static processor_set_t
	3678	choose_next_pset(
	3679	processor_set_t pset)
	3680	{
	3681	processor_set_t nset = pset;
	3682
	3683	do {
	3684	nset = next_pset(nset);
	3685	} while (nset->online_processor_count < 1 && nset != pset);
	3686
	3687	return (nset);
	3688	}
	3689
	3690	/*
	3691	* choose_processor:
	3692	*
	3693	* Choose a processor for the thread, beginning at
	3694	* the pset. Accepts an optional processor hint in
	3695	* the pset.
	3696	*
	3697	* Returns a processor, possibly from a different pset.
	3698	*
	3699	* The thread must be locked. The pset must be locked,
	3700	* and the resulting pset is locked on return.
	3701	*/
	3702	processor_t
	3703	choose_processor(
	3704	processor_set_t pset,
	3705	processor_t processor,
	3706	thread_t thread)
	3707	{
	3708	processor_set_t nset, cset = pset;
	3709
	3710	assert(thread->sched_pri <= BASEPRI_RTQUEUES);
	3711
	3712	/*
	3713	* Prefer the hinted processor, when appropriate.
	3714	*/
	3715
	3716	/* Fold last processor hint from secondary processor to its primary */
	3717	if (processor != PROCESSOR_NULL) {
	3718	processor = processor->processor_primary;
	3719	}
	3720
	3721	/*
	3722	* Only consult platform layer if pset is active, which
	3723	* it may not be in some cases when a multi-set system
	3724	* is going to sleep.
	3725	*/
	3726	if (pset->online_processor_count) {
	3727	if ((processor == PROCESSOR_NULL) \|\| (processor->processor_set == pset && processor->state == PROCESSOR_IDLE)) {
	3728	processor_t mc_processor = machine_choose_processor(pset, processor);
	3729	if (mc_processor != PROCESSOR_NULL)
	3730	processor = mc_processor->processor_primary;
	3731	}
	3732	}
	3733
	3734	/*
	3735	* At this point, we may have a processor hint, and we may have
	3736	* an initial starting pset. If the hint is not in the pset, or
	3737	* if the hint is for a processor in an invalid state, discard
	3738	* the hint.
	3739	*/
	3740	if (processor != PROCESSOR_NULL) {
	3741	if (processor->processor_set != pset) {
	3742	processor = PROCESSOR_NULL;
	3743	} else if (!processor->is_recommended) {
	3744	processor = PROCESSOR_NULL;
	3745	} else {
	3746	switch (processor->state) {
	3747	case PROCESSOR_START:
	3748	case PROCESSOR_SHUTDOWN:
	3749	case PROCESSOR_OFF_LINE:
	3750	/*
	3751	* Hint is for a processor that cannot support running new threads.
	3752	*/
	3753	processor = PROCESSOR_NULL;
	3754	break;
	3755	case PROCESSOR_IDLE:
	3756	/*
	3757	* Hint is for an idle processor. Assume it is no worse than any other
	3758	* idle processor. The platform layer had an opportunity to provide
	3759	* the "least cost idle" processor above.
	3760	*/
	3761	return (processor);
	3762	case PROCESSOR_RUNNING:
	3763	case PROCESSOR_DISPATCHING:
	3764	/*
	3765	* Hint is for an active CPU. This fast-path allows
	3766	* realtime threads to preempt non-realtime threads
	3767	* to regain their previous executing processor.
	3768	*/
	3769	if ((thread->sched_pri >= BASEPRI_RTQUEUES) &&
	3770	(processor->current_pri < BASEPRI_RTQUEUES))
	3771	return (processor);
	3772
	3773	/* Otherwise, use hint as part of search below */
	3774	break;
	3775	default:
	3776	processor = PROCESSOR_NULL;
	3777	break;
	3778	}
	3779	}
	3780	}
	3781
	3782	/*
	3783	* Iterate through the processor sets to locate
	3784	* an appropriate processor. Seed results with
	3785	* a last-processor hint, if available, so that
	3786	* a search must find something strictly better
	3787	* to replace it.
	3788	*
	3789	* A primary/secondary pair of SMT processors are
	3790	* "unpaired" if the primary is busy but its
	3791	* corresponding secondary is idle (so the physical
	3792	* core has full use of its resources).
	3793	*/
	3794
	3795	integer_t lowest_priority = MAXPRI + 1;
	3796	integer_t lowest_secondary_priority = MAXPRI + 1;
	3797	integer_t lowest_unpaired_primary_priority = MAXPRI + 1;
	3798	integer_t lowest_count = INT_MAX;
	3799	uint64_t furthest_deadline = 1;
	3800	processor_t lp_processor = PROCESSOR_NULL;
	3801	processor_t lp_unpaired_primary_processor = PROCESSOR_NULL;
	3802	processor_t lp_unpaired_secondary_processor = PROCESSOR_NULL;
	3803	processor_t lp_paired_secondary_processor = PROCESSOR_NULL;
	3804	processor_t lc_processor = PROCESSOR_NULL;
	3805	processor_t fd_processor = PROCESSOR_NULL;
	3806
	3807	if (processor != PROCESSOR_NULL) {
	3808	/* All other states should be enumerated above. */
	3809	assert(processor->state == PROCESSOR_RUNNING \|\| processor->state == PROCESSOR_DISPATCHING);
	3810
	3811	lowest_priority = processor->current_pri;
	3812	lp_processor = processor;
	3813
	3814	if (processor->current_pri >= BASEPRI_RTQUEUES) {
	3815	furthest_deadline = processor->deadline;
	3816	fd_processor = processor;
	3817	}
	3818
	3819	lowest_count = SCHED(processor_runq_count)(processor);
	3820	lc_processor = processor;
	3821	}
	3822
	3823	do {
	3824
	3825	/*
	3826	* Choose an idle processor, in pset traversal order
	3827	*/
	3828	qe_foreach_element(processor, &cset->idle_queue, processor_queue) {
	3829	if (bit_test(cset->pending_AST_cpu_mask, processor->cpu_id)) {
	3830	continue;
	3831	}
	3832	if (processor->is_recommended)
	3833	return processor;
	3834	}
	3835
	3836	/*
	3837	* Otherwise, enumerate active and idle processors to find primary candidates
	3838	* with lower priority/etc.
	3839	*/
	3840
	3841	qe_foreach_element(processor, &cset->active_queue, processor_queue) {
	3842
	3843	if (!processor->is_recommended) {
	3844	continue;
	3845	}
	3846	if (bit_test(cset->pending_AST_cpu_mask, processor->cpu_id)) {
	3847	continue;
	3848	}
	3849
	3850	integer_t cpri = processor->current_pri;
	3851	if (processor->processor_primary != processor) {
	3852	if (cpri < lowest_secondary_priority) {
	3853	lowest_secondary_priority = cpri;
	3854	lp_paired_secondary_processor = processor;
	3855	}
	3856	} else {
	3857	if (cpri < lowest_priority) {
	3858	lowest_priority = cpri;
	3859	lp_processor = processor;
	3860	}
	3861	}
	3862
	3863	if ((cpri >= BASEPRI_RTQUEUES) && (processor->deadline > furthest_deadline)) {
	3864	furthest_deadline = processor->deadline;
	3865	fd_processor = processor;
	3866	}
	3867
	3868	integer_t ccount = SCHED(processor_runq_count)(processor);
	3869	if (ccount < lowest_count) {
	3870	lowest_count = ccount;
	3871	lc_processor = processor;
	3872	}
	3873	}
	3874
	3875	/*
	3876	* For SMT configs, these idle secondary processors must have active primary. Otherwise
	3877	* the idle primary would have short-circuited the loop above
	3878	*/
	3879	qe_foreach_element(processor, &cset->idle_secondary_queue, processor_queue) {
	3880
	3881	if (!processor->is_recommended) {
	3882	continue;
	3883	}
	3884
	3885	processor_t cprimary = processor->processor_primary;
	3886
	3887	if (bit_test(cset->pending_AST_cpu_mask, cprimary->cpu_id)) {
	3888	continue;
	3889	}
	3890
	3891	/* If the primary processor is offline or starting up, it's not a candidate for this path */
	3892	if (cprimary->state == PROCESSOR_RUNNING \|\| cprimary->state == PROCESSOR_DISPATCHING) {
	3893	integer_t primary_pri = cprimary->current_pri;
	3894
	3895	if (primary_pri < lowest_unpaired_primary_priority) {
	3896	lowest_unpaired_primary_priority = primary_pri;
	3897	lp_unpaired_primary_processor = cprimary;
	3898	lp_unpaired_secondary_processor = processor;
	3899	}
	3900	}
	3901	}
	3902
	3903
	3904	if (thread->sched_pri >= BASEPRI_RTQUEUES) {
	3905
	3906	/*
	3907	* For realtime threads, the most important aspect is
	3908	* scheduling latency, so we attempt to assign threads
	3909	* to good preemption candidates (assuming an idle primary
	3910	* processor was not available above).
	3911	*/
	3912
	3913	if (thread->sched_pri > lowest_unpaired_primary_priority) {
	3914	/* Move to end of active queue so that the next thread doesn't also pick it */
	3915	re_queue_tail(&cset->active_queue, &lp_unpaired_primary_processor->processor_queue);
	3916	return lp_unpaired_primary_processor;
	3917	}
	3918	if (thread->sched_pri > lowest_priority) {
	3919	/* Move to end of active queue so that the next thread doesn't also pick it */
	3920	re_queue_tail(&cset->active_queue, &lp_processor->processor_queue);
	3921	return lp_processor;
	3922	}
	3923	if (sched_allow_rt_smt && (thread->sched_pri > lowest_secondary_priority)) {
	3924	return lp_paired_secondary_processor;
	3925	}
	3926	if (thread->realtime.deadline < furthest_deadline)
	3927	return fd_processor;
	3928
	3929	/*
	3930	* If all primary and secondary CPUs are busy with realtime
	3931	* threads with deadlines earlier than us, move on to next
	3932	* pset.
	3933	*/
	3934	}
	3935	else {
	3936
	3937	if (thread->sched_pri > lowest_unpaired_primary_priority) {
	3938	/* Move to end of active queue so that the next thread doesn't also pick it */
	3939	re_queue_tail(&cset->active_queue, &lp_unpaired_primary_processor->processor_queue);
	3940	return lp_unpaired_primary_processor;
	3941	}
	3942	if (thread->sched_pri > lowest_priority) {
	3943	/* Move to end of active queue so that the next thread doesn't also pick it */
	3944	re_queue_tail(&cset->active_queue, &lp_processor->processor_queue);
	3945	return lp_processor;
	3946	}
	3947
	3948	/*
	3949	* If all primary processor in this pset are running a higher
	3950	* priority thread, move on to next pset. Only when we have
	3951	* exhausted this search do we fall back to other heuristics.
	3952	*/
	3953	}
	3954
	3955	/*
	3956	* Move onto the next processor set.
	3957	*/
	3958	nset = next_pset(cset);
	3959
	3960	if (nset != pset) {
	3961	pset_unlock(cset);
	3962
	3963	cset = nset;
	3964	pset_lock(cset);
	3965	}
	3966	} while (nset != pset);
	3967
	3968	/*
	3969	* Make sure that we pick a running processor,
	3970	* and that the correct processor set is locked.
	3971	* Since we may have unlock the candidate processor's
	3972	* pset, it may have changed state.
	3973	*
	3974	* All primary processors are running a higher priority
	3975	* thread, so the only options left are enqueuing on
	3976	* the secondary processor that would perturb the least priority
	3977	* primary, or the least busy primary.
	3978	*/
	3979	do {
	3980
	3981	/* lowest_priority is evaluated in the main loops above */
	3982	if (lp_unpaired_secondary_processor != PROCESSOR_NULL) {
	3983	processor = lp_unpaired_secondary_processor;
	3984	lp_unpaired_secondary_processor = PROCESSOR_NULL;
	3985	} else if (lp_paired_secondary_processor != PROCESSOR_NULL) {
	3986	processor = lp_paired_secondary_processor;
	3987	lp_paired_secondary_processor = PROCESSOR_NULL;
	3988	} else if (lc_processor != PROCESSOR_NULL) {
	3989	processor = lc_processor;
	3990	lc_processor = PROCESSOR_NULL;
	3991	} else {
	3992	/*
	3993	* All processors are executing higher
	3994	* priority threads, and the lowest_count
	3995	* candidate was not usable
	3996	*/
	3997	processor = master_processor;
	3998	}
	3999
	4000	/*
	4001	* Check that the correct processor set is
	4002	* returned locked.
	4003	*/
	4004	if (cset != processor->processor_set) {
	4005	pset_unlock(cset);
	4006	cset = processor->processor_set;
	4007	pset_lock(cset);
	4008	}
	4009
	4010	/*
	4011	* We must verify that the chosen processor is still available.
	4012	* master_processor is an exception, since we may need to preempt
	4013	* a running thread on it during processor shutdown (for sleep),
	4014	* and that thread needs to be enqueued on its runqueue to run
	4015	* when the processor is restarted.
	4016	*/
	4017	if (processor != master_processor && (processor->state == PROCESSOR_SHUTDOWN \|\| processor->state == PROCESSOR_OFF_LINE))
	4018	processor = PROCESSOR_NULL;
	4019
	4020	} while (processor == PROCESSOR_NULL);
	4021
	4022	if (processor->state == PROCESSOR_RUNNING) {
	4023	re_queue_tail(&cset->active_queue, &processor->processor_queue);
	4024	}
	4025
	4026	return (processor);
	4027	}
	4028
	4029	/*
	4030	* thread_setrun:
	4031	*
	4032	* Dispatch thread for execution, onto an idle
	4033	* processor or run queue, and signal a preemption
	4034	* as appropriate.
	4035	*
	4036	* Thread must be locked.
	4037	*/
	4038	void
	4039	thread_setrun(
	4040	thread_t thread,
	4041	integer_t options)
	4042	{
	4043	processor_t processor;
	4044	processor_set_t pset;
	4045
	4046	assert((thread->state & (TH_RUN\|TH_WAIT\|TH_UNINT\|TH_TERMINATE\|TH_TERMINATE2)) == TH_RUN);
	4047	assert(thread->runq == PROCESSOR_NULL);
	4048
	4049	/*
	4050	* Update priority if needed.
	4051	*/
	4052	if (SCHED(can_update_priority)(thread))
	4053	SCHED(update_priority)(thread);
	4054
	4055	thread->sfi_class = sfi_thread_classify(thread);
	4056
	4057	assert(thread->runq == PROCESSOR_NULL);
	4058
	4059	#if __SMP__
	4060	if (thread->bound_processor == PROCESSOR_NULL) {
	4061	/*
	4062	* Unbound case.
	4063	*/
	4064	if (thread->affinity_set != AFFINITY_SET_NULL) {
	4065	/*
	4066	* Use affinity set policy hint.
	4067	*/
	4068	pset = thread->affinity_set->aset_pset;
	4069	pset_lock(pset);
	4070
	4071	processor = SCHED(choose_processor)(pset, PROCESSOR_NULL, thread);
	4072	pset = processor->processor_set;
	4073
	4074	SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR)\|DBG_FUNC_NONE,
	4075	(uintptr_t)thread_tid(thread), (uintptr_t)-1, processor->cpu_id, processor->state, 0);
	4076	} else if (thread->last_processor != PROCESSOR_NULL) {
	4077	/*
	4078	* Simple (last processor) affinity case.
	4079	*/
	4080	processor = thread->last_processor;
	4081	pset = processor->processor_set;
	4082	pset_lock(pset);
	4083	processor = SCHED(choose_processor)(pset, processor, thread);
	4084	pset = processor->processor_set;
	4085
	4086	SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR)\|DBG_FUNC_NONE,
	4087	(uintptr_t)thread_tid(thread), thread->last_processor->cpu_id, processor->cpu_id, processor->state, 0);
	4088	} else {
	4089	/*
	4090	* No Affinity case:
	4091	*
	4092	* Utilitize a per task hint to spread threads
	4093	* among the available processor sets.
	4094	*/
	4095	task_t task = thread->task;
	4096
	4097	pset = task->pset_hint;
	4098	if (pset == PROCESSOR_SET_NULL)
	4099	pset = current_processor()->processor_set;
	4100
	4101	pset = choose_next_pset(pset);
	4102	pset_lock(pset);
	4103
	4104	processor = SCHED(choose_processor)(pset, PROCESSOR_NULL, thread);
	4105	pset = processor->processor_set;
	4106	task->pset_hint = pset;
	4107
	4108	SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR)\|DBG_FUNC_NONE,
	4109	(uintptr_t)thread_tid(thread), (uintptr_t)-1, processor->cpu_id, processor->state, 0);
	4110	}
	4111	} else {
	4112	/*
	4113	* Bound case:
	4114	*
	4115	* Unconditionally dispatch on the processor.
	4116	*/
	4117	processor = thread->bound_processor;
	4118	pset = processor->processor_set;
	4119	pset_lock(pset);
	4120
	4121	SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR)\|DBG_FUNC_NONE,
	4122	(uintptr_t)thread_tid(thread), (uintptr_t)-2, processor->cpu_id, processor->state, 0);
	4123	}
	4124	#else /* !__SMP__ */
	4125	/* Only one processor to choose */
	4126	assert(thread->bound_processor == PROCESSOR_NULL \|\| thread->bound_processor == master_processor);
	4127	processor = master_processor;
	4128	pset = processor->processor_set;
	4129	pset_lock(pset);
	4130	#endif /* !__SMP__ */
	4131
	4132	/*
	4133	* Dispatch the thread on the chosen processor.
	4134	* TODO: This should be based on sched_mode, not sched_pri
	4135	*/
	4136	if (thread->sched_pri >= BASEPRI_RTQUEUES) {
	4137	realtime_setrun(processor, thread);
	4138	} else {
	4139	processor_setrun(processor, thread, options);
	4140	}
	4141	/* pset is now unlocked */
	4142	if (thread->bound_processor == PROCESSOR_NULL) {
	4143	SCHED(check_spill)(pset, thread);
	4144	}
	4145	}
	4146
	4147	processor_set_t
	4148	task_choose_pset(
	4149	task_t task)
	4150	{
	4151	processor_set_t pset = task->pset_hint;
	4152
	4153	if (pset != PROCESSOR_SET_NULL)
	4154	pset = choose_next_pset(pset);
	4155
	4156	return (pset);
	4157	}
	4158
	4159	/*
	4160	* Check for a preemption point in
	4161	* the current context.
	4162	*
	4163	* Called at splsched with thread locked.
	4164	*/
	4165	ast_t
	4166	csw_check(
	4167	processor_t processor,
	4168	ast_t check_reason)
	4169	{
	4170	processor_set_t pset = processor->processor_set;
	4171	ast_t result;
	4172
	4173	pset_lock(pset);
	4174
	4175	/* If we were sent a remote AST and interrupted a running processor, acknowledge it here with pset lock held */
	4176	bit_clear(pset->pending_AST_cpu_mask, processor->cpu_id);
	4177
	4178	result = csw_check_locked(processor, pset, check_reason);
	4179
	4180	pset_unlock(pset);
	4181
	4182	return result;
	4183	}
	4184
	4185	/*
	4186	* Check for preemption at splsched with
	4187	* pset and thread locked
	4188	*/
	4189	ast_t
	4190	csw_check_locked(
	4191	processor_t processor,
	4192	processor_set_t pset,
	4193	ast_t check_reason)
	4194	{
	4195	ast_t result;
	4196	thread_t thread = processor->active_thread;
	4197
	4198	if (processor->first_timeslice) {
	4199	if (rt_runq_count(pset) > 0)
	4200	return (check_reason \| AST_PREEMPT \| AST_URGENT);
	4201	}
	4202	else {
	4203	if (rt_runq_count(pset) > 0) {
	4204	if (BASEPRI_RTQUEUES > processor->current_pri)
	4205	return (check_reason \| AST_PREEMPT \| AST_URGENT);
	4206	else
	4207	return (check_reason \| AST_PREEMPT);
	4208	}
	4209	}
	4210
	4211	result = SCHED(processor_csw_check)(processor);
	4212	if (result != AST_NONE)
	4213	return (check_reason \| result \| (thread_eager_preemption(thread) ? AST_URGENT : AST_NONE));
	4214
	4215	#if __SMP__
	4216
	4217	/*
	4218	* If the current thread is running on a processor that is no longer recommended, gently
	4219	* (non-urgently) get to a point and then block, and which point thread_select() should
	4220	* try to idle the processor and re-dispatch the thread to a recommended processor.
	4221	*/
	4222	if (!processor->is_recommended) {
	4223	return (check_reason \| AST_PREEMPT);
	4224	}
	4225
	4226	/*
	4227	* Same for avoid-processor
	4228	*
	4229	* TODO: Should these set AST_REBALANCE?
	4230	*/
	4231	if (SCHED(avoid_processor_enabled) && SCHED(thread_avoid_processor)(processor, thread)) {
	4232	return (check_reason \| AST_PREEMPT);
	4233	}
	4234
	4235	/*
	4236	* Even though we could continue executing on this processor, a
	4237	* secondary SMT core should try to shed load to another primary core.
	4238	*
	4239	* TODO: Should this do the same check that thread_select does? i.e.
	4240	* if no bound threads target this processor, and idle primaries exist, preempt
	4241	* The case of RT threads existing is already taken care of above
	4242	* Consider Capri in this scenario.
	4243	*
	4244	* if (!SCHED(processor_bound_count)(processor) && !queue_empty(&pset->idle_queue))
	4245	*
	4246	* TODO: Alternatively - check if only primary is idle, or check if primary's pri is lower than mine.
	4247	*/
	4248
	4249	if (processor->current_pri < BASEPRI_RTQUEUES &&
	4250	processor->processor_primary != processor)
	4251	return (check_reason \| AST_PREEMPT);
	4252	#endif
	4253
	4254	if (thread->state & TH_SUSP)
	4255	return (check_reason \| AST_PREEMPT);
	4256
	4257	#if CONFIG_SCHED_SFI
	4258	/*
	4259	* Current thread may not need to be preempted, but maybe needs
	4260	* an SFI wait?
	4261	*/
	4262	result = sfi_thread_needs_ast(thread, NULL);
	4263	if (result != AST_NONE)
	4264	return (check_reason \| result);
	4265	#endif
	4266
	4267	return (AST_NONE);
	4268	}
	4269
	4270	/*
	4271	* set_sched_pri:
	4272	*
	4273	* Set the scheduled priority of the specified thread.
	4274	*
	4275	* This may cause the thread to change queues.
	4276	*
	4277	* Thread must be locked.
	4278	*/
	4279	void
	4280	set_sched_pri(
	4281	thread_t thread,
	4282	int new_priority)
	4283	{
	4284	thread_t cthread = current_thread();
	4285	boolean_t is_current_thread = (thread == cthread) ? TRUE : FALSE;
	4286	int curgency, nurgency;
	4287	uint64_t urgency_param1, urgency_param2;
	4288	boolean_t removed_from_runq = FALSE;
	4289
	4290	int old_priority = thread->sched_pri;
	4291
	4292	/* If we're already at this priority, no need to mess with the runqueue */
	4293	if (new_priority == old_priority)
	4294	return;
	4295
	4296	if (is_current_thread) {
	4297	assert(thread->runq == PROCESSOR_NULL);
	4298	curgency = thread_get_urgency(thread, &urgency_param1, &urgency_param2);
	4299	} else {
	4300	removed_from_runq = thread_run_queue_remove(thread);
	4301	}
	4302
	4303	thread->sched_pri = new_priority;
	4304
	4305	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHANGE_PRIORITY),
	4306	(uintptr_t)thread_tid(thread),
	4307	thread->base_pri,
	4308	thread->sched_pri,
	4309	thread->sched_usage,
	4310	0);
	4311
	4312	if (is_current_thread) {
	4313	nurgency = thread_get_urgency(thread, &urgency_param1, &urgency_param2);
	4314	/*
	4315	* set_sched_pri doesn't alter RT params. We expect direct base priority/QoS
	4316	* class alterations from user space to occur relatively infrequently, hence
	4317	* those are lazily handled. QoS classes have distinct priority bands, and QoS
	4318	* inheritance is expected to involve priority changes.
	4319	*/
	4320	uint64_t ctime = mach_approximate_time();
	4321	if (nurgency != curgency) {
	4322	thread_tell_urgency(nurgency, urgency_param1, urgency_param2, 0, thread);
	4323	}
	4324	machine_thread_going_on_core(thread, nurgency, 0, 0, ctime);
	4325	}
	4326
	4327	if (removed_from_runq)
	4328	thread_run_queue_reinsert(thread, SCHED_PREEMPT \| SCHED_TAILQ);
	4329	else if (thread->state & TH_RUN) {
	4330	processor_t processor = thread->last_processor;
	4331
	4332	if (is_current_thread) {
	4333	processor_state_update_from_thread(processor, thread);
	4334
	4335	/*
	4336	* When dropping in priority, check if the thread no longer belongs on core.
	4337	* If a thread raises its own priority, don't aggressively rebalance it.
	4338	* <rdar://problem/31699165>
	4339	*/
	4340	if (new_priority < old_priority) {
	4341	ast_t preempt;
	4342
	4343	if ((preempt = csw_check(processor, AST_NONE)) != AST_NONE)
	4344	ast_on(preempt);
	4345	}
	4346	} else if (processor != PROCESSOR_NULL && processor->active_thread == thread) {
	4347	cause_ast_check(processor);
	4348	}
	4349	}
	4350	}
	4351
	4352	/*
	4353	* thread_run_queue_remove_for_handoff
	4354	*
	4355	* Pull a thread or its (recursive) push target out of the runqueue
	4356	* so that it is ready for thread_run()
	4357	*
	4358	* Called at splsched
	4359	*
	4360	* Returns the thread that was pulled or THREAD_NULL if no thread could be pulled.
	4361	* This may be different than the thread that was passed in.
	4362	*/
	4363	thread_t
	4364	thread_run_queue_remove_for_handoff(thread_t thread) {
	4365
	4366	thread_t pulled_thread = THREAD_NULL;
	4367
	4368	thread_lock(thread);
	4369
	4370	/*
	4371	* Check that the thread is not bound
	4372	* to a different processor, and that realtime
	4373	* is not involved.
	4374	*
	4375	* Next, pull it off its run queue. If it
	4376	* doesn't come, it's not eligible.
	4377	*/
	4378
	4379	processor_t processor = current_processor();
	4380	if (processor->current_pri < BASEPRI_RTQUEUES && thread->sched_pri < BASEPRI_RTQUEUES &&
	4381	(thread->bound_processor == PROCESSOR_NULL \|\| thread->bound_processor == processor)) {
	4382
	4383	if (thread_run_queue_remove(thread))
	4384	pulled_thread = thread;
	4385	}
	4386
	4387	thread_unlock(thread);
	4388
	4389	return pulled_thread;
	4390	}
	4391
	4392	/*
	4393	* thread_run_queue_remove:
	4394	*
	4395	* Remove a thread from its current run queue and
	4396	* return TRUE if successful.
	4397	*
	4398	* Thread must be locked.
	4399	*
	4400	* If thread->runq is PROCESSOR_NULL, the thread will not re-enter the
	4401	* run queues because the caller locked the thread. Otherwise
	4402	* the thread is on a run queue, but could be chosen for dispatch
	4403	* and removed by another processor under a different lock, which
	4404	* will set thread->runq to PROCESSOR_NULL.
	4405	*
	4406	* Hence the thread select path must not rely on anything that could
	4407	* be changed under the thread lock after calling this function,
	4408	* most importantly thread->sched_pri.
	4409	*/
	4410	boolean_t
	4411	thread_run_queue_remove(
	4412	thread_t thread)
	4413	{
	4414	boolean_t removed = FALSE;
	4415	processor_t processor = thread->runq;
	4416
	4417	if ((thread->state & (TH_RUN\|TH_WAIT)) == TH_WAIT) {
	4418	/* Thread isn't runnable */
	4419	assert(thread->runq == PROCESSOR_NULL);
	4420	return FALSE;
	4421	}
	4422
	4423	if (processor == PROCESSOR_NULL) {
	4424	/*
	4425	* The thread is either not on the runq,
	4426	* or is in the midst of being removed from the runq.
	4427	*
	4428	* runq is set to NULL under the pset lock, not the thread
	4429	* lock, so the thread may still be in the process of being dequeued
	4430	* from the runq. It will wait in invoke for the thread lock to be
	4431	* dropped.
	4432	*/
	4433
	4434	return FALSE;
	4435	}
	4436
	4437	if (thread->sched_pri < BASEPRI_RTQUEUES) {
	4438	return SCHED(processor_queue_remove)(processor, thread);
	4439	}
	4440
	4441	processor_set_t pset = processor->processor_set;
	4442
	4443	rt_lock_lock(pset);
	4444
	4445	if (thread->runq != PROCESSOR_NULL) {
	4446	/*
	4447	* Thread is on the RT run queue and we have a lock on
	4448	* that run queue.
	4449	*/
	4450
	4451	remqueue(&thread->runq_links);
	4452	SCHED_STATS_RUNQ_CHANGE(&SCHED(rt_runq)(pset)->runq_stats, rt_runq_count(pset));
	4453	rt_runq_count_decr(pset);
	4454
	4455	thread->runq = PROCESSOR_NULL;
	4456
	4457	removed = TRUE;
	4458	}
	4459
	4460	rt_lock_unlock(pset);
	4461
	4462	return (removed);
	4463	}
	4464
	4465	/*
	4466	* Put the thread back where it goes after a thread_run_queue_remove
	4467	*
	4468	* Thread must have been removed under the same thread lock hold
	4469	*
	4470	* thread locked, at splsched
	4471	*/
	4472	void
	4473	thread_run_queue_reinsert(thread_t thread, integer_t options)
	4474	{
	4475	assert(thread->runq == PROCESSOR_NULL);
	4476	assert(thread->state & (TH_RUN));
	4477
	4478	thread_setrun(thread, options);
	4479	}
	4480
	4481	void
	4482	sys_override_cpu_throttle(int flag)
	4483	{
	4484	if (flag == CPU_THROTTLE_ENABLE)
	4485	cpu_throttle_enabled = 1;
	4486	if (flag == CPU_THROTTLE_DISABLE)
	4487	cpu_throttle_enabled = 0;
	4488	}
	4489
	4490	int
	4491	thread_get_urgency(thread_t thread, uint64_t arg1, uint64_t arg2)
	4492	{
	4493	if (thread == NULL \|\| (thread->state & TH_IDLE)) {
	4494	*arg1 = 0;
	4495	*arg2 = 0;
	4496
	4497	return (THREAD_URGENCY_NONE);
	4498	} else if (thread->sched_mode == TH_MODE_REALTIME) {
	4499	*arg1 = thread->realtime.period;
	4500	*arg2 = thread->realtime.deadline;
	4501
	4502	return (THREAD_URGENCY_REAL_TIME);
	4503	} else if (cpu_throttle_enabled &&
	4504	((thread->sched_pri <= MAXPRI_THROTTLE) && (thread->base_pri <= MAXPRI_THROTTLE))) {
	4505	/*
	4506	* Background urgency applied when thread priority is MAXPRI_THROTTLE or lower and thread is not promoted
	4507	*/
	4508	*arg1 = thread->sched_pri;
	4509	*arg2 = thread->base_pri;
	4510
	4511	return (THREAD_URGENCY_BACKGROUND);
	4512	} else {
	4513	/* For otherwise unclassified threads, report throughput QoS
	4514	* parameters
	4515	*/
	4516	*arg1 = proc_get_effective_thread_policy(thread, TASK_POLICY_THROUGH_QOS);
	4517	*arg2 = proc_get_effective_task_policy(thread->task, TASK_POLICY_THROUGH_QOS);
	4518
	4519	return (THREAD_URGENCY_NORMAL);
	4520	}
	4521	}
	4522
	4523	perfcontrol_class_t
	4524	thread_get_perfcontrol_class(thread_t thread)
	4525	{
	4526	/* Special case handling */
	4527	if (thread->state & TH_IDLE)
	4528	return PERFCONTROL_CLASS_IDLE;
	4529	if (thread->task == kernel_task)
	4530	return PERFCONTROL_CLASS_KERNEL;
	4531	if (thread->sched_mode == TH_MODE_REALTIME)
	4532	return PERFCONTROL_CLASS_REALTIME;
	4533
	4534	/* perfcontrol_class based on base_pri */
	4535	if (thread->base_pri <= MAXPRI_THROTTLE)
	4536	return PERFCONTROL_CLASS_BACKGROUND;
	4537	else if (thread->base_pri <= BASEPRI_UTILITY)
	4538	return PERFCONTROL_CLASS_UTILITY;
	4539	else if (thread->base_pri <= BASEPRI_DEFAULT)
	4540	return PERFCONTROL_CLASS_NONUI;
	4541	else if (thread->base_pri <= BASEPRI_FOREGROUND)
	4542	return PERFCONTROL_CLASS_UI;
	4543	else
	4544	return PERFCONTROL_CLASS_ABOVEUI;
	4545	}
	4546
	4547	/*
	4548	* This is the processor idle loop, which just looks for other threads
	4549	* to execute. Processor idle threads invoke this without supplying a
	4550	* current thread to idle without an asserted wait state.
	4551	*
	4552	* Returns a the next thread to execute if dispatched directly.
	4553	*/
	4554
	4555	#if 0
	4556	#define IDLE_KERNEL_DEBUG_CONSTANT(...) KERNEL_DEBUG_CONSTANT(__VA_ARGS__)
	4557	#else
	4558	#define IDLE_KERNEL_DEBUG_CONSTANT(...) do { } while(0)
	4559	#endif
	4560
	4561	thread_t
	4562	processor_idle(
	4563	thread_t thread,
	4564	processor_t processor)
	4565	{
	4566	processor_set_t pset = processor->processor_set;
	4567	thread_t new_thread;
	4568	int state;
	4569	(void)splsched();
	4570
	4571	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	4572	MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) \| DBG_FUNC_START,
	4573	(uintptr_t)thread_tid(thread), 0, 0, 0, 0);
	4574
	4575	SCHED_STATS_CPU_IDLE_START(processor);
	4576
	4577	timer_switch(&PROCESSOR_DATA(processor, system_state),
	4578	mach_absolute_time(), &PROCESSOR_DATA(processor, idle_state));
	4579	PROCESSOR_DATA(processor, current_state) = &PROCESSOR_DATA(processor, idle_state);
	4580
	4581	while (1) {
	4582	/*
	4583	* Ensure that updates to my processor and pset state,
	4584	* made by the IPI source processor before sending the IPI,
	4585	* are visible on this processor now (even though we don't
	4586	* take the pset lock yet).
	4587	*/
	4588	atomic_thread_fence(memory_order_acquire);
	4589
	4590	if (processor->state != PROCESSOR_IDLE)
	4591	break;
	4592	if (bit_test(pset->pending_AST_cpu_mask, processor->cpu_id))
	4593	break;
	4594	#if defined(CONFIG_SCHED_DEFERRED_AST)
	4595	if (bit_test(pset->pending_deferred_AST_cpu_mask, processor->cpu_id))
	4596	break;
	4597	#endif
	4598	if (processor->is_recommended && (processor->processor_primary == processor)) {
	4599	if (rt_runq_count(pset))
	4600	break;
	4601	} else {
	4602	if (SCHED(processor_bound_count)(processor))
	4603	break;
	4604	}
	4605
	4606	#if CONFIG_SCHED_IDLE_IN_PLACE
	4607	if (thread != THREAD_NULL) {
	4608	/* Did idle-in-place thread wake up */
	4609	if ((thread->state & (TH_WAIT\|TH_SUSP)) != TH_WAIT \|\| thread->wake_active)
	4610	break;
	4611	}
	4612	#endif
	4613
	4614	IDLE_KERNEL_DEBUG_CONSTANT(
	4615	MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) \| DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), rt_runq_count(pset), SCHED(processor_runq_count)(processor), -1, 0);
	4616
	4617	machine_track_platform_idle(TRUE);
	4618
	4619	machine_idle();
	4620
	4621	machine_track_platform_idle(FALSE);
	4622
	4623	(void)splsched();
	4624
	4625	IDLE_KERNEL_DEBUG_CONSTANT(
	4626	MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) \| DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), rt_runq_count(pset), SCHED(processor_runq_count)(processor), -2, 0);
	4627
	4628	if (!SCHED(processor_queue_empty)(processor)) {
	4629	/* Secondary SMT processors respond to directed wakeups
	4630	* exclusively. Some platforms induce 'spurious' SMT wakeups.
	4631	*/
	4632	if (processor->processor_primary == processor)
	4633	break;
	4634	}
	4635	}
	4636
	4637	timer_switch(&PROCESSOR_DATA(processor, idle_state),
	4638	mach_absolute_time(), &PROCESSOR_DATA(processor, system_state));
	4639	PROCESSOR_DATA(processor, current_state) = &PROCESSOR_DATA(processor, system_state);
	4640
	4641	pset_lock(pset);
	4642
	4643	/* If we were sent a remote AST and came out of idle, acknowledge it here with pset lock held */
	4644	bit_clear(pset->pending_AST_cpu_mask, processor->cpu_id);
	4645	#if defined(CONFIG_SCHED_DEFERRED_AST)
	4646	bit_clear(pset->pending_deferred_AST_cpu_mask, processor->cpu_id);
	4647	#endif
	4648
	4649	state = processor->state;
	4650	if (state == PROCESSOR_DISPATCHING) {
	4651	/*
	4652	* Commmon case -- cpu dispatched.
	4653	*/
	4654	new_thread = processor->next_thread;
	4655	processor->next_thread = THREAD_NULL;
	4656	processor->state = PROCESSOR_RUNNING;
	4657
	4658	if ((new_thread != THREAD_NULL) && (SCHED(processor_queue_has_priority)(processor, new_thread->sched_pri, FALSE) \|\|
	4659	(rt_runq_count(pset) > 0)) ) {
	4660	/* Something higher priority has popped up on the runqueue - redispatch this thread elsewhere */
	4661	processor_state_update_idle(processor);
	4662	processor->deadline = UINT64_MAX;
	4663
	4664	pset_unlock(pset);
	4665
	4666	thread_lock(new_thread);
	4667	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_REDISPATCH), (uintptr_t)thread_tid(new_thread), new_thread->sched_pri, rt_runq_count(pset), 0, 0);
	4668	thread_setrun(new_thread, SCHED_HEADQ);
	4669	thread_unlock(new_thread);
	4670
	4671	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	4672	MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) \| DBG_FUNC_END,
	4673	(uintptr_t)thread_tid(thread), state, 0, 0, 0);
	4674
	4675	return (THREAD_NULL);
	4676	}
	4677
	4678	sched_update_pset_load_average(pset);
	4679
	4680	pset_unlock(pset);
	4681
	4682	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	4683	MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) \| DBG_FUNC_END,
	4684	(uintptr_t)thread_tid(thread), state, (uintptr_t)thread_tid(new_thread), 0, 0);
	4685
	4686	return (new_thread);
	4687
	4688	} else if (state == PROCESSOR_IDLE) {
	4689	re_queue_tail(&pset->active_queue, &processor->processor_queue);
	4690
	4691	pset->active_processor_count++;
	4692	sched_update_pset_load_average(pset);
	4693
	4694	processor->state = PROCESSOR_RUNNING;
	4695	processor_state_update_idle(processor);
	4696	processor->deadline = UINT64_MAX;
	4697
	4698	} else if (state == PROCESSOR_SHUTDOWN) {
	4699	/*
	4700	* Going off-line. Force a
	4701	* reschedule.
	4702	*/
	4703	if ((new_thread = processor->next_thread) != THREAD_NULL) {
	4704	processor->next_thread = THREAD_NULL;
	4705	processor_state_update_idle(processor);
	4706	processor->deadline = UINT64_MAX;
	4707
	4708	pset_unlock(pset);
	4709
	4710	thread_lock(new_thread);
	4711	thread_setrun(new_thread, SCHED_HEADQ);
	4712	thread_unlock(new_thread);
	4713
	4714	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	4715	MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) \| DBG_FUNC_END,
	4716	(uintptr_t)thread_tid(thread), state, 0, 0, 0);
	4717
	4718	return (THREAD_NULL);
	4719	}
	4720	}
	4721
	4722	pset_unlock(pset);
	4723
	4724	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	4725	MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) \| DBG_FUNC_END,
	4726	(uintptr_t)thread_tid(thread), state, 0, 0, 0);
	4727
	4728	return (THREAD_NULL);
	4729	}
	4730
	4731	/*
	4732	* Each processor has a dedicated thread which
	4733	* executes the idle loop when there is no suitable
	4734	* previous context.
	4735	*/
	4736	void
	4737	idle_thread(void)
	4738	{
	4739	processor_t processor = current_processor();
	4740	thread_t new_thread;
	4741
	4742	new_thread = processor_idle(THREAD_NULL, processor);
	4743	if (new_thread != THREAD_NULL) {
	4744	thread_run(processor->idle_thread, (thread_continue_t)idle_thread, NULL, new_thread);
	4745	/NOTREACHED/
	4746	}
	4747
	4748	thread_block((thread_continue_t)idle_thread);
	4749	/NOTREACHED/
	4750	}
	4751
	4752	kern_return_t
	4753	idle_thread_create(
	4754	processor_t processor)
	4755	{
	4756	kern_return_t result;
	4757	thread_t thread;
	4758	spl_t s;
	4759	char name[MAXTHREADNAMESIZE];
	4760
	4761	result = kernel_thread_create((thread_continue_t)idle_thread, NULL, MAXPRI_KERNEL, &thread);
	4762	if (result != KERN_SUCCESS)
	4763	return (result);
	4764
	4765	snprintf(name, sizeof(name), "idle #%d", processor->cpu_id);
	4766	thread_set_thread_name(thread, name);
	4767
	4768	s = splsched();
	4769	thread_lock(thread);
	4770	thread->bound_processor = processor;
	4771	processor->idle_thread = thread;
	4772	thread->sched_pri = thread->base_pri = IDLEPRI;
	4773	thread->state = (TH_RUN \| TH_IDLE);
	4774	thread->options \|= TH_OPT_IDLE_THREAD;
	4775	thread_unlock(thread);
	4776	splx(s);
	4777
	4778	thread_deallocate(thread);
	4779
	4780	return (KERN_SUCCESS);
	4781	}
	4782
	4783	/*
	4784	* sched_startup:
	4785	*
	4786	* Kicks off scheduler services.
	4787	*
	4788	* Called at splsched.
	4789	*/
	4790	void
	4791	sched_startup(void)
	4792	{
	4793	kern_return_t result;
	4794	thread_t thread;
	4795
	4796	simple_lock_init(&sched_vm_group_list_lock, 0);
	4797
	4798	#if __arm__ \|\| __arm64__
	4799	simple_lock_init(&sched_recommended_cores_lock, 0);
	4800	#endif /* __arm__ \|\| __arm64__ */
	4801
	4802	result = kernel_thread_start_priority((thread_continue_t)sched_init_thread,
	4803	(void *)SCHED(maintenance_continuation), MAXPRI_KERNEL, &thread);
	4804	if (result != KERN_SUCCESS)
	4805	panic("sched_startup");
	4806
	4807	thread_deallocate(thread);
	4808
	4809	assert_thread_magic(thread);
	4810
	4811	/*
	4812	* Yield to the sched_init_thread once, to
	4813	* initialize our own thread after being switched
	4814	* back to.
	4815	*
	4816	* The current thread is the only other thread
	4817	* active at this point.
	4818	*/
	4819	thread_block(THREAD_CONTINUE_NULL);
	4820	}
	4821
	4822	#if __arm64__
	4823	static _Atomic uint64_t sched_perfcontrol_callback_deadline;
	4824	#endif /* __arm64__ */
	4825
	4826
	4827	#if defined(CONFIG_SCHED_TIMESHARE_CORE)
	4828
	4829	static volatile uint64_t sched_maintenance_deadline;
	4830	static uint64_t sched_tick_last_abstime;
	4831	static uint64_t sched_tick_delta;
	4832	uint64_t sched_tick_max_delta;
	4833
	4834
	4835	/*
	4836	* sched_init_thread:
	4837	*
	4838	* Perform periodic bookkeeping functions about ten
	4839	* times per second.
	4840	*/
	4841	void
	4842	sched_timeshare_maintenance_continue(void)
	4843	{
	4844	uint64_t sched_tick_ctime, late_time;
	4845
	4846	struct sched_update_scan_context scan_context = {
	4847	.earliest_bg_make_runnable_time = UINT64_MAX,
	4848	.earliest_normal_make_runnable_time = UINT64_MAX,
	4849	.earliest_rt_make_runnable_time = UINT64_MAX
	4850	};
	4851
	4852	sched_tick_ctime = mach_absolute_time();
	4853
	4854	if (__improbable(sched_tick_last_abstime == 0)) {
	4855	sched_tick_last_abstime = sched_tick_ctime;
	4856	late_time = 0;
	4857	sched_tick_delta = 1;
	4858	} else {
	4859	late_time = sched_tick_ctime - sched_tick_last_abstime;
	4860	sched_tick_delta = late_time / sched_tick_interval;
	4861	/* Ensure a delta of 1, since the interval could be slightly
	4862	* smaller than the sched_tick_interval due to dispatch
	4863	* latencies.
	4864	*/
	4865	sched_tick_delta = MAX(sched_tick_delta, 1);
	4866
	4867	/* In the event interrupt latencies or platform
	4868	* idle events that advanced the timebase resulted
	4869	* in periods where no threads were dispatched,
	4870	* cap the maximum "tick delta" at SCHED_TICK_MAX_DELTA
	4871	* iterations.
	4872	*/
	4873	sched_tick_delta = MIN(sched_tick_delta, SCHED_TICK_MAX_DELTA);
	4874
	4875	sched_tick_last_abstime = sched_tick_ctime;
	4876	sched_tick_max_delta = MAX(sched_tick_delta, sched_tick_max_delta);
	4877	}
	4878
	4879	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_MAINTENANCE)\|DBG_FUNC_START,
	4880	sched_tick_delta, late_time, 0, 0, 0);
	4881
	4882	/* Add a number of pseudo-ticks corresponding to the elapsed interval
	4883	* This could be greater than 1 if substantial intervals where
	4884	* all processors are idle occur, which rarely occurs in practice.
	4885	*/
	4886
	4887	sched_tick += sched_tick_delta;
	4888
	4889	update_vm_info();
	4890
	4891	/*
	4892	* Compute various averages.
	4893	*/
	4894	compute_averages(sched_tick_delta);
	4895
	4896	/*
	4897	* Scan the run queues for threads which
	4898	* may need to be updated, and find the earliest runnable thread on the runqueue
	4899	* to report its latency.
	4900	*/
	4901	SCHED(thread_update_scan)(&scan_context);
	4902
	4903	SCHED(rt_runq_scan)(&scan_context);
	4904
	4905	uint64_t ctime = mach_absolute_time();
	4906
	4907	uint64_t bg_max_latency = (ctime > scan_context.earliest_bg_make_runnable_time) ?
	4908	ctime - scan_context.earliest_bg_make_runnable_time : 0;
	4909
	4910	uint64_t default_max_latency = (ctime > scan_context.earliest_normal_make_runnable_time) ?
	4911	ctime - scan_context.earliest_normal_make_runnable_time : 0;
	4912
	4913	uint64_t realtime_max_latency = (ctime > scan_context.earliest_rt_make_runnable_time) ?
	4914	ctime - scan_context.earliest_rt_make_runnable_time : 0;
	4915
	4916	machine_max_runnable_latency(bg_max_latency, default_max_latency, realtime_max_latency);
	4917
	4918	/*
	4919	* Check to see if the special sched VM group needs attention.
	4920	*/
	4921	sched_vm_group_maintenance();
	4922
	4923	#if __arm__ \|\| __arm64__
	4924	/* Check to see if the recommended cores failsafe is active */
	4925	sched_recommended_cores_maintenance();
	4926	#endif /* __arm__ \|\| __arm64__ */
	4927
	4928
	4929	#if DEBUG \|\| DEVELOPMENT
	4930	#if __x86_64__
	4931	#include <i386/misc_protos.h>
	4932	/* Check for long-duration interrupts */
	4933	mp_interrupt_watchdog();
	4934	#endif /* __x86_64__ */
	4935	#endif /* DEBUG \|\| DEVELOPMENT */
	4936
	4937	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_MAINTENANCE) \| DBG_FUNC_END,
	4938	sched_pri_shifts[TH_BUCKET_SHARE_FG], sched_pri_shifts[TH_BUCKET_SHARE_BG],
	4939	sched_pri_shifts[TH_BUCKET_SHARE_UT], 0, 0);
	4940
	4941	assert_wait((event_t)sched_timeshare_maintenance_continue, THREAD_UNINT);
	4942	thread_block((thread_continue_t)sched_timeshare_maintenance_continue);
	4943	/NOTREACHED/
	4944	}
	4945
	4946	static uint64_t sched_maintenance_wakeups;
	4947
	4948	/*
	4949	* Determine if the set of routines formerly driven by a maintenance timer
	4950	* must be invoked, based on a deadline comparison. Signals the scheduler
	4951	* maintenance thread on deadline expiration. Must be invoked at an interval
	4952	* lower than the "sched_tick_interval", currently accomplished by
	4953	* invocation via the quantum expiration timer and at context switch time.
	4954	* Performance matters: this routine reuses a timestamp approximating the
	4955	* current absolute time received from the caller, and should perform
	4956	* no more than a comparison against the deadline in the common case.
	4957	*/
	4958	void
	4959	sched_timeshare_consider_maintenance(uint64_t ctime) {
	4960	uint64_t ndeadline, deadline = sched_maintenance_deadline;
	4961
	4962	if (__improbable(ctime >= deadline)) {
	4963	if (__improbable(current_thread() == sched_maintenance_thread))
	4964	return;
	4965	OSMemoryBarrier();
	4966
	4967	ndeadline = ctime + sched_tick_interval;
	4968
	4969	if (__probable(__sync_bool_compare_and_swap(&sched_maintenance_deadline, deadline, ndeadline))) {
	4970	thread_wakeup((event_t)sched_timeshare_maintenance_continue);
	4971	sched_maintenance_wakeups++;
	4972	}
	4973	}
	4974
	4975	#if __arm64__
	4976	uint64_t perf_deadline = __c11_atomic_load(&sched_perfcontrol_callback_deadline, memory_order_relaxed);
	4977
	4978	if (__improbable(perf_deadline && ctime >= perf_deadline)) {
	4979	/* CAS in 0, if success, make callback. Otherwise let the next context switch check again. */
	4980	if (__c11_atomic_compare_exchange_strong(&sched_perfcontrol_callback_deadline, &perf_deadline, 0,
	4981	memory_order_relaxed, memory_order_relaxed)) {
	4982	machine_perfcontrol_deadline_passed(perf_deadline);
	4983	}
	4984	}
	4985	#endif /* __arm64__ */
	4986
	4987	}
	4988
	4989	#endif /* CONFIG_SCHED_TIMESHARE_CORE */
	4990
	4991	void
	4992	sched_init_thread(void (*continuation)(void))
	4993	{
	4994	thread_block(THREAD_CONTINUE_NULL);
	4995
	4996	thread_t thread = current_thread();
	4997
	4998	thread_set_thread_name(thread, "sched_maintenance_thread");
	4999
	5000	sched_maintenance_thread = thread;
	5001
	5002	continuation();
	5003
	5004	/NOTREACHED/
	5005	}
	5006
	5007	#if defined(CONFIG_SCHED_TIMESHARE_CORE)
	5008
	5009	/*
	5010	* thread_update_scan / runq_scan:
	5011	*
	5012	* Scan the run queues to account for timesharing threads
	5013	* which need to be updated.
	5014	*
	5015	* Scanner runs in two passes. Pass one squirrels likely
	5016	* threads away in an array, pass two does the update.
	5017	*
	5018	* This is necessary because the run queue is locked for
	5019	* the candidate scan, but the thread is locked for the update.
	5020	*
	5021	* Array should be sized to make forward progress, without
	5022	* disabling preemption for long periods.
	5023	*/
	5024
	5025	#define THREAD_UPDATE_SIZE 128
	5026
	5027	static thread_t thread_update_array[THREAD_UPDATE_SIZE];
	5028	static uint32_t thread_update_count = 0;
	5029
	5030	/* Returns TRUE if thread was added, FALSE if thread_update_array is full */
	5031	boolean_t
	5032	thread_update_add_thread(thread_t thread)
	5033	{
	5034	if (thread_update_count == THREAD_UPDATE_SIZE)
	5035	return (FALSE);
	5036
	5037	thread_update_array[thread_update_count++] = thread;
	5038	thread_reference_internal(thread);
	5039	return (TRUE);
	5040	}
	5041
	5042	void
	5043	thread_update_process_threads(void)
	5044	{
	5045	assert(thread_update_count <= THREAD_UPDATE_SIZE);
	5046
	5047	for (uint32_t i = 0 ; i < thread_update_count ; i++) {
	5048	thread_t thread = thread_update_array[i];
	5049	assert_thread_magic(thread);
	5050	thread_update_array[i] = THREAD_NULL;
	5051
	5052	spl_t s = splsched();
	5053	thread_lock(thread);
	5054	if (!(thread->state & (TH_WAIT)) && thread->sched_stamp != sched_tick) {
	5055	SCHED(update_priority)(thread);
	5056	}
	5057	thread_unlock(thread);
	5058	splx(s);
	5059
	5060	thread_deallocate(thread);
	5061	}
	5062
	5063	thread_update_count = 0;
	5064	}
	5065
	5066	/*
	5067	* Scan a runq for candidate threads.
	5068	*
	5069	* Returns TRUE if retry is needed.
	5070	*/
	5071	boolean_t
	5072	runq_scan(
	5073	run_queue_t runq,
	5074	sched_update_scan_context_t scan_context)
	5075	{
	5076	int count = runq->count;
	5077	int queue_index;
	5078
	5079	assert(count >= 0);
	5080
	5081	if (count == 0)
	5082	return FALSE;
	5083
	5084	for (queue_index = bitmap_first(runq->bitmap, NRQS);
	5085	queue_index >= 0;
	5086	queue_index = bitmap_next(runq->bitmap, queue_index)) {
	5087
	5088	thread_t thread;
	5089	queue_t queue = &runq->queues[queue_index];
	5090
	5091	qe_foreach_element(thread, queue, runq_links) {
	5092	assert(count > 0);
	5093	assert_thread_magic(thread);
	5094
	5095	if (thread->sched_stamp != sched_tick &&
	5096	thread->sched_mode == TH_MODE_TIMESHARE) {
	5097	if (thread_update_add_thread(thread) == FALSE)
	5098	return TRUE;
	5099	}
	5100
	5101	if (cpu_throttle_enabled && ((thread->sched_pri <= MAXPRI_THROTTLE) && (thread->base_pri <= MAXPRI_THROTTLE))) {
	5102	if (thread->last_made_runnable_time < scan_context->earliest_bg_make_runnable_time) {
	5103	scan_context->earliest_bg_make_runnable_time = thread->last_made_runnable_time;
	5104	}
	5105	} else {
	5106	if (thread->last_made_runnable_time < scan_context->earliest_normal_make_runnable_time) {
	5107	scan_context->earliest_normal_make_runnable_time = thread->last_made_runnable_time;
	5108	}
	5109	}
	5110	count--;
	5111	}
	5112	}
	5113
	5114	return FALSE;
	5115	}
	5116
	5117	#endif /* CONFIG_SCHED_TIMESHARE_CORE */
	5118
	5119	boolean_t
	5120	thread_eager_preemption(thread_t thread)
	5121	{
	5122	return ((thread->sched_flags & TH_SFLAG_EAGERPREEMPT) != 0);
	5123	}
	5124
	5125	void
	5126	thread_set_eager_preempt(thread_t thread)
	5127	{
	5128	spl_t x;
	5129	processor_t p;
	5130	ast_t ast = AST_NONE;
	5131
	5132	x = splsched();
	5133	p = current_processor();
	5134
	5135	thread_lock(thread);
	5136	thread->sched_flags \|= TH_SFLAG_EAGERPREEMPT;
	5137
	5138	if (thread == current_thread()) {
	5139
	5140	ast = csw_check(p, AST_NONE);
	5141	thread_unlock(thread);
	5142	if (ast != AST_NONE) {
	5143	(void) thread_block_reason(THREAD_CONTINUE_NULL, NULL, ast);
	5144	}
	5145	} else {
	5146	p = thread->last_processor;
	5147
	5148	if (p != PROCESSOR_NULL && p->state == PROCESSOR_RUNNING &&
	5149	p->active_thread == thread) {
	5150	cause_ast_check(p);
	5151	}
	5152
	5153	thread_unlock(thread);
	5154	}
	5155
	5156	splx(x);
	5157	}
	5158
	5159	void
	5160	thread_clear_eager_preempt(thread_t thread)
	5161	{
	5162	spl_t x;
	5163
	5164	x = splsched();
	5165	thread_lock(thread);
	5166
	5167	thread->sched_flags &= ~TH_SFLAG_EAGERPREEMPT;
	5168
	5169	thread_unlock(thread);
	5170	splx(x);
	5171	}
	5172
	5173	/*
	5174	* Scheduling statistics
	5175	*/
	5176	void
	5177	sched_stats_handle_csw(processor_t processor, int reasons, int selfpri, int otherpri)
	5178	{
	5179	struct processor_sched_statistics *stats;
	5180	boolean_t to_realtime = FALSE;
	5181
	5182	stats = &processor->processor_data.sched_stats;
	5183	stats->csw_count++;
	5184
	5185	if (otherpri >= BASEPRI_REALTIME) {
	5186	stats->rt_sched_count++;
	5187	to_realtime = TRUE;
	5188	}
	5189
	5190	if ((reasons & AST_PREEMPT) != 0) {
	5191	stats->preempt_count++;
	5192
	5193	if (selfpri >= BASEPRI_REALTIME) {
	5194	stats->preempted_rt_count++;
	5195	}
	5196
	5197	if (to_realtime) {
	5198	stats->preempted_by_rt_count++;
	5199	}
	5200
	5201	}
	5202	}
	5203
	5204	void
	5205	sched_stats_handle_runq_change(struct runq_stats *stats, int old_count)
	5206	{
	5207	uint64_t timestamp = mach_absolute_time();
	5208
	5209	stats->count_sum += (timestamp - stats->last_change_timestamp) * old_count;
	5210	stats->last_change_timestamp = timestamp;
	5211	}
	5212
	5213	/*
	5214	* For calls from assembly code
	5215	*/
	5216	#undef thread_wakeup
	5217	void
	5218	thread_wakeup(
	5219	event_t x);
	5220
	5221	void
	5222	thread_wakeup(
	5223	event_t x)
	5224	{
	5225	thread_wakeup_with_result(x, THREAD_AWAKENED);
	5226	}
	5227
	5228	boolean_t
	5229	preemption_enabled(void)
	5230	{
	5231	return (get_preemption_level() == 0 && ml_get_interrupts_enabled());
	5232	}
	5233
	5234	static void
	5235	sched_timer_deadline_tracking_init(void) {
	5236	nanoseconds_to_absolutetime(TIMER_DEADLINE_TRACKING_BIN_1_DEFAULT, &timer_deadline_tracking_bin_1);
	5237	nanoseconds_to_absolutetime(TIMER_DEADLINE_TRACKING_BIN_2_DEFAULT, &timer_deadline_tracking_bin_2);
	5238	}
	5239
	5240	#if __arm__ \|\| __arm64__
	5241
	5242	uint32_t perfcontrol_requested_recommended_cores = ALL_CORES_RECOMMENDED;
	5243	uint32_t perfcontrol_requested_recommended_core_count = MAX_CPUS;
	5244	boolean_t perfcontrol_failsafe_active = FALSE;
	5245
	5246	uint64_t perfcontrol_failsafe_maintenance_runnable_time;
	5247	uint64_t perfcontrol_failsafe_activation_time;
	5248	uint64_t perfcontrol_failsafe_deactivation_time;
	5249
	5250	/* data covering who likely caused it and how long they ran */
	5251	#define FAILSAFE_NAME_LEN 33 /* (2MAXCOMLEN)+1 from size of p_name /
	5252	char perfcontrol_failsafe_name[FAILSAFE_NAME_LEN];
	5253	int perfcontrol_failsafe_pid;
	5254	uint64_t perfcontrol_failsafe_tid;
	5255	uint64_t perfcontrol_failsafe_thread_timer_at_start;
	5256	uint64_t perfcontrol_failsafe_thread_timer_last_seen;
	5257	uint32_t perfcontrol_failsafe_recommended_at_trigger;
	5258
	5259	/*
	5260	* Perf controller calls here to update the recommended core bitmask.
	5261	* If the failsafe is active, we don't immediately apply the new value.
	5262	* Instead, we store the new request and use it after the failsafe deactivates.
	5263	*
	5264	* If the failsafe is not active, immediately apply the update.
	5265	*
	5266	* No scheduler locks are held, no other locks are held that scheduler might depend on,
	5267	* interrupts are enabled
	5268	*
	5269	* currently prototype is in osfmk/arm/machine_routines.h
	5270	*/
	5271	void
	5272	sched_perfcontrol_update_recommended_cores(uint32_t recommended_cores)
	5273	{
	5274	assert(preemption_enabled());
	5275
	5276	spl_t s = splsched();
	5277	simple_lock(&sched_recommended_cores_lock);
	5278
	5279	perfcontrol_requested_recommended_cores = recommended_cores;
	5280	perfcontrol_requested_recommended_core_count = __builtin_popcountll(recommended_cores);
	5281
	5282	if (perfcontrol_failsafe_active == FALSE)
	5283	sched_update_recommended_cores(perfcontrol_requested_recommended_cores);
	5284	else
	5285	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	5286	MACHDBG_CODE(DBG_MACH_SCHED,MACH_REC_CORES_FAILSAFE) \| DBG_FUNC_NONE,
	5287	perfcontrol_requested_recommended_cores,
	5288	sched_maintenance_thread->last_made_runnable_time, 0, 0, 0);
	5289
	5290	simple_unlock(&sched_recommended_cores_lock);
	5291	splx(s);
	5292	}
	5293
	5294	/*
	5295	* Consider whether we need to activate the recommended cores failsafe
	5296	*
	5297	* Called from quantum timer interrupt context of a realtime thread
	5298	* No scheduler locks are held, interrupts are disabled
	5299	*/
	5300	void
	5301	sched_consider_recommended_cores(uint64_t ctime, thread_t cur_thread)
	5302	{
	5303	/*
	5304	* Check if a realtime thread is starving the system
	5305	* and bringing up non-recommended cores would help
	5306	*
	5307	* TODO: Is this the correct check for recommended == possible cores?
	5308	* TODO: Validate the checks without the relevant lock are OK.
	5309	*/
	5310
	5311	if (__improbable(perfcontrol_failsafe_active == TRUE)) {
	5312	/* keep track of how long the responsible thread runs */
	5313
	5314	simple_lock(&sched_recommended_cores_lock);
	5315
	5316	if (perfcontrol_failsafe_active == TRUE &&
	5317	cur_thread->thread_id == perfcontrol_failsafe_tid) {
	5318	perfcontrol_failsafe_thread_timer_last_seen = timer_grab(&cur_thread->user_timer) +
	5319	timer_grab(&cur_thread->system_timer);
	5320	}
	5321
	5322	simple_unlock(&sched_recommended_cores_lock);
	5323
	5324	/* we're already trying to solve the problem, so bail */
	5325	return;
	5326	}
	5327
	5328	/* The failsafe won't help if there are no more processors to enable */
	5329	if (__probable(perfcontrol_requested_recommended_core_count >= processor_count))
	5330	return;
	5331
	5332	uint64_t too_long_ago = ctime - perfcontrol_failsafe_starvation_threshold;
	5333
	5334	/* Use the maintenance thread as our canary in the coal mine */
	5335	thread_t m_thread = sched_maintenance_thread;
	5336
	5337	/* If it doesn't look bad, nothing to see here */
	5338	if (__probable(m_thread->last_made_runnable_time >= too_long_ago))
	5339	return;
	5340
	5341	/* It looks bad, take the lock to be sure */
	5342	thread_lock(m_thread);
	5343
	5344	if (m_thread->runq == PROCESSOR_NULL \|\|
	5345	(m_thread->state & (TH_RUN\|TH_WAIT)) != TH_RUN \|\|
	5346	m_thread->last_made_runnable_time >= too_long_ago) {
	5347	/*
	5348	* Maintenance thread is either on cpu or blocked, and
	5349	* therefore wouldn't benefit from more cores
	5350	*/
	5351	thread_unlock(m_thread);
	5352	return;
	5353	}
	5354
	5355	uint64_t maintenance_runnable_time = m_thread->last_made_runnable_time;
	5356
	5357	thread_unlock(m_thread);
	5358
	5359	/*
	5360	* There are cores disabled at perfcontrol's recommendation, but the
	5361	* system is so overloaded that the maintenance thread can't run.
	5362	* That likely means that perfcontrol can't run either, so it can't fix
	5363	* the recommendation. We have to kick in a failsafe to keep from starving.
	5364	*
	5365	* When the maintenance thread has been starved for too long,
	5366	* ignore the recommendation from perfcontrol and light up all the cores.
	5367	*
	5368	* TODO: Consider weird states like boot, sleep, or debugger
	5369	*/
	5370
	5371	simple_lock(&sched_recommended_cores_lock);
	5372
	5373	if (perfcontrol_failsafe_active == TRUE) {
	5374	simple_unlock(&sched_recommended_cores_lock);
	5375	return;
	5376	}
	5377
	5378	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	5379	MACHDBG_CODE(DBG_MACH_SCHED,MACH_REC_CORES_FAILSAFE) \| DBG_FUNC_START,
	5380	perfcontrol_requested_recommended_cores, maintenance_runnable_time, 0, 0, 0);
	5381
	5382	perfcontrol_failsafe_active = TRUE;
	5383	perfcontrol_failsafe_activation_time = mach_absolute_time();
	5384	perfcontrol_failsafe_maintenance_runnable_time = maintenance_runnable_time;
	5385	perfcontrol_failsafe_recommended_at_trigger = perfcontrol_requested_recommended_cores;
	5386
	5387	/* Capture some data about who screwed up (assuming that the thread on core is at fault) */
	5388	task_t task = cur_thread->task;
	5389	perfcontrol_failsafe_pid = task_pid(task);
	5390	strlcpy(perfcontrol_failsafe_name, proc_name_address(task->bsd_info), sizeof(perfcontrol_failsafe_name));
	5391
	5392	perfcontrol_failsafe_tid = cur_thread->thread_id;
	5393
	5394	/* Blame the thread for time it has run recently */
	5395	uint64_t recent_computation = (ctime - cur_thread->computation_epoch) + cur_thread->computation_metered;
	5396
	5397	uint64_t last_seen = timer_grab(&cur_thread->user_timer) + timer_grab(&cur_thread->system_timer);
	5398
	5399	/* Compute the start time of the bad behavior in terms of the thread's on core time */
	5400	perfcontrol_failsafe_thread_timer_at_start = last_seen - recent_computation;
	5401	perfcontrol_failsafe_thread_timer_last_seen = last_seen;
	5402
	5403	/* Ignore the previously recommended core configuration */
	5404	sched_update_recommended_cores(ALL_CORES_RECOMMENDED);
	5405
	5406	simple_unlock(&sched_recommended_cores_lock);
	5407	}
	5408
	5409	/*
	5410	* Now that our bacon has been saved by the failsafe, consider whether to turn it off
	5411	*
	5412	* Runs in the context of the maintenance thread, no locks held
	5413	*/
	5414	static void
	5415	sched_recommended_cores_maintenance(void)
	5416	{
	5417	/* Common case - no failsafe, nothing to be done here */
	5418	if (__probable(perfcontrol_failsafe_active == FALSE))
	5419	return;
	5420
	5421	uint64_t ctime = mach_absolute_time();
	5422
	5423	boolean_t print_diagnostic = FALSE;
	5424	char p_name[FAILSAFE_NAME_LEN] = "";
	5425
	5426	spl_t s = splsched();
	5427	simple_lock(&sched_recommended_cores_lock);
	5428
	5429	/* Check again, under the lock, to avoid races */
	5430	if (perfcontrol_failsafe_active == FALSE)
	5431	goto out;
	5432
	5433	/*
	5434	* Ensure that the other cores get another few ticks to run some threads
	5435	* If we don't have this hysteresis, the maintenance thread is the first
	5436	* to run, and then it immediately kills the other cores
	5437	*/
	5438	if ((ctime - perfcontrol_failsafe_activation_time) < perfcontrol_failsafe_starvation_threshold)
	5439	goto out;
	5440
	5441	/* Capture some diagnostic state under the lock so we can print it out later */
	5442
	5443	int pid = perfcontrol_failsafe_pid;
	5444	uint64_t tid = perfcontrol_failsafe_tid;
	5445
	5446	uint64_t thread_usage = perfcontrol_failsafe_thread_timer_last_seen -
	5447	perfcontrol_failsafe_thread_timer_at_start;
	5448	uint32_t rec_cores_before = perfcontrol_failsafe_recommended_at_trigger;
	5449	uint32_t rec_cores_after = perfcontrol_requested_recommended_cores;
	5450	uint64_t failsafe_duration = ctime - perfcontrol_failsafe_activation_time;
	5451	strlcpy(p_name, perfcontrol_failsafe_name, sizeof(p_name));
	5452
	5453	print_diagnostic = TRUE;
	5454
	5455	/* Deactivate the failsafe and reinstate the requested recommendation settings */
	5456
	5457	perfcontrol_failsafe_deactivation_time = ctime;
	5458	perfcontrol_failsafe_active = FALSE;
	5459
	5460	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	5461	MACHDBG_CODE(DBG_MACH_SCHED,MACH_REC_CORES_FAILSAFE) \| DBG_FUNC_END,
	5462	perfcontrol_requested_recommended_cores, failsafe_duration, 0, 0, 0);
	5463
	5464	sched_update_recommended_cores(perfcontrol_requested_recommended_cores);
	5465
	5466	out:
	5467	simple_unlock(&sched_recommended_cores_lock);
	5468	splx(s);
	5469
	5470	if (print_diagnostic) {
	5471	uint64_t failsafe_duration_ms = 0, thread_usage_ms = 0;
	5472
	5473	absolutetime_to_nanoseconds(failsafe_duration, &failsafe_duration_ms);
	5474	failsafe_duration_ms = failsafe_duration_ms / NSEC_PER_MSEC;
	5475
	5476	absolutetime_to_nanoseconds(thread_usage, &thread_usage_ms);
	5477	thread_usage_ms = thread_usage_ms / NSEC_PER_MSEC;
	5478
	5479	printf("recommended core failsafe kicked in for %lld ms "
	5480	"likely due to %s[%d] thread 0x%llx spending "
	5481	"%lld ms on cpu at realtime priority - "
	5482	"new recommendation: 0x%x -> 0x%x\n",
	5483	failsafe_duration_ms, p_name, pid, tid, thread_usage_ms,
	5484	rec_cores_before, rec_cores_after);
	5485	}
	5486	}
	5487
	5488	/*
	5489	* Apply a new recommended cores mask to the processors it affects
	5490	* Runs after considering failsafes and such
	5491	*
	5492	* Iterate over processors and update their ->is_recommended field.
	5493	* If a processor is running, we let it drain out at its next
	5494	* quantum expiration or blocking point. If a processor is idle, there
	5495	* may be more work for it to do, so IPI it.
	5496	*
	5497	* interrupts disabled, sched_recommended_cores_lock is held
	5498	*/
	5499	static void
	5500	sched_update_recommended_cores(uint32_t recommended_cores)
	5501	{
	5502	processor_set_t pset, nset;
	5503	processor_t processor;
	5504	uint64_t needs_exit_idle_mask = 0x0;
	5505
	5506	processor = processor_list;
	5507	pset = processor->processor_set;
	5508
	5509	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	5510	MACHDBG_CODE(DBG_MACH_SCHED,MACH_SCHED_UPDATE_REC_CORES) \| DBG_FUNC_START,
	5511	recommended_cores, perfcontrol_failsafe_active, 0, 0, 0);
	5512
	5513	if (__builtin_popcount(recommended_cores) == 0) {
	5514	recommended_cores \|= 0x1U; /* add boot processor or we hang */
	5515	}
	5516
	5517	/* First set recommended cores */
	5518	pset_lock(pset);
	5519	do {
	5520
	5521	nset = processor->processor_set;
	5522	if (nset != pset) {
	5523	pset_unlock(pset);
	5524	pset = nset;
	5525	pset_lock(pset);
	5526	}
	5527
	5528	pset->recommended_bitmask = recommended_cores;
	5529
	5530	if (recommended_cores & (1ULL << processor->cpu_id)) {
	5531	processor->is_recommended = TRUE;
	5532
	5533	if (processor->state == PROCESSOR_IDLE) {
	5534	if (processor->processor_primary == processor) {
	5535	re_queue_head(&pset->idle_queue, &processor->processor_queue);
	5536	} else {
	5537	re_queue_head(&pset->idle_secondary_queue, &processor->processor_queue);
	5538	}
	5539	if (processor != current_processor()) {
	5540	needs_exit_idle_mask \|= (1ULL << processor->cpu_id);
	5541	}
	5542	}
	5543	}
	5544	} while ((processor = processor->processor_list) != NULL);
	5545	pset_unlock(pset);
	5546
	5547	/* Now shutdown not recommended cores */
	5548	processor = processor_list;
	5549	pset = processor->processor_set;
	5550
	5551	pset_lock(pset);
	5552	do {
	5553
	5554	nset = processor->processor_set;
	5555	if (nset != pset) {
	5556	pset_unlock(pset);
	5557	pset = nset;
	5558	pset_lock(pset);
	5559	}
	5560
	5561	if (!(recommended_cores & (1ULL << processor->cpu_id))) {
	5562	processor->is_recommended = FALSE;
	5563	if (processor->state == PROCESSOR_IDLE) {
	5564	re_queue_head(&pset->unused_queue, &processor->processor_queue);
	5565	}
	5566	SCHED(processor_queue_shutdown)(processor);
	5567	/* pset unlocked */
	5568
	5569	SCHED(rt_queue_shutdown)(processor);
	5570
	5571	pset_lock(pset);
	5572	}
	5573	} while ((processor = processor->processor_list) != NULL);
	5574	pset_unlock(pset);
	5575
	5576	/* Issue all pending IPIs now that the pset lock has been dropped */
	5577	for (int cpuid = lsb_first(needs_exit_idle_mask); cpuid >= 0; cpuid = lsb_next(needs_exit_idle_mask, cpuid)) {
	5578	processor = processor_array[cpuid];
	5579	machine_signal_idle(processor);
	5580	}
	5581
	5582	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	5583	MACHDBG_CODE(DBG_MACH_SCHED,MACH_SCHED_UPDATE_REC_CORES) \| DBG_FUNC_END,
	5584	needs_exit_idle_mask, 0, 0, 0, 0);
	5585	}
	5586	#endif /* __arm__ \|\| __arm64__ */
	5587
	5588	void thread_set_options(uint32_t thopt) {
	5589	spl_t x;
	5590	thread_t t = current_thread();
	5591
	5592	x = splsched();
	5593	thread_lock(t);
	5594
	5595	t->options \|= thopt;
	5596
	5597	thread_unlock(t);
	5598	splx(x);
	5599	}
	5600
	5601	void thread_set_pending_block_hint(thread_t thread, block_hint_t block_hint) {
	5602	thread->pending_block_hint = block_hint;
	5603	}
	5604
	5605	uint32_t qos_max_parallelism(int qos, uint64_t options)
	5606	{
	5607	return SCHED(qos_max_parallelism)(qos, options);
	5608	}
	5609
	5610	uint32_t sched_qos_max_parallelism(__unused int qos, uint64_t options)
	5611	{
	5612	host_basic_info_data_t hinfo;
	5613	mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT;
	5614	/* Query the machine layer for core information */
	5615	__assert_only kern_return_t kret = host_info(host_self(), HOST_BASIC_INFO,
	5616	(host_info_t)&hinfo, &count);
	5617	assert(kret == KERN_SUCCESS);
	5618
	5619	/* We would not want multiple realtime threads running on the
	5620	* same physical core; even for SMT capable machines.
	5621	*/
	5622	if (options & QOS_PARALLELISM_REALTIME) {
	5623	return hinfo.physical_cpu;
	5624	}
	5625
	5626	if (options & QOS_PARALLELISM_COUNT_LOGICAL) {
	5627	return hinfo.logical_cpu;
	5628	} else {
	5629	return hinfo.physical_cpu;
	5630	}
	5631	}
	5632
	5633	#if __arm64__
	5634
	5635	/*
	5636	* Set up or replace old timer with new timer
	5637	*
	5638	* Returns true if canceled old timer, false if it did not
	5639	*/
	5640	boolean_t
	5641	sched_perfcontrol_update_callback_deadline(uint64_t new_deadline)
	5642	{
	5643	/*
	5644	* Exchange deadline for new deadline, if old deadline was nonzero,
	5645	* then I cancelled the callback, otherwise I didn't
	5646	*/
	5647
	5648	uint64_t old_deadline = __c11_atomic_load(&sched_perfcontrol_callback_deadline,
	5649	memory_order_relaxed);
	5650
	5651
	5652	while (!__c11_atomic_compare_exchange_weak(&sched_perfcontrol_callback_deadline,
	5653	&old_deadline, new_deadline,
	5654	memory_order_relaxed, memory_order_relaxed));
	5655
	5656
	5657	/* now old_deadline contains previous value, which might not be the same if it raced */
	5658
	5659	return (old_deadline != 0) ? TRUE : FALSE;
	5660	}
	5661
	5662	#endif /* __arm64__ */
	5663
	5664	int
	5665	sched_get_pset_load_average(processor_set_t pset)
	5666	{
	5667	return pset->load_average >> (PSET_LOAD_NUMERATOR_SHIFT - PSET_LOAD_FRACTIONAL_SHIFT);
	5668	}
	5669
	5670	void
	5671	sched_update_pset_load_average(processor_set_t pset)
	5672	{
	5673	#if DEBUG
	5674	queue_entry_t iter;
	5675	int count = 0;
	5676	qe_foreach(iter, &pset->active_queue) {
	5677	count++;
	5678	}
	5679	assertf(count == pset->active_processor_count, "count %d pset->active_processor_count %d\n", count, pset->active_processor_count);
	5680	#endif
	5681
	5682	int load = ((pset->active_processor_count + pset->pset_runq.count + rt_runq_count(pset)) << PSET_LOAD_NUMERATOR_SHIFT);
	5683	int new_load_average = (pset->load_average + load) >> 1;
	5684
	5685	pset->load_average = new_load_average;
	5686
	5687	#if (DEVELOPMENT \|\| DEBUG)
	5688	#endif
	5689	}
	5690
	5691	/* pset is locked */
	5692	static processor_t
	5693	choose_processor_for_realtime_thread(processor_set_t pset)
	5694	{
	5695	uint64_t cpu_map = (pset->cpu_bitmask & pset->recommended_bitmask & ~pset->pending_AST_cpu_mask);
	5696
	5697	for (int cpuid = lsb_first(cpu_map); cpuid >= 0; cpuid = lsb_next(cpu_map, cpuid)) {
	5698	processor_t processor = processor_array[cpuid];
	5699
	5700	if (processor->processor_primary != processor) {
	5701	continue;
	5702	}
	5703
	5704	if (processor->state == PROCESSOR_IDLE) {
	5705	return processor;
	5706	}
	5707
	5708	if ((processor->state != PROCESSOR_RUNNING) && (processor->state != PROCESSOR_DISPATCHING)) {
	5709	continue;
	5710	}
	5711
	5712	if (processor->current_pri >= BASEPRI_RTQUEUES) {
	5713	continue;
	5714	}
	5715
	5716	return processor;
	5717
	5718	}
	5719
	5720	if (!sched_allow_rt_smt) {
	5721	return PROCESSOR_NULL;
	5722	}
	5723
	5724	/* Consider secondary processors */
	5725	for (int cpuid = lsb_first(cpu_map); cpuid >= 0; cpuid = lsb_next(cpu_map, cpuid)) {
	5726	processor_t processor = processor_array[cpuid];
	5727
	5728	if (processor->processor_primary == processor) {
	5729	continue;
	5730	}
	5731
	5732	if (processor->state == PROCESSOR_IDLE) {
	5733	return processor;
	5734	}
	5735
	5736	if ((processor->state != PROCESSOR_RUNNING) && (processor->state != PROCESSOR_DISPATCHING)) {
	5737	continue;
	5738	}
	5739
	5740	if (processor->current_pri >= BASEPRI_RTQUEUES) {
	5741	continue;
	5742	}
	5743
	5744	return processor;
	5745
	5746	}
	5747
	5748	return PROCESSOR_NULL;
	5749	}
	5750
	5751	/* pset is locked */
	5752	static bool
	5753	all_available_primaries_are_running_realtime_threads(processor_set_t pset)
	5754	{
	5755	uint64_t cpu_map = (pset->cpu_bitmask & pset->recommended_bitmask);
	5756
	5757	for (int cpuid = lsb_first(cpu_map); cpuid >= 0; cpuid = lsb_next(cpu_map, cpuid)) {
	5758	processor_t processor = processor_array[cpuid];
	5759
	5760	if (processor->processor_primary != processor) {
	5761	continue;
	5762	}
	5763
	5764	if ((processor->state != PROCESSOR_RUNNING) && (processor->state != PROCESSOR_DISPATCHING)) {
	5765	continue;
	5766	}
	5767
	5768	if (processor->current_pri < BASEPRI_RTQUEUES) {
	5769	return false;
	5770	}
	5771	}
	5772
	5773	return true;
	5774	}