git.saurik.com Git - apple/xnu.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2000-2012 Apple Inc. All rights reserved.
	3	*
	4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
	5	*
	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. The rights granted to you under the License
	10	* may not be used to create, or enable the creation or redistribution of,
	11	* unlawful or unlicensed copies of an Apple operating system, or to
	12	* circumvent, violate, or enable the circumvention or violation of, any
	13	* terms of an Apple operating system software license agreement.
	14	*
	15	* Please obtain a copy of the License at
	16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
	17	*
	18	* The Original Code and all software distributed under the License are
	19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	23	* Please see the License for the specific language governing rights and
	24	* limitations under the License.
	25	*
	26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
	27	*/
	28	/*
	29	* @OSF_FREE_COPYRIGHT@
	30	*/
	31	/*
	32	* Mach Operating System
	33	* Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
	34	* All Rights Reserved.
	35	*
	36	* Permission to use, copy, modify and distribute this software and its
	37	* documentation is hereby granted, provided that both the copyright
	38	* notice and this permission notice appear in all copies of the
	39	* software, derivative works or modified versions, and any portions
	40	* thereof, and that both notices appear in supporting documentation.
	41	*
	42	* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
	43	* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
	44	* ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
	45	*
	46	* Carnegie Mellon requests users of this software to return to
	47	*
	48	* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
	49	* School of Computer Science
	50	* Carnegie Mellon University
	51	* Pittsburgh PA 15213-3890
	52	*
	53	* any improvements or extensions that they make and grant Carnegie Mellon
	54	* the rights to redistribute these changes.
	55	*/
	56	/*
	57	*/
	58	/*
	59	* File: sched_prim.c
	60	* Author: Avadis Tevanian, Jr.
	61	* Date: 1986
	62	*
	63	* Scheduling primitives
	64	*
	65	*/
	66
	67	#include <debug.h>
	68
	69	#include <mach/mach_types.h>
	70	#include <mach/machine.h>
	71	#include <mach/policy.h>
	72	#include <mach/sync_policy.h>
	73	#include <mach/thread_act.h>
	74
	75	#include <machine/machine_routines.h>
	76	#include <machine/sched_param.h>
	77	#include <machine/machine_cpu.h>
	78	#include <machine/machlimits.h>
	79
	80	#ifdef CONFIG_MACH_APPROXIMATE_TIME
	81	#include <machine/commpage.h>
	82	#endif
	83
	84	#include <kern/kern_types.h>
	85	#include <kern/clock.h>
	86	#include <kern/counters.h>
	87	#include <kern/cpu_number.h>
	88	#include <kern/cpu_data.h>
	89	#include <kern/smp.h>
	90	#include <kern/debug.h>
	91	#include <kern/macro_help.h>
	92	#include <kern/machine.h>
	93	#include <kern/misc_protos.h>
	94	#include <kern/processor.h>
	95	#include <kern/queue.h>
	96	#include <kern/sched.h>
	97	#include <kern/sched_prim.h>
	98	#include <kern/sfi.h>
	99	#include <kern/syscall_subr.h>
	100	#include <kern/task.h>
	101	#include <kern/thread.h>
	102	#include <kern/ledger.h>
	103	#include <kern/timer_queue.h>
	104	#include <kern/waitq.h>
	105
	106	#include <vm/pmap.h>
	107	#include <vm/vm_kern.h>
	108	#include <vm/vm_map.h>
	109
	110	#include <mach/sdt.h>
	111
	112	#include <sys/kdebug.h>
	113
	114	#include <kern/pms.h>
	115
	116	#if defined(CONFIG_TELEMETRY) && defined(CONFIG_SCHED_TIMESHARE_CORE)
	117	#include <kern/telemetry.h>
	118	#endif
	119
	120	struct rt_queue rt_runq;
	121
	122	uintptr_t sched_thread_on_rt_queue = (uintptr_t)0xDEAFBEE0;
	123
	124	/* Lock RT runq, must be done with interrupts disabled (under splsched()) */
	125	#if __SMP__
	126	decl_simple_lock_data(static,rt_lock);
	127	#define rt_lock_init() simple_lock_init(&rt_lock, 0)
	128	#define rt_lock_lock() simple_lock(&rt_lock)
	129	#define rt_lock_unlock() simple_unlock(&rt_lock)
	130	#else
	131	#define rt_lock_init() do { } while(0)
	132	#define rt_lock_lock() do { } while(0)
	133	#define rt_lock_unlock() do { } while(0)
	134	#endif
	135
	136	#define DEFAULT_PREEMPTION_RATE 100 /* (1/s) */
	137	int default_preemption_rate = DEFAULT_PREEMPTION_RATE;
	138
	139	#define DEFAULT_BG_PREEMPTION_RATE 400 /* (1/s) */
	140	int default_bg_preemption_rate = DEFAULT_BG_PREEMPTION_RATE;
	141
	142	#define MAX_UNSAFE_QUANTA 800
	143	int max_unsafe_quanta = MAX_UNSAFE_QUANTA;
	144
	145	#define MAX_POLL_QUANTA 2
	146	int max_poll_quanta = MAX_POLL_QUANTA;
	147
	148	#define SCHED_POLL_YIELD_SHIFT 4 /* 1/16 */
	149	int sched_poll_yield_shift = SCHED_POLL_YIELD_SHIFT;
	150
	151	uint64_t max_poll_computation;
	152
	153	uint64_t max_unsafe_computation;
	154	uint64_t sched_safe_duration;
	155
	156	#if defined(CONFIG_SCHED_TIMESHARE_CORE)
	157
	158	uint32_t std_quantum;
	159	uint32_t min_std_quantum;
	160	uint32_t bg_quantum;
	161
	162	uint32_t std_quantum_us;
	163	uint32_t bg_quantum_us;
	164
	165	#endif /* CONFIG_SCHED_TIMESHARE_CORE */
	166
	167	uint32_t thread_depress_time;
	168	uint32_t default_timeshare_computation;
	169	uint32_t default_timeshare_constraint;
	170
	171	uint32_t max_rt_quantum;
	172	uint32_t min_rt_quantum;
	173
	174	#if defined(CONFIG_SCHED_TIMESHARE_CORE)
	175
	176	unsigned sched_tick;
	177	uint32_t sched_tick_interval;
	178	#if defined(CONFIG_TELEMETRY)
	179	uint32_t sched_telemetry_interval;
	180	#endif /* CONFIG_TELEMETRY */
	181
	182	uint32_t sched_pri_shift = INT8_MAX;
	183	uint32_t sched_background_pri_shift = INT8_MAX;
	184	uint32_t sched_combined_fgbg_pri_shift = INT8_MAX;
	185	uint32_t sched_fixed_shift;
	186	uint32_t sched_use_combined_fgbg_decay = 0;
	187
	188	uint32_t sched_decay_usage_age_factor = 1; /* accelerate 5/8^n usage aging */
	189
	190	/* Allow foreground to decay past default to resolve inversions */
	191	#define DEFAULT_DECAY_BAND_LIMIT ((BASEPRI_FOREGROUND - BASEPRI_DEFAULT) + 2)
	192	int sched_pri_decay_band_limit = DEFAULT_DECAY_BAND_LIMIT;
	193
	194	/* Defaults for timer deadline profiling */
	195	#define TIMER_DEADLINE_TRACKING_BIN_1_DEFAULT 2000000 /* Timers with deadlines <=
	196	* 2ms */
	197	#define TIMER_DEADLINE_TRACKING_BIN_2_DEFAULT 5000000 /* Timers with deadlines
	198	<= 5ms */
	199
	200	uint64_t timer_deadline_tracking_bin_1;
	201	uint64_t timer_deadline_tracking_bin_2;
	202
	203	thread_t sched_maintenance_thread;
	204
	205	#endif /* CONFIG_SCHED_TIMESHARE_CORE */
	206
	207	uint64_t sched_one_second_interval;
	208
	209	uint32_t sched_run_count, sched_share_count, sched_background_count;
	210	uint32_t sched_load_average, sched_mach_factor;
	211
	212	/* Forwards */
	213
	214	#if defined(CONFIG_SCHED_TIMESHARE_CORE)
	215
	216	static void load_shift_init(void);
	217	static void preempt_pri_init(void);
	218
	219	#endif /* CONFIG_SCHED_TIMESHARE_CORE */
	220
	221	static thread_t thread_select(
	222	thread_t thread,
	223	processor_t processor,
	224	ast_t reason);
	225
	226	#if CONFIG_SCHED_IDLE_IN_PLACE
	227	static thread_t thread_select_idle(
	228	thread_t thread,
	229	processor_t processor);
	230	#endif
	231
	232	thread_t processor_idle(
	233	thread_t thread,
	234	processor_t processor);
	235
	236	ast_t
	237	csw_check_locked( processor_t processor,
	238	processor_set_t pset,
	239	ast_t check_reason);
	240
	241	static void processor_setrun(
	242	processor_t processor,
	243	thread_t thread,
	244	integer_t options);
	245
	246	static void
	247	sched_realtime_init(void);
	248
	249	static void
	250	sched_realtime_timebase_init(void);
	251
	252	static void
	253	sched_timer_deadline_tracking_init(void);
	254
	255	#if DEBUG
	256	extern int debug_task;
	257	#define TLOG(a, fmt, args...) if(debug_task & a) kprintf(fmt, ## args)
	258	#else
	259	#define TLOG(a, fmt, args...) do {} while (0)
	260	#endif
	261
	262	static processor_t
	263	thread_bind_internal(
	264	thread_t thread,
	265	processor_t processor);
	266
	267	static void
	268	sched_vm_group_maintenance(void);
	269
	270	#if defined(CONFIG_SCHED_TIMESHARE_CORE)
	271	int8_t sched_load_shifts[NRQS];
	272	int sched_preempt_pri[NRQBM];
	273	#endif /* CONFIG_SCHED_TIMESHARE_CORE */
	274
	275	const struct sched_dispatch_table *sched_current_dispatch = NULL;
	276
	277	/*
	278	* Statically allocate a buffer to hold the longest possible
	279	* scheduler description string, as currently implemented.
	280	* bsd/kern/kern_sysctl.c has a corresponding definition in bsd/
	281	* to export to userspace via sysctl(3). If either version
	282	* changes, update the other.
	283	*
	284	* Note that in addition to being an upper bound on the strings
	285	* in the kernel, it's also an exact parameter to PE_get_default(),
	286	* which interrogates the device tree on some platforms. That
	287	* API requires the caller know the exact size of the device tree
	288	* property, so we need both a legacy size (32) and the current size
	289	* (48) to deal with old and new device trees. The device tree property
	290	* is similarly padded to a fixed size so that the same kernel image
	291	* can run on multiple devices with different schedulers configured
	292	* in the device tree.
	293	*/
	294	char sched_string[SCHED_STRING_MAX_LENGTH];
	295
	296	uint32_t sched_debug_flags;
	297
	298	/* Global flag which indicates whether Background Stepper Context is enabled */
	299	static int cpu_throttle_enabled = 1;
	300
	301	void
	302	sched_init(void)
	303	{
	304	char sched_arg[SCHED_STRING_MAX_LENGTH] = { '\0' };
	305
	306	/* Check for runtime selection of the scheduler algorithm */
	307	if (!PE_parse_boot_argn("sched", sched_arg, sizeof (sched_arg))) {
	308	/* If no boot-args override, look in device tree */
	309	if (!PE_get_default("kern.sched", sched_arg,
	310	SCHED_STRING_MAX_LENGTH)) {
	311	sched_arg[0] = '\0';
	312	}
	313	}
	314
	315
	316	if (!PE_parse_boot_argn("sched_pri_decay_limit", &sched_pri_decay_band_limit, sizeof(sched_pri_decay_band_limit))) {
	317	/* No boot-args, check in device tree */
	318	if (!PE_get_default("kern.sched_pri_decay_limit",
	319	&sched_pri_decay_band_limit,
	320	sizeof(sched_pri_decay_band_limit))) {
	321	/* Allow decay all the way to normal limits */
	322	sched_pri_decay_band_limit = DEFAULT_DECAY_BAND_LIMIT;
	323	}
	324	}
	325
	326	kprintf("Setting scheduler priority decay band limit %d\n", sched_pri_decay_band_limit);
	327
	328	if (strlen(sched_arg) > 0) {
	329	if (0) {
	330	/* Allow pattern below */
	331	#if defined(CONFIG_SCHED_TRADITIONAL)
	332	} else if (0 == strcmp(sched_arg, sched_traditional_dispatch.sched_name)) {
	333	sched_current_dispatch = &sched_traditional_dispatch;
	334	} else if (0 == strcmp(sched_arg, sched_traditional_with_pset_runqueue_dispatch.sched_name)) {
	335	sched_current_dispatch = &sched_traditional_with_pset_runqueue_dispatch;
	336	#endif
	337	#if defined(CONFIG_SCHED_PROTO)
	338	} else if (0 == strcmp(sched_arg, sched_proto_dispatch.sched_name)) {
	339	sched_current_dispatch = &sched_proto_dispatch;
	340	#endif
	341	#if defined(CONFIG_SCHED_GRRR)
	342	} else if (0 == strcmp(sched_arg, sched_grrr_dispatch.sched_name)) {
	343	sched_current_dispatch = &sched_grrr_dispatch;
	344	#endif
	345	#if defined(CONFIG_SCHED_MULTIQ)
	346	} else if (0 == strcmp(sched_arg, sched_multiq_dispatch.sched_name)) {
	347	sched_current_dispatch = &sched_multiq_dispatch;
	348	} else if (0 == strcmp(sched_arg, sched_dualq_dispatch.sched_name)) {
	349	sched_current_dispatch = &sched_dualq_dispatch;
	350	#endif
	351	} else {
	352	#if defined(CONFIG_SCHED_TRADITIONAL)
	353	printf("Unrecognized scheduler algorithm: %s\n", sched_arg);
	354	printf("Scheduler: Using instead: %s\n", sched_traditional_with_pset_runqueue_dispatch.sched_name);
	355	sched_current_dispatch = &sched_traditional_with_pset_runqueue_dispatch;
	356	#else
	357	panic("Unrecognized scheduler algorithm: %s", sched_arg);
	358	#endif
	359	}
	360	kprintf("Scheduler: Runtime selection of %s\n", SCHED(sched_name));
	361	} else {
	362	#if defined(CONFIG_SCHED_MULTIQ)
	363	sched_current_dispatch = &sched_multiq_dispatch;
	364	#elif defined(CONFIG_SCHED_TRADITIONAL)
	365	sched_current_dispatch = &sched_traditional_with_pset_runqueue_dispatch;
	366	#elif defined(CONFIG_SCHED_PROTO)
	367	sched_current_dispatch = &sched_proto_dispatch;
	368	#elif defined(CONFIG_SCHED_GRRR)
	369	sched_current_dispatch = &sched_grrr_dispatch;
	370	#else
	371	#error No default scheduler implementation
	372	#endif
	373	kprintf("Scheduler: Default of %s\n", SCHED(sched_name));
	374	}
	375
	376	strlcpy(sched_string, SCHED(sched_name), sizeof(sched_string));
	377
	378	if (PE_parse_boot_argn("sched_debug", &sched_debug_flags, sizeof(sched_debug_flags))) {
	379	kprintf("Scheduler: Debug flags 0x%08x\n", sched_debug_flags);
	380	}
	381
	382	SCHED(init)();
	383	sched_realtime_init();
	384	ast_init();
	385	sched_timer_deadline_tracking_init();
	386
	387	SCHED(pset_init)(&pset0);
	388	SCHED(processor_init)(master_processor);
	389	}
	390
	391	void
	392	sched_timebase_init(void)
	393	{
	394	uint64_t abstime;
	395
	396	clock_interval_to_absolutetime_interval(1, NSEC_PER_SEC, &abstime);
	397	sched_one_second_interval = abstime;
	398
	399	SCHED(timebase_init)();
	400	sched_realtime_timebase_init();
	401	}
	402
	403	#if defined(CONFIG_SCHED_TIMESHARE_CORE)
	404
	405	void
	406	sched_timeshare_init(void)
	407	{
	408	/*
	409	* Calculate the timeslicing quantum
	410	* in us.
	411	*/
	412	if (default_preemption_rate < 1)
	413	default_preemption_rate = DEFAULT_PREEMPTION_RATE;
	414	std_quantum_us = (1000 * 1000) / default_preemption_rate;
	415
	416	printf("standard timeslicing quantum is %d us\n", std_quantum_us);
	417
	418	if (default_bg_preemption_rate < 1)
	419	default_bg_preemption_rate = DEFAULT_BG_PREEMPTION_RATE;
	420	bg_quantum_us = (1000 * 1000) / default_bg_preemption_rate;
	421
	422	printf("standard background quantum is %d us\n", bg_quantum_us);
	423
	424	load_shift_init();
	425	preempt_pri_init();
	426	sched_tick = 0;
	427	}
	428
	429	void
	430	sched_timeshare_timebase_init(void)
	431	{
	432	uint64_t abstime;
	433	uint32_t shift;
	434
	435	/* standard timeslicing quantum */
	436	clock_interval_to_absolutetime_interval(
	437	std_quantum_us, NSEC_PER_USEC, &abstime);
	438	assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
	439	std_quantum = (uint32_t)abstime;
	440
	441	/* smallest remaining quantum (250 us) */
	442	clock_interval_to_absolutetime_interval(250, NSEC_PER_USEC, &abstime);
	443	assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
	444	min_std_quantum = (uint32_t)abstime;
	445
	446	/* quantum for background tasks */
	447	clock_interval_to_absolutetime_interval(
	448	bg_quantum_us, NSEC_PER_USEC, &abstime);
	449	assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
	450	bg_quantum = (uint32_t)abstime;
	451
	452	/* scheduler tick interval */
	453	clock_interval_to_absolutetime_interval(USEC_PER_SEC >> SCHED_TICK_SHIFT,
	454	NSEC_PER_USEC, &abstime);
	455	assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
	456	sched_tick_interval = (uint32_t)abstime;
	457
	458	/*
	459	* Compute conversion factor from usage to
	460	* timesharing priorities with 5/8 ** n aging.
	461	*/
	462	abstime = (abstime * 5) / 3;
	463	for (shift = 0; abstime > BASEPRI_DEFAULT; ++shift)
	464	abstime >>= 1;
	465	sched_fixed_shift = shift;
	466
	467	max_unsafe_computation = ((uint64_t)max_unsafe_quanta) * std_quantum;
	468	sched_safe_duration = 2 * ((uint64_t)max_unsafe_quanta) * std_quantum;
	469
	470	max_poll_computation = ((uint64_t)max_poll_quanta) * std_quantum;
	471	thread_depress_time = 1 * std_quantum;
	472	default_timeshare_computation = std_quantum / 2;
	473	default_timeshare_constraint = std_quantum;
	474
	475	#if defined(CONFIG_TELEMETRY)
	476	/* interval for high frequency telemetry */
	477	clock_interval_to_absolutetime_interval(10, NSEC_PER_MSEC, &abstime);
	478	assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
	479	sched_telemetry_interval = (uint32_t)abstime;
	480	#endif
	481	}
	482
	483	#endif /* CONFIG_SCHED_TIMESHARE_CORE */
	484
	485	static void
	486	sched_realtime_init(void)
	487	{
	488	rt_lock_init();
	489
	490	rt_runq.count = 0;
	491	queue_init(&rt_runq.queue);
	492	}
	493
	494	static void
	495	sched_realtime_timebase_init(void)
	496	{
	497	uint64_t abstime;
	498
	499	/* smallest rt computaton (50 us) */
	500	clock_interval_to_absolutetime_interval(50, NSEC_PER_USEC, &abstime);
	501	assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
	502	min_rt_quantum = (uint32_t)abstime;
	503
	504	/* maximum rt computation (50 ms) */
	505	clock_interval_to_absolutetime_interval(
	506	50, 1000*NSEC_PER_USEC, &abstime);
	507	assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
	508	max_rt_quantum = (uint32_t)abstime;
	509
	510	}
	511
	512	#if defined(CONFIG_SCHED_TIMESHARE_CORE)
	513
	514	/*
	515	* Set up values for timeshare
	516	* loading factors.
	517	*/
	518	static void
	519	load_shift_init(void)
	520	{
	521	int8_t k, *p = sched_load_shifts;
	522	uint32_t i, j;
	523
	524	uint32_t sched_decay_penalty = 1;
	525
	526	if (PE_parse_boot_argn("sched_decay_penalty", &sched_decay_penalty, sizeof (sched_decay_penalty))) {
	527	kprintf("Overriding scheduler decay penalty %u\n", sched_decay_penalty);
	528	}
	529
	530	if (PE_parse_boot_argn("sched_decay_usage_age_factor", &sched_decay_usage_age_factor, sizeof (sched_decay_usage_age_factor))) {
	531	kprintf("Overriding scheduler decay usage age factor %u\n", sched_decay_usage_age_factor);
	532	}
	533
	534	if (PE_parse_boot_argn("sched_use_combined_fgbg_decay", &sched_use_combined_fgbg_decay, sizeof (sched_use_combined_fgbg_decay))) {
	535	kprintf("Overriding schedule fg/bg decay calculation: %u\n", sched_use_combined_fgbg_decay);
	536	}
	537
	538	if (sched_decay_penalty == 0) {
	539	/*
	540	* There is no penalty for timeshare threads for using too much
	541	* CPU, so set all load shifts to INT8_MIN. Even under high load,
	542	* sched_pri_shift will be >INT8_MAX, and there will be no
	543	* penalty applied to threads (nor will sched_usage be updated per
	544	* thread).
	545	*/
	546	for (i = 0; i < NRQS; i++) {
	547	sched_load_shifts[i] = INT8_MIN;
	548	}
	549
	550	return;
	551	}
	552
	553	p++ = INT8_MIN; p++ = 0;
	554
	555	/*
	556	* For a given system load "i", the per-thread priority
	557	* penalty per quantum of CPU usage is ~2^k priority
	558	* levels. "sched_decay_penalty" can cause more
	559	* array entries to be filled with smaller "k" values
	560	*/
	561	for (i = 2, j = 1 << sched_decay_penalty, k = 1; i < NRQS; ++k) {
	562	for (j <<= 1; (i < j) && (i < NRQS); ++i)
	563	*p++ = k;
	564	}
	565	}
	566
	567	static void
	568	preempt_pri_init(void)
	569	{
	570	int i, *p = sched_preempt_pri;
	571
	572	for (i = BASEPRI_FOREGROUND; i < MINPRI_KERNEL; ++i)
	573	setbit(i, p);
	574
	575	for (i = BASEPRI_PREEMPT; i <= MAXPRI; ++i)
	576	setbit(i, p);
	577	}
	578
	579	#endif /* CONFIG_SCHED_TIMESHARE_CORE */
	580
	581	/*
	582	* Thread wait timer expiration.
	583	*/
	584	void
	585	thread_timer_expire(
	586	void *p0,
	587	__unused void *p1)
	588	{
	589	thread_t thread = p0;
	590	spl_t s;
	591
	592	s = splsched();
	593	thread_lock(thread);
	594	if (--thread->wait_timer_active == 0) {
	595	if (thread->wait_timer_is_set) {
	596	thread->wait_timer_is_set = FALSE;
	597	clear_wait_internal(thread, THREAD_TIMED_OUT);
	598	}
	599	}
	600	thread_unlock(thread);
	601	splx(s);
	602	}
	603
	604	/*
	605	* thread_unblock:
	606	*
	607	* Unblock thread on wake up.
	608	*
	609	* Returns TRUE if the thread should now be placed on the runqueue.
	610	*
	611	* Thread must be locked.
	612	*
	613	* Called at splsched().
	614	*/
	615	boolean_t
	616	thread_unblock(
	617	thread_t thread,
	618	wait_result_t wresult)
	619	{
	620	boolean_t ready_for_runq = FALSE;
	621	thread_t cthread = current_thread();
	622	uint32_t new_run_count;
	623
	624	/*
	625	* Set wait_result.
	626	*/
	627	thread->wait_result = wresult;
	628
	629	/*
	630	* Cancel pending wait timer.
	631	*/
	632	if (thread->wait_timer_is_set) {
	633	if (timer_call_cancel(&thread->wait_timer))
	634	thread->wait_timer_active--;
	635	thread->wait_timer_is_set = FALSE;
	636	}
	637
	638	/*
	639	* Update scheduling state: not waiting,
	640	* set running.
	641	*/
	642	thread->state &= ~(TH_WAIT\|TH_UNINT);
	643
	644	if (!(thread->state & TH_RUN)) {
	645	thread->state \|= TH_RUN;
	646	thread->last_made_runnable_time = mach_approximate_time();
	647
	648	ready_for_runq = TRUE;
	649
	650	(*thread->sched_call)(SCHED_CALL_UNBLOCK, thread);
	651
	652	/*
	653	* Update run counts.
	654	*/
	655	new_run_count = sched_run_incr(thread);
	656	if (thread->sched_mode == TH_MODE_TIMESHARE) {
	657	sched_share_incr(thread);
	658
	659	if (thread->sched_flags & TH_SFLAG_THROTTLED)
	660	sched_background_incr(thread);
	661	}
	662	} else {
	663	/*
	664	* Signal if idling on another processor.
	665	*/
	666	#if CONFIG_SCHED_IDLE_IN_PLACE
	667	if (thread->state & TH_IDLE) {
	668	processor_t processor = thread->last_processor;
	669
	670	if (processor != current_processor())
	671	machine_signal_idle(processor);
	672	}
	673	#else
	674	assert((thread->state & TH_IDLE) == 0);
	675	#endif
	676
	677	new_run_count = sched_run_count; /* updated in thread_select_idle() */
	678	}
	679
	680
	681	/*
	682	* Calculate deadline for real-time threads.
	683	*/
	684	if (thread->sched_mode == TH_MODE_REALTIME) {
	685	uint64_t ctime;
	686
	687	ctime = mach_absolute_time();
	688	thread->realtime.deadline = thread->realtime.constraint + ctime;
	689	}
	690
	691	/*
	692	* Clear old quantum, fail-safe computation, etc.
	693	*/
	694	thread->quantum_remaining = 0;
	695	thread->computation_metered = 0;
	696	thread->reason = AST_NONE;
	697
	698	/* Obtain power-relevant interrupt and "platform-idle exit" statistics.
	699	* We also account for "double hop" thread signaling via
	700	* the thread callout infrastructure.
	701	* DRK: consider removing the callout wakeup counters in the future
	702	* they're present for verification at the moment.
	703	*/
	704	boolean_t aticontext, pidle;
	705	ml_get_power_state(&aticontext, &pidle);
	706
	707	if (__improbable(aticontext && !(thread_get_tag_internal(thread) & THREAD_TAG_CALLOUT))) {
	708	ledger_credit(thread->t_ledger, task_ledgers.interrupt_wakeups, 1);
	709	DTRACE_SCHED2(iwakeup, struct thread , thread, struct proc , thread->task->bsd_info);
	710
	711	uint64_t ttd = PROCESSOR_DATA(current_processor(), timer_call_ttd);
	712
	713	if (ttd) {
	714	if (ttd <= timer_deadline_tracking_bin_1)
	715	thread->thread_timer_wakeups_bin_1++;
	716	else
	717	if (ttd <= timer_deadline_tracking_bin_2)
	718	thread->thread_timer_wakeups_bin_2++;
	719	}
	720
	721	if (pidle) {
	722	ledger_credit(thread->t_ledger, task_ledgers.platform_idle_wakeups, 1);
	723	}
	724
	725	} else if (thread_get_tag_internal(cthread) & THREAD_TAG_CALLOUT) {
	726	if (cthread->callout_woken_from_icontext) {
	727	ledger_credit(thread->t_ledger, task_ledgers.interrupt_wakeups, 1);
	728	thread->thread_callout_interrupt_wakeups++;
	729	if (cthread->callout_woken_from_platform_idle) {
	730	ledger_credit(thread->t_ledger, task_ledgers.platform_idle_wakeups, 1);
	731	thread->thread_callout_platform_idle_wakeups++;
	732	}
	733
	734	cthread->callout_woke_thread = TRUE;
	735	}
	736	}
	737
	738	if (thread_get_tag_internal(thread) & THREAD_TAG_CALLOUT) {
	739	thread->callout_woken_from_icontext = aticontext;
	740	thread->callout_woken_from_platform_idle = pidle;
	741	thread->callout_woke_thread = FALSE;
	742	}
	743
	744	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	745	MACHDBG_CODE(DBG_MACH_SCHED,MACH_MAKE_RUNNABLE) \| DBG_FUNC_NONE,
	746	(uintptr_t)thread_tid(thread), thread->sched_pri, thread->wait_result, new_run_count, 0);
	747
	748	DTRACE_SCHED2(wakeup, struct thread , thread, struct proc , thread->task->bsd_info);
	749
	750	return (ready_for_runq);
	751	}
	752
	753	/*
	754	* Routine: thread_go
	755	* Purpose:
	756	* Unblock and dispatch thread.
	757	* Conditions:
	758	* thread lock held, IPC locks may be held.
	759	* thread must have been pulled from wait queue under same lock hold.
	760	* thread must have been waiting
	761	* Returns:
	762	* KERN_SUCCESS - Thread was set running
	763	*
	764	* TODO: This should return void
	765	*/
	766	kern_return_t
	767	thread_go(
	768	thread_t thread,
	769	wait_result_t wresult)
	770	{
	771	assert(thread->at_safe_point == FALSE);
	772	assert(thread->wait_event == NO_EVENT64);
	773	assert(thread->waitq == NULL);
	774
	775	assert(!(thread->state & (TH_TERMINATE\|TH_TERMINATE2)));
	776	assert(thread->state & TH_WAIT);
	777
	778
	779	if (thread_unblock(thread, wresult))
	780	thread_setrun(thread, SCHED_PREEMPT \| SCHED_TAILQ);
	781
	782	return (KERN_SUCCESS);
	783	}
	784
	785	/*
	786	* Routine: thread_mark_wait_locked
	787	* Purpose:
	788	* Mark a thread as waiting. If, given the circumstances,
	789	* it doesn't want to wait (i.e. already aborted), then
	790	* indicate that in the return value.
	791	* Conditions:
	792	* at splsched() and thread is locked.
	793	*/
	794	__private_extern__
	795	wait_result_t
	796	thread_mark_wait_locked(
	797	thread_t thread,
	798	wait_interrupt_t interruptible)
	799	{
	800	boolean_t at_safe_point;
	801
	802	assert(thread == current_thread());
	803	assert(!(thread->state & (TH_WAIT\|TH_IDLE\|TH_UNINT\|TH_TERMINATE2)));
	804
	805	/*
	806	* The thread may have certain types of interrupts/aborts masked
	807	* off. Even if the wait location says these types of interrupts
	808	* are OK, we have to honor mask settings (outer-scoped code may
	809	* not be able to handle aborts at the moment).
	810	*/
	811	if (interruptible > (thread->options & TH_OPT_INTMASK))
	812	interruptible = thread->options & TH_OPT_INTMASK;
	813
	814	at_safe_point = (interruptible == THREAD_ABORTSAFE);
	815
	816	if ( interruptible == THREAD_UNINT \|\|
	817	!(thread->sched_flags & TH_SFLAG_ABORT) \|\|
	818	(!at_safe_point &&
	819	(thread->sched_flags & TH_SFLAG_ABORTSAFELY))) {
	820
	821	if ( !(thread->state & TH_TERMINATE))
	822	DTRACE_SCHED(sleep);
	823
	824	thread->state \|= (interruptible) ? TH_WAIT : (TH_WAIT \| TH_UNINT);
	825	thread->at_safe_point = at_safe_point;
	826	return (thread->wait_result = THREAD_WAITING);
	827	}
	828	else
	829	if (thread->sched_flags & TH_SFLAG_ABORTSAFELY)
	830	thread->sched_flags &= ~TH_SFLAG_ABORTED_MASK;
	831
	832	return (thread->wait_result = THREAD_INTERRUPTED);
	833	}
	834
	835	/*
	836	* Routine: thread_interrupt_level
	837	* Purpose:
	838	* Set the maximum interruptible state for the
	839	* current thread. The effective value of any
	840	* interruptible flag passed into assert_wait
	841	* will never exceed this.
	842	*
	843	* Useful for code that must not be interrupted,
	844	* but which calls code that doesn't know that.
	845	* Returns:
	846	* The old interrupt level for the thread.
	847	*/
	848	__private_extern__
	849	wait_interrupt_t
	850	thread_interrupt_level(
	851	wait_interrupt_t new_level)
	852	{
	853	thread_t thread = current_thread();
	854	wait_interrupt_t result = thread->options & TH_OPT_INTMASK;
	855
	856	thread->options = (thread->options & ~TH_OPT_INTMASK) \| (new_level & TH_OPT_INTMASK);
	857
	858	return result;
	859	}
	860
	861	/*
	862	* Check to see if an assert wait is possible, without actually doing one.
	863	* This is used by debug code in locks and elsewhere to verify that it is
	864	* always OK to block when trying to take a blocking lock (since waiting
	865	* for the actual assert_wait to catch the case may make it hard to detect
	866	* this case.
	867	*/
	868	boolean_t
	869	assert_wait_possible(void)
	870	{
	871
	872	thread_t thread;
	873
	874	#if DEBUG
	875	if(debug_mode) return TRUE; /* Always succeed in debug mode */
	876	#endif
	877
	878	thread = current_thread();
	879
	880	return (thread == NULL \|\| waitq_wait_possible(thread));
	881	}
	882
	883	/*
	884	* assert_wait:
	885	*
	886	* Assert that the current thread is about to go to
	887	* sleep until the specified event occurs.
	888	*/
	889	wait_result_t
	890	assert_wait(
	891	event_t event,
	892	wait_interrupt_t interruptible)
	893	{
	894	if (__improbable(event == NO_EVENT))
	895	panic("%s() called with NO_EVENT", __func__);
	896
	897	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	898	MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)\|DBG_FUNC_NONE,
	899	VM_KERNEL_UNSLIDE_OR_PERM(event), 0, 0, 0, 0);
	900
	901	struct waitq *waitq;
	902	waitq = global_eventq(event);
	903	return waitq_assert_wait64(waitq, CAST_EVENT64_T(event), interruptible, TIMEOUT_WAIT_FOREVER);
	904	}
	905
	906	wait_result_t
	907	assert_wait_timeout(
	908	event_t event,
	909	wait_interrupt_t interruptible,
	910	uint32_t interval,
	911	uint32_t scale_factor)
	912	{
	913	thread_t thread = current_thread();
	914	wait_result_t wresult;
	915	uint64_t deadline;
	916	spl_t s;
	917
	918	if (__improbable(event == NO_EVENT))
	919	panic("%s() called with NO_EVENT", __func__);
	920
	921	struct waitq *waitq;
	922	waitq = global_eventq(event);
	923
	924	s = splsched();
	925	waitq_lock(waitq);
	926	thread_lock(thread);
	927
	928	clock_interval_to_deadline(interval, scale_factor, &deadline);
	929
	930	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	931	MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)\|DBG_FUNC_NONE,
	932	VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
	933
	934	wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
	935	interruptible,
	936	TIMEOUT_URGENCY_SYS_NORMAL,
	937	deadline, TIMEOUT_NO_LEEWAY,
	938	thread);
	939
	940	thread_unlock(thread);
	941	waitq_unlock(waitq);
	942	splx(s);
	943	return wresult;
	944	}
	945
	946	wait_result_t
	947	assert_wait_timeout_with_leeway(
	948	event_t event,
	949	wait_interrupt_t interruptible,
	950	wait_timeout_urgency_t urgency,
	951	uint32_t interval,
	952	uint32_t leeway,
	953	uint32_t scale_factor)
	954	{
	955	thread_t thread = current_thread();
	956	wait_result_t wresult;
	957	uint64_t deadline;
	958	uint64_t abstime;
	959	uint64_t slop;
	960	uint64_t now;
	961	spl_t s;
	962
	963	if (__improbable(event == NO_EVENT))
	964	panic("%s() called with NO_EVENT", __func__);
	965
	966	now = mach_absolute_time();
	967	clock_interval_to_absolutetime_interval(interval, scale_factor, &abstime);
	968	deadline = now + abstime;
	969
	970	clock_interval_to_absolutetime_interval(leeway, scale_factor, &slop);
	971
	972	struct waitq *waitq;
	973	waitq = global_eventq(event);
	974
	975	s = splsched();
	976	waitq_lock(waitq);
	977	thread_lock(thread);
	978
	979	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	980	MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)\|DBG_FUNC_NONE,
	981	VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
	982
	983	wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
	984	interruptible,
	985	urgency, deadline, slop,
	986	thread);
	987
	988	thread_unlock(thread);
	989	waitq_unlock(waitq);
	990	splx(s);
	991	return wresult;
	992	}
	993
	994	wait_result_t
	995	assert_wait_deadline(
	996	event_t event,
	997	wait_interrupt_t interruptible,
	998	uint64_t deadline)
	999	{
	1000	thread_t thread = current_thread();
	1001	wait_result_t wresult;
	1002	spl_t s;
	1003
	1004	if (__improbable(event == NO_EVENT))
	1005	panic("%s() called with NO_EVENT", __func__);
	1006
	1007	struct waitq *waitq;
	1008	waitq = global_eventq(event);
	1009
	1010	s = splsched();
	1011	waitq_lock(waitq);
	1012	thread_lock(thread);
	1013
	1014	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	1015	MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)\|DBG_FUNC_NONE,
	1016	VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
	1017
	1018	wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
	1019	interruptible,
	1020	TIMEOUT_URGENCY_SYS_NORMAL, deadline,
	1021	TIMEOUT_NO_LEEWAY, thread);
	1022	thread_unlock(thread);
	1023	waitq_unlock(waitq);
	1024	splx(s);
	1025	return wresult;
	1026	}
	1027
	1028	wait_result_t
	1029	assert_wait_deadline_with_leeway(
	1030	event_t event,
	1031	wait_interrupt_t interruptible,
	1032	wait_timeout_urgency_t urgency,
	1033	uint64_t deadline,
	1034	uint64_t leeway)
	1035	{
	1036	thread_t thread = current_thread();
	1037	wait_result_t wresult;
	1038	spl_t s;
	1039
	1040	if (__improbable(event == NO_EVENT))
	1041	panic("%s() called with NO_EVENT", __func__);
	1042
	1043	struct waitq *waitq;
	1044	waitq = global_eventq(event);
	1045
	1046	s = splsched();
	1047	waitq_lock(waitq);
	1048	thread_lock(thread);
	1049
	1050	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	1051	MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)\|DBG_FUNC_NONE,
	1052	VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
	1053
	1054	wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
	1055	interruptible,
	1056	urgency, deadline, leeway,
	1057	thread);
	1058
	1059	thread_unlock(thread);
	1060	waitq_unlock(waitq);
	1061	splx(s);
	1062	return wresult;
	1063	}
	1064
	1065	/*
	1066	* thread_isoncpu:
	1067	*
	1068	* Return TRUE if a thread is running on a processor such that an AST
	1069	* is needed to pull it out of userspace execution, or if executing in
	1070	* the kernel, bring to a context switch boundary that would cause
	1071	* thread state to be serialized in the thread PCB.
	1072	*
	1073	* Thread locked, returns the same way. While locked, fields
	1074	* like "state" cannot change. "runq" can change only from set to unset.
	1075	*/
	1076	static inline boolean_t
	1077	thread_isoncpu(thread_t thread)
	1078	{
	1079	/* Not running or runnable */
	1080	if (!(thread->state & TH_RUN))
	1081	return (FALSE);
	1082
	1083	/* Waiting on a runqueue, not currently running */
	1084	/* TODO: This is invalid - it can get dequeued without thread lock, but not context switched. */
	1085	if (thread->runq != PROCESSOR_NULL)
	1086	return (FALSE);
	1087
	1088	/*
	1089	* Thread does not have a stack yet
	1090	* It could be on the stack alloc queue or preparing to be invoked
	1091	*/
	1092	if (!thread->kernel_stack)
	1093	return (FALSE);
	1094
	1095	/*
	1096	* Thread must be running on a processor, or
	1097	* about to run, or just did run. In all these
	1098	* cases, an AST to the processor is needed
	1099	* to guarantee that the thread is kicked out
	1100	* of userspace and the processor has
	1101	* context switched (and saved register state).
	1102	*/
	1103	return (TRUE);
	1104	}
	1105
	1106	/*
	1107	* thread_stop:
	1108	*
	1109	* Force a preemption point for a thread and wait
	1110	* for it to stop running on a CPU. If a stronger
	1111	* guarantee is requested, wait until no longer
	1112	* runnable. Arbitrates access among
	1113	* multiple stop requests. (released by unstop)
	1114	*
	1115	* The thread must enter a wait state and stop via a
	1116	* separate means.
	1117	*
	1118	* Returns FALSE if interrupted.
	1119	*/
	1120	boolean_t
	1121	thread_stop(
	1122	thread_t thread,
	1123	boolean_t until_not_runnable)
	1124	{
	1125	wait_result_t wresult;
	1126	spl_t s = splsched();
	1127	boolean_t oncpu;
	1128
	1129	wake_lock(thread);
	1130	thread_lock(thread);
	1131
	1132	while (thread->state & TH_SUSP) {
	1133	thread->wake_active = TRUE;
	1134	thread_unlock(thread);
	1135
	1136	wresult = assert_wait(&thread->wake_active, THREAD_ABORTSAFE);
	1137	wake_unlock(thread);
	1138	splx(s);
	1139
	1140	if (wresult == THREAD_WAITING)
	1141	wresult = thread_block(THREAD_CONTINUE_NULL);
	1142
	1143	if (wresult != THREAD_AWAKENED)
	1144	return (FALSE);
	1145
	1146	s = splsched();
	1147	wake_lock(thread);
	1148	thread_lock(thread);
	1149	}
	1150
	1151	thread->state \|= TH_SUSP;
	1152
	1153	while ((oncpu = thread_isoncpu(thread)) \|\|
	1154	(until_not_runnable && (thread->state & TH_RUN))) {
	1155	processor_t processor;
	1156
	1157	if (oncpu) {
	1158	assert(thread->state & TH_RUN);
	1159	processor = thread->chosen_processor;
	1160	cause_ast_check(processor);
	1161	}
	1162
	1163	thread->wake_active = TRUE;
	1164	thread_unlock(thread);
	1165
	1166	wresult = assert_wait(&thread->wake_active, THREAD_ABORTSAFE);
	1167	wake_unlock(thread);
	1168	splx(s);
	1169
	1170	if (wresult == THREAD_WAITING)
	1171	wresult = thread_block(THREAD_CONTINUE_NULL);
	1172
	1173	if (wresult != THREAD_AWAKENED) {
	1174	thread_unstop(thread);
	1175	return (FALSE);
	1176	}
	1177
	1178	s = splsched();
	1179	wake_lock(thread);
	1180	thread_lock(thread);
	1181	}
	1182
	1183	thread_unlock(thread);
	1184	wake_unlock(thread);
	1185	splx(s);
	1186
	1187	/*
	1188	* We return with the thread unlocked. To prevent it from
	1189	* transitioning to a runnable state (or from TH_RUN to
	1190	* being on the CPU), the caller must ensure the thread
	1191	* is stopped via an external means (such as an AST)
	1192	*/
	1193
	1194	return (TRUE);
	1195	}
	1196
	1197	/*
	1198	* thread_unstop:
	1199	*
	1200	* Release a previous stop request and set
	1201	* the thread running if appropriate.
	1202	*
	1203	* Use only after a successful stop operation.
	1204	*/
	1205	void
	1206	thread_unstop(
	1207	thread_t thread)
	1208	{
	1209	spl_t s = splsched();
	1210
	1211	wake_lock(thread);
	1212	thread_lock(thread);
	1213
	1214	assert((thread->state & (TH_RUN\|TH_WAIT\|TH_SUSP)) != TH_SUSP);
	1215
	1216	if (thread->state & TH_SUSP) {
	1217	thread->state &= ~TH_SUSP;
	1218
	1219	if (thread->wake_active) {
	1220	thread->wake_active = FALSE;
	1221	thread_unlock(thread);
	1222
	1223	thread_wakeup(&thread->wake_active);
	1224	wake_unlock(thread);
	1225	splx(s);
	1226
	1227	return;
	1228	}
	1229	}
	1230
	1231	thread_unlock(thread);
	1232	wake_unlock(thread);
	1233	splx(s);
	1234	}
	1235
	1236	/*
	1237	* thread_wait:
	1238	*
	1239	* Wait for a thread to stop running. (non-interruptible)
	1240	*
	1241	*/
	1242	void
	1243	thread_wait(
	1244	thread_t thread,
	1245	boolean_t until_not_runnable)
	1246	{
	1247	wait_result_t wresult;
	1248	boolean_t oncpu;
	1249	processor_t processor;
	1250	spl_t s = splsched();
	1251
	1252	wake_lock(thread);
	1253	thread_lock(thread);
	1254
	1255	/*
	1256	* Wait until not running on a CPU. If stronger requirement
	1257	* desired, wait until not runnable. Assumption: if thread is
	1258	* on CPU, then TH_RUN is set, so we're not waiting in any case
	1259	* where the original, pure "TH_RUN" check would have let us
	1260	* finish.
	1261	*/
	1262	while ((oncpu = thread_isoncpu(thread)) \|\|
	1263	(until_not_runnable && (thread->state & TH_RUN))) {
	1264
	1265	if (oncpu) {
	1266	assert(thread->state & TH_RUN);
	1267	processor = thread->chosen_processor;
	1268	cause_ast_check(processor);
	1269	}
	1270
	1271	thread->wake_active = TRUE;
	1272	thread_unlock(thread);
	1273
	1274	wresult = assert_wait(&thread->wake_active, THREAD_UNINT);
	1275	wake_unlock(thread);
	1276	splx(s);
	1277
	1278	if (wresult == THREAD_WAITING)
	1279	thread_block(THREAD_CONTINUE_NULL);
	1280
	1281	s = splsched();
	1282	wake_lock(thread);
	1283	thread_lock(thread);
	1284	}
	1285
	1286	thread_unlock(thread);
	1287	wake_unlock(thread);
	1288	splx(s);
	1289	}
	1290
	1291	/*
	1292	* Routine: clear_wait_internal
	1293	*
	1294	* Clear the wait condition for the specified thread.
	1295	* Start the thread executing if that is appropriate.
	1296	* Arguments:
	1297	* thread thread to awaken
	1298	* result Wakeup result the thread should see
	1299	* Conditions:
	1300	* At splsched
	1301	* the thread is locked.
	1302	* Returns:
	1303	* KERN_SUCCESS thread was rousted out a wait
	1304	* KERN_FAILURE thread was waiting but could not be rousted
	1305	* KERN_NOT_WAITING thread was not waiting
	1306	*/
	1307	__private_extern__ kern_return_t
	1308	clear_wait_internal(
	1309	thread_t thread,
	1310	wait_result_t wresult)
	1311	{
	1312	uint32_t i = LockTimeOut;
	1313	struct waitq *waitq = thread->waitq;
	1314
	1315	do {
	1316	if (wresult == THREAD_INTERRUPTED && (thread->state & TH_UNINT))
	1317	return (KERN_FAILURE);
	1318
	1319	if (waitq != NULL) {
	1320	assert(waitq_irq_safe(waitq)); //irqs are already disabled!
	1321	if (waitq_lock_try(waitq)) {
	1322	waitq_pull_thread_locked(waitq, thread);
	1323	waitq_unlock(waitq);
	1324	} else {
	1325	thread_unlock(thread);
	1326	delay(1);
	1327	thread_lock(thread);
	1328	if (waitq != thread->waitq)
	1329	return KERN_NOT_WAITING;
	1330	continue;
	1331	}
	1332	}
	1333
	1334	/* TODO: Can we instead assert TH_TERMINATE is not set? */
	1335	if ((thread->state & (TH_WAIT\|TH_TERMINATE)) == TH_WAIT)
	1336	return (thread_go(thread, wresult));
	1337	else
	1338	return (KERN_NOT_WAITING);
	1339	} while ((--i > 0) \|\| machine_timeout_suspended());
	1340
	1341	panic("clear_wait_internal: deadlock: thread=%p, wq=%p, cpu=%d\n",
	1342	thread, waitq, cpu_number());
	1343
	1344	return (KERN_FAILURE);
	1345	}
	1346
	1347
	1348	/*
	1349	* clear_wait:
	1350	*
	1351	* Clear the wait condition for the specified thread. Start the thread
	1352	* executing if that is appropriate.
	1353	*
	1354	* parameters:
	1355	* thread thread to awaken
	1356	* result Wakeup result the thread should see
	1357	*/
	1358	kern_return_t
	1359	clear_wait(
	1360	thread_t thread,
	1361	wait_result_t result)
	1362	{
	1363	kern_return_t ret;
	1364	spl_t s;
	1365
	1366	s = splsched();
	1367	thread_lock(thread);
	1368	ret = clear_wait_internal(thread, result);
	1369	thread_unlock(thread);
	1370	splx(s);
	1371	return ret;
	1372	}
	1373
	1374
	1375	/*
	1376	* thread_wakeup_prim:
	1377	*
	1378	* Common routine for thread_wakeup, thread_wakeup_with_result,
	1379	* and thread_wakeup_one.
	1380	*
	1381	*/
	1382	kern_return_t
	1383	thread_wakeup_prim(
	1384	event_t event,
	1385	boolean_t one_thread,
	1386	wait_result_t result)
	1387	{
	1388	return (thread_wakeup_prim_internal(event, one_thread, result, -1));
	1389	}
	1390
	1391
	1392	kern_return_t
	1393	thread_wakeup_prim_internal(
	1394	event_t event,
	1395	boolean_t one_thread,
	1396	wait_result_t result,
	1397	int priority)
	1398	{
	1399	if (__improbable(event == NO_EVENT))
	1400	panic("%s() called with NO_EVENT", __func__);
	1401
	1402	struct waitq *wq;
	1403
	1404	wq = global_eventq(event);
	1405	priority = (priority == -1 ? WAITQ_ALL_PRIORITIES : priority);
	1406
	1407	if (one_thread)
	1408	return waitq_wakeup64_one(wq, CAST_EVENT64_T(event), result, priority);
	1409	else
	1410	return waitq_wakeup64_all(wq, CAST_EVENT64_T(event), result, priority);
	1411	}
	1412
	1413	/*
	1414	* thread_bind:
	1415	*
	1416	* Force the current thread to execute on the specified processor.
	1417	* Takes effect after the next thread_block().
	1418	*
	1419	* Returns the previous binding. PROCESSOR_NULL means
	1420	* not bound.
	1421	*
	1422	* XXX - DO NOT export this to users - XXX
	1423	*/
	1424	processor_t
	1425	thread_bind(
	1426	processor_t processor)
	1427	{
	1428	thread_t self = current_thread();
	1429	processor_t prev;
	1430	spl_t s;
	1431
	1432	s = splsched();
	1433	thread_lock(self);
	1434
	1435	prev = thread_bind_internal(self, processor);
	1436
	1437	thread_unlock(self);
	1438	splx(s);
	1439
	1440	return (prev);
	1441	}
	1442
	1443	/*
	1444	* thread_bind_internal:
	1445	*
	1446	* If the specified thread is not the current thread, and it is currently
	1447	* running on another CPU, a remote AST must be sent to that CPU to cause
	1448	* the thread to migrate to its bound processor. Otherwise, the migration
	1449	* will occur at the next quantum expiration or blocking point.
	1450	*
	1451	* When the thread is the current thread, and explicit thread_block() should
	1452	* be used to force the current processor to context switch away and
	1453	* let the thread migrate to the bound processor.
	1454	*
	1455	* Thread must be locked, and at splsched.
	1456	*/
	1457
	1458	static processor_t
	1459	thread_bind_internal(
	1460	thread_t thread,
	1461	processor_t processor)
	1462	{
	1463	processor_t prev;
	1464
	1465	/* <rdar://problem/15102234> */
	1466	assert(thread->sched_pri < BASEPRI_RTQUEUES);
	1467	/* A thread can't be bound if it's sitting on a (potentially incorrect) runqueue */
	1468	assert(thread->runq == PROCESSOR_NULL);
	1469
	1470	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_THREAD_BIND), thread_tid(thread), processor ? (uintptr_t)processor->cpu_id : (uintptr_t)-1, 0, 0, 0);
	1471
	1472	prev = thread->bound_processor;
	1473	thread->bound_processor = processor;
	1474
	1475	return (prev);
	1476	}
	1477
	1478	/*
	1479	* thread_vm_bind_group_add:
	1480	*
	1481	* The "VM bind group" is a special mechanism to mark a collection
	1482	* of threads from the VM subsystem that, in general, should be scheduled
	1483	* with only one CPU of parallelism. To accomplish this, we initially
	1484	* bind all the threads to the master processor, which has the effect
	1485	* that only one of the threads in the group can execute at once, including
	1486	* preempting threads in the group that are a lower priority. Future
	1487	* mechanisms may use more dynamic mechanisms to prevent the collection
	1488	* of VM threads from using more CPU time than desired.
	1489	*
	1490	* The current implementation can result in priority inversions where
	1491	* compute-bound priority 95 or realtime threads that happen to have
	1492	* landed on the master processor prevent the VM threads from running.
	1493	* When this situation is detected, we unbind the threads for one
	1494	* scheduler tick to allow the scheduler to run the threads an
	1495	* additional CPUs, before restoring the binding (assuming high latency
	1496	* is no longer a problem).
	1497	*/
	1498
	1499	/*
	1500	* The current max is provisioned for:
	1501	* vm_compressor_swap_trigger_thread (92)
	1502	* 2 x vm_pageout_iothread_internal (92) when vm_restricted_to_single_processor==TRUE
	1503	* vm_pageout_continue (92)
	1504	* memorystatus_thread (95)
	1505	*/
	1506	#define MAX_VM_BIND_GROUP_COUNT (5)
	1507	decl_simple_lock_data(static,sched_vm_group_list_lock);
	1508	static thread_t sched_vm_group_thread_list[MAX_VM_BIND_GROUP_COUNT];
	1509	static int sched_vm_group_thread_count;
	1510	static boolean_t sched_vm_group_temporarily_unbound = FALSE;
	1511
	1512	void
	1513	thread_vm_bind_group_add(void)
	1514	{
	1515	thread_t self = current_thread();
	1516
	1517	thread_reference_internal(self);
	1518	self->options \|= TH_OPT_SCHED_VM_GROUP;
	1519
	1520	simple_lock(&sched_vm_group_list_lock);
	1521	assert(sched_vm_group_thread_count < MAX_VM_BIND_GROUP_COUNT);
	1522	sched_vm_group_thread_list[sched_vm_group_thread_count++] = self;
	1523	simple_unlock(&sched_vm_group_list_lock);
	1524
	1525	thread_bind(master_processor);
	1526
	1527	/* Switch to bound processor if not already there */
	1528	thread_block(THREAD_CONTINUE_NULL);
	1529	}
	1530
	1531	static void
	1532	sched_vm_group_maintenance(void)
	1533	{
	1534	uint64_t ctime = mach_absolute_time();
	1535	uint64_t longtime = ctime - sched_tick_interval;
	1536	int i;
	1537	spl_t s;
	1538	boolean_t high_latency_observed = FALSE;
	1539	boolean_t runnable_and_not_on_runq_observed = FALSE;
	1540	boolean_t bind_target_changed = FALSE;
	1541	processor_t bind_target = PROCESSOR_NULL;
	1542
	1543	/* Make sure nobody attempts to add new threads while we are enumerating them */
	1544	simple_lock(&sched_vm_group_list_lock);
	1545
	1546	s = splsched();
	1547
	1548	for (i=0; i < sched_vm_group_thread_count; i++) {
	1549	thread_t thread = sched_vm_group_thread_list[i];
	1550	assert(thread != THREAD_NULL);
	1551	thread_lock(thread);
	1552	if ((thread->state & (TH_RUN\|TH_WAIT)) == TH_RUN) {
	1553	if (thread->runq != PROCESSOR_NULL && thread->last_made_runnable_time < longtime) {
	1554	high_latency_observed = TRUE;
	1555	} else if (thread->runq == PROCESSOR_NULL) {
	1556	/* There are some cases where a thread be transitiong that also fall into this case */
	1557	runnable_and_not_on_runq_observed = TRUE;
	1558	}
	1559	}
	1560	thread_unlock(thread);
	1561
	1562	if (high_latency_observed && runnable_and_not_on_runq_observed) {
	1563	/* All the things we are looking for are true, stop looking */
	1564	break;
	1565	}
	1566	}
	1567
	1568	splx(s);
	1569
	1570	if (sched_vm_group_temporarily_unbound) {
	1571	/* If we turned off binding, make sure everything is OK before rebinding */
	1572	if (!high_latency_observed) {
	1573	/* rebind */
	1574	bind_target_changed = TRUE;
	1575	bind_target = master_processor;
	1576	sched_vm_group_temporarily_unbound = FALSE; /* might be reset to TRUE if change cannot be completed */
	1577	}
	1578	} else {
	1579	/*
	1580	* Check if we're in a bad state, which is defined by high
	1581	* latency with no core currently executing a thread. If a
	1582	* single thread is making progress on a CPU, that means the
	1583	* binding concept to reduce parallelism is working as
	1584	* designed.
	1585	*/
	1586	if (high_latency_observed && !runnable_and_not_on_runq_observed) {
	1587	/* unbind */
	1588	bind_target_changed = TRUE;
	1589	bind_target = PROCESSOR_NULL;
	1590	sched_vm_group_temporarily_unbound = TRUE;
	1591	}
	1592	}
	1593
	1594	if (bind_target_changed) {
	1595	s = splsched();
	1596	for (i=0; i < sched_vm_group_thread_count; i++) {
	1597	thread_t thread = sched_vm_group_thread_list[i];
	1598	boolean_t removed;
	1599	assert(thread != THREAD_NULL);
	1600
	1601	thread_lock(thread);
	1602	removed = thread_run_queue_remove(thread);
	1603	if (removed \|\| ((thread->state & (TH_RUN \| TH_WAIT)) == TH_WAIT)) {
	1604	thread_bind_internal(thread, bind_target);
	1605	} else {
	1606	/*
	1607	* Thread was in the middle of being context-switched-to,
	1608	* or was in the process of blocking. To avoid switching the bind
	1609	* state out mid-flight, defer the change if possible.
	1610	*/
	1611	if (bind_target == PROCESSOR_NULL) {
	1612	thread_bind_internal(thread, bind_target);
	1613	} else {
	1614	sched_vm_group_temporarily_unbound = TRUE; /* next pass will try again */
	1615	}
	1616	}
	1617
	1618	if (removed) {
	1619	thread_run_queue_reinsert(thread, SCHED_PREEMPT \| SCHED_TAILQ);
	1620	}
	1621	thread_unlock(thread);
	1622	}
	1623	splx(s);
	1624	}
	1625
	1626	simple_unlock(&sched_vm_group_list_lock);
	1627	}
	1628
	1629	/* Invoked prior to idle entry to determine if, on SMT capable processors, an SMT
	1630	* rebalancing opportunity exists when a core is (instantaneously) idle, but
	1631	* other SMT-capable cores may be over-committed. TODO: some possible negatives:
	1632	* IPI thrash if this core does not remain idle following the load balancing ASTs
	1633	* Idle "thrash", when IPI issue is followed by idle entry/core power down
	1634	* followed by a wakeup shortly thereafter.
	1635	*/
	1636
	1637	#if (DEVELOPMENT \|\| DEBUG)
	1638	int sched_smt_balance = 1;
	1639	#endif
	1640
	1641	#if __SMP__
	1642	/* Invoked with pset locked, returns with pset unlocked */
	1643	static void
	1644	sched_SMT_balance(processor_t cprocessor, processor_set_t cpset) {
	1645	processor_t ast_processor = NULL;
	1646
	1647	#if (DEVELOPMENT \|\| DEBUG)
	1648	if (__improbable(sched_smt_balance == 0))
	1649	goto smt_balance_exit;
	1650	#endif
	1651
	1652	assert(cprocessor == current_processor());
	1653	if (cprocessor->is_SMT == FALSE)
	1654	goto smt_balance_exit;
	1655
	1656	processor_t sib_processor = cprocessor->processor_secondary ? cprocessor->processor_secondary : cprocessor->processor_primary;
	1657
	1658	/* Determine if both this processor and its sibling are idle,
	1659	* indicating an SMT rebalancing opportunity.
	1660	*/
	1661	if (sib_processor->state != PROCESSOR_IDLE)
	1662	goto smt_balance_exit;
	1663
	1664	processor_t sprocessor;
	1665
	1666	sprocessor = (processor_t)queue_first(&cpset->active_queue);
	1667
	1668	while (!queue_end(&cpset->active_queue, (queue_entry_t)sprocessor)) {
	1669	if ((sprocessor->state == PROCESSOR_RUNNING) &&
	1670	(sprocessor->processor_primary != sprocessor) &&
	1671	(sprocessor->processor_primary->state == PROCESSOR_RUNNING) &&
	1672	(sprocessor->current_pri < BASEPRI_RTQUEUES) &&
	1673	((cpset->pending_AST_cpu_mask & (1ULL << sprocessor->cpu_id)) == 0)) {
	1674	assert(sprocessor != cprocessor);
	1675	ast_processor = sprocessor;
	1676	break;
	1677	}
	1678	sprocessor = (processor_t)queue_next((queue_entry_t)sprocessor);
	1679	}
	1680
	1681	smt_balance_exit:
	1682	pset_unlock(cpset);
	1683
	1684	if (ast_processor) {
	1685	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_SMT_BALANCE), ast_processor->cpu_id, ast_processor->state, ast_processor->processor_primary->state, 0, 0);
	1686	cause_ast_check(ast_processor);
	1687	}
	1688	}
	1689	#endif /* __SMP__ */
	1690
	1691	/*
	1692	* thread_select:
	1693	*
	1694	* Select a new thread for the current processor to execute.
	1695	*
	1696	* May select the current thread, which must be locked.
	1697	*/
	1698	static thread_t
	1699	thread_select(
	1700	thread_t thread,
	1701	processor_t processor,
	1702	ast_t reason)
	1703	{
	1704	processor_set_t pset = processor->processor_set;
	1705	thread_t new_thread = THREAD_NULL;
	1706
	1707	assert(processor == current_processor());
	1708	assert((thread->state & (TH_RUN\|TH_TERMINATE2)) == TH_RUN);
	1709
	1710	do {
	1711	/*
	1712	* Update the priority.
	1713	*/
	1714	if (SCHED(can_update_priority)(thread))
	1715	SCHED(update_priority)(thread);
	1716
	1717	processor->current_pri = thread->sched_pri;
	1718	processor->current_thmode = thread->sched_mode;
	1719	processor->current_sfi_class = thread->sfi_class;
	1720
	1721	pset_lock(pset);
	1722
	1723	assert(processor->state != PROCESSOR_OFF_LINE);
	1724
	1725	if (!processor->is_recommended) {
	1726	/*
	1727	* The performance controller has provided a hint to not dispatch more threads,
	1728	* unless they are bound to us (and thus we are the only option
	1729	*/
	1730	if (!SCHED(processor_bound_count)(processor)) {
	1731	goto idle;
	1732	}
	1733	} else if (processor->processor_primary != processor) {
	1734	/*
	1735	* Should this secondary SMT processor attempt to find work? For pset runqueue systems,
	1736	* we should look for work only under the same conditions that choose_processor()
	1737	* would have assigned work, which is when all primary processors have been assigned work.
	1738	*
	1739	* An exception is that bound threads are dispatched to a processor without going through
	1740	* choose_processor(), so in those cases we should continue trying to dequeue work.
	1741	*/
	1742	if (!SCHED(processor_bound_count)(processor) && !queue_empty(&pset->idle_queue) && !rt_runq.count) {
	1743	goto idle;
	1744	}
	1745	}
	1746
	1747	rt_lock_lock();
	1748
	1749	/*
	1750	* Test to see if the current thread should continue
	1751	* to run on this processor. Must not be attempting to wait, and not
	1752	* bound to a different processor, nor be in the wrong
	1753	* processor set, nor be forced to context switch by TH_SUSP.
	1754	*
	1755	* Note that there are never any RT threads in the regular runqueue.
	1756	*
	1757	* This code is very insanely tricky.
	1758	*/
	1759
	1760	if (((thread->state & (TH_TERMINATE\|TH_IDLE\|TH_WAIT\|TH_RUN\|TH_SUSP)) == TH_RUN) &&
	1761	(thread->sched_pri >= BASEPRI_RTQUEUES \|\| processor->processor_primary == processor) &&
	1762	(thread->bound_processor == PROCESSOR_NULL \|\| thread->bound_processor == processor) &&
	1763	(thread->affinity_set == AFFINITY_SET_NULL \|\| thread->affinity_set->aset_pset == pset)) {
	1764	/*
	1765	* RT threads with un-expired quantum stay on processor,
	1766	* unless there's a valid RT thread with an earlier deadline.
	1767	*/
	1768	if (thread->sched_pri >= BASEPRI_RTQUEUES && processor->first_timeslice) {
	1769	if (rt_runq.count > 0) {
	1770	thread_t next_rt;
	1771
	1772	next_rt = (thread_t)queue_first(&rt_runq.queue);
	1773
	1774	assert(next_rt->runq == THREAD_ON_RT_RUNQ);
	1775
	1776	if (next_rt->realtime.deadline < processor->deadline &&
	1777	(next_rt->bound_processor == PROCESSOR_NULL \|\|
	1778	next_rt->bound_processor == processor)) {
	1779	/* The next RT thread is better, so pick it off the runqueue. */
	1780	goto pick_new_rt_thread;
	1781	}
	1782	}
	1783
	1784	/* This is still the best RT thread to run. */
	1785	processor->deadline = thread->realtime.deadline;
	1786
	1787	rt_lock_unlock();
	1788	pset_unlock(pset);
	1789
	1790	return (thread);
	1791	}
	1792
	1793	if ((rt_runq.count == 0) &&
	1794	SCHED(processor_queue_has_priority)(processor, thread->sched_pri, TRUE) == FALSE) {
	1795	/* This thread is still the highest priority runnable (non-idle) thread */
	1796	processor->deadline = UINT64_MAX;
	1797
	1798	rt_lock_unlock();
	1799	pset_unlock(pset);
	1800
	1801	return (thread);
	1802	}
	1803	}
	1804
	1805	/* OK, so we're not going to run the current thread. Look at the RT queue. */
	1806	if (rt_runq.count > 0) {
	1807	thread_t next_rt = (thread_t)queue_first(&rt_runq.queue);
	1808
	1809	assert(next_rt->runq == THREAD_ON_RT_RUNQ);
	1810
	1811	if (__probable((next_rt->bound_processor == PROCESSOR_NULL \|\|
	1812	(next_rt->bound_processor == processor)))) {
	1813	pick_new_rt_thread:
	1814	new_thread = (thread_t)dequeue_head(&rt_runq.queue);
	1815
	1816	new_thread->runq = PROCESSOR_NULL;
	1817	SCHED_STATS_RUNQ_CHANGE(&rt_runq.runq_stats, rt_runq.count);
	1818	rt_runq.count--;
	1819
	1820	processor->deadline = new_thread->realtime.deadline;
	1821
	1822	rt_lock_unlock();
	1823	pset_unlock(pset);
	1824
	1825	return (new_thread);
	1826	}
	1827	}
	1828
	1829	processor->deadline = UINT64_MAX;
	1830	rt_lock_unlock();
	1831
	1832	/* No RT threads, so let's look at the regular threads. */
	1833	if ((new_thread = SCHED(choose_thread)(processor, MINPRI, reason)) != THREAD_NULL) {
	1834	pset_unlock(pset);
	1835	return (new_thread);
	1836	}
	1837
	1838	#if __SMP__
	1839	if (SCHED(steal_thread_enabled)) {
	1840	/*
	1841	* No runnable threads, attempt to steal
	1842	* from other processors. Returns with pset lock dropped.
	1843	*/
	1844
	1845	if ((new_thread = SCHED(steal_thread)(pset)) != THREAD_NULL) {
	1846	return (new_thread);
	1847	}
	1848
	1849	/*
	1850	* If other threads have appeared, shortcut
	1851	* around again.
	1852	*/
	1853	if (!SCHED(processor_queue_empty)(processor) \|\| rt_runq.count > 0)
	1854	continue;
	1855
	1856	pset_lock(pset);
	1857	}
	1858	#endif
	1859
	1860	idle:
	1861	/*
	1862	* Nothing is runnable, so set this processor idle if it
	1863	* was running.
	1864	*/
	1865	if (processor->state == PROCESSOR_RUNNING) {
	1866	remqueue((queue_entry_t)processor);
	1867	processor->state = PROCESSOR_IDLE;
	1868
	1869	if (processor->processor_primary == processor) {
	1870	enqueue_head(&pset->idle_queue, (queue_entry_t)processor);
	1871	}
	1872	else {
	1873	enqueue_head(&pset->idle_secondary_queue, (queue_entry_t)processor);
	1874	}
	1875	}
	1876
	1877	#if __SMP__
	1878	/* Invoked with pset locked, returns with pset unlocked */
	1879	sched_SMT_balance(processor, pset);
	1880	#else
	1881	pset_unlock(pset);
	1882	#endif
	1883
	1884	#if CONFIG_SCHED_IDLE_IN_PLACE
	1885	/*
	1886	* Choose idle thread if fast idle is not possible.
	1887	*/
	1888	if (processor->processor_primary != processor)
	1889	return (processor->idle_thread);
	1890
	1891	if ((thread->state & (TH_IDLE\|TH_TERMINATE\|TH_SUSP)) \|\| !(thread->state & TH_WAIT) \|\| thread->wake_active \|\| thread->sched_pri >= BASEPRI_RTQUEUES)
	1892	return (processor->idle_thread);
	1893
	1894	/*
	1895	* Perform idling activities directly without a
	1896	* context switch. Return dispatched thread,
	1897	* else check again for a runnable thread.
	1898	*/
	1899	new_thread = thread_select_idle(thread, processor);
	1900
	1901	#else /* !CONFIG_SCHED_IDLE_IN_PLACE */
	1902
	1903	/*
	1904	* Do a full context switch to idle so that the current
	1905	* thread can start running on another processor without
	1906	* waiting for the fast-idled processor to wake up.
	1907	*/
	1908	new_thread = processor->idle_thread;
	1909
	1910	#endif /* !CONFIG_SCHED_IDLE_IN_PLACE */
	1911
	1912	} while (new_thread == THREAD_NULL);
	1913
	1914	return (new_thread);
	1915	}
	1916
	1917	#if CONFIG_SCHED_IDLE_IN_PLACE
	1918	/*
	1919	* thread_select_idle:
	1920	*
	1921	* Idle the processor using the current thread context.
	1922	*
	1923	* Called with thread locked, then dropped and relocked.
	1924	*/
	1925	static thread_t
	1926	thread_select_idle(
	1927	thread_t thread,
	1928	processor_t processor)
	1929	{
	1930	thread_t new_thread;
	1931	uint64_t arg1, arg2;
	1932	int urgency;
	1933
	1934	if (thread->sched_mode == TH_MODE_TIMESHARE) {
	1935	if (thread->sched_flags & TH_SFLAG_THROTTLED)
	1936	sched_background_decr(thread);
	1937
	1938	sched_share_decr(thread);
	1939	}
	1940	sched_run_decr(thread);
	1941
	1942	thread->state \|= TH_IDLE;
	1943	processor->current_pri = IDLEPRI;
	1944	processor->current_thmode = TH_MODE_NONE;
	1945	processor->current_sfi_class = SFI_CLASS_KERNEL;
	1946
	1947	/* Reload precise timing global policy to thread-local policy */
	1948	thread->precise_user_kernel_time = use_precise_user_kernel_time(thread);
	1949
	1950	thread_unlock(thread);
	1951
	1952	/*
	1953	* Switch execution timing to processor idle thread.
	1954	*/
	1955	processor->last_dispatch = mach_absolute_time();
	1956
	1957	#ifdef CONFIG_MACH_APPROXIMATE_TIME
	1958	commpage_update_mach_approximate_time(processor->last_dispatch);
	1959	#endif
	1960
	1961	thread->last_run_time = processor->last_dispatch;
	1962	thread_timer_event(processor->last_dispatch, &processor->idle_thread->system_timer);
	1963	PROCESSOR_DATA(processor, kernel_timer) = &processor->idle_thread->system_timer;
	1964
	1965	/*
	1966	* Cancel the quantum timer while idling.
	1967	*/
	1968	timer_call_cancel(&processor->quantum_timer);
	1969	processor->first_timeslice = FALSE;
	1970
	1971	(*thread->sched_call)(SCHED_CALL_BLOCK, thread);
	1972
	1973	thread_tell_urgency(THREAD_URGENCY_NONE, 0, 0, 0, NULL);
	1974
	1975	/*
	1976	* Enable interrupts and perform idling activities. No
	1977	* preemption due to TH_IDLE being set.
	1978	*/
	1979	spllo(); new_thread = processor_idle(thread, processor);
	1980
	1981	/*
	1982	* Return at splsched.
	1983	*/
	1984	(*thread->sched_call)(SCHED_CALL_UNBLOCK, thread);
	1985
	1986	thread_lock(thread);
	1987
	1988	/*
	1989	* If awakened, switch to thread timer and start a new quantum.
	1990	* Otherwise skip; we will context switch to another thread or return here.
	1991	*/
	1992	if (!(thread->state & TH_WAIT)) {
	1993	processor->last_dispatch = mach_absolute_time();
	1994	thread_timer_event(processor->last_dispatch, &thread->system_timer);
	1995	PROCESSOR_DATA(processor, kernel_timer) = &thread->system_timer;
	1996
	1997	thread_quantum_init(thread);
	1998	processor->quantum_end = processor->last_dispatch + thread->quantum_remaining;
	1999	timer_call_enter1(&processor->quantum_timer, thread, processor->quantum_end, TIMER_CALL_SYS_CRITICAL \| TIMER_CALL_LOCAL);
	2000	processor->first_timeslice = TRUE;
	2001
	2002	thread->computation_epoch = processor->last_dispatch;
	2003	}
	2004
	2005	thread->state &= ~TH_IDLE;
	2006
	2007	urgency = thread_get_urgency(thread, &arg1, &arg2);
	2008
	2009	thread_tell_urgency(urgency, arg1, arg2, 0, new_thread);
	2010
	2011	sched_run_incr(thread);
	2012	if (thread->sched_mode == TH_MODE_TIMESHARE) {
	2013	sched_share_incr(thread);
	2014
	2015	if (thread->sched_flags & TH_SFLAG_THROTTLED)
	2016	sched_background_incr(thread);
	2017	}
	2018
	2019	return (new_thread);
	2020	}
	2021	#endif /* CONFIG_SCHED_IDLE_IN_PLACE */
	2022
	2023	/*
	2024	* thread_invoke
	2025	*
	2026	* Called at splsched with neither thread locked.
	2027	*
	2028	* Perform a context switch and start executing the new thread.
	2029	*
	2030	* Returns FALSE when the context switch didn't happen.
	2031	* The reference to the new thread is still consumed.
	2032	*
	2033	* "self" is what is currently running on the processor,
	2034	* "thread" is the new thread to context switch to
	2035	* (which may be the same thread in some cases)
	2036	*/
	2037	static boolean_t
	2038	thread_invoke(
	2039	thread_t self,
	2040	thread_t thread,
	2041	ast_t reason)
	2042	{
	2043	if (__improbable(get_preemption_level() != 0)) {
	2044	int pl = get_preemption_level();
	2045	panic("thread_invoke: preemption_level %d, possible cause: %s",
	2046	pl, (pl < 0 ? "unlocking an unlocked mutex or spinlock" :
	2047	"blocking while holding a spinlock, or within interrupt context"));
	2048	}
	2049
	2050	thread_continue_t continuation = self->continuation;
	2051	void *parameter = self->parameter;
	2052	processor_t processor;
	2053
	2054	uint64_t ctime = mach_absolute_time();
	2055
	2056	#ifdef CONFIG_MACH_APPROXIMATE_TIME
	2057	commpage_update_mach_approximate_time(ctime);
	2058	#endif
	2059
	2060	#if defined(CONFIG_SCHED_TIMESHARE_CORE)
	2061	sched_timeshare_consider_maintenance(ctime);
	2062	#endif
	2063
	2064	assert(self == current_thread());
	2065	assert(self->runq == PROCESSOR_NULL);
	2066	assert((self->state & (TH_RUN\|TH_TERMINATE2)) == TH_RUN);
	2067
	2068	thread_lock(thread);
	2069
	2070	assert((thread->state & (TH_RUN\|TH_WAIT\|TH_UNINT\|TH_TERMINATE\|TH_TERMINATE2)) == TH_RUN);
	2071	assert(thread->bound_processor == PROCESSOR_NULL \|\| thread->bound_processor == current_processor());
	2072	assert(thread->runq == PROCESSOR_NULL);
	2073
	2074	/* Reload precise timing global policy to thread-local policy */
	2075	thread->precise_user_kernel_time = use_precise_user_kernel_time(thread);
	2076
	2077	/* Update SFI class based on other factors */
	2078	thread->sfi_class = sfi_thread_classify(thread);
	2079
	2080	/* Allow realtime threads to hang onto a stack. */
	2081	if ((self->sched_mode == TH_MODE_REALTIME) && !self->reserved_stack)
	2082	self->reserved_stack = self->kernel_stack;
	2083
	2084	if (continuation != NULL) {
	2085	if (!thread->kernel_stack) {
	2086	/*
	2087	* If we are using a privileged stack,
	2088	* check to see whether we can exchange it with
	2089	* that of the other thread.
	2090	*/
	2091	if (self->kernel_stack == self->reserved_stack && !thread->reserved_stack)
	2092	goto need_stack;
	2093
	2094	/*
	2095	* Context switch by performing a stack handoff.
	2096	*/
	2097	continuation = thread->continuation;
	2098	parameter = thread->parameter;
	2099
	2100	processor = current_processor();
	2101	processor->active_thread = thread;
	2102	processor->current_pri = thread->sched_pri;
	2103	processor->current_thmode = thread->sched_mode;
	2104	processor->current_sfi_class = thread->sfi_class;
	2105	if (thread->last_processor != processor && thread->last_processor != NULL) {
	2106	if (thread->last_processor->processor_set != processor->processor_set)
	2107	thread->ps_switch++;
	2108	thread->p_switch++;
	2109	}
	2110	thread->last_processor = processor;
	2111	thread->c_switch++;
	2112	ast_context(thread);
	2113
	2114	thread_unlock(thread);
	2115
	2116	self->reason = reason;
	2117
	2118	processor->last_dispatch = ctime;
	2119	self->last_run_time = ctime;
	2120	thread_timer_event(ctime, &thread->system_timer);
	2121	PROCESSOR_DATA(processor, kernel_timer) = &thread->system_timer;
	2122
	2123	/*
	2124	* Since non-precise user/kernel time doesn't update the state timer
	2125	* during privilege transitions, synthesize an event now.
	2126	*/
	2127	if (!thread->precise_user_kernel_time) {
	2128	timer_switch(PROCESSOR_DATA(processor, current_state),
	2129	ctime,
	2130	PROCESSOR_DATA(processor, current_state));
	2131	}
	2132
	2133	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	2134	MACHDBG_CODE(DBG_MACH_SCHED, MACH_STACK_HANDOFF)\|DBG_FUNC_NONE,
	2135	self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
	2136
	2137	if ((thread->chosen_processor != processor) && (thread->chosen_processor != PROCESSOR_NULL)) {
	2138	SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_MOVED)\|DBG_FUNC_NONE,
	2139	(uintptr_t)thread_tid(thread), (uintptr_t)thread->chosen_processor->cpu_id, 0, 0, 0);
	2140	}
	2141
	2142	DTRACE_SCHED2(off__cpu, struct thread , thread, struct proc , thread->task->bsd_info);
	2143
	2144	SCHED_STATS_CSW(processor, self->reason, self->sched_pri, thread->sched_pri);
	2145
	2146	TLOG(1, "thread_invoke: calling stack_handoff\n");
	2147	stack_handoff(self, thread);
	2148
	2149	/* 'self' is now off core */
	2150	assert(thread == current_thread());
	2151
	2152	DTRACE_SCHED(on__cpu);
	2153
	2154	thread_dispatch(self, thread);
	2155
	2156	thread->continuation = thread->parameter = NULL;
	2157
	2158	counter(c_thread_invoke_hits++);
	2159
	2160	(void) spllo();
	2161
	2162	assert(continuation);
	2163	call_continuation(continuation, parameter, thread->wait_result);
	2164	/NOTREACHED/
	2165	}
	2166	else if (thread == self) {
	2167	/* same thread but with continuation */
	2168	ast_context(self);
	2169	counter(++c_thread_invoke_same);
	2170
	2171	thread_unlock(self);
	2172
	2173	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	2174	MACHDBG_CODE(DBG_MACH_SCHED,MACH_SCHED) \| DBG_FUNC_NONE,
	2175	self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
	2176
	2177	self->continuation = self->parameter = NULL;
	2178
	2179	(void) spllo();
	2180
	2181	call_continuation(continuation, parameter, self->wait_result);
	2182	/NOTREACHED/
	2183	}
	2184	} else {
	2185	/*
	2186	* Check that the other thread has a stack
	2187	*/
	2188	if (!thread->kernel_stack) {
	2189	need_stack:
	2190	if (!stack_alloc_try(thread)) {
	2191	counter(c_thread_invoke_misses++);
	2192	thread_unlock(thread);
	2193	thread_stack_enqueue(thread);
	2194	return (FALSE);
	2195	}
	2196	} else if (thread == self) {
	2197	ast_context(self);
	2198	counter(++c_thread_invoke_same);
	2199	thread_unlock(self);
	2200
	2201	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	2202	MACHDBG_CODE(DBG_MACH_SCHED,MACH_SCHED) \| DBG_FUNC_NONE,
	2203	self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
	2204
	2205	return (TRUE);
	2206	}
	2207	}
	2208
	2209	/*
	2210	* Context switch by full context save.
	2211	*/
	2212	processor = current_processor();
	2213	processor->active_thread = thread;
	2214	processor->current_pri = thread->sched_pri;
	2215	processor->current_thmode = thread->sched_mode;
	2216	processor->current_sfi_class = thread->sfi_class;
	2217	if (thread->last_processor != processor && thread->last_processor != NULL) {
	2218	if (thread->last_processor->processor_set != processor->processor_set)
	2219	thread->ps_switch++;
	2220	thread->p_switch++;
	2221	}
	2222	thread->last_processor = processor;
	2223	thread->c_switch++;
	2224	ast_context(thread);
	2225
	2226	thread_unlock(thread);
	2227
	2228	counter(c_thread_invoke_csw++);
	2229
	2230	self->reason = reason;
	2231
	2232	processor->last_dispatch = ctime;
	2233	self->last_run_time = ctime;
	2234	thread_timer_event(ctime, &thread->system_timer);
	2235	PROCESSOR_DATA(processor, kernel_timer) = &thread->system_timer;
	2236
	2237	/*
	2238	* Since non-precise user/kernel time doesn't update the state timer
	2239	* during privilege transitions, synthesize an event now.
	2240	*/
	2241	if (!thread->precise_user_kernel_time) {
	2242	timer_switch(PROCESSOR_DATA(processor, current_state),
	2243	ctime,
	2244	PROCESSOR_DATA(processor, current_state));
	2245	}
	2246
	2247	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	2248	MACHDBG_CODE(DBG_MACH_SCHED,MACH_SCHED) \| DBG_FUNC_NONE,
	2249	self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
	2250
	2251	if ((thread->chosen_processor != processor) && (thread->chosen_processor != NULL)) {
	2252	SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_MOVED)\|DBG_FUNC_NONE,
	2253	(uintptr_t)thread_tid(thread), (uintptr_t)thread->chosen_processor->cpu_id, 0, 0, 0);
	2254	}
	2255
	2256	DTRACE_SCHED2(off__cpu, struct thread , thread, struct proc , thread->task->bsd_info);
	2257
	2258	SCHED_STATS_CSW(processor, self->reason, self->sched_pri, thread->sched_pri);
	2259
	2260	/*
	2261	* This is where we actually switch register context,
	2262	* and address space if required. We will next run
	2263	* as a result of a subsequent context switch.
	2264	*
	2265	* Once registers are switched and the processor is running "thread",
	2266	* the stack variables and non-volatile registers will contain whatever
	2267	* was there the last time that thread blocked. No local variables should
	2268	* be used after this point, except for the special case of "thread", which
	2269	* the platform layer returns as the previous thread running on the processor
	2270	* via the function call ABI as a return register, and "self", which may have
	2271	* been stored on the stack or a non-volatile register, but a stale idea of
	2272	* what was on the CPU is newly-accurate because that thread is again
	2273	* running on the CPU.
	2274	*/
	2275	assert(continuation == self->continuation);
	2276	thread = machine_switch_context(self, continuation, thread);
	2277	assert(self == current_thread());
	2278	TLOG(1,"thread_invoke: returning machine_switch_context: self %p continuation %p thread %p\n", self, continuation, thread);
	2279
	2280	DTRACE_SCHED(on__cpu);
	2281
	2282	/*
	2283	* We have been resumed and are set to run.
	2284	*/
	2285	thread_dispatch(thread, self);
	2286
	2287	if (continuation) {
	2288	self->continuation = self->parameter = NULL;
	2289
	2290	(void) spllo();
	2291
	2292	call_continuation(continuation, parameter, self->wait_result);
	2293	/NOTREACHED/
	2294	}
	2295
	2296	return (TRUE);
	2297	}
	2298
	2299	#if defined(CONFIG_SCHED_DEFERRED_AST)
	2300	/*
	2301	* pset_cancel_deferred_dispatch:
	2302	*
	2303	* Cancels all ASTs that we can cancel for the given processor set
	2304	* if the current processor is running the last runnable thread in the
	2305	* system.
	2306	*
	2307	* This function assumes the current thread is runnable. This must
	2308	* be called with the pset unlocked.
	2309	*/
	2310	static void
	2311	pset_cancel_deferred_dispatch(
	2312	processor_set_t pset,
	2313	processor_t processor)
	2314	{
	2315	processor_t active_processor = NULL;
	2316	uint32_t sampled_sched_run_count;
	2317
	2318	pset_lock(pset);
	2319	sampled_sched_run_count = (volatile uint32_t) sched_run_count;
	2320
	2321	/*
	2322	* If we have emptied the run queue, and our current thread is runnable, we
	2323	* should tell any processors that are still DISPATCHING that they will
	2324	* probably not have any work to do. In the event that there are no
	2325	* pending signals that we can cancel, this is also uninteresting.
	2326	*
	2327	* In the unlikely event that another thread becomes runnable while we are
	2328	* doing this (sched_run_count is atomically updated, not guarded), the
	2329	* codepath making it runnable SHOULD (a dangerous word) need the pset lock
	2330	* in order to dispatch it to a processor in our pset. So, the other
	2331	* codepath will wait while we squash all cancelable ASTs, get the pset
	2332	* lock, and then dispatch the freshly runnable thread. So this should be
	2333	* correct (we won't accidentally have a runnable thread that hasn't been
	2334	* dispatched to an idle processor), if not ideal (we may be restarting the
	2335	* dispatch process, which could have some overhead).
	2336	*
	2337	*/
	2338	if ((sampled_sched_run_count == 1) &&
	2339	(pset->pending_deferred_AST_cpu_mask)) {
	2340	qe_foreach_element_safe(active_processor, &pset->active_queue, processor_queue) {
	2341	/*
	2342	* If a processor is DISPATCHING, it could be because of
	2343	* a cancelable signal.
	2344	*
	2345	* IF the processor is not our
	2346	* current processor (the current processor should not
	2347	* be DISPATCHING, so this is a bit paranoid), AND there
	2348	* is a cancelable signal pending on the processor, AND
	2349	* there is no non-cancelable signal pending (as there is
	2350	* no point trying to backtrack on bringing the processor
	2351	* up if a signal we cannot cancel is outstanding), THEN
	2352	* it should make sense to roll back the processor state
	2353	* to the IDLE state.
	2354	*
	2355	* If the racey nature of this approach (as the signal
	2356	* will be arbitrated by hardware, and can fire as we
	2357	* roll back state) results in the core responding
	2358	* despite being pushed back to the IDLE state, it
	2359	* should be no different than if the core took some
	2360	* interrupt while IDLE.
	2361	*/
	2362	if ((active_processor->state == PROCESSOR_DISPATCHING) &&
	2363	(pset->pending_deferred_AST_cpu_mask & (1ULL << active_processor->cpu_id)) &&
	2364	(!(pset->pending_AST_cpu_mask & (1ULL << active_processor->cpu_id))) &&
	2365	(active_processor != processor)) {
	2366	/*
	2367	* Squash all of the processor state back to some
	2368	* reasonable facsimile of PROCESSOR_IDLE.
	2369	*
	2370	* TODO: What queue policy do we actually want here?
	2371	* We want to promote selection of a good processor
	2372	* to run on. Do we want to enqueue at the head?
	2373	* The tail? At the (relative) old position in the
	2374	* queue? Or something else entirely?
	2375	*/
	2376	re_queue_head(&pset->idle_queue, (queue_entry_t)active_processor);
	2377
	2378	assert(active_processor->next_thread == THREAD_NULL);
	2379
	2380	active_processor->current_pri = IDLEPRI;
	2381	active_processor->current_thmode = TH_MODE_FIXED;
	2382	active_processor->current_sfi_class = SFI_CLASS_KERNEL;
	2383	active_processor->deadline = UINT64_MAX;
	2384	active_processor->state = PROCESSOR_IDLE;
	2385	pset->pending_deferred_AST_cpu_mask &= ~(1U << active_processor->cpu_id);
	2386	machine_signal_idle_cancel(active_processor);
	2387	}
	2388
	2389	}
	2390	}
	2391
	2392	pset_unlock(pset);
	2393	}
	2394	#else
	2395	/* We don't support deferred ASTs; everything is candycanes and sunshine. */
	2396	#endif
	2397
	2398	/*
	2399	* thread_dispatch:
	2400	*
	2401	* Handle threads at context switch. Re-dispatch other thread
	2402	* if still running, otherwise update run state and perform
	2403	* special actions. Update quantum for other thread and begin
	2404	* the quantum for ourselves.
	2405	*
	2406	* "thread" is the old thread that we have switched away from.
	2407	* "self" is the new current thread that we have context switched to
	2408	*
	2409	* Called at splsched.
	2410	*/
	2411	void
	2412	thread_dispatch(
	2413	thread_t thread,
	2414	thread_t self)
	2415	{
	2416	processor_t processor = self->last_processor;
	2417
	2418	assert(processor == current_processor());
	2419	assert(self == current_thread());
	2420	assert(thread != self);
	2421
	2422	if (thread != THREAD_NULL) {
	2423	/*
	2424	* If blocked at a continuation, discard
	2425	* the stack.
	2426	*/
	2427	if (thread->continuation != NULL && thread->kernel_stack != 0)
	2428	stack_free(thread);
	2429
	2430	if (thread->state & TH_IDLE) {
	2431	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	2432	MACHDBG_CODE(DBG_MACH_SCHED,MACH_DISPATCH) \| DBG_FUNC_NONE,
	2433	(uintptr_t)thread_tid(thread), 0, thread->state, sched_run_count, 0);
	2434	} else {
	2435	int64_t consumed;
	2436	int64_t remainder = 0;
	2437
	2438	if (processor->quantum_end > processor->last_dispatch)
	2439	remainder = processor->quantum_end -
	2440	processor->last_dispatch;
	2441
	2442	consumed = thread->quantum_remaining - remainder;
	2443
	2444	if ((thread->reason & AST_LEDGER) == 0) {
	2445	/*
	2446	* Bill CPU time to both the task and
	2447	* the individual thread.
	2448	*/
	2449	ledger_credit(thread->t_ledger,
	2450	task_ledgers.cpu_time, consumed);
	2451	ledger_credit(thread->t_threadledger,
	2452	thread_ledgers.cpu_time, consumed);
	2453	#ifdef CONFIG_BANK
	2454	if (thread->t_bankledger) {
	2455	ledger_credit(thread->t_bankledger,
	2456	bank_ledgers.cpu_time,
	2457	(consumed - thread->t_deduct_bank_ledger_time));
	2458
	2459	}
	2460	thread->t_deduct_bank_ledger_time =0;
	2461	#endif
	2462	}
	2463
	2464	wake_lock(thread);
	2465	thread_lock(thread);
	2466
	2467	/*
	2468	* Compute remainder of current quantum.
	2469	*/
	2470	if (processor->first_timeslice &&
	2471	processor->quantum_end > processor->last_dispatch)
	2472	thread->quantum_remaining = (uint32_t)remainder;
	2473	else
	2474	thread->quantum_remaining = 0;
	2475
	2476	if (thread->sched_mode == TH_MODE_REALTIME) {
	2477	/*
	2478	* Cancel the deadline if the thread has
	2479	* consumed the entire quantum.
	2480	*/
	2481	if (thread->quantum_remaining == 0) {
	2482	thread->realtime.deadline = UINT64_MAX;
	2483	}
	2484	} else {
	2485	#if defined(CONFIG_SCHED_TIMESHARE_CORE)
	2486	/*
	2487	* For non-realtime threads treat a tiny
	2488	* remaining quantum as an expired quantum
	2489	* but include what's left next time.
	2490	*/
	2491	if (thread->quantum_remaining < min_std_quantum) {
	2492	thread->reason \|= AST_QUANTUM;
	2493	thread->quantum_remaining += SCHED(initial_quantum_size)(thread);
	2494	}
	2495	#endif /* CONFIG_SCHED_TIMESHARE_CORE */
	2496	}
	2497
	2498	/*
	2499	* If we are doing a direct handoff then
	2500	* take the remainder of the quantum.
	2501	*/
	2502	if ((thread->reason & (AST_HANDOFF\|AST_QUANTUM)) == AST_HANDOFF) {
	2503	self->quantum_remaining = thread->quantum_remaining;
	2504	thread->reason \|= AST_QUANTUM;
	2505	thread->quantum_remaining = 0;
	2506	} else {
	2507	#if defined(CONFIG_SCHED_MULTIQ)
	2508	if (SCHED(sched_groups_enabled) &&
	2509	thread->sched_group == self->sched_group) {
	2510	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	2511	MACHDBG_CODE(DBG_MACH_SCHED, MACH_QUANTUM_HANDOFF),
	2512	self->reason, (uintptr_t)thread_tid(thread),
	2513	self->quantum_remaining, thread->quantum_remaining, 0);
	2514
	2515	self->quantum_remaining = thread->quantum_remaining;
	2516	thread->quantum_remaining = 0;
	2517	/* Don't set AST_QUANTUM here - old thread might still want to preempt someone else */
	2518	}
	2519	#endif /* defined(CONFIG_SCHED_MULTIQ) */
	2520	}
	2521
	2522	thread->computation_metered += (processor->last_dispatch - thread->computation_epoch);
	2523
	2524	if ((thread->rwlock_count != 0) && !(LcksOpts & disLkRWPrio)) {
	2525	integer_t priority;
	2526
	2527	priority = thread->sched_pri;
	2528
	2529	if (priority < thread->base_pri)
	2530	priority = thread->base_pri;
	2531	if (priority < BASEPRI_BACKGROUND)
	2532	priority = BASEPRI_BACKGROUND;
	2533
	2534	if ((thread->sched_pri < priority) \|\| !(thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
	2535	KERNEL_DEBUG_CONSTANT(
	2536	MACHDBG_CODE(DBG_MACH_SCHED, MACH_RW_PROMOTE) \| DBG_FUNC_NONE,
	2537	(uintptr_t)thread_tid(thread), thread->sched_pri, thread->base_pri, priority, 0);
	2538
	2539	thread->sched_flags \|= TH_SFLAG_RW_PROMOTED;
	2540
	2541	if (thread->sched_pri < priority)
	2542	set_sched_pri(thread, priority);
	2543	}
	2544	}
	2545
	2546	if (!(thread->state & TH_WAIT)) {
	2547	/*
	2548	* Still runnable.
	2549	*/
	2550	thread->last_made_runnable_time = mach_approximate_time();
	2551
	2552	machine_thread_going_off_core(thread, FALSE);
	2553
	2554	if (thread->reason & AST_QUANTUM)
	2555	thread_setrun(thread, SCHED_TAILQ);
	2556	else if (thread->reason & AST_PREEMPT)
	2557	thread_setrun(thread, SCHED_HEADQ);
	2558	else
	2559	thread_setrun(thread, SCHED_PREEMPT \| SCHED_TAILQ);
	2560
	2561	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	2562	MACHDBG_CODE(DBG_MACH_SCHED,MACH_DISPATCH) \| DBG_FUNC_NONE,
	2563	(uintptr_t)thread_tid(thread), thread->reason, thread->state, sched_run_count, 0);
	2564
	2565	if (thread->wake_active) {
	2566	thread->wake_active = FALSE;
	2567	thread_unlock(thread);
	2568
	2569	thread_wakeup(&thread->wake_active);
	2570	} else {
	2571	thread_unlock(thread);
	2572	}
	2573
	2574	wake_unlock(thread);
	2575	} else {
	2576	/*
	2577	* Waiting.
	2578	*/
	2579	boolean_t should_terminate = FALSE;
	2580	uint32_t new_run_count;
	2581
	2582	/* Only the first call to thread_dispatch
	2583	* after explicit termination should add
	2584	* the thread to the termination queue
	2585	*/
	2586	if ((thread->state & (TH_TERMINATE\|TH_TERMINATE2)) == TH_TERMINATE) {
	2587	should_terminate = TRUE;
	2588	thread->state \|= TH_TERMINATE2;
	2589	}
	2590
	2591	thread->state &= ~TH_RUN;
	2592	thread->last_made_runnable_time = ~0ULL;
	2593	thread->chosen_processor = PROCESSOR_NULL;
	2594
	2595	if (thread->sched_mode == TH_MODE_TIMESHARE) {
	2596	if (thread->sched_flags & TH_SFLAG_THROTTLED)
	2597	sched_background_decr(thread);
	2598
	2599	sched_share_decr(thread);
	2600	}
	2601	new_run_count = sched_run_decr(thread);
	2602
	2603	#if CONFIG_SCHED_SFI
	2604	if ((thread->state & (TH_WAIT \| TH_TERMINATE)) == TH_WAIT) {
	2605	if (thread->reason & AST_SFI) {
	2606	thread->wait_sfi_begin_time = processor->last_dispatch;
	2607	}
	2608	}
	2609	#endif
	2610
	2611	machine_thread_going_off_core(thread, should_terminate);
	2612
	2613	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	2614	MACHDBG_CODE(DBG_MACH_SCHED,MACH_DISPATCH) \| DBG_FUNC_NONE,
	2615	(uintptr_t)thread_tid(thread), thread->reason, thread->state, new_run_count, 0);
	2616
	2617	(*thread->sched_call)(SCHED_CALL_BLOCK, thread);
	2618
	2619	if (thread->wake_active) {
	2620	thread->wake_active = FALSE;
	2621	thread_unlock(thread);
	2622
	2623	thread_wakeup(&thread->wake_active);
	2624	} else {
	2625	thread_unlock(thread);
	2626	}
	2627
	2628	wake_unlock(thread);
	2629
	2630	if (should_terminate)
	2631	thread_terminate_enqueue(thread);
	2632	}
	2633	}
	2634	}
	2635
	2636	/* Update (new) current thread and reprogram quantum timer */
	2637	thread_lock(self);
	2638	if (!(self->state & TH_IDLE)) {
	2639	uint64_t arg1, arg2;
	2640	int urgency;
	2641	uint64_t latency;
	2642
	2643	#if CONFIG_SCHED_SFI
	2644	ast_t new_ast;
	2645
	2646	new_ast = sfi_thread_needs_ast(self, NULL);
	2647
	2648	if (new_ast != AST_NONE) {
	2649	ast_on(new_ast);
	2650	}
	2651	#endif
	2652
	2653	assert(processor->last_dispatch >= self->last_made_runnable_time);
	2654	latency = processor->last_dispatch - self->last_made_runnable_time;
	2655
	2656	urgency = thread_get_urgency(self, &arg1, &arg2);
	2657
	2658	thread_tell_urgency(urgency, arg1, arg2, latency, self);
	2659
	2660	machine_thread_going_on_core(self, urgency, latency);
	2661
	2662	/*
	2663	* Get a new quantum if none remaining.
	2664	*/
	2665	if (self->quantum_remaining == 0) {
	2666	thread_quantum_init(self);
	2667	}
	2668
	2669	/*
	2670	* Set up quantum timer and timeslice.
	2671	*/
	2672	processor->quantum_end = processor->last_dispatch + self->quantum_remaining;
	2673	timer_call_enter1(&processor->quantum_timer, self, processor->quantum_end, TIMER_CALL_SYS_CRITICAL \| TIMER_CALL_LOCAL);
	2674
	2675	processor->first_timeslice = TRUE;
	2676	} else {
	2677	timer_call_cancel(&processor->quantum_timer);
	2678	processor->first_timeslice = FALSE;
	2679
	2680	thread_tell_urgency(THREAD_URGENCY_NONE, 0, 0, 0, self);
	2681	machine_thread_going_on_core(self, THREAD_URGENCY_NONE, 0);
	2682	}
	2683
	2684	self->computation_epoch = processor->last_dispatch;
	2685	self->reason = AST_NONE;
	2686
	2687	thread_unlock(self);
	2688
	2689	#if defined(CONFIG_SCHED_DEFERRED_AST)
	2690	/*
	2691	* TODO: Can we state that redispatching our old thread is also
	2692	* uninteresting?
	2693	*/
	2694	if ((((volatile uint32_t)sched_run_count) == 1) &&
	2695	!(self->state & TH_IDLE)) {
	2696	pset_cancel_deferred_dispatch(processor->processor_set, processor);
	2697	}
	2698	#endif
	2699
	2700	}
	2701
	2702	/*
	2703	* thread_block_reason:
	2704	*
	2705	* Forces a reschedule, blocking the caller if a wait
	2706	* has been asserted.
	2707	*
	2708	* If a continuation is specified, then thread_invoke will
	2709	* attempt to discard the thread's kernel stack. When the
	2710	* thread resumes, it will execute the continuation function
	2711	* on a new kernel stack.
	2712	*/
	2713	counter(mach_counter_t c_thread_block_calls = 0;)
	2714
	2715	wait_result_t
	2716	thread_block_reason(
	2717	thread_continue_t continuation,
	2718	void *parameter,
	2719	ast_t reason)
	2720	{
	2721	thread_t self = current_thread();
	2722	processor_t processor;
	2723	thread_t new_thread;
	2724	spl_t s;
	2725
	2726	counter(++c_thread_block_calls);
	2727
	2728	s = splsched();
	2729
	2730	processor = current_processor();
	2731
	2732	/* If we're explicitly yielding, force a subsequent quantum */
	2733	if (reason & AST_YIELD)
	2734	processor->first_timeslice = FALSE;
	2735
	2736	/* We're handling all scheduling AST's */
	2737	ast_off(AST_SCHEDULING);
	2738
	2739	self->continuation = continuation;
	2740	self->parameter = parameter;
	2741
	2742	if (self->state & ~(TH_RUN \| TH_IDLE)) {
	2743	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	2744	MACHDBG_CODE(DBG_MACH_SCHED,MACH_BLOCK),
	2745	reason, VM_KERNEL_UNSLIDE(continuation), 0, 0, 0);
	2746	}
	2747
	2748	do {
	2749	thread_lock(self);
	2750	new_thread = thread_select(self, processor, reason);
	2751	thread_unlock(self);
	2752	} while (!thread_invoke(self, new_thread, reason));
	2753
	2754	splx(s);
	2755
	2756	return (self->wait_result);
	2757	}
	2758
	2759	/*
	2760	* thread_block:
	2761	*
	2762	* Block the current thread if a wait has been asserted.
	2763	*/
	2764	wait_result_t
	2765	thread_block(
	2766	thread_continue_t continuation)
	2767	{
	2768	return thread_block_reason(continuation, NULL, AST_NONE);
	2769	}
	2770
	2771	wait_result_t
	2772	thread_block_parameter(
	2773	thread_continue_t continuation,
	2774	void *parameter)
	2775	{
	2776	return thread_block_reason(continuation, parameter, AST_NONE);
	2777	}
	2778
	2779	/*
	2780	* thread_run:
	2781	*
	2782	* Switch directly from the current thread to the
	2783	* new thread, handing off our quantum if appropriate.
	2784	*
	2785	* New thread must be runnable, and not on a run queue.
	2786	*
	2787	* Called at splsched.
	2788	*/
	2789	int
	2790	thread_run(
	2791	thread_t self,
	2792	thread_continue_t continuation,
	2793	void *parameter,
	2794	thread_t new_thread)
	2795	{
	2796	ast_t handoff = AST_HANDOFF;
	2797
	2798	self->continuation = continuation;
	2799	self->parameter = parameter;
	2800
	2801	while (!thread_invoke(self, new_thread, handoff)) {
	2802	processor_t processor = current_processor();
	2803
	2804	thread_lock(self);
	2805	new_thread = thread_select(self, processor, AST_NONE);
	2806	thread_unlock(self);
	2807	handoff = AST_NONE;
	2808	}
	2809
	2810	return (self->wait_result);
	2811	}
	2812
	2813	/*
	2814	* thread_continue:
	2815	*
	2816	* Called at splsched when a thread first receives
	2817	* a new stack after a continuation.
	2818	*/
	2819	void
	2820	thread_continue(
	2821	thread_t thread)
	2822	{
	2823	thread_t self = current_thread();
	2824	thread_continue_t continuation;
	2825	void *parameter;
	2826
	2827	DTRACE_SCHED(on__cpu);
	2828
	2829	continuation = self->continuation;
	2830	parameter = self->parameter;
	2831
	2832	thread_dispatch(thread, self);
	2833
	2834	self->continuation = self->parameter = NULL;
	2835
	2836	if (thread != THREAD_NULL)
	2837	(void)spllo();
	2838
	2839	TLOG(1, "thread_continue: calling call_continuation \n");
	2840	call_continuation(continuation, parameter, self->wait_result);
	2841	/NOTREACHED/
	2842	}
	2843
	2844	void
	2845	thread_quantum_init(thread_t thread)
	2846	{
	2847	if (thread->sched_mode == TH_MODE_REALTIME) {
	2848	thread->quantum_remaining = thread->realtime.computation;
	2849	} else {
	2850	thread->quantum_remaining = SCHED(initial_quantum_size)(thread);
	2851	}
	2852	}
	2853
	2854	uint32_t
	2855	sched_timeshare_initial_quantum_size(thread_t thread)
	2856	{
	2857	if ((thread == THREAD_NULL) \|\| !(thread->sched_flags & TH_SFLAG_THROTTLED))
	2858	return std_quantum;
	2859	else
	2860	return bg_quantum;
	2861	}
	2862
	2863	/*
	2864	* run_queue_init:
	2865	*
	2866	* Initialize a run queue before first use.
	2867	*/
	2868	void
	2869	run_queue_init(
	2870	run_queue_t rq)
	2871	{
	2872	int i;
	2873
	2874	rq->highq = IDLEPRI;
	2875	for (i = 0; i < NRQBM; i++)
	2876	rq->bitmap[i] = 0;
	2877	setbit(MAXPRI - IDLEPRI, rq->bitmap);
	2878	rq->urgency = rq->count = 0;
	2879	for (i = 0; i < NRQS; i++)
	2880	queue_init(&rq->queues[i]);
	2881	}
	2882
	2883	/*
	2884	* run_queue_dequeue:
	2885	*
	2886	* Perform a dequeue operation on a run queue,
	2887	* and return the resulting thread.
	2888	*
	2889	* The run queue must be locked (see thread_run_queue_remove()
	2890	* for more info), and not empty.
	2891	*/
	2892	thread_t
	2893	run_queue_dequeue(
	2894	run_queue_t rq,
	2895	integer_t options)
	2896	{
	2897	thread_t thread;
	2898	queue_t queue = rq->queues + rq->highq;
	2899
	2900	if (options & SCHED_HEADQ) {
	2901	thread = (thread_t)dequeue_head(queue);
	2902	}
	2903	else {
	2904	thread = (thread_t)dequeue_tail(queue);
	2905	}
	2906
	2907	thread->runq = PROCESSOR_NULL;
	2908	SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
	2909	rq->count--;
	2910	if (SCHED(priority_is_urgent)(rq->highq)) {
	2911	rq->urgency--; assert(rq->urgency >= 0);
	2912	}
	2913	if (queue_empty(queue)) {
	2914	if (rq->highq != IDLEPRI)
	2915	clrbit(MAXPRI - rq->highq, rq->bitmap);
	2916	rq->highq = MAXPRI - ffsbit(rq->bitmap);
	2917	}
	2918
	2919	return (thread);
	2920	}
	2921
	2922	/*
	2923	* run_queue_enqueue:
	2924	*
	2925	* Perform a enqueue operation on a run queue.
	2926	*
	2927	* The run queue must be locked (see thread_run_queue_remove()
	2928	* for more info).
	2929	*/
	2930	boolean_t
	2931	run_queue_enqueue(
	2932	run_queue_t rq,
	2933	thread_t thread,
	2934	integer_t options)
	2935	{
	2936	queue_t queue = rq->queues + thread->sched_pri;
	2937	boolean_t result = FALSE;
	2938
	2939	if (queue_empty(queue)) {
	2940	enqueue_tail(queue, (queue_entry_t)thread);
	2941
	2942	setbit(MAXPRI - thread->sched_pri, rq->bitmap);
	2943	if (thread->sched_pri > rq->highq) {
	2944	rq->highq = thread->sched_pri;
	2945	result = TRUE;
	2946	}
	2947	} else {
	2948	if (options & SCHED_TAILQ)
	2949	enqueue_tail(queue, (queue_entry_t)thread);
	2950	else
	2951	enqueue_head(queue, (queue_entry_t)thread);
	2952	}
	2953	if (SCHED(priority_is_urgent)(thread->sched_pri))
	2954	rq->urgency++;
	2955	SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
	2956	rq->count++;
	2957
	2958	return (result);
	2959
	2960	}
	2961
	2962	/*
	2963	* run_queue_remove:
	2964	*
	2965	* Remove a specific thread from a runqueue.
	2966	*
	2967	* The run queue must be locked.
	2968	*/
	2969	void
	2970	run_queue_remove(
	2971	run_queue_t rq,
	2972	thread_t thread)
	2973	{
	2974
	2975	remqueue((queue_entry_t)thread);
	2976	SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
	2977	rq->count--;
	2978	if (SCHED(priority_is_urgent)(thread->sched_pri)) {
	2979	rq->urgency--; assert(rq->urgency >= 0);
	2980	}
	2981
	2982	if (queue_empty(rq->queues + thread->sched_pri)) {
	2983	/* update run queue status */
	2984	if (thread->sched_pri != IDLEPRI)
	2985	clrbit(MAXPRI - thread->sched_pri, rq->bitmap);
	2986	rq->highq = MAXPRI - ffsbit(rq->bitmap);
	2987	}
	2988
	2989	thread->runq = PROCESSOR_NULL;
	2990	}
	2991
	2992	/* Assumes RT lock is not held, and acquires splsched/rt_lock itself */
	2993	void
	2994	rt_runq_scan(sched_update_scan_context_t scan_context)
	2995	{
	2996	spl_t s;
	2997	thread_t thread;
	2998
	2999	s = splsched();
	3000	rt_lock_lock();
	3001
	3002	qe_foreach_element_safe(thread, &rt_runq.queue, links) {
	3003	if (thread->last_made_runnable_time < scan_context->earliest_rt_make_runnable_time) {
	3004	scan_context->earliest_rt_make_runnable_time = thread->last_made_runnable_time;
	3005	}
	3006	}
	3007
	3008	rt_lock_unlock();
	3009	splx(s);
	3010	}
	3011
	3012
	3013	/*
	3014	* realtime_queue_insert:
	3015	*
	3016	* Enqueue a thread for realtime execution.
	3017	*/
	3018	static boolean_t
	3019	realtime_queue_insert(
	3020	thread_t thread)
	3021	{
	3022	queue_t queue = &rt_runq.queue;
	3023	uint64_t deadline = thread->realtime.deadline;
	3024	boolean_t preempt = FALSE;
	3025
	3026	rt_lock_lock();
	3027
	3028	if (queue_empty(queue)) {
	3029	enqueue_tail(queue, (queue_entry_t)thread);
	3030	preempt = TRUE;
	3031	}
	3032	else {
	3033	register thread_t entry = (thread_t)queue_first(queue);
	3034
	3035	while (TRUE) {
	3036	if ( queue_end(queue, (queue_entry_t)entry) \|\|
	3037	deadline < entry->realtime.deadline ) {
	3038	entry = (thread_t)queue_prev((queue_entry_t)entry);
	3039	break;
	3040	}
	3041
	3042	entry = (thread_t)queue_next((queue_entry_t)entry);
	3043	}
	3044
	3045	if ((queue_entry_t)entry == queue)
	3046	preempt = TRUE;
	3047
	3048	insque((queue_entry_t)thread, (queue_entry_t)entry);
	3049	}
	3050
	3051	thread->runq = THREAD_ON_RT_RUNQ;
	3052	SCHED_STATS_RUNQ_CHANGE(&rt_runq.runq_stats, rt_runq.count);
	3053	rt_runq.count++;
	3054
	3055	rt_lock_unlock();
	3056
	3057	return (preempt);
	3058	}
	3059
	3060	/*
	3061	* realtime_setrun:
	3062	*
	3063	* Dispatch a thread for realtime execution.
	3064	*
	3065	* Thread must be locked. Associated pset must
	3066	* be locked, and is returned unlocked.
	3067	*/
	3068	static void
	3069	realtime_setrun(
	3070	processor_t processor,
	3071	thread_t thread)
	3072	{
	3073	processor_set_t pset = processor->processor_set;
	3074	ast_t preempt;
	3075
	3076	boolean_t do_signal_idle = FALSE, do_cause_ast = FALSE;
	3077
	3078	thread->chosen_processor = processor;
	3079
	3080	/* <rdar://problem/15102234> */
	3081	assert(thread->bound_processor == PROCESSOR_NULL);
	3082
	3083	/*
	3084	* Dispatch directly onto idle processor.
	3085	*/
	3086	if ( (thread->bound_processor == processor)
	3087	&& processor->state == PROCESSOR_IDLE) {
	3088	remqueue((queue_entry_t)processor);
	3089	enqueue_tail(&pset->active_queue, (queue_entry_t)processor);
	3090
	3091	processor->next_thread = thread;
	3092	processor->current_pri = thread->sched_pri;
	3093	processor->current_thmode = thread->sched_mode;
	3094	processor->current_sfi_class = thread->sfi_class;
	3095	processor->deadline = thread->realtime.deadline;
	3096	processor->state = PROCESSOR_DISPATCHING;
	3097
	3098	if (processor != current_processor()) {
	3099	if (!(pset->pending_AST_cpu_mask & (1ULL << processor->cpu_id))) {
	3100	/* cleared on exit from main processor_idle() loop */
	3101	pset->pending_AST_cpu_mask \|= (1ULL << processor->cpu_id);
	3102	do_signal_idle = TRUE;
	3103	}
	3104	}
	3105	pset_unlock(pset);
	3106
	3107	if (do_signal_idle) {
	3108	machine_signal_idle(processor);
	3109	}
	3110	return;
	3111	}
	3112
	3113	if (processor->current_pri < BASEPRI_RTQUEUES)
	3114	preempt = (AST_PREEMPT \| AST_URGENT);
	3115	else if (thread->realtime.deadline < processor->deadline)
	3116	preempt = (AST_PREEMPT \| AST_URGENT);
	3117	else
	3118	preempt = AST_NONE;
	3119
	3120	realtime_queue_insert(thread);
	3121
	3122	if (preempt != AST_NONE) {
	3123	if (processor->state == PROCESSOR_IDLE) {
	3124	remqueue((queue_entry_t)processor);
	3125	enqueue_tail(&pset->active_queue, (queue_entry_t)processor);
	3126	processor->next_thread = THREAD_NULL;
	3127	processor->current_pri = thread->sched_pri;
	3128	processor->current_thmode = thread->sched_mode;
	3129	processor->current_sfi_class = thread->sfi_class;
	3130	processor->deadline = thread->realtime.deadline;
	3131	processor->state = PROCESSOR_DISPATCHING;
	3132	if (processor == current_processor()) {
	3133	ast_on(preempt);
	3134	} else {
	3135	if (!(pset->pending_AST_cpu_mask & (1ULL << processor->cpu_id))) {
	3136	/* cleared on exit from main processor_idle() loop */
	3137	pset->pending_AST_cpu_mask \|= (1ULL << processor->cpu_id);
	3138	do_signal_idle = TRUE;
	3139	}
	3140	}
	3141	} else if (processor->state == PROCESSOR_DISPATCHING) {
	3142	if ((processor->next_thread == THREAD_NULL) && ((processor->current_pri < thread->sched_pri) \|\| (processor->deadline > thread->realtime.deadline))) {
	3143	processor->current_pri = thread->sched_pri;
	3144	processor->current_thmode = thread->sched_mode;
	3145	processor->current_sfi_class = thread->sfi_class;
	3146	processor->deadline = thread->realtime.deadline;
	3147	}
	3148	} else {
	3149	if (processor == current_processor()) {
	3150	ast_on(preempt);
	3151	} else {
	3152	if (!(pset->pending_AST_cpu_mask & (1ULL << processor->cpu_id))) {
	3153	/* cleared after IPI causes csw_check() to be called */
	3154	pset->pending_AST_cpu_mask \|= (1ULL << processor->cpu_id);
	3155	do_cause_ast = TRUE;
	3156	}
	3157	}
	3158	}
	3159	} else {
	3160	/* Selected processor was too busy, just keep thread enqueued and let other processors drain it naturally. */
	3161	}
	3162
	3163	pset_unlock(pset);
	3164
	3165	if (do_signal_idle) {
	3166	machine_signal_idle(processor);
	3167	} else if (do_cause_ast) {
	3168	cause_ast_check(processor);
	3169	}
	3170	}
	3171
	3172
	3173	#if defined(CONFIG_SCHED_TIMESHARE_CORE)
	3174
	3175	boolean_t
	3176	priority_is_urgent(int priority)
	3177	{
	3178	return testbit(priority, sched_preempt_pri) ? TRUE : FALSE;
	3179	}
	3180
	3181	#endif /* CONFIG_SCHED_TIMESHARE_CORE */
	3182
	3183	/*
	3184	* processor_setrun:
	3185	*
	3186	* Dispatch a thread for execution on a
	3187	* processor.
	3188	*
	3189	* Thread must be locked. Associated pset must
	3190	* be locked, and is returned unlocked.
	3191	*/
	3192	static void
	3193	processor_setrun(
	3194	processor_t processor,
	3195	thread_t thread,
	3196	integer_t options)
	3197	{
	3198	processor_set_t pset = processor->processor_set;
	3199	ast_t preempt;
	3200	enum { eExitIdle, eInterruptRunning, eDoNothing } ipi_action = eDoNothing;
	3201	enum { eNoSignal, eDoSignal, eDoDeferredSignal } do_signal_idle = eNoSignal;
	3202
	3203	boolean_t do_cause_ast = FALSE;
	3204
	3205	thread->chosen_processor = processor;
	3206
	3207	/*
	3208	* Dispatch directly onto idle processor.
	3209	*/
	3210	if ( (SCHED(direct_dispatch_to_idle_processors) \|\|
	3211	thread->bound_processor == processor)
	3212	&& processor->state == PROCESSOR_IDLE) {
	3213	remqueue((queue_entry_t)processor);
	3214	enqueue_tail(&pset->active_queue, (queue_entry_t)processor);
	3215
	3216	processor->next_thread = thread;
	3217	processor->current_pri = thread->sched_pri;
	3218	processor->current_thmode = thread->sched_mode;
	3219	processor->current_sfi_class = thread->sfi_class;
	3220	processor->deadline = UINT64_MAX;
	3221	processor->state = PROCESSOR_DISPATCHING;
	3222
	3223	if (!(pset->pending_AST_cpu_mask & (1ULL << processor->cpu_id))) {
	3224	/* cleared on exit from main processor_idle() loop */
	3225	pset->pending_AST_cpu_mask \|= (1ULL << processor->cpu_id);
	3226	do_signal_idle = eDoSignal;
	3227	}
	3228
	3229	pset_unlock(pset);
	3230
	3231	if (do_signal_idle == eDoSignal) {
	3232	machine_signal_idle(processor);
	3233	}
	3234
	3235	return;
	3236	}
	3237
	3238	/*
	3239	* Set preemption mode.
	3240	*/
	3241	#if defined(CONFIG_SCHED_DEFERRED_AST)
	3242	/* TODO: Do we need to care about urgency (see rdar://problem/20136239)? */
	3243	#endif
	3244	if (SCHED(priority_is_urgent)(thread->sched_pri) && thread->sched_pri > processor->current_pri)
	3245	preempt = (AST_PREEMPT \| AST_URGENT);
	3246	else if(processor->active_thread && thread_eager_preemption(processor->active_thread))
	3247	preempt = (AST_PREEMPT \| AST_URGENT);
	3248	else if ((thread->sched_mode == TH_MODE_TIMESHARE) && (thread->sched_pri < thread->base_pri)) {
	3249	if(SCHED(priority_is_urgent)(thread->base_pri) && thread->sched_pri > processor->current_pri) {
	3250	preempt = (options & SCHED_PREEMPT)? AST_PREEMPT: AST_NONE;
	3251	} else {
	3252	preempt = AST_NONE;
	3253	}
	3254	} else
	3255	preempt = (options & SCHED_PREEMPT)? AST_PREEMPT: AST_NONE;
	3256
	3257	SCHED(processor_enqueue)(processor, thread, options);
	3258
	3259	if (preempt != AST_NONE) {
	3260	if (processor->state == PROCESSOR_IDLE) {
	3261	remqueue((queue_entry_t)processor);
	3262	enqueue_tail(&pset->active_queue, (queue_entry_t)processor);
	3263	processor->next_thread = THREAD_NULL;
	3264	processor->current_pri = thread->sched_pri;
	3265	processor->current_thmode = thread->sched_mode;
	3266	processor->current_sfi_class = thread->sfi_class;
	3267	processor->deadline = UINT64_MAX;
	3268	processor->state = PROCESSOR_DISPATCHING;
	3269
	3270	ipi_action = eExitIdle;
	3271	} else if ( processor->state == PROCESSOR_DISPATCHING) {
	3272	if ((processor->next_thread == THREAD_NULL) && (processor->current_pri < thread->sched_pri)) {
	3273	processor->current_pri = thread->sched_pri;
	3274	processor->current_thmode = thread->sched_mode;
	3275	processor->current_sfi_class = thread->sfi_class;
	3276	processor->deadline = UINT64_MAX;
	3277	}
	3278	} else if ( (processor->state == PROCESSOR_RUNNING \|\|
	3279	processor->state == PROCESSOR_SHUTDOWN) &&
	3280	(thread->sched_pri >= processor->current_pri)) {
	3281	ipi_action = eInterruptRunning;
	3282	}
	3283	} else {
	3284	/*
	3285	* New thread is not important enough to preempt what is running, but
	3286	* special processor states may need special handling
	3287	*/
	3288	if (processor->state == PROCESSOR_SHUTDOWN &&
	3289	thread->sched_pri >= processor->current_pri ) {
	3290	ipi_action = eInterruptRunning;
	3291	} else if ( processor->state == PROCESSOR_IDLE &&
	3292	processor != current_processor() ) {
	3293	remqueue((queue_entry_t)processor);
	3294	enqueue_tail(&pset->active_queue, (queue_entry_t)processor);
	3295	processor->next_thread = THREAD_NULL;
	3296	processor->current_pri = thread->sched_pri;
	3297	processor->current_thmode = thread->sched_mode;
	3298	processor->current_sfi_class = thread->sfi_class;
	3299	processor->deadline = UINT64_MAX;
	3300	processor->state = PROCESSOR_DISPATCHING;
	3301
	3302	ipi_action = eExitIdle;
	3303	}
	3304	}
	3305
	3306	switch (ipi_action) {
	3307	case eDoNothing:
	3308	break;
	3309	case eExitIdle:
	3310	if (processor == current_processor()) {
	3311	if (csw_check_locked(processor, pset, AST_NONE) != AST_NONE)
	3312	ast_on(preempt);
	3313	} else {
	3314	#if defined(CONFIG_SCHED_DEFERRED_AST)
	3315	if (!(pset->pending_deferred_AST_cpu_mask & (1ULL << processor->cpu_id)) &&
	3316	!(pset->pending_AST_cpu_mask & (1ULL << processor->cpu_id))) {
	3317	/* cleared on exit from main processor_idle() loop */
	3318	pset->pending_deferred_AST_cpu_mask \|= (1ULL << processor->cpu_id);
	3319	do_signal_idle = eDoDeferredSignal;
	3320	}
	3321	#else
	3322	if (!(pset->pending_AST_cpu_mask & (1ULL << processor->cpu_id))) {
	3323	/* cleared on exit from main processor_idle() loop */
	3324	pset->pending_AST_cpu_mask \|= (1ULL << processor->cpu_id);
	3325	do_signal_idle = eDoSignal;
	3326	}
	3327	#endif
	3328	}
	3329	break;
	3330	case eInterruptRunning:
	3331	if (processor == current_processor()) {
	3332	if (csw_check_locked(processor, pset, AST_NONE) != AST_NONE)
	3333	ast_on(preempt);
	3334	} else {
	3335	if (!(pset->pending_AST_cpu_mask & (1ULL << processor->cpu_id))) {
	3336	/* cleared after IPI causes csw_check() to be called */
	3337	pset->pending_AST_cpu_mask \|= (1ULL << processor->cpu_id);
	3338	do_cause_ast = TRUE;
	3339	}
	3340	}
	3341	break;
	3342	}
	3343
	3344	pset_unlock(pset);
	3345
	3346	if (do_signal_idle == eDoSignal) {
	3347	machine_signal_idle(processor);
	3348	}
	3349	#if defined(CONFIG_SCHED_DEFERRED_AST)
	3350	else if (do_signal_idle == eDoDeferredSignal) {
	3351	/*
	3352	* TODO: The ability to cancel this signal could make
	3353	* sending it outside of the pset lock an issue. Do
	3354	* we need to address this? Or would the only fallout
	3355	* be that the core takes a signal? As long as we do
	3356	* not run the risk of having a core marked as signal
	3357	* outstanding, with no real signal outstanding, the
	3358	* only result should be that we fail to cancel some
	3359	* signals.
	3360	*/
	3361	machine_signal_idle_deferred(processor);
	3362	}
	3363	#endif
	3364	else if (do_cause_ast) {
	3365	cause_ast_check(processor);
	3366	}
	3367	}
	3368
	3369	/*
	3370	* choose_next_pset:
	3371	*
	3372	* Return the next sibling pset containing
	3373	* available processors.
	3374	*
	3375	* Returns the original pset if none other is
	3376	* suitable.
	3377	*/
	3378	static processor_set_t
	3379	choose_next_pset(
	3380	processor_set_t pset)
	3381	{
	3382	processor_set_t nset = pset;
	3383
	3384	do {
	3385	nset = next_pset(nset);
	3386	} while (nset->online_processor_count < 1 && nset != pset);
	3387
	3388	return (nset);
	3389	}
	3390
	3391	/*
	3392	* choose_processor:
	3393	*
	3394	* Choose a processor for the thread, beginning at
	3395	* the pset. Accepts an optional processor hint in
	3396	* the pset.
	3397	*
	3398	* Returns a processor, possibly from a different pset.
	3399	*
	3400	* The thread must be locked. The pset must be locked,
	3401	* and the resulting pset is locked on return.
	3402	*/
	3403	processor_t
	3404	choose_processor(
	3405	processor_set_t pset,
	3406	processor_t processor,
	3407	thread_t thread)
	3408	{
	3409	processor_set_t nset, cset = pset;
	3410
	3411	/*
	3412	* Prefer the hinted processor, when appropriate.
	3413	*/
	3414
	3415	/* Fold last processor hint from secondary processor to its primary */
	3416	if (processor != PROCESSOR_NULL) {
	3417	processor = processor->processor_primary;
	3418	}
	3419
	3420	/*
	3421	* Only consult platform layer if pset is active, which
	3422	* it may not be in some cases when a multi-set system
	3423	* is going to sleep.
	3424	*/
	3425	if (pset->online_processor_count) {
	3426	if ((processor == PROCESSOR_NULL) \|\| (processor->processor_set == pset && processor->state == PROCESSOR_IDLE)) {
	3427	processor_t mc_processor = machine_choose_processor(pset, processor);
	3428	if (mc_processor != PROCESSOR_NULL)
	3429	processor = mc_processor->processor_primary;
	3430	}
	3431	}
	3432
	3433	/*
	3434	* At this point, we may have a processor hint, and we may have
	3435	* an initial starting pset. If the hint is not in the pset, or
	3436	* if the hint is for a processor in an invalid state, discard
	3437	* the hint.
	3438	*/
	3439	if (processor != PROCESSOR_NULL) {
	3440	if (processor->processor_set != pset) {
	3441	processor = PROCESSOR_NULL;
	3442	} else if (!processor->is_recommended) {
	3443	processor = PROCESSOR_NULL;
	3444	} else {
	3445	switch (processor->state) {
	3446	case PROCESSOR_START:
	3447	case PROCESSOR_SHUTDOWN:
	3448	case PROCESSOR_OFF_LINE:
	3449	/*
	3450	* Hint is for a processor that cannot support running new threads.
	3451	*/
	3452	processor = PROCESSOR_NULL;
	3453	break;
	3454	case PROCESSOR_IDLE:
	3455	/*
	3456	* Hint is for an idle processor. Assume it is no worse than any other
	3457	* idle processor. The platform layer had an opportunity to provide
	3458	* the "least cost idle" processor above.
	3459	*/
	3460	return (processor);
	3461	break;
	3462	case PROCESSOR_RUNNING:
	3463	case PROCESSOR_DISPATCHING:
	3464	/*
	3465	* Hint is for an active CPU. This fast-path allows
	3466	* realtime threads to preempt non-realtime threads
	3467	* to regain their previous executing processor.
	3468	*/
	3469	if ((thread->sched_pri >= BASEPRI_RTQUEUES) &&
	3470	(processor->current_pri < BASEPRI_RTQUEUES))
	3471	return (processor);
	3472
	3473	/* Otherwise, use hint as part of search below */
	3474	break;
	3475	default:
	3476	processor = PROCESSOR_NULL;
	3477	break;
	3478	}
	3479	}
	3480	}
	3481
	3482	/*
	3483	* Iterate through the processor sets to locate
	3484	* an appropriate processor. Seed results with
	3485	* a last-processor hint, if available, so that
	3486	* a search must find something strictly better
	3487	* to replace it.
	3488	*
	3489	* A primary/secondary pair of SMT processors are
	3490	* "unpaired" if the primary is busy but its
	3491	* corresponding secondary is idle (so the physical
	3492	* core has full use of its resources).
	3493	*/
	3494
	3495	integer_t lowest_priority = MAXPRI + 1;
	3496	integer_t lowest_unpaired_primary_priority = MAXPRI + 1;
	3497	integer_t lowest_count = INT_MAX;
	3498	uint64_t furthest_deadline = 1;
	3499	processor_t lp_processor = PROCESSOR_NULL;
	3500	processor_t lp_unpaired_primary_processor = PROCESSOR_NULL;
	3501	processor_t lp_unpaired_secondary_processor = PROCESSOR_NULL;
	3502	processor_t lc_processor = PROCESSOR_NULL;
	3503	processor_t fd_processor = PROCESSOR_NULL;
	3504
	3505	if (processor != PROCESSOR_NULL) {
	3506	/* All other states should be enumerated above. */
	3507	assert(processor->state == PROCESSOR_RUNNING \|\| processor->state == PROCESSOR_DISPATCHING);
	3508
	3509	lowest_priority = processor->current_pri;
	3510	lp_processor = processor;
	3511
	3512	if (processor->current_pri >= BASEPRI_RTQUEUES) {
	3513	furthest_deadline = processor->deadline;
	3514	fd_processor = processor;
	3515	}
	3516
	3517	lowest_count = SCHED(processor_runq_count)(processor);
	3518	lc_processor = processor;
	3519	}
	3520
	3521	do {
	3522
	3523	/*
	3524	* Choose an idle processor, in pset traversal order
	3525	*/
	3526	qe_foreach_element(processor, &cset->idle_queue, processor_queue) {
	3527	if (processor->is_recommended)
	3528	return processor;
	3529	}
	3530
	3531	/*
	3532	* Otherwise, enumerate active and idle processors to find candidates
	3533	* with lower priority/etc.
	3534	*/
	3535
	3536	qe_foreach_element(processor, &cset->active_queue, processor_queue) {
	3537
	3538	if (!processor->is_recommended) {
	3539	continue;
	3540	}
	3541
	3542	integer_t cpri = processor->current_pri;
	3543	if (cpri < lowest_priority) {
	3544	lowest_priority = cpri;
	3545	lp_processor = processor;
	3546	}
	3547
	3548	if ((cpri >= BASEPRI_RTQUEUES) && (processor->deadline > furthest_deadline)) {
	3549	furthest_deadline = processor->deadline;
	3550	fd_processor = processor;
	3551	}
	3552
	3553	integer_t ccount = SCHED(processor_runq_count)(processor);
	3554	if (ccount < lowest_count) {
	3555	lowest_count = ccount;
	3556	lc_processor = processor;
	3557	}
	3558	}
	3559
	3560	/*
	3561	* For SMT configs, these idle secondary processors must have active primary. Otherwise
	3562	* the idle primary would have short-circuited the loop above
	3563	*/
	3564	qe_foreach_element(processor, &cset->idle_secondary_queue, processor_queue) {
	3565
	3566	if (!processor->is_recommended) {
	3567	continue;
	3568	}
	3569
	3570	processor_t cprimary = processor->processor_primary;
	3571
	3572	/* If the primary processor is offline or starting up, it's not a candidate for this path */
	3573	if (cprimary->state == PROCESSOR_RUNNING \|\| cprimary->state == PROCESSOR_DISPATCHING) {
	3574	integer_t primary_pri = cprimary->current_pri;
	3575
	3576	if (primary_pri < lowest_unpaired_primary_priority) {
	3577	lowest_unpaired_primary_priority = primary_pri;
	3578	lp_unpaired_primary_processor = cprimary;
	3579	lp_unpaired_secondary_processor = processor;
	3580	}
	3581	}
	3582	}
	3583
	3584
	3585	if (thread->sched_pri >= BASEPRI_RTQUEUES) {
	3586
	3587	/*
	3588	* For realtime threads, the most important aspect is
	3589	* scheduling latency, so we attempt to assign threads
	3590	* to good preemption candidates (assuming an idle primary
	3591	* processor was not available above).
	3592	*/
	3593
	3594	if (thread->sched_pri > lowest_unpaired_primary_priority) {
	3595	/* Move to end of active queue so that the next thread doesn't also pick it */
	3596	re_queue_tail(&cset->active_queue, (queue_entry_t)lp_unpaired_primary_processor);
	3597	return lp_unpaired_primary_processor;
	3598	}
	3599	if (thread->sched_pri > lowest_priority) {
	3600	/* Move to end of active queue so that the next thread doesn't also pick it */
	3601	re_queue_tail(&cset->active_queue, (queue_entry_t)lp_processor);
	3602	return lp_processor;
	3603	}
	3604	if (thread->realtime.deadline < furthest_deadline)
	3605	return fd_processor;
	3606
	3607	/*
	3608	* If all primary and secondary CPUs are busy with realtime
	3609	* threads with deadlines earlier than us, move on to next
	3610	* pset.
	3611	*/
	3612	}
	3613	else {
	3614
	3615	if (thread->sched_pri > lowest_unpaired_primary_priority) {
	3616	/* Move to end of active queue so that the next thread doesn't also pick it */
	3617	re_queue_tail(&cset->active_queue, (queue_entry_t)lp_unpaired_primary_processor);
	3618	return lp_unpaired_primary_processor;
	3619	}
	3620	if (thread->sched_pri > lowest_priority) {
	3621	/* Move to end of active queue so that the next thread doesn't also pick it */
	3622	re_queue_tail(&cset->active_queue, (queue_entry_t)lp_processor);
	3623	return lp_processor;
	3624	}
	3625
	3626	/*
	3627	* If all primary processor in this pset are running a higher
	3628	* priority thread, move on to next pset. Only when we have
	3629	* exhausted this search do we fall back to other heuristics.
	3630	*/
	3631	}
	3632
	3633	/*
	3634	* Move onto the next processor set.
	3635	*/
	3636	nset = next_pset(cset);
	3637
	3638	if (nset != pset) {
	3639	pset_unlock(cset);
	3640
	3641	cset = nset;
	3642	pset_lock(cset);
	3643	}
	3644	} while (nset != pset);
	3645
	3646	/*
	3647	* Make sure that we pick a running processor,
	3648	* and that the correct processor set is locked.
	3649	* Since we may have unlock the candidate processor's
	3650	* pset, it may have changed state.
	3651	*
	3652	* All primary processors are running a higher priority
	3653	* thread, so the only options left are enqueuing on
	3654	* the secondary processor that would perturb the least priority
	3655	* primary, or the least busy primary.
	3656	*/
	3657	do {
	3658
	3659	/* lowest_priority is evaluated in the main loops above */
	3660	if (lp_unpaired_secondary_processor != PROCESSOR_NULL) {
	3661	processor = lp_unpaired_secondary_processor;
	3662	lp_unpaired_secondary_processor = PROCESSOR_NULL;
	3663	} else if (lc_processor != PROCESSOR_NULL) {
	3664	processor = lc_processor;
	3665	lc_processor = PROCESSOR_NULL;
	3666	} else {
	3667	/*
	3668	* All processors are executing higher
	3669	* priority threads, and the lowest_count
	3670	* candidate was not usable
	3671	*/
	3672	processor = master_processor;
	3673	}
	3674
	3675	/*
	3676	* Check that the correct processor set is
	3677	* returned locked.
	3678	*/
	3679	if (cset != processor->processor_set) {
	3680	pset_unlock(cset);
	3681	cset = processor->processor_set;
	3682	pset_lock(cset);
	3683	}
	3684
	3685	/*
	3686	* We must verify that the chosen processor is still available.
	3687	* master_processor is an exception, since we may need to preempt
	3688	* a running thread on it during processor shutdown (for sleep),
	3689	* and that thread needs to be enqueued on its runqueue to run
	3690	* when the processor is restarted.
	3691	*/
	3692	if (processor != master_processor && (processor->state == PROCESSOR_SHUTDOWN \|\| processor->state == PROCESSOR_OFF_LINE))
	3693	processor = PROCESSOR_NULL;
	3694
	3695	} while (processor == PROCESSOR_NULL);
	3696
	3697	return (processor);
	3698	}
	3699
	3700	/*
	3701	* thread_setrun:
	3702	*
	3703	* Dispatch thread for execution, onto an idle
	3704	* processor or run queue, and signal a preemption
	3705	* as appropriate.
	3706	*
	3707	* Thread must be locked.
	3708	*/
	3709	void
	3710	thread_setrun(
	3711	thread_t thread,
	3712	integer_t options)
	3713	{
	3714	processor_t processor;
	3715	processor_set_t pset;
	3716
	3717	assert((thread->state & (TH_RUN\|TH_WAIT\|TH_UNINT\|TH_TERMINATE\|TH_TERMINATE2)) == TH_RUN);
	3718	assert(thread->runq == PROCESSOR_NULL);
	3719
	3720	/*
	3721	* Update priority if needed.
	3722	*/
	3723	if (SCHED(can_update_priority)(thread))
	3724	SCHED(update_priority)(thread);
	3725
	3726	thread->sfi_class = sfi_thread_classify(thread);
	3727
	3728	assert(thread->runq == PROCESSOR_NULL);
	3729
	3730	#if __SMP__
	3731	if (thread->bound_processor == PROCESSOR_NULL) {
	3732	/*
	3733	* Unbound case.
	3734	*/
	3735	if (thread->affinity_set != AFFINITY_SET_NULL) {
	3736	/*
	3737	* Use affinity set policy hint.
	3738	*/
	3739	pset = thread->affinity_set->aset_pset;
	3740	pset_lock(pset);
	3741
	3742	processor = SCHED(choose_processor)(pset, PROCESSOR_NULL, thread);
	3743
	3744	SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR)\|DBG_FUNC_NONE,
	3745	(uintptr_t)thread_tid(thread), (uintptr_t)-1, processor->cpu_id, processor->state, 0);
	3746	} else if (thread->last_processor != PROCESSOR_NULL) {
	3747	/*
	3748	* Simple (last processor) affinity case.
	3749	*/
	3750	processor = thread->last_processor;
	3751	pset = processor->processor_set;
	3752	pset_lock(pset);
	3753	processor = SCHED(choose_processor)(pset, processor, thread);
	3754
	3755	SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR)\|DBG_FUNC_NONE,
	3756	(uintptr_t)thread_tid(thread), thread->last_processor->cpu_id, processor->cpu_id, processor->state, 0);
	3757	} else {
	3758	/*
	3759	* No Affinity case:
	3760	*
	3761	* Utilitize a per task hint to spread threads
	3762	* among the available processor sets.
	3763	*/
	3764	task_t task = thread->task;
	3765
	3766	pset = task->pset_hint;
	3767	if (pset == PROCESSOR_SET_NULL)
	3768	pset = current_processor()->processor_set;
	3769
	3770	pset = choose_next_pset(pset);
	3771	pset_lock(pset);
	3772
	3773	processor = SCHED(choose_processor)(pset, PROCESSOR_NULL, thread);
	3774	task->pset_hint = processor->processor_set;
	3775
	3776	SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR)\|DBG_FUNC_NONE,
	3777	(uintptr_t)thread_tid(thread), (uintptr_t)-1, processor->cpu_id, processor->state, 0);
	3778	}
	3779	} else {
	3780	/*
	3781	* Bound case:
	3782	*
	3783	* Unconditionally dispatch on the processor.
	3784	*/
	3785	processor = thread->bound_processor;
	3786	pset = processor->processor_set;
	3787	pset_lock(pset);
	3788
	3789	SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR)\|DBG_FUNC_NONE,
	3790	(uintptr_t)thread_tid(thread), (uintptr_t)-2, processor->cpu_id, processor->state, 0);
	3791	}
	3792	#else /* !__SMP__ */
	3793	/* Only one processor to choose */
	3794	assert(thread->bound_processor == PROCESSOR_NULL \|\| thread->bound_processor == master_processor);
	3795	processor = master_processor;
	3796	pset = processor->processor_set;
	3797	pset_lock(pset);
	3798	#endif /* !__SMP__ */
	3799
	3800	/*
	3801	* Dispatch the thread on the chosen processor.
	3802	* TODO: This should be based on sched_mode, not sched_pri
	3803	*/
	3804	if (thread->sched_pri >= BASEPRI_RTQUEUES)
	3805	realtime_setrun(processor, thread);
	3806	else
	3807	processor_setrun(processor, thread, options);
	3808	}
	3809
	3810	processor_set_t
	3811	task_choose_pset(
	3812	task_t task)
	3813	{
	3814	processor_set_t pset = task->pset_hint;
	3815
	3816	if (pset != PROCESSOR_SET_NULL)
	3817	pset = choose_next_pset(pset);
	3818
	3819	return (pset);
	3820	}
	3821
	3822	/*
	3823	* Check for a preemption point in
	3824	* the current context.
	3825	*
	3826	* Called at splsched with thread locked.
	3827	*/
	3828	ast_t
	3829	csw_check(
	3830	processor_t processor,
	3831	ast_t check_reason)
	3832	{
	3833	processor_set_t pset = processor->processor_set;
	3834	ast_t result;
	3835
	3836	pset_lock(pset);
	3837
	3838	/* If we were sent a remote AST and interrupted a running processor, acknowledge it here with pset lock held */
	3839	pset->pending_AST_cpu_mask &= ~(1ULL << processor->cpu_id);
	3840
	3841	result = csw_check_locked(processor, pset, check_reason);
	3842
	3843	pset_unlock(pset);
	3844
	3845	return result;
	3846	}
	3847
	3848	/*
	3849	* Check for preemption at splsched with
	3850	* pset and thread locked
	3851	*/
	3852	ast_t
	3853	csw_check_locked(
	3854	processor_t processor,
	3855	processor_set_t pset __unused,
	3856	ast_t check_reason)
	3857	{
	3858	ast_t result;
	3859	thread_t thread = processor->active_thread;
	3860
	3861	if (processor->first_timeslice) {
	3862	if (rt_runq.count > 0)
	3863	return (check_reason \| AST_PREEMPT \| AST_URGENT);
	3864	}
	3865	else {
	3866	if (rt_runq.count > 0) {
	3867	if (BASEPRI_RTQUEUES > processor->current_pri)
	3868	return (check_reason \| AST_PREEMPT \| AST_URGENT);
	3869	else
	3870	return (check_reason \| AST_PREEMPT);
	3871	}
	3872	}
	3873
	3874	result = SCHED(processor_csw_check)(processor);
	3875	if (result != AST_NONE)
	3876	return (check_reason \| result \| (thread_eager_preemption(thread) ? AST_URGENT : AST_NONE));
	3877
	3878	#if __SMP__
	3879
	3880	/*
	3881	* If the current thread is running on a processor that is no longer recommended, gently
	3882	* (non-urgently) get to a point and then block, and which point thread_select() should
	3883	* try to idle the processor and re-dispatch the thread to a recommended processor.
	3884	*/
	3885	if (!processor->is_recommended)
	3886	return (check_reason \| AST_PREEMPT);
	3887
	3888	/*
	3889	* Even though we could continue executing on this processor, a
	3890	* secondary SMT core should try to shed load to another primary core.
	3891	*
	3892	* TODO: Should this do the same check that thread_select does? i.e.
	3893	* if no bound threads target this processor, and idle primaries exist, preempt
	3894	* The case of RT threads existing is already taken care of above
	3895	* Consider Capri in this scenario.
	3896	*
	3897	* if (!SCHED(processor_bound_count)(processor) && !queue_empty(&pset->idle_queue))
	3898	*
	3899	* TODO: Alternatively - check if only primary is idle, or check if primary's pri is lower than mine.
	3900	*/
	3901
	3902	if (processor->current_pri < BASEPRI_RTQUEUES &&
	3903	processor->processor_primary != processor)
	3904	return (check_reason \| AST_PREEMPT);
	3905	#endif
	3906
	3907	if (thread->state & TH_SUSP)
	3908	return (check_reason \| AST_PREEMPT);
	3909
	3910	#if CONFIG_SCHED_SFI
	3911	/*
	3912	* Current thread may not need to be preempted, but maybe needs
	3913	* an SFI wait?
	3914	*/
	3915	result = sfi_thread_needs_ast(thread, NULL);
	3916	if (result != AST_NONE)
	3917	return (check_reason \| result);
	3918	#endif
	3919
	3920	return (AST_NONE);
	3921	}
	3922
	3923	/*
	3924	* set_sched_pri:
	3925	*
	3926	* Set the scheduled priority of the specified thread.
	3927	*
	3928	* This may cause the thread to change queues.
	3929	*
	3930	* Thread must be locked.
	3931	*/
	3932	void
	3933	set_sched_pri(
	3934	thread_t thread,
	3935	int priority)
	3936	{
	3937	thread_t cthread = current_thread();
	3938	boolean_t is_current_thread = (thread == cthread) ? TRUE : FALSE;
	3939	int curgency, nurgency;
	3940	uint64_t urgency_param1, urgency_param2;
	3941	boolean_t removed_from_runq = FALSE;
	3942
	3943	/* If we're already at this priority, no need to mess with the runqueue */
	3944	if (priority == thread->sched_pri)
	3945	return;
	3946
	3947	if (is_current_thread) {
	3948	assert(thread->runq == PROCESSOR_NULL);
	3949	curgency = thread_get_urgency(thread, &urgency_param1, &urgency_param2);
	3950	} else {
	3951	removed_from_runq = thread_run_queue_remove(thread);
	3952	}
	3953
	3954	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHANGE_PRIORITY),
	3955	(uintptr_t)thread_tid(thread),
	3956	thread->base_pri,
	3957	thread->sched_pri,
	3958	0, /* eventually, 'reason' */
	3959	0);
	3960
	3961	thread->sched_pri = priority;
	3962
	3963	if (is_current_thread) {
	3964	nurgency = thread_get_urgency(thread, &urgency_param1, &urgency_param2);
	3965	/*
	3966	* set_sched_pri doesn't alter RT params. We expect direct base priority/QoS
	3967	* class alterations from user space to occur relatively infrequently, hence
	3968	* those are lazily handled. QoS classes have distinct priority bands, and QoS
	3969	* inheritance is expected to involve priority changes.
	3970	*/
	3971	if (nurgency != curgency) {
	3972	thread_tell_urgency(nurgency, urgency_param1, urgency_param2, 0, thread);
	3973	machine_thread_going_on_core(thread, nurgency, 0);
	3974	}
	3975	}
	3976
	3977	/* TODO: Should this be TAILQ if it went down, HEADQ if it went up? */
	3978	if (removed_from_runq)
	3979	thread_run_queue_reinsert(thread, SCHED_PREEMPT \| SCHED_TAILQ);
	3980	else if (thread->state & TH_RUN) {
	3981	processor_t processor = thread->last_processor;
	3982
	3983	if (is_current_thread) {
	3984	ast_t preempt;
	3985
	3986	processor->current_pri = priority;
	3987	processor->current_thmode = thread->sched_mode;
	3988	processor->current_sfi_class = thread->sfi_class = sfi_thread_classify(thread);
	3989	if ((preempt = csw_check(processor, AST_NONE)) != AST_NONE)
	3990	ast_on(preempt);
	3991	} else if (processor != PROCESSOR_NULL && processor->active_thread == thread)
	3992	cause_ast_check(processor);
	3993	}
	3994	}
	3995
	3996	/*
	3997	* thread_run_queue_remove_for_handoff
	3998	*
	3999	* Pull a thread or its (recursive) push target out of the runqueue
	4000	* so that it is ready for thread_run()
	4001	*
	4002	* Called at splsched
	4003	*
	4004	* Returns the thread that was pulled or THREAD_NULL if no thread could be pulled.
	4005	* This may be different than the thread that was passed in.
	4006	*/
	4007	thread_t
	4008	thread_run_queue_remove_for_handoff(thread_t thread) {
	4009
	4010	thread_t pulled_thread = THREAD_NULL;
	4011
	4012	thread_lock(thread);
	4013
	4014	/*
	4015	* Check that the thread is not bound
	4016	* to a different processor, and that realtime
	4017	* is not involved.
	4018	*
	4019	* Next, pull it off its run queue. If it
	4020	* doesn't come, it's not eligible.
	4021	*/
	4022
	4023	processor_t processor = current_processor();
	4024	if (processor->current_pri < BASEPRI_RTQUEUES && thread->sched_pri < BASEPRI_RTQUEUES &&
	4025	(thread->bound_processor == PROCESSOR_NULL \|\| thread->bound_processor == processor)) {
	4026
	4027	if (thread_run_queue_remove(thread))
	4028	pulled_thread = thread;
	4029	}
	4030
	4031	thread_unlock(thread);
	4032
	4033	return pulled_thread;
	4034	}
	4035
	4036	/*
	4037	* thread_run_queue_remove:
	4038	*
	4039	* Remove a thread from its current run queue and
	4040	* return TRUE if successful.
	4041	*
	4042	* Thread must be locked.
	4043	*
	4044	* If thread->runq is PROCESSOR_NULL, the thread will not re-enter the
	4045	* run queues because the caller locked the thread. Otherwise
	4046	* the thread is on a run queue, but could be chosen for dispatch
	4047	* and removed by another processor under a different lock, which
	4048	* will set thread->runq to PROCESSOR_NULL.
	4049	*
	4050	* Hence the thread select path must not rely on anything that could
	4051	* be changed under the thread lock after calling this function,
	4052	* most importantly thread->sched_pri.
	4053	*/
	4054	boolean_t
	4055	thread_run_queue_remove(
	4056	thread_t thread)
	4057	{
	4058	boolean_t removed = FALSE;
	4059	processor_t processor = thread->runq;
	4060
	4061	if ((thread->state & (TH_RUN\|TH_WAIT)) == TH_WAIT) {
	4062	/* Thread isn't runnable */
	4063	assert(thread->runq == PROCESSOR_NULL);
	4064	return FALSE;
	4065	}
	4066
	4067	if (processor == PROCESSOR_NULL) {
	4068	/*
	4069	* The thread is either not on the runq,
	4070	* or is in the midst of being removed from the runq.
	4071	*
	4072	* runq is set to NULL under the pset lock, not the thread
	4073	* lock, so the thread may still be in the process of being dequeued
	4074	* from the runq. It will wait in invoke for the thread lock to be
	4075	* dropped.
	4076	*/
	4077
	4078	return FALSE;
	4079	}
	4080
	4081	if (thread->sched_pri < BASEPRI_RTQUEUES) {
	4082	return SCHED(processor_queue_remove)(processor, thread);
	4083	}
	4084
	4085	rt_lock_lock();
	4086
	4087	if (thread->runq != PROCESSOR_NULL) {
	4088	/*
	4089	* Thread is on the RT run queue and we have a lock on
	4090	* that run queue.
	4091	*/
	4092
	4093	assert(thread->runq == THREAD_ON_RT_RUNQ);
	4094
	4095	remqueue((queue_entry_t)thread);
	4096	SCHED_STATS_RUNQ_CHANGE(&rt_runq.runq_stats, rt_runq.count);
	4097	rt_runq.count--;
	4098
	4099	thread->runq = PROCESSOR_NULL;
	4100
	4101	removed = TRUE;
	4102	}
	4103
	4104	rt_lock_unlock();
	4105
	4106	return (removed);
	4107	}
	4108
	4109	/*
	4110	* Put the thread back where it goes after a thread_run_queue_remove
	4111	*
	4112	* Thread must have been removed under the same thread lock hold
	4113	*
	4114	* thread locked, at splsched
	4115	*/
	4116	void
	4117	thread_run_queue_reinsert(thread_t thread, integer_t options)
	4118	{
	4119	assert(thread->runq == PROCESSOR_NULL);
	4120
	4121	assert(thread->state & (TH_RUN));
	4122	thread_setrun(thread, options);
	4123
	4124	}
	4125
	4126	void
	4127	sys_override_cpu_throttle(int flag)
	4128	{
	4129	if (flag == CPU_THROTTLE_ENABLE)
	4130	cpu_throttle_enabled = 1;
	4131	if (flag == CPU_THROTTLE_DISABLE)
	4132	cpu_throttle_enabled = 0;
	4133	}
	4134
	4135	int
	4136	thread_get_urgency(thread_t thread, uint64_t arg1, uint64_t arg2)
	4137	{
	4138	if (thread == NULL \|\| (thread->state & TH_IDLE)) {
	4139	*arg1 = 0;
	4140	*arg2 = 0;
	4141
	4142	return (THREAD_URGENCY_NONE);
	4143	} else if (thread->sched_mode == TH_MODE_REALTIME) {
	4144	*arg1 = thread->realtime.period;
	4145	*arg2 = thread->realtime.deadline;
	4146
	4147	return (THREAD_URGENCY_REAL_TIME);
	4148	} else if (cpu_throttle_enabled &&
	4149	((thread->sched_pri <= MAXPRI_THROTTLE) && (thread->base_pri <= MAXPRI_THROTTLE))) {
	4150	/*
	4151	* Background urgency applied when thread priority is MAXPRI_THROTTLE or lower and thread is not promoted
	4152	* TODO: Use TH_SFLAG_THROTTLED instead?
	4153	*/
	4154	*arg1 = thread->sched_pri;
	4155	*arg2 = thread->base_pri;
	4156
	4157	return (THREAD_URGENCY_BACKGROUND);
	4158	} else {
	4159	/* For otherwise unclassified threads, report throughput QoS
	4160	* parameters
	4161	*/
	4162	*arg1 = thread->effective_policy.t_through_qos;
	4163	*arg2 = thread->task->effective_policy.t_through_qos;
	4164
	4165	return (THREAD_URGENCY_NORMAL);
	4166	}
	4167	}
	4168
	4169
	4170	/*
	4171	* This is the processor idle loop, which just looks for other threads
	4172	* to execute. Processor idle threads invoke this without supplying a
	4173	* current thread to idle without an asserted wait state.
	4174	*
	4175	* Returns a the next thread to execute if dispatched directly.
	4176	*/
	4177
	4178	#if 0
	4179	#define IDLE_KERNEL_DEBUG_CONSTANT(...) KERNEL_DEBUG_CONSTANT(__VA_ARGS__)
	4180	#else
	4181	#define IDLE_KERNEL_DEBUG_CONSTANT(...) do { } while(0)
	4182	#endif
	4183
	4184	thread_t
	4185	processor_idle(
	4186	thread_t thread,
	4187	processor_t processor)
	4188	{
	4189	processor_set_t pset = processor->processor_set;
	4190	thread_t new_thread;
	4191	int state;
	4192	(void)splsched();
	4193
	4194	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	4195	MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) \| DBG_FUNC_START,
	4196	(uintptr_t)thread_tid(thread), 0, 0, 0, 0);
	4197
	4198	SCHED_STATS_CPU_IDLE_START(processor);
	4199
	4200	timer_switch(&PROCESSOR_DATA(processor, system_state),
	4201	mach_absolute_time(), &PROCESSOR_DATA(processor, idle_state));
	4202	PROCESSOR_DATA(processor, current_state) = &PROCESSOR_DATA(processor, idle_state);
	4203
	4204	while (1) {
	4205	if (processor->state != PROCESSOR_IDLE) /* unsafe, but worst case we loop around once */
	4206	break;
	4207	if (pset->pending_AST_cpu_mask & (1ULL << processor->cpu_id))
	4208	break;
	4209	if (processor->is_recommended) {
	4210	if (rt_runq.count)
	4211	break;
	4212	} else {
	4213	if (SCHED(processor_bound_count)(processor))
	4214	break;
	4215	}
	4216
	4217	#if CONFIG_SCHED_IDLE_IN_PLACE
	4218	if (thread != THREAD_NULL) {
	4219	/* Did idle-in-place thread wake up */
	4220	if ((thread->state & (TH_WAIT\|TH_SUSP)) != TH_WAIT \|\| thread->wake_active)
	4221	break;
	4222	}
	4223	#endif
	4224
	4225	IDLE_KERNEL_DEBUG_CONSTANT(
	4226	MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) \| DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), rt_runq.count, SCHED(processor_runq_count)(processor), -1, 0);
	4227
	4228	machine_track_platform_idle(TRUE);
	4229
	4230	machine_idle();
	4231
	4232	machine_track_platform_idle(FALSE);
	4233
	4234	(void)splsched();
	4235
	4236	IDLE_KERNEL_DEBUG_CONSTANT(
	4237	MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) \| DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), rt_runq.count, SCHED(processor_runq_count)(processor), -2, 0);
	4238
	4239	if (!SCHED(processor_queue_empty)(processor)) {
	4240	/* Secondary SMT processors respond to directed wakeups
	4241	* exclusively. Some platforms induce 'spurious' SMT wakeups.
	4242	*/
	4243	if (processor->processor_primary == processor)
	4244	break;
	4245	}
	4246	}
	4247
	4248	timer_switch(&PROCESSOR_DATA(processor, idle_state),
	4249	mach_absolute_time(), &PROCESSOR_DATA(processor, system_state));
	4250	PROCESSOR_DATA(processor, current_state) = &PROCESSOR_DATA(processor, system_state);
	4251
	4252	pset_lock(pset);
	4253
	4254	/* If we were sent a remote AST and came out of idle, acknowledge it here with pset lock held */
	4255	pset->pending_AST_cpu_mask &= ~(1ULL << processor->cpu_id);
	4256	#if defined(CONFIG_SCHED_DEFERRED_AST)
	4257	pset->pending_deferred_AST_cpu_mask &= ~(1ULL << processor->cpu_id);
	4258	#endif
	4259
	4260	state = processor->state;
	4261	if (state == PROCESSOR_DISPATCHING) {
	4262	/*
	4263	* Commmon case -- cpu dispatched.
	4264	*/
	4265	new_thread = processor->next_thread;
	4266	processor->next_thread = THREAD_NULL;
	4267	processor->state = PROCESSOR_RUNNING;
	4268
	4269	if ((new_thread != THREAD_NULL) && (SCHED(processor_queue_has_priority)(processor, new_thread->sched_pri, FALSE) \|\|
	4270	(rt_runq.count > 0)) ) {
	4271	/* Something higher priority has popped up on the runqueue - redispatch this thread elsewhere */
	4272	processor->current_pri = IDLEPRI;
	4273	processor->current_thmode = TH_MODE_FIXED;
	4274	processor->current_sfi_class = SFI_CLASS_KERNEL;
	4275	processor->deadline = UINT64_MAX;
	4276
	4277	pset_unlock(pset);
	4278
	4279	thread_lock(new_thread);
	4280	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_REDISPATCH), (uintptr_t)thread_tid(new_thread), new_thread->sched_pri, rt_runq.count, 0, 0);
	4281	thread_setrun(new_thread, SCHED_HEADQ);
	4282	thread_unlock(new_thread);
	4283
	4284	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	4285	MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) \| DBG_FUNC_END,
	4286	(uintptr_t)thread_tid(thread), state, 0, 0, 0);
	4287
	4288	return (THREAD_NULL);
	4289	}
	4290
	4291	pset_unlock(pset);
	4292
	4293	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	4294	MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) \| DBG_FUNC_END,
	4295	(uintptr_t)thread_tid(thread), state, (uintptr_t)thread_tid(new_thread), 0, 0);
	4296
	4297	return (new_thread);
	4298	}
	4299	else
	4300	if (state == PROCESSOR_IDLE) {
	4301	remqueue((queue_entry_t)processor);
	4302
	4303	processor->state = PROCESSOR_RUNNING;
	4304	processor->current_pri = IDLEPRI;
	4305	processor->current_thmode = TH_MODE_FIXED;
	4306	processor->current_sfi_class = SFI_CLASS_KERNEL;
	4307	processor->deadline = UINT64_MAX;
	4308	enqueue_tail(&pset->active_queue, (queue_entry_t)processor);
	4309	}
	4310	else
	4311	if (state == PROCESSOR_SHUTDOWN) {
	4312	/*
	4313	* Going off-line. Force a
	4314	* reschedule.
	4315	*/
	4316	if ((new_thread = processor->next_thread) != THREAD_NULL) {
	4317	processor->next_thread = THREAD_NULL;
	4318	processor->current_pri = IDLEPRI;
	4319	processor->current_thmode = TH_MODE_FIXED;
	4320	processor->current_sfi_class = SFI_CLASS_KERNEL;
	4321	processor->deadline = UINT64_MAX;
	4322
	4323	pset_unlock(pset);
	4324
	4325	thread_lock(new_thread);
	4326	thread_setrun(new_thread, SCHED_HEADQ);
	4327	thread_unlock(new_thread);
	4328
	4329	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	4330	MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) \| DBG_FUNC_END,
	4331	(uintptr_t)thread_tid(thread), state, 0, 0, 0);
	4332
	4333	return (THREAD_NULL);
	4334	}
	4335	}
	4336
	4337	pset_unlock(pset);
	4338
	4339	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	4340	MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) \| DBG_FUNC_END,
	4341	(uintptr_t)thread_tid(thread), state, 0, 0, 0);
	4342
	4343	return (THREAD_NULL);
	4344	}
	4345
	4346	/*
	4347	* Each processor has a dedicated thread which
	4348	* executes the idle loop when there is no suitable
	4349	* previous context.
	4350	*/
	4351	void
	4352	idle_thread(void)
	4353	{
	4354	processor_t processor = current_processor();
	4355	thread_t new_thread;
	4356
	4357	new_thread = processor_idle(THREAD_NULL, processor);
	4358	if (new_thread != THREAD_NULL) {
	4359	thread_run(processor->idle_thread, (thread_continue_t)idle_thread, NULL, new_thread);
	4360	/NOTREACHED/
	4361	}
	4362
	4363	thread_block((thread_continue_t)idle_thread);
	4364	/NOTREACHED/
	4365	}
	4366
	4367	kern_return_t
	4368	idle_thread_create(
	4369	processor_t processor)
	4370	{
	4371	kern_return_t result;
	4372	thread_t thread;
	4373	spl_t s;
	4374
	4375	result = kernel_thread_create((thread_continue_t)idle_thread, NULL, MAXPRI_KERNEL, &thread);
	4376	if (result != KERN_SUCCESS)
	4377	return (result);
	4378
	4379	s = splsched();
	4380	thread_lock(thread);
	4381	thread->bound_processor = processor;
	4382	processor->idle_thread = thread;
	4383	thread->sched_pri = thread->base_pri = IDLEPRI;
	4384	thread->state = (TH_RUN \| TH_IDLE);
	4385	thread->options \|= TH_OPT_IDLE_THREAD;
	4386	thread_unlock(thread);
	4387	splx(s);
	4388
	4389	thread_deallocate(thread);
	4390
	4391	return (KERN_SUCCESS);
	4392	}
	4393
	4394	/*
	4395	* sched_startup:
	4396	*
	4397	* Kicks off scheduler services.
	4398	*
	4399	* Called at splsched.
	4400	*/
	4401	void
	4402	sched_startup(void)
	4403	{
	4404	kern_return_t result;
	4405	thread_t thread;
	4406
	4407	simple_lock_init(&sched_vm_group_list_lock, 0);
	4408
	4409	result = kernel_thread_start_priority((thread_continue_t)sched_init_thread,
	4410	(void *)SCHED(maintenance_continuation), MAXPRI_KERNEL, &thread);
	4411	if (result != KERN_SUCCESS)
	4412	panic("sched_startup");
	4413
	4414	thread_deallocate(thread);
	4415
	4416	/*
	4417	* Yield to the sched_init_thread once, to
	4418	* initialize our own thread after being switched
	4419	* back to.
	4420	*
	4421	* The current thread is the only other thread
	4422	* active at this point.
	4423	*/
	4424	thread_block(THREAD_CONTINUE_NULL);
	4425	}
	4426
	4427	#if defined(CONFIG_SCHED_TIMESHARE_CORE)
	4428
	4429	static volatile uint64_t sched_maintenance_deadline;
	4430	#if defined(CONFIG_TELEMETRY)
	4431	static volatile uint64_t sched_telemetry_deadline = 0;
	4432	#endif
	4433	static uint64_t sched_tick_last_abstime;
	4434	static uint64_t sched_tick_delta;
	4435	uint64_t sched_tick_max_delta;
	4436	/*
	4437	* sched_init_thread:
	4438	*
	4439	* Perform periodic bookkeeping functions about ten
	4440	* times per second.
	4441	*/
	4442	void
	4443	sched_timeshare_maintenance_continue(void)
	4444	{
	4445	uint64_t sched_tick_ctime, late_time;
	4446
	4447	struct sched_update_scan_context scan_context = {
	4448	.earliest_bg_make_runnable_time = UINT64_MAX,
	4449	.earliest_normal_make_runnable_time = UINT64_MAX,
	4450	.earliest_rt_make_runnable_time = UINT64_MAX
	4451	};
	4452
	4453	sched_tick_ctime = mach_absolute_time();
	4454
	4455	if (__improbable(sched_tick_last_abstime == 0)) {
	4456	sched_tick_last_abstime = sched_tick_ctime;
	4457	late_time = 0;
	4458	sched_tick_delta = 1;
	4459	} else {
	4460	late_time = sched_tick_ctime - sched_tick_last_abstime;
	4461	sched_tick_delta = late_time / sched_tick_interval;
	4462	/* Ensure a delta of 1, since the interval could be slightly
	4463	* smaller than the sched_tick_interval due to dispatch
	4464	* latencies.
	4465	*/
	4466	sched_tick_delta = MAX(sched_tick_delta, 1);
	4467
	4468	/* In the event interrupt latencies or platform
	4469	* idle events that advanced the timebase resulted
	4470	* in periods where no threads were dispatched,
	4471	* cap the maximum "tick delta" at SCHED_TICK_MAX_DELTA
	4472	* iterations.
	4473	*/
	4474	sched_tick_delta = MIN(sched_tick_delta, SCHED_TICK_MAX_DELTA);
	4475
	4476	sched_tick_last_abstime = sched_tick_ctime;
	4477	sched_tick_max_delta = MAX(sched_tick_delta, sched_tick_max_delta);
	4478	}
	4479
	4480	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_MAINTENANCE)\|DBG_FUNC_START,
	4481	sched_tick_delta,
	4482	late_time,
	4483	0,
	4484	0,
	4485	0);
	4486
	4487	/* Add a number of pseudo-ticks corresponding to the elapsed interval
	4488	* This could be greater than 1 if substantial intervals where
	4489	* all processors are idle occur, which rarely occurs in practice.
	4490	*/
	4491
	4492	sched_tick += sched_tick_delta;
	4493
	4494	/*
	4495	* Compute various averages.
	4496	*/
	4497	compute_averages(sched_tick_delta);
	4498
	4499	/*
	4500	* Scan the run queues for threads which
	4501	* may need to be updated.
	4502	*/
	4503	SCHED(thread_update_scan)(&scan_context);
	4504
	4505	rt_runq_scan(&scan_context);
	4506
	4507	uint64_t ctime = mach_absolute_time();
	4508
	4509	machine_max_runnable_latency(ctime > scan_context.earliest_bg_make_runnable_time ? ctime - scan_context.earliest_bg_make_runnable_time : 0,
	4510	ctime > scan_context.earliest_normal_make_runnable_time ? ctime - scan_context.earliest_normal_make_runnable_time : 0,
	4511	ctime > scan_context.earliest_rt_make_runnable_time ? ctime - scan_context.earliest_rt_make_runnable_time : 0);
	4512
	4513	/*
	4514	* Check to see if the special sched VM group needs attention.
	4515	*/
	4516	sched_vm_group_maintenance();
	4517
	4518	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_MAINTENANCE)\|DBG_FUNC_END,
	4519	sched_pri_shift,
	4520	sched_background_pri_shift,
	4521	0,
	4522	0,
	4523	0);
	4524
	4525	assert_wait((event_t)sched_timeshare_maintenance_continue, THREAD_UNINT);
	4526	thread_block((thread_continue_t)sched_timeshare_maintenance_continue);
	4527	/NOTREACHED/
	4528	}
	4529
	4530	static uint64_t sched_maintenance_wakeups;
	4531
	4532	/*
	4533	* Determine if the set of routines formerly driven by a maintenance timer
	4534	* must be invoked, based on a deadline comparison. Signals the scheduler
	4535	* maintenance thread on deadline expiration. Must be invoked at an interval
	4536	* lower than the "sched_tick_interval", currently accomplished by
	4537	* invocation via the quantum expiration timer and at context switch time.
	4538	* Performance matters: this routine reuses a timestamp approximating the
	4539	* current absolute time received from the caller, and should perform
	4540	* no more than a comparison against the deadline in the common case.
	4541	*/
	4542	void
	4543	sched_timeshare_consider_maintenance(uint64_t ctime) {
	4544	uint64_t ndeadline, deadline = sched_maintenance_deadline;
	4545
	4546	if (__improbable(ctime >= deadline)) {
	4547	if (__improbable(current_thread() == sched_maintenance_thread))
	4548	return;
	4549	OSMemoryBarrier();
	4550
	4551	ndeadline = ctime + sched_tick_interval;
	4552
	4553	if (__probable(__sync_bool_compare_and_swap(&sched_maintenance_deadline, deadline, ndeadline))) {
	4554	thread_wakeup((event_t)sched_timeshare_maintenance_continue);
	4555	sched_maintenance_wakeups++;
	4556	}
	4557	}
	4558
	4559	#if defined(CONFIG_TELEMETRY)
	4560	/*
	4561	* Windowed telemetry is driven by the scheduler. It should be safe
	4562	* to call compute_telemetry_windowed() even when windowed telemetry
	4563	* is disabled, but we should try to avoid doing extra work for no
	4564	* reason.
	4565	*/
	4566	if (telemetry_window_enabled) {
	4567	deadline = sched_telemetry_deadline;
	4568
	4569	if (__improbable(ctime >= deadline)) {
	4570	ndeadline = ctime + sched_telemetry_interval;
	4571
	4572	if (__probable(__sync_bool_compare_and_swap(&sched_telemetry_deadline, deadline, ndeadline))) {
	4573	compute_telemetry_windowed();
	4574	}
	4575	}
	4576	}
	4577	#endif /* CONFIG_TELEMETRY */
	4578	}
	4579
	4580	#endif /* CONFIG_SCHED_TIMESHARE_CORE */
	4581
	4582	void
	4583	sched_init_thread(void (*continuation)(void))
	4584	{
	4585	thread_block(THREAD_CONTINUE_NULL);
	4586
	4587	sched_maintenance_thread = current_thread();
	4588	continuation();
	4589
	4590	/NOTREACHED/
	4591	}
	4592
	4593	#if defined(CONFIG_SCHED_TIMESHARE_CORE)
	4594
	4595	/*
	4596	* thread_update_scan / runq_scan:
	4597	*
	4598	* Scan the run queues to account for timesharing threads
	4599	* which need to be updated.
	4600	*
	4601	* Scanner runs in two passes. Pass one squirrels likely
	4602	* threads away in an array, pass two does the update.
	4603	*
	4604	* This is necessary because the run queue is locked for
	4605	* the candidate scan, but the thread is locked for the update.
	4606	*
	4607	* Array should be sized to make forward progress, without
	4608	* disabling preemption for long periods.
	4609	*/
	4610
	4611	#define THREAD_UPDATE_SIZE 128
	4612
	4613	static thread_t thread_update_array[THREAD_UPDATE_SIZE];
	4614	static int thread_update_count = 0;
	4615
	4616	/* Returns TRUE if thread was added, FALSE if thread_update_array is full */
	4617	boolean_t
	4618	thread_update_add_thread(thread_t thread)
	4619	{
	4620	if (thread_update_count == THREAD_UPDATE_SIZE)
	4621	return (FALSE);
	4622
	4623	thread_update_array[thread_update_count++] = thread;
	4624	thread_reference_internal(thread);
	4625	return (TRUE);
	4626	}
	4627
	4628	void
	4629	thread_update_process_threads(void)
	4630	{
	4631	while (thread_update_count > 0) {
	4632	spl_t s;
	4633	thread_t thread = thread_update_array[--thread_update_count];
	4634	thread_update_array[thread_update_count] = THREAD_NULL;
	4635
	4636	s = splsched();
	4637	thread_lock(thread);
	4638	if (!(thread->state & (TH_WAIT)) && (SCHED(can_update_priority)(thread))) {
	4639	SCHED(update_priority)(thread);
	4640	}
	4641	thread_unlock(thread);
	4642	splx(s);
	4643
	4644	thread_deallocate(thread);
	4645	}
	4646	}
	4647
	4648	/*
	4649	* Scan a runq for candidate threads.
	4650	*
	4651	* Returns TRUE if retry is needed.
	4652	*/
	4653	boolean_t
	4654	runq_scan(
	4655	run_queue_t runq,
	4656	sched_update_scan_context_t scan_context)
	4657	{
	4658	register int count;
	4659	register queue_t q;
	4660	register thread_t thread;
	4661
	4662	if ((count = runq->count) > 0) {
	4663	q = runq->queues + runq->highq;
	4664	while (count > 0) {
	4665	queue_iterate(q, thread, thread_t, links) {
	4666	if ( thread->sched_stamp != sched_tick &&
	4667	(thread->sched_mode == TH_MODE_TIMESHARE) ) {
	4668	if (thread_update_add_thread(thread) == FALSE)
	4669	return (TRUE);
	4670	}
	4671
	4672	if (cpu_throttle_enabled && ((thread->sched_pri <= MAXPRI_THROTTLE) && (thread->base_pri <= MAXPRI_THROTTLE))) {
	4673	if (thread->last_made_runnable_time < scan_context->earliest_bg_make_runnable_time) {
	4674	scan_context->earliest_bg_make_runnable_time = thread->last_made_runnable_time;
	4675	}
	4676	} else {
	4677	if (thread->last_made_runnable_time < scan_context->earliest_normal_make_runnable_time) {
	4678	scan_context->earliest_normal_make_runnable_time = thread->last_made_runnable_time;
	4679	}
	4680	}
	4681
	4682	count--;
	4683	}
	4684
	4685	q--;
	4686	}
	4687	}
	4688
	4689	return (FALSE);
	4690	}
	4691
	4692	#endif /* CONFIG_SCHED_TIMESHARE_CORE */
	4693
	4694	boolean_t
	4695	thread_eager_preemption(thread_t thread)
	4696	{
	4697	return ((thread->sched_flags & TH_SFLAG_EAGERPREEMPT) != 0);
	4698	}
	4699
	4700	void
	4701	thread_set_eager_preempt(thread_t thread)
	4702	{
	4703	spl_t x;
	4704	processor_t p;
	4705	ast_t ast = AST_NONE;
	4706
	4707	x = splsched();
	4708	p = current_processor();
	4709
	4710	thread_lock(thread);
	4711	thread->sched_flags \|= TH_SFLAG_EAGERPREEMPT;
	4712
	4713	if (thread == current_thread()) {
	4714
	4715	ast = csw_check(p, AST_NONE);
	4716	thread_unlock(thread);
	4717	if (ast != AST_NONE) {
	4718	(void) thread_block_reason(THREAD_CONTINUE_NULL, NULL, ast);
	4719	}
	4720	} else {
	4721	p = thread->last_processor;
	4722
	4723	if (p != PROCESSOR_NULL && p->state == PROCESSOR_RUNNING &&
	4724	p->active_thread == thread) {
	4725	cause_ast_check(p);
	4726	}
	4727
	4728	thread_unlock(thread);
	4729	}
	4730
	4731	splx(x);
	4732	}
	4733
	4734	void
	4735	thread_clear_eager_preempt(thread_t thread)
	4736	{
	4737	spl_t x;
	4738
	4739	x = splsched();
	4740	thread_lock(thread);
	4741
	4742	thread->sched_flags &= ~TH_SFLAG_EAGERPREEMPT;
	4743
	4744	thread_unlock(thread);
	4745	splx(x);
	4746	}
	4747
	4748	/*
	4749	* Scheduling statistics
	4750	*/
	4751	void
	4752	sched_stats_handle_csw(processor_t processor, int reasons, int selfpri, int otherpri)
	4753	{
	4754	struct processor_sched_statistics *stats;
	4755	boolean_t to_realtime = FALSE;
	4756
	4757	stats = &processor->processor_data.sched_stats;
	4758	stats->csw_count++;
	4759
	4760	if (otherpri >= BASEPRI_REALTIME) {
	4761	stats->rt_sched_count++;
	4762	to_realtime = TRUE;
	4763	}
	4764
	4765	if ((reasons & AST_PREEMPT) != 0) {
	4766	stats->preempt_count++;
	4767
	4768	if (selfpri >= BASEPRI_REALTIME) {
	4769	stats->preempted_rt_count++;
	4770	}
	4771
	4772	if (to_realtime) {
	4773	stats->preempted_by_rt_count++;
	4774	}
	4775
	4776	}
	4777	}
	4778
	4779	void
	4780	sched_stats_handle_runq_change(struct runq_stats *stats, int old_count)
	4781	{
	4782	uint64_t timestamp = mach_absolute_time();
	4783
	4784	stats->count_sum += (timestamp - stats->last_change_timestamp) * old_count;
	4785	stats->last_change_timestamp = timestamp;
	4786	}
	4787
	4788	/*
	4789	* For calls from assembly code
	4790	*/
	4791	#undef thread_wakeup
	4792	void
	4793	thread_wakeup(
	4794	event_t x);
	4795
	4796	void
	4797	thread_wakeup(
	4798	event_t x)
	4799	{
	4800	thread_wakeup_with_result(x, THREAD_AWAKENED);
	4801	}
	4802
	4803	boolean_t
	4804	preemption_enabled(void)
	4805	{
	4806	return (get_preemption_level() == 0 && ml_get_interrupts_enabled());
	4807	}
	4808
	4809	static void
	4810	sched_timer_deadline_tracking_init(void) {
	4811	nanoseconds_to_absolutetime(TIMER_DEADLINE_TRACKING_BIN_1_DEFAULT, &timer_deadline_tracking_bin_1);
	4812	nanoseconds_to_absolutetime(TIMER_DEADLINE_TRACKING_BIN_2_DEFAULT, &timer_deadline_tracking_bin_2);
	4813	}
	4814
	4815
	4816	kern_return_t
	4817	sched_work_interval_notify(thread_t thread, uint64_t work_interval_id, uint64_t start, uint64_t finish, uint64_t deadline, uint64_t next_start, uint32_t flags)
	4818	{
	4819	int urgency;
	4820	uint64_t urgency_param1, urgency_param2;
	4821	spl_t s;
	4822
	4823	if (work_interval_id == 0) {
	4824	return (KERN_INVALID_ARGUMENT);
	4825	}
	4826
	4827	assert(thread == current_thread());
	4828
	4829	thread_mtx_lock(thread);
	4830	if (thread->work_interval_id != work_interval_id) {
	4831	thread_mtx_unlock(thread);
	4832	return (KERN_INVALID_ARGUMENT);
	4833	}
	4834	thread_mtx_unlock(thread);
	4835
	4836	s = splsched();
	4837	thread_lock(thread);
	4838	urgency = thread_get_urgency(thread, &urgency_param1, &urgency_param2);
	4839	thread_unlock(thread);
	4840	splx(s);
	4841
	4842	machine_work_interval_notify(thread, work_interval_id, start, finish, deadline, next_start, urgency, flags);
	4843	return (KERN_SUCCESS);
	4844	}
	4845
	4846	void thread_set_options(uint32_t thopt) {
	4847	spl_t x;
	4848	thread_t t = current_thread();
	4849
	4850	x = splsched();
	4851	thread_lock(t);
	4852
	4853	t->options \|= thopt;
	4854
	4855	thread_unlock(t);
	4856	splx(x);
	4857	}