git.saurik.com Git - apple/xnu.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2000-2016 Apple Inc. All rights reserved.
	3	*
	4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
	5	*
	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. The rights granted to you under the License
	10	* may not be used to create, or enable the creation or redistribution of,
	11	* unlawful or unlicensed copies of an Apple operating system, or to
	12	* circumvent, violate, or enable the circumvention or violation of, any
	13	* terms of an Apple operating system software license agreement.
	14	*
	15	* Please obtain a copy of the License at
	16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
	17	*
	18	* The Original Code and all software distributed under the License are
	19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	23	* Please see the License for the specific language governing rights and
	24	* limitations under the License.
	25	*
	26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
	27	*/
	28	/*
	29	* @OSF_FREE_COPYRIGHT@
	30	*/
	31	/*
	32	* Mach Operating System
	33	* Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
	34	* All Rights Reserved.
	35	*
	36	* Permission to use, copy, modify and distribute this software and its
	37	* documentation is hereby granted, provided that both the copyright
	38	* notice and this permission notice appear in all copies of the
	39	* software, derivative works or modified versions, and any portions
	40	* thereof, and that both notices appear in supporting documentation.
	41	*
	42	* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
	43	* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
	44	* ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
	45	*
	46	* Carnegie Mellon requests users of this software to return to
	47	*
	48	* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
	49	* School of Computer Science
	50	* Carnegie Mellon University
	51	* Pittsburgh PA 15213-3890
	52	*
	53	* any improvements or extensions that they make and grant Carnegie Mellon
	54	* the rights to redistribute these changes.
	55	*/
	56	/*
	57	*/
	58	/*
	59	* File: sched_prim.c
	60	* Author: Avadis Tevanian, Jr.
	61	* Date: 1986
	62	*
	63	* Scheduling primitives
	64	*
	65	*/
	66
	67	#include <debug.h>
	68
	69	#include <mach/mach_types.h>
	70	#include <mach/machine.h>
	71	#include <mach/policy.h>
	72	#include <mach/sync_policy.h>
	73	#include <mach/thread_act.h>
	74
	75	#include <machine/machine_routines.h>
	76	#include <machine/sched_param.h>
	77	#include <machine/machine_cpu.h>
	78	#include <machine/machlimits.h>
	79
	80	#ifdef CONFIG_MACH_APPROXIMATE_TIME
	81	#include <machine/commpage.h>
	82	#endif
	83
	84	#include <kern/kern_types.h>
	85	#include <kern/backtrace.h>
	86	#include <kern/clock.h>
	87	#include <kern/counters.h>
	88	#include <kern/cpu_number.h>
	89	#include <kern/cpu_data.h>
	90	#include <kern/smp.h>
	91	#include <kern/debug.h>
	92	#include <kern/macro_help.h>
	93	#include <kern/machine.h>
	94	#include <kern/misc_protos.h>
	95	#include <kern/processor.h>
	96	#include <kern/queue.h>
	97	#include <kern/sched.h>
	98	#include <kern/sched_prim.h>
	99	#include <kern/sfi.h>
	100	#include <kern/syscall_subr.h>
	101	#include <kern/task.h>
	102	#include <kern/thread.h>
	103	#include <kern/ledger.h>
	104	#include <kern/timer_queue.h>
	105	#include <kern/waitq.h>
	106	#include <kern/policy_internal.h>
	107
	108	#include <vm/pmap.h>
	109	#include <vm/vm_kern.h>
	110	#include <vm/vm_map.h>
	111
	112	#include <mach/sdt.h>
	113
	114	#include <sys/kdebug.h>
	115	#include <kperf/kperf.h>
	116	#include <kern/kpc.h>
	117
	118	#include <kern/pms.h>
	119
	120	struct rt_queue rt_runq;
	121
	122	uintptr_t sched_thread_on_rt_queue = (uintptr_t)0xDEAFBEE0;
	123
	124	/* Lock RT runq, must be done with interrupts disabled (under splsched()) */
	125	#if __SMP__
	126	decl_simple_lock_data(static,rt_lock);
	127	#define rt_lock_init() simple_lock_init(&rt_lock, 0)
	128	#define rt_lock_lock() simple_lock(&rt_lock)
	129	#define rt_lock_unlock() simple_unlock(&rt_lock)
	130	#else
	131	#define rt_lock_init() do { } while(0)
	132	#define rt_lock_lock() do { } while(0)
	133	#define rt_lock_unlock() do { } while(0)
	134	#endif
	135
	136	#define DEFAULT_PREEMPTION_RATE 100 /* (1/s) */
	137	int default_preemption_rate = DEFAULT_PREEMPTION_RATE;
	138
	139	#define DEFAULT_BG_PREEMPTION_RATE 400 /* (1/s) */
	140	int default_bg_preemption_rate = DEFAULT_BG_PREEMPTION_RATE;
	141
	142	#define MAX_UNSAFE_QUANTA 800
	143	int max_unsafe_quanta = MAX_UNSAFE_QUANTA;
	144
	145	#define MAX_POLL_QUANTA 2
	146	int max_poll_quanta = MAX_POLL_QUANTA;
	147
	148	#define SCHED_POLL_YIELD_SHIFT 4 /* 1/16 */
	149	int sched_poll_yield_shift = SCHED_POLL_YIELD_SHIFT;
	150
	151	uint64_t max_poll_computation;
	152
	153	uint64_t max_unsafe_computation;
	154	uint64_t sched_safe_duration;
	155
	156	#if defined(CONFIG_SCHED_TIMESHARE_CORE)
	157
	158	uint32_t std_quantum;
	159	uint32_t min_std_quantum;
	160	uint32_t bg_quantum;
	161
	162	uint32_t std_quantum_us;
	163	uint32_t bg_quantum_us;
	164
	165	#endif /* CONFIG_SCHED_TIMESHARE_CORE */
	166
	167	uint32_t thread_depress_time;
	168	uint32_t default_timeshare_computation;
	169	uint32_t default_timeshare_constraint;
	170
	171	uint32_t max_rt_quantum;
	172	uint32_t min_rt_quantum;
	173
	174	#if defined(CONFIG_SCHED_TIMESHARE_CORE)
	175
	176	unsigned sched_tick;
	177	uint32_t sched_tick_interval;
	178
	179	uint32_t sched_pri_shifts[TH_BUCKET_MAX];
	180	uint32_t sched_fixed_shift;
	181
	182	uint32_t sched_decay_usage_age_factor = 1; /* accelerate 5/8^n usage aging */
	183
	184	/* Allow foreground to decay past default to resolve inversions */
	185	#define DEFAULT_DECAY_BAND_LIMIT ((BASEPRI_FOREGROUND - BASEPRI_DEFAULT) + 2)
	186	int sched_pri_decay_band_limit = DEFAULT_DECAY_BAND_LIMIT;
	187
	188	/* Defaults for timer deadline profiling */
	189	#define TIMER_DEADLINE_TRACKING_BIN_1_DEFAULT 2000000 /* Timers with deadlines <=
	190	* 2ms */
	191	#define TIMER_DEADLINE_TRACKING_BIN_2_DEFAULT 5000000 /* Timers with deadlines
	192	<= 5ms */
	193
	194	uint64_t timer_deadline_tracking_bin_1;
	195	uint64_t timer_deadline_tracking_bin_2;
	196
	197	#endif /* CONFIG_SCHED_TIMESHARE_CORE */
	198
	199	thread_t sched_maintenance_thread;
	200
	201
	202	uint64_t sched_one_second_interval;
	203
	204	/* Forwards */
	205
	206	#if defined(CONFIG_SCHED_TIMESHARE_CORE)
	207
	208	static void load_shift_init(void);
	209	static void preempt_pri_init(void);
	210
	211	#endif /* CONFIG_SCHED_TIMESHARE_CORE */
	212
	213	static thread_t thread_select(
	214	thread_t thread,
	215	processor_t processor,
	216	ast_t reason);
	217
	218	#if CONFIG_SCHED_IDLE_IN_PLACE
	219	static thread_t thread_select_idle(
	220	thread_t thread,
	221	processor_t processor);
	222	#endif
	223
	224	thread_t processor_idle(
	225	thread_t thread,
	226	processor_t processor);
	227
	228	ast_t
	229	csw_check_locked( processor_t processor,
	230	processor_set_t pset,
	231	ast_t check_reason);
	232
	233	static void processor_setrun(
	234	processor_t processor,
	235	thread_t thread,
	236	integer_t options);
	237
	238	static void
	239	sched_realtime_init(void);
	240
	241	static void
	242	sched_realtime_timebase_init(void);
	243
	244	static void
	245	sched_timer_deadline_tracking_init(void);
	246
	247	#if DEBUG
	248	extern int debug_task;
	249	#define TLOG(a, fmt, args...) if(debug_task & a) kprintf(fmt, ## args)
	250	#else
	251	#define TLOG(a, fmt, args...) do {} while (0)
	252	#endif
	253
	254	static processor_t
	255	thread_bind_internal(
	256	thread_t thread,
	257	processor_t processor);
	258
	259	static void
	260	sched_vm_group_maintenance(void);
	261
	262	#if defined(CONFIG_SCHED_TIMESHARE_CORE)
	263	int8_t sched_load_shifts[NRQS];
	264	bitmap_t sched_preempt_pri[BITMAP_LEN(NRQS)];
	265	#endif /* CONFIG_SCHED_TIMESHARE_CORE */
	266
	267	const struct sched_dispatch_table *sched_current_dispatch = NULL;
	268
	269	/*
	270	* Statically allocate a buffer to hold the longest possible
	271	* scheduler description string, as currently implemented.
	272	* bsd/kern/kern_sysctl.c has a corresponding definition in bsd/
	273	* to export to userspace via sysctl(3). If either version
	274	* changes, update the other.
	275	*
	276	* Note that in addition to being an upper bound on the strings
	277	* in the kernel, it's also an exact parameter to PE_get_default(),
	278	* which interrogates the device tree on some platforms. That
	279	* API requires the caller know the exact size of the device tree
	280	* property, so we need both a legacy size (32) and the current size
	281	* (48) to deal with old and new device trees. The device tree property
	282	* is similarly padded to a fixed size so that the same kernel image
	283	* can run on multiple devices with different schedulers configured
	284	* in the device tree.
	285	*/
	286	char sched_string[SCHED_STRING_MAX_LENGTH];
	287
	288	uint32_t sched_debug_flags;
	289
	290	/* Global flag which indicates whether Background Stepper Context is enabled */
	291	static int cpu_throttle_enabled = 1;
	292
	293	void
	294	sched_init(void)
	295	{
	296	char sched_arg[SCHED_STRING_MAX_LENGTH] = { '\0' };
	297
	298	/* Check for runtime selection of the scheduler algorithm */
	299	if (!PE_parse_boot_argn("sched", sched_arg, sizeof (sched_arg))) {
	300	/* If no boot-args override, look in device tree */
	301	if (!PE_get_default("kern.sched", sched_arg,
	302	SCHED_STRING_MAX_LENGTH)) {
	303	sched_arg[0] = '\0';
	304	}
	305	}
	306
	307
	308	if (!PE_parse_boot_argn("sched_pri_decay_limit", &sched_pri_decay_band_limit, sizeof(sched_pri_decay_band_limit))) {
	309	/* No boot-args, check in device tree */
	310	if (!PE_get_default("kern.sched_pri_decay_limit",
	311	&sched_pri_decay_band_limit,
	312	sizeof(sched_pri_decay_band_limit))) {
	313	/* Allow decay all the way to normal limits */
	314	sched_pri_decay_band_limit = DEFAULT_DECAY_BAND_LIMIT;
	315	}
	316	}
	317
	318	kprintf("Setting scheduler priority decay band limit %d\n", sched_pri_decay_band_limit);
	319
	320	if (strlen(sched_arg) > 0) {
	321	if (0) {
	322	/* Allow pattern below */
	323	#if defined(CONFIG_SCHED_TRADITIONAL)
	324	} else if (0 == strcmp(sched_arg, sched_traditional_dispatch.sched_name)) {
	325	sched_current_dispatch = &sched_traditional_dispatch;
	326	} else if (0 == strcmp(sched_arg, sched_traditional_with_pset_runqueue_dispatch.sched_name)) {
	327	sched_current_dispatch = &sched_traditional_with_pset_runqueue_dispatch;
	328	#endif
	329	#if defined(CONFIG_SCHED_PROTO)
	330	} else if (0 == strcmp(sched_arg, sched_proto_dispatch.sched_name)) {
	331	sched_current_dispatch = &sched_proto_dispatch;
	332	#endif
	333	#if defined(CONFIG_SCHED_GRRR)
	334	} else if (0 == strcmp(sched_arg, sched_grrr_dispatch.sched_name)) {
	335	sched_current_dispatch = &sched_grrr_dispatch;
	336	#endif
	337	#if defined(CONFIG_SCHED_MULTIQ)
	338	} else if (0 == strcmp(sched_arg, sched_multiq_dispatch.sched_name)) {
	339	sched_current_dispatch = &sched_multiq_dispatch;
	340	} else if (0 == strcmp(sched_arg, sched_dualq_dispatch.sched_name)) {
	341	sched_current_dispatch = &sched_dualq_dispatch;
	342	#endif
	343	} else {
	344	#if defined(CONFIG_SCHED_TRADITIONAL)
	345	printf("Unrecognized scheduler algorithm: %s\n", sched_arg);
	346	printf("Scheduler: Using instead: %s\n", sched_traditional_with_pset_runqueue_dispatch.sched_name);
	347	sched_current_dispatch = &sched_traditional_with_pset_runqueue_dispatch;
	348	#else
	349	panic("Unrecognized scheduler algorithm: %s", sched_arg);
	350	#endif
	351	}
	352	kprintf("Scheduler: Runtime selection of %s\n", SCHED(sched_name));
	353	} else {
	354	#if defined(CONFIG_SCHED_MULTIQ)
	355	sched_current_dispatch = &sched_multiq_dispatch;
	356	#elif defined(CONFIG_SCHED_TRADITIONAL)
	357	sched_current_dispatch = &sched_traditional_with_pset_runqueue_dispatch;
	358	#elif defined(CONFIG_SCHED_PROTO)
	359	sched_current_dispatch = &sched_proto_dispatch;
	360	#elif defined(CONFIG_SCHED_GRRR)
	361	sched_current_dispatch = &sched_grrr_dispatch;
	362	#else
	363	#error No default scheduler implementation
	364	#endif
	365	kprintf("Scheduler: Default of %s\n", SCHED(sched_name));
	366	}
	367
	368	strlcpy(sched_string, SCHED(sched_name), sizeof(sched_string));
	369
	370	if (PE_parse_boot_argn("sched_debug", &sched_debug_flags, sizeof(sched_debug_flags))) {
	371	kprintf("Scheduler: Debug flags 0x%08x\n", sched_debug_flags);
	372	}
	373
	374	SCHED(init)();
	375	sched_realtime_init();
	376	ast_init();
	377	sched_timer_deadline_tracking_init();
	378
	379	SCHED(pset_init)(&pset0);
	380	SCHED(processor_init)(master_processor);
	381	}
	382
	383	void
	384	sched_timebase_init(void)
	385	{
	386	uint64_t abstime;
	387
	388	clock_interval_to_absolutetime_interval(1, NSEC_PER_SEC, &abstime);
	389	sched_one_second_interval = abstime;
	390
	391	SCHED(timebase_init)();
	392	sched_realtime_timebase_init();
	393	}
	394
	395	#if defined(CONFIG_SCHED_TIMESHARE_CORE)
	396
	397	void
	398	sched_timeshare_init(void)
	399	{
	400	/*
	401	* Calculate the timeslicing quantum
	402	* in us.
	403	*/
	404	if (default_preemption_rate < 1)
	405	default_preemption_rate = DEFAULT_PREEMPTION_RATE;
	406	std_quantum_us = (1000 * 1000) / default_preemption_rate;
	407
	408	printf("standard timeslicing quantum is %d us\n", std_quantum_us);
	409
	410	if (default_bg_preemption_rate < 1)
	411	default_bg_preemption_rate = DEFAULT_BG_PREEMPTION_RATE;
	412	bg_quantum_us = (1000 * 1000) / default_bg_preemption_rate;
	413
	414	printf("standard background quantum is %d us\n", bg_quantum_us);
	415
	416	load_shift_init();
	417	preempt_pri_init();
	418	sched_tick = 0;
	419	}
	420
	421	void
	422	sched_timeshare_timebase_init(void)
	423	{
	424	uint64_t abstime;
	425	uint32_t shift;
	426
	427	/* standard timeslicing quantum */
	428	clock_interval_to_absolutetime_interval(
	429	std_quantum_us, NSEC_PER_USEC, &abstime);
	430	assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
	431	std_quantum = (uint32_t)abstime;
	432
	433	/* smallest remaining quantum (250 us) */
	434	clock_interval_to_absolutetime_interval(250, NSEC_PER_USEC, &abstime);
	435	assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
	436	min_std_quantum = (uint32_t)abstime;
	437
	438	/* quantum for background tasks */
	439	clock_interval_to_absolutetime_interval(
	440	bg_quantum_us, NSEC_PER_USEC, &abstime);
	441	assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
	442	bg_quantum = (uint32_t)abstime;
	443
	444	/* scheduler tick interval */
	445	clock_interval_to_absolutetime_interval(USEC_PER_SEC >> SCHED_TICK_SHIFT,
	446	NSEC_PER_USEC, &abstime);
	447	assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
	448	sched_tick_interval = (uint32_t)abstime;
	449
	450	/*
	451	* Compute conversion factor from usage to
	452	* timesharing priorities with 5/8 ** n aging.
	453	*/
	454	abstime = (abstime * 5) / 3;
	455	for (shift = 0; abstime > BASEPRI_DEFAULT; ++shift)
	456	abstime >>= 1;
	457	sched_fixed_shift = shift;
	458
	459	for (uint32_t i = 0 ; i < TH_BUCKET_MAX ; i++)
	460	sched_pri_shifts[i] = INT8_MAX;
	461
	462	max_unsafe_computation = ((uint64_t)max_unsafe_quanta) * std_quantum;
	463	sched_safe_duration = 2 * ((uint64_t)max_unsafe_quanta) * std_quantum;
	464
	465	max_poll_computation = ((uint64_t)max_poll_quanta) * std_quantum;
	466	thread_depress_time = 1 * std_quantum;
	467	default_timeshare_computation = std_quantum / 2;
	468	default_timeshare_constraint = std_quantum;
	469
	470	}
	471
	472	#endif /* CONFIG_SCHED_TIMESHARE_CORE */
	473
	474	static void
	475	sched_realtime_init(void)
	476	{
	477	rt_lock_init();
	478
	479	rt_runq.count = 0;
	480	queue_init(&rt_runq.queue);
	481	}
	482
	483	static void
	484	sched_realtime_timebase_init(void)
	485	{
	486	uint64_t abstime;
	487
	488	/* smallest rt computaton (50 us) */
	489	clock_interval_to_absolutetime_interval(50, NSEC_PER_USEC, &abstime);
	490	assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
	491	min_rt_quantum = (uint32_t)abstime;
	492
	493	/* maximum rt computation (50 ms) */
	494	clock_interval_to_absolutetime_interval(
	495	50, 1000*NSEC_PER_USEC, &abstime);
	496	assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
	497	max_rt_quantum = (uint32_t)abstime;
	498
	499	}
	500
	501	#if defined(CONFIG_SCHED_TIMESHARE_CORE)
	502
	503	/*
	504	* Set up values for timeshare
	505	* loading factors.
	506	*/
	507	static void
	508	load_shift_init(void)
	509	{
	510	int8_t k, *p = sched_load_shifts;
	511	uint32_t i, j;
	512
	513	uint32_t sched_decay_penalty = 1;
	514
	515	if (PE_parse_boot_argn("sched_decay_penalty", &sched_decay_penalty, sizeof (sched_decay_penalty))) {
	516	kprintf("Overriding scheduler decay penalty %u\n", sched_decay_penalty);
	517	}
	518
	519	if (PE_parse_boot_argn("sched_decay_usage_age_factor", &sched_decay_usage_age_factor, sizeof (sched_decay_usage_age_factor))) {
	520	kprintf("Overriding scheduler decay usage age factor %u\n", sched_decay_usage_age_factor);
	521	}
	522
	523	if (sched_decay_penalty == 0) {
	524	/*
	525	* There is no penalty for timeshare threads for using too much
	526	* CPU, so set all load shifts to INT8_MIN. Even under high load,
	527	* sched_pri_shift will be >INT8_MAX, and there will be no
	528	* penalty applied to threads (nor will sched_usage be updated per
	529	* thread).
	530	*/
	531	for (i = 0; i < NRQS; i++) {
	532	sched_load_shifts[i] = INT8_MIN;
	533	}
	534
	535	return;
	536	}
	537
	538	p++ = INT8_MIN; p++ = 0;
	539
	540	/*
	541	* For a given system load "i", the per-thread priority
	542	* penalty per quantum of CPU usage is ~2^k priority
	543	* levels. "sched_decay_penalty" can cause more
	544	* array entries to be filled with smaller "k" values
	545	*/
	546	for (i = 2, j = 1 << sched_decay_penalty, k = 1; i < NRQS; ++k) {
	547	for (j <<= 1; (i < j) && (i < NRQS); ++i)
	548	*p++ = k;
	549	}
	550	}
	551
	552	static void
	553	preempt_pri_init(void)
	554	{
	555	bitmap_t *p = sched_preempt_pri;
	556
	557	for (int i = BASEPRI_FOREGROUND; i < MINPRI_KERNEL; ++i)
	558	bitmap_set(p, i);
	559
	560	for (int i = BASEPRI_PREEMPT; i <= MAXPRI; ++i)
	561	bitmap_set(p, i);
	562	}
	563
	564	#endif /* CONFIG_SCHED_TIMESHARE_CORE */
	565
	566	/*
	567	* Thread wait timer expiration.
	568	*/
	569	void
	570	thread_timer_expire(
	571	void *p0,
	572	__unused void *p1)
	573	{
	574	thread_t thread = p0;
	575	spl_t s;
	576
	577	assert_thread_magic(thread);
	578
	579	s = splsched();
	580	thread_lock(thread);
	581	if (--thread->wait_timer_active == 0) {
	582	if (thread->wait_timer_is_set) {
	583	thread->wait_timer_is_set = FALSE;
	584	clear_wait_internal(thread, THREAD_TIMED_OUT);
	585	}
	586	}
	587	thread_unlock(thread);
	588	splx(s);
	589	}
	590
	591	/*
	592	* thread_unblock:
	593	*
	594	* Unblock thread on wake up.
	595	*
	596	* Returns TRUE if the thread should now be placed on the runqueue.
	597	*
	598	* Thread must be locked.
	599	*
	600	* Called at splsched().
	601	*/
	602	boolean_t
	603	thread_unblock(
	604	thread_t thread,
	605	wait_result_t wresult)
	606	{
	607	boolean_t ready_for_runq = FALSE;
	608	thread_t cthread = current_thread();
	609	uint32_t new_run_count;
	610
	611	/*
	612	* Set wait_result.
	613	*/
	614	thread->wait_result = wresult;
	615
	616	/*
	617	* Cancel pending wait timer.
	618	*/
	619	if (thread->wait_timer_is_set) {
	620	if (timer_call_cancel(&thread->wait_timer))
	621	thread->wait_timer_active--;
	622	thread->wait_timer_is_set = FALSE;
	623	}
	624
	625	/*
	626	* Update scheduling state: not waiting,
	627	* set running.
	628	*/
	629	thread->state &= ~(TH_WAIT\|TH_UNINT);
	630
	631	if (!(thread->state & TH_RUN)) {
	632	thread->state \|= TH_RUN;
	633	thread->last_made_runnable_time = mach_approximate_time();
	634
	635	ready_for_runq = TRUE;
	636
	637	(*thread->sched_call)(SCHED_CALL_UNBLOCK, thread);
	638
	639	/* Update the runnable thread count */
	640	new_run_count = sched_run_incr(thread);
	641	} else {
	642	/*
	643	* Either the thread is idling in place on another processor,
	644	* or it hasn't finished context switching yet.
	645	*/
	646	#if CONFIG_SCHED_IDLE_IN_PLACE
	647	if (thread->state & TH_IDLE) {
	648	processor_t processor = thread->last_processor;
	649
	650	if (processor != current_processor())
	651	machine_signal_idle(processor);
	652	}
	653	#else
	654	assert((thread->state & TH_IDLE) == 0);
	655	#endif
	656	/*
	657	* The run count is only dropped after the context switch completes
	658	* and the thread is still waiting, so we should not run_incr here
	659	*/
	660	new_run_count = sched_run_buckets[TH_BUCKET_RUN];
	661	}
	662
	663
	664	/*
	665	* Calculate deadline for real-time threads.
	666	*/
	667	if (thread->sched_mode == TH_MODE_REALTIME) {
	668	uint64_t ctime;
	669
	670	ctime = mach_absolute_time();
	671	thread->realtime.deadline = thread->realtime.constraint + ctime;
	672	}
	673
	674	/*
	675	* Clear old quantum, fail-safe computation, etc.
	676	*/
	677	thread->quantum_remaining = 0;
	678	thread->computation_metered = 0;
	679	thread->reason = AST_NONE;
	680	thread->block_hint = kThreadWaitNone;
	681
	682	/* Obtain power-relevant interrupt and "platform-idle exit" statistics.
	683	* We also account for "double hop" thread signaling via
	684	* the thread callout infrastructure.
	685	* DRK: consider removing the callout wakeup counters in the future
	686	* they're present for verification at the moment.
	687	*/
	688	boolean_t aticontext, pidle;
	689	ml_get_power_state(&aticontext, &pidle);
	690
	691	if (__improbable(aticontext && !(thread_get_tag_internal(thread) & THREAD_TAG_CALLOUT))) {
	692	ledger_credit(thread->t_ledger, task_ledgers.interrupt_wakeups, 1);
	693	DTRACE_SCHED2(iwakeup, struct thread , thread, struct proc , thread->task->bsd_info);
	694
	695	uint64_t ttd = PROCESSOR_DATA(current_processor(), timer_call_ttd);
	696
	697	if (ttd) {
	698	if (ttd <= timer_deadline_tracking_bin_1)
	699	thread->thread_timer_wakeups_bin_1++;
	700	else
	701	if (ttd <= timer_deadline_tracking_bin_2)
	702	thread->thread_timer_wakeups_bin_2++;
	703	}
	704
	705	if (pidle) {
	706	ledger_credit(thread->t_ledger, task_ledgers.platform_idle_wakeups, 1);
	707	}
	708
	709	} else if (thread_get_tag_internal(cthread) & THREAD_TAG_CALLOUT) {
	710	if (cthread->callout_woken_from_icontext) {
	711	ledger_credit(thread->t_ledger, task_ledgers.interrupt_wakeups, 1);
	712	thread->thread_callout_interrupt_wakeups++;
	713	if (cthread->callout_woken_from_platform_idle) {
	714	ledger_credit(thread->t_ledger, task_ledgers.platform_idle_wakeups, 1);
	715	thread->thread_callout_platform_idle_wakeups++;
	716	}
	717
	718	cthread->callout_woke_thread = TRUE;
	719	}
	720	}
	721
	722	if (thread_get_tag_internal(thread) & THREAD_TAG_CALLOUT) {
	723	thread->callout_woken_from_icontext = aticontext;
	724	thread->callout_woken_from_platform_idle = pidle;
	725	thread->callout_woke_thread = FALSE;
	726	}
	727
	728	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	729	MACHDBG_CODE(DBG_MACH_SCHED,MACH_MAKE_RUNNABLE) \| DBG_FUNC_NONE,
	730	(uintptr_t)thread_tid(thread), thread->sched_pri, thread->wait_result,
	731	sched_run_buckets[TH_BUCKET_RUN], 0);
	732
	733	DTRACE_SCHED2(wakeup, struct thread , thread, struct proc , thread->task->bsd_info);
	734
	735	return (ready_for_runq);
	736	}
	737
	738	/*
	739	* Routine: thread_go
	740	* Purpose:
	741	* Unblock and dispatch thread.
	742	* Conditions:
	743	* thread lock held, IPC locks may be held.
	744	* thread must have been pulled from wait queue under same lock hold.
	745	* thread must have been waiting
	746	* Returns:
	747	* KERN_SUCCESS - Thread was set running
	748	*
	749	* TODO: This should return void
	750	*/
	751	kern_return_t
	752	thread_go(
	753	thread_t thread,
	754	wait_result_t wresult)
	755	{
	756	assert_thread_magic(thread);
	757
	758	assert(thread->at_safe_point == FALSE);
	759	assert(thread->wait_event == NO_EVENT64);
	760	assert(thread->waitq == NULL);
	761
	762	assert(!(thread->state & (TH_TERMINATE\|TH_TERMINATE2)));
	763	assert(thread->state & TH_WAIT);
	764
	765
	766	if (thread_unblock(thread, wresult)) {
	767	#if SCHED_TRACE_THREAD_WAKEUPS
	768	backtrace(&thread->thread_wakeup_bt[0],
	769	(sizeof(thread->thread_wakeup_bt)/sizeof(uintptr_t)));
	770	#endif
	771	thread_setrun(thread, SCHED_PREEMPT \| SCHED_TAILQ);
	772	}
	773
	774	return (KERN_SUCCESS);
	775	}
	776
	777	/*
	778	* Routine: thread_mark_wait_locked
	779	* Purpose:
	780	* Mark a thread as waiting. If, given the circumstances,
	781	* it doesn't want to wait (i.e. already aborted), then
	782	* indicate that in the return value.
	783	* Conditions:
	784	* at splsched() and thread is locked.
	785	*/
	786	__private_extern__
	787	wait_result_t
	788	thread_mark_wait_locked(
	789	thread_t thread,
	790	wait_interrupt_t interruptible)
	791	{
	792	boolean_t at_safe_point;
	793
	794	assert(!(thread->state & (TH_WAIT\|TH_IDLE\|TH_UNINT\|TH_TERMINATE2)));
	795
	796	/*
	797	* The thread may have certain types of interrupts/aborts masked
	798	* off. Even if the wait location says these types of interrupts
	799	* are OK, we have to honor mask settings (outer-scoped code may
	800	* not be able to handle aborts at the moment).
	801	*/
	802	if (interruptible > (thread->options & TH_OPT_INTMASK))
	803	interruptible = thread->options & TH_OPT_INTMASK;
	804
	805	at_safe_point = (interruptible == THREAD_ABORTSAFE);
	806
	807	if ( interruptible == THREAD_UNINT \|\|
	808	!(thread->sched_flags & TH_SFLAG_ABORT) \|\|
	809	(!at_safe_point &&
	810	(thread->sched_flags & TH_SFLAG_ABORTSAFELY))) {
	811
	812	if ( !(thread->state & TH_TERMINATE))
	813	DTRACE_SCHED(sleep);
	814
	815	thread->state \|= (interruptible) ? TH_WAIT : (TH_WAIT \| TH_UNINT);
	816	thread->at_safe_point = at_safe_point;
	817
	818	/* TODO: pass this through assert_wait instead, have
	819	* assert_wait just take a struct as an argument */
	820	assert(!thread->block_hint);
	821	thread->block_hint = thread->pending_block_hint;
	822	thread->pending_block_hint = kThreadWaitNone;
	823
	824	return (thread->wait_result = THREAD_WAITING);
	825	}
	826	else
	827	if (thread->sched_flags & TH_SFLAG_ABORTSAFELY)
	828	thread->sched_flags &= ~TH_SFLAG_ABORTED_MASK;
	829	thread->pending_block_hint = kThreadWaitNone;
	830
	831	return (thread->wait_result = THREAD_INTERRUPTED);
	832	}
	833
	834	/*
	835	* Routine: thread_interrupt_level
	836	* Purpose:
	837	* Set the maximum interruptible state for the
	838	* current thread. The effective value of any
	839	* interruptible flag passed into assert_wait
	840	* will never exceed this.
	841	*
	842	* Useful for code that must not be interrupted,
	843	* but which calls code that doesn't know that.
	844	* Returns:
	845	* The old interrupt level for the thread.
	846	*/
	847	__private_extern__
	848	wait_interrupt_t
	849	thread_interrupt_level(
	850	wait_interrupt_t new_level)
	851	{
	852	thread_t thread = current_thread();
	853	wait_interrupt_t result = thread->options & TH_OPT_INTMASK;
	854
	855	thread->options = (thread->options & ~TH_OPT_INTMASK) \| (new_level & TH_OPT_INTMASK);
	856
	857	return result;
	858	}
	859
	860	/*
	861	* Check to see if an assert wait is possible, without actually doing one.
	862	* This is used by debug code in locks and elsewhere to verify that it is
	863	* always OK to block when trying to take a blocking lock (since waiting
	864	* for the actual assert_wait to catch the case may make it hard to detect
	865	* this case.
	866	*/
	867	boolean_t
	868	assert_wait_possible(void)
	869	{
	870
	871	thread_t thread;
	872
	873	#if DEBUG
	874	if(debug_mode) return TRUE; /* Always succeed in debug mode */
	875	#endif
	876
	877	thread = current_thread();
	878
	879	return (thread == NULL \|\| waitq_wait_possible(thread));
	880	}
	881
	882	/*
	883	* assert_wait:
	884	*
	885	* Assert that the current thread is about to go to
	886	* sleep until the specified event occurs.
	887	*/
	888	wait_result_t
	889	assert_wait(
	890	event_t event,
	891	wait_interrupt_t interruptible)
	892	{
	893	if (__improbable(event == NO_EVENT))
	894	panic("%s() called with NO_EVENT", __func__);
	895
	896	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	897	MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)\|DBG_FUNC_NONE,
	898	VM_KERNEL_UNSLIDE_OR_PERM(event), 0, 0, 0, 0);
	899
	900	struct waitq *waitq;
	901	waitq = global_eventq(event);
	902	return waitq_assert_wait64(waitq, CAST_EVENT64_T(event), interruptible, TIMEOUT_WAIT_FOREVER);
	903	}
	904
	905	/*
	906	* assert_wait_queue:
	907	*
	908	* Return the global waitq for the specified event
	909	*/
	910	struct waitq *
	911	assert_wait_queue(
	912	event_t event)
	913	{
	914	return global_eventq(event);
	915	}
	916
	917	wait_result_t
	918	assert_wait_timeout(
	919	event_t event,
	920	wait_interrupt_t interruptible,
	921	uint32_t interval,
	922	uint32_t scale_factor)
	923	{
	924	thread_t thread = current_thread();
	925	wait_result_t wresult;
	926	uint64_t deadline;
	927	spl_t s;
	928
	929	if (__improbable(event == NO_EVENT))
	930	panic("%s() called with NO_EVENT", __func__);
	931
	932	struct waitq *waitq;
	933	waitq = global_eventq(event);
	934
	935	s = splsched();
	936	waitq_lock(waitq);
	937
	938	clock_interval_to_deadline(interval, scale_factor, &deadline);
	939
	940	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	941	MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)\|DBG_FUNC_NONE,
	942	VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
	943
	944	wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
	945	interruptible,
	946	TIMEOUT_URGENCY_SYS_NORMAL,
	947	deadline, TIMEOUT_NO_LEEWAY,
	948	thread);
	949
	950	waitq_unlock(waitq);
	951	splx(s);
	952	return wresult;
	953	}
	954
	955	wait_result_t
	956	assert_wait_timeout_with_leeway(
	957	event_t event,
	958	wait_interrupt_t interruptible,
	959	wait_timeout_urgency_t urgency,
	960	uint32_t interval,
	961	uint32_t leeway,
	962	uint32_t scale_factor)
	963	{
	964	thread_t thread = current_thread();
	965	wait_result_t wresult;
	966	uint64_t deadline;
	967	uint64_t abstime;
	968	uint64_t slop;
	969	uint64_t now;
	970	spl_t s;
	971
	972	if (__improbable(event == NO_EVENT))
	973	panic("%s() called with NO_EVENT", __func__);
	974
	975	now = mach_absolute_time();
	976	clock_interval_to_absolutetime_interval(interval, scale_factor, &abstime);
	977	deadline = now + abstime;
	978
	979	clock_interval_to_absolutetime_interval(leeway, scale_factor, &slop);
	980
	981	struct waitq *waitq;
	982	waitq = global_eventq(event);
	983
	984	s = splsched();
	985	waitq_lock(waitq);
	986
	987	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	988	MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)\|DBG_FUNC_NONE,
	989	VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
	990
	991	wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
	992	interruptible,
	993	urgency, deadline, slop,
	994	thread);
	995
	996	waitq_unlock(waitq);
	997	splx(s);
	998	return wresult;
	999	}
	1000
	1001	wait_result_t
	1002	assert_wait_deadline(
	1003	event_t event,
	1004	wait_interrupt_t interruptible,
	1005	uint64_t deadline)
	1006	{
	1007	thread_t thread = current_thread();
	1008	wait_result_t wresult;
	1009	spl_t s;
	1010
	1011	if (__improbable(event == NO_EVENT))
	1012	panic("%s() called with NO_EVENT", __func__);
	1013
	1014	struct waitq *waitq;
	1015	waitq = global_eventq(event);
	1016
	1017	s = splsched();
	1018	waitq_lock(waitq);
	1019
	1020	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	1021	MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)\|DBG_FUNC_NONE,
	1022	VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
	1023
	1024	wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
	1025	interruptible,
	1026	TIMEOUT_URGENCY_SYS_NORMAL, deadline,
	1027	TIMEOUT_NO_LEEWAY, thread);
	1028	waitq_unlock(waitq);
	1029	splx(s);
	1030	return wresult;
	1031	}
	1032
	1033	wait_result_t
	1034	assert_wait_deadline_with_leeway(
	1035	event_t event,
	1036	wait_interrupt_t interruptible,
	1037	wait_timeout_urgency_t urgency,
	1038	uint64_t deadline,
	1039	uint64_t leeway)
	1040	{
	1041	thread_t thread = current_thread();
	1042	wait_result_t wresult;
	1043	spl_t s;
	1044
	1045	if (__improbable(event == NO_EVENT))
	1046	panic("%s() called with NO_EVENT", __func__);
	1047
	1048	struct waitq *waitq;
	1049	waitq = global_eventq(event);
	1050
	1051	s = splsched();
	1052	waitq_lock(waitq);
	1053
	1054	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	1055	MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)\|DBG_FUNC_NONE,
	1056	VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
	1057
	1058	wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
	1059	interruptible,
	1060	urgency, deadline, leeway,
	1061	thread);
	1062	waitq_unlock(waitq);
	1063	splx(s);
	1064	return wresult;
	1065	}
	1066
	1067	/*
	1068	* thread_isoncpu:
	1069	*
	1070	* Return TRUE if a thread is running on a processor such that an AST
	1071	* is needed to pull it out of userspace execution, or if executing in
	1072	* the kernel, bring to a context switch boundary that would cause
	1073	* thread state to be serialized in the thread PCB.
	1074	*
	1075	* Thread locked, returns the same way. While locked, fields
	1076	* like "state" cannot change. "runq" can change only from set to unset.
	1077	*/
	1078	static inline boolean_t
	1079	thread_isoncpu(thread_t thread)
	1080	{
	1081	/* Not running or runnable */
	1082	if (!(thread->state & TH_RUN))
	1083	return (FALSE);
	1084
	1085	/* Waiting on a runqueue, not currently running */
	1086	/* TODO: This is invalid - it can get dequeued without thread lock, but not context switched. */
	1087	if (thread->runq != PROCESSOR_NULL)
	1088	return (FALSE);
	1089
	1090	/*
	1091	* Thread does not have a stack yet
	1092	* It could be on the stack alloc queue or preparing to be invoked
	1093	*/
	1094	if (!thread->kernel_stack)
	1095	return (FALSE);
	1096
	1097	/*
	1098	* Thread must be running on a processor, or
	1099	* about to run, or just did run. In all these
	1100	* cases, an AST to the processor is needed
	1101	* to guarantee that the thread is kicked out
	1102	* of userspace and the processor has
	1103	* context switched (and saved register state).
	1104	*/
	1105	return (TRUE);
	1106	}
	1107
	1108	/*
	1109	* thread_stop:
	1110	*
	1111	* Force a preemption point for a thread and wait
	1112	* for it to stop running on a CPU. If a stronger
	1113	* guarantee is requested, wait until no longer
	1114	* runnable. Arbitrates access among
	1115	* multiple stop requests. (released by unstop)
	1116	*
	1117	* The thread must enter a wait state and stop via a
	1118	* separate means.
	1119	*
	1120	* Returns FALSE if interrupted.
	1121	*/
	1122	boolean_t
	1123	thread_stop(
	1124	thread_t thread,
	1125	boolean_t until_not_runnable)
	1126	{
	1127	wait_result_t wresult;
	1128	spl_t s = splsched();
	1129	boolean_t oncpu;
	1130
	1131	wake_lock(thread);
	1132	thread_lock(thread);
	1133
	1134	while (thread->state & TH_SUSP) {
	1135	thread->wake_active = TRUE;
	1136	thread_unlock(thread);
	1137
	1138	wresult = assert_wait(&thread->wake_active, THREAD_ABORTSAFE);
	1139	wake_unlock(thread);
	1140	splx(s);
	1141
	1142	if (wresult == THREAD_WAITING)
	1143	wresult = thread_block(THREAD_CONTINUE_NULL);
	1144
	1145	if (wresult != THREAD_AWAKENED)
	1146	return (FALSE);
	1147
	1148	s = splsched();
	1149	wake_lock(thread);
	1150	thread_lock(thread);
	1151	}
	1152
	1153	thread->state \|= TH_SUSP;
	1154
	1155	while ((oncpu = thread_isoncpu(thread)) \|\|
	1156	(until_not_runnable && (thread->state & TH_RUN))) {
	1157	processor_t processor;
	1158
	1159	if (oncpu) {
	1160	assert(thread->state & TH_RUN);
	1161	processor = thread->chosen_processor;
	1162	cause_ast_check(processor);
	1163	}
	1164
	1165	thread->wake_active = TRUE;
	1166	thread_unlock(thread);
	1167
	1168	wresult = assert_wait(&thread->wake_active, THREAD_ABORTSAFE);
	1169	wake_unlock(thread);
	1170	splx(s);
	1171
	1172	if (wresult == THREAD_WAITING)
	1173	wresult = thread_block(THREAD_CONTINUE_NULL);
	1174
	1175	if (wresult != THREAD_AWAKENED) {
	1176	thread_unstop(thread);
	1177	return (FALSE);
	1178	}
	1179
	1180	s = splsched();
	1181	wake_lock(thread);
	1182	thread_lock(thread);
	1183	}
	1184
	1185	thread_unlock(thread);
	1186	wake_unlock(thread);
	1187	splx(s);
	1188
	1189	/*
	1190	* We return with the thread unlocked. To prevent it from
	1191	* transitioning to a runnable state (or from TH_RUN to
	1192	* being on the CPU), the caller must ensure the thread
	1193	* is stopped via an external means (such as an AST)
	1194	*/
	1195
	1196	return (TRUE);
	1197	}
	1198
	1199	/*
	1200	* thread_unstop:
	1201	*
	1202	* Release a previous stop request and set
	1203	* the thread running if appropriate.
	1204	*
	1205	* Use only after a successful stop operation.
	1206	*/
	1207	void
	1208	thread_unstop(
	1209	thread_t thread)
	1210	{
	1211	spl_t s = splsched();
	1212
	1213	wake_lock(thread);
	1214	thread_lock(thread);
	1215
	1216	assert((thread->state & (TH_RUN\|TH_WAIT\|TH_SUSP)) != TH_SUSP);
	1217
	1218	if (thread->state & TH_SUSP) {
	1219	thread->state &= ~TH_SUSP;
	1220
	1221	if (thread->wake_active) {
	1222	thread->wake_active = FALSE;
	1223	thread_unlock(thread);
	1224
	1225	thread_wakeup(&thread->wake_active);
	1226	wake_unlock(thread);
	1227	splx(s);
	1228
	1229	return;
	1230	}
	1231	}
	1232
	1233	thread_unlock(thread);
	1234	wake_unlock(thread);
	1235	splx(s);
	1236	}
	1237
	1238	/*
	1239	* thread_wait:
	1240	*
	1241	* Wait for a thread to stop running. (non-interruptible)
	1242	*
	1243	*/
	1244	void
	1245	thread_wait(
	1246	thread_t thread,
	1247	boolean_t until_not_runnable)
	1248	{
	1249	wait_result_t wresult;
	1250	boolean_t oncpu;
	1251	processor_t processor;
	1252	spl_t s = splsched();
	1253
	1254	wake_lock(thread);
	1255	thread_lock(thread);
	1256
	1257	/*
	1258	* Wait until not running on a CPU. If stronger requirement
	1259	* desired, wait until not runnable. Assumption: if thread is
	1260	* on CPU, then TH_RUN is set, so we're not waiting in any case
	1261	* where the original, pure "TH_RUN" check would have let us
	1262	* finish.
	1263	*/
	1264	while ((oncpu = thread_isoncpu(thread)) \|\|
	1265	(until_not_runnable && (thread->state & TH_RUN))) {
	1266
	1267	if (oncpu) {
	1268	assert(thread->state & TH_RUN);
	1269	processor = thread->chosen_processor;
	1270	cause_ast_check(processor);
	1271	}
	1272
	1273	thread->wake_active = TRUE;
	1274	thread_unlock(thread);
	1275
	1276	wresult = assert_wait(&thread->wake_active, THREAD_UNINT);
	1277	wake_unlock(thread);
	1278	splx(s);
	1279
	1280	if (wresult == THREAD_WAITING)
	1281	thread_block(THREAD_CONTINUE_NULL);
	1282
	1283	s = splsched();
	1284	wake_lock(thread);
	1285	thread_lock(thread);
	1286	}
	1287
	1288	thread_unlock(thread);
	1289	wake_unlock(thread);
	1290	splx(s);
	1291	}
	1292
	1293	/*
	1294	* Routine: clear_wait_internal
	1295	*
	1296	* Clear the wait condition for the specified thread.
	1297	* Start the thread executing if that is appropriate.
	1298	* Arguments:
	1299	* thread thread to awaken
	1300	* result Wakeup result the thread should see
	1301	* Conditions:
	1302	* At splsched
	1303	* the thread is locked.
	1304	* Returns:
	1305	* KERN_SUCCESS thread was rousted out a wait
	1306	* KERN_FAILURE thread was waiting but could not be rousted
	1307	* KERN_NOT_WAITING thread was not waiting
	1308	*/
	1309	__private_extern__ kern_return_t
	1310	clear_wait_internal(
	1311	thread_t thread,
	1312	wait_result_t wresult)
	1313	{
	1314	uint32_t i = LockTimeOutUsec;
	1315	struct waitq *waitq = thread->waitq;
	1316
	1317	do {
	1318	if (wresult == THREAD_INTERRUPTED && (thread->state & TH_UNINT))
	1319	return (KERN_FAILURE);
	1320
	1321	if (waitq != NULL) {
	1322	if (!waitq_pull_thread_locked(waitq, thread)) {
	1323	thread_unlock(thread);
	1324	delay(1);
	1325	if (i > 0 && !machine_timeout_suspended())
	1326	i--;
	1327	thread_lock(thread);
	1328	if (waitq != thread->waitq)
	1329	return KERN_NOT_WAITING;
	1330	continue;
	1331	}
	1332	}
	1333
	1334	/* TODO: Can we instead assert TH_TERMINATE is not set? */
	1335	if ((thread->state & (TH_WAIT\|TH_TERMINATE)) == TH_WAIT)
	1336	return (thread_go(thread, wresult));
	1337	else
	1338	return (KERN_NOT_WAITING);
	1339	} while (i > 0);
	1340
	1341	panic("clear_wait_internal: deadlock: thread=%p, wq=%p, cpu=%d\n",
	1342	thread, waitq, cpu_number());
	1343
	1344	return (KERN_FAILURE);
	1345	}
	1346
	1347
	1348	/*
	1349	* clear_wait:
	1350	*
	1351	* Clear the wait condition for the specified thread. Start the thread
	1352	* executing if that is appropriate.
	1353	*
	1354	* parameters:
	1355	* thread thread to awaken
	1356	* result Wakeup result the thread should see
	1357	*/
	1358	kern_return_t
	1359	clear_wait(
	1360	thread_t thread,
	1361	wait_result_t result)
	1362	{
	1363	kern_return_t ret;
	1364	spl_t s;
	1365
	1366	s = splsched();
	1367	thread_lock(thread);
	1368	ret = clear_wait_internal(thread, result);
	1369	thread_unlock(thread);
	1370	splx(s);
	1371	return ret;
	1372	}
	1373
	1374
	1375	/*
	1376	* thread_wakeup_prim:
	1377	*
	1378	* Common routine for thread_wakeup, thread_wakeup_with_result,
	1379	* and thread_wakeup_one.
	1380	*
	1381	*/
	1382	kern_return_t
	1383	thread_wakeup_prim(
	1384	event_t event,
	1385	boolean_t one_thread,
	1386	wait_result_t result)
	1387	{
	1388	if (__improbable(event == NO_EVENT))
	1389	panic("%s() called with NO_EVENT", __func__);
	1390
	1391	struct waitq *wq = global_eventq(event);
	1392
	1393	if (one_thread)
	1394	return waitq_wakeup64_one(wq, CAST_EVENT64_T(event), result, WAITQ_ALL_PRIORITIES);
	1395	else
	1396	return waitq_wakeup64_all(wq, CAST_EVENT64_T(event), result, WAITQ_ALL_PRIORITIES);
	1397	}
	1398
	1399	/*
	1400	* Wakeup a specified thread if and only if it's waiting for this event
	1401	*/
	1402	kern_return_t
	1403	thread_wakeup_thread(
	1404	event_t event,
	1405	thread_t thread)
	1406	{
	1407	if (__improbable(event == NO_EVENT))
	1408	panic("%s() called with NO_EVENT", __func__);
	1409
	1410	struct waitq *wq = global_eventq(event);
	1411
	1412	return waitq_wakeup64_thread(wq, CAST_EVENT64_T(event), thread, THREAD_AWAKENED);
	1413	}
	1414
	1415	/*
	1416	* Wakeup a thread waiting on an event and promote it to a priority.
	1417	*
	1418	* Requires woken thread to un-promote itself when done.
	1419	*/
	1420	kern_return_t
	1421	thread_wakeup_one_with_pri(
	1422	event_t event,
	1423	int priority)
	1424	{
	1425	if (__improbable(event == NO_EVENT))
	1426	panic("%s() called with NO_EVENT", __func__);
	1427
	1428	struct waitq *wq = global_eventq(event);
	1429
	1430	return waitq_wakeup64_one(wq, CAST_EVENT64_T(event), THREAD_AWAKENED, priority);
	1431	}
	1432
	1433	/*
	1434	* Wakeup a thread waiting on an event,
	1435	* promote it to a priority,
	1436	* and return a reference to the woken thread.
	1437	*
	1438	* Requires woken thread to un-promote itself when done.
	1439	*/
	1440	thread_t
	1441	thread_wakeup_identify(event_t event,
	1442	int priority)
	1443	{
	1444	if (__improbable(event == NO_EVENT))
	1445	panic("%s() called with NO_EVENT", __func__);
	1446
	1447	struct waitq *wq = global_eventq(event);
	1448
	1449	return waitq_wakeup64_identify(wq, CAST_EVENT64_T(event), THREAD_AWAKENED, priority);
	1450	}
	1451
	1452	/*
	1453	* thread_bind:
	1454	*
	1455	* Force the current thread to execute on the specified processor.
	1456	* Takes effect after the next thread_block().
	1457	*
	1458	* Returns the previous binding. PROCESSOR_NULL means
	1459	* not bound.
	1460	*
	1461	* XXX - DO NOT export this to users - XXX
	1462	*/
	1463	processor_t
	1464	thread_bind(
	1465	processor_t processor)
	1466	{
	1467	thread_t self = current_thread();
	1468	processor_t prev;
	1469	spl_t s;
	1470
	1471	s = splsched();
	1472	thread_lock(self);
	1473
	1474	prev = thread_bind_internal(self, processor);
	1475
	1476	thread_unlock(self);
	1477	splx(s);
	1478
	1479	return (prev);
	1480	}
	1481
	1482	/*
	1483	* thread_bind_internal:
	1484	*
	1485	* If the specified thread is not the current thread, and it is currently
	1486	* running on another CPU, a remote AST must be sent to that CPU to cause
	1487	* the thread to migrate to its bound processor. Otherwise, the migration
	1488	* will occur at the next quantum expiration or blocking point.
	1489	*
	1490	* When the thread is the current thread, and explicit thread_block() should
	1491	* be used to force the current processor to context switch away and
	1492	* let the thread migrate to the bound processor.
	1493	*
	1494	* Thread must be locked, and at splsched.
	1495	*/
	1496
	1497	static processor_t
	1498	thread_bind_internal(
	1499	thread_t thread,
	1500	processor_t processor)
	1501	{
	1502	processor_t prev;
	1503
	1504	/* <rdar://problem/15102234> */
	1505	assert(thread->sched_pri < BASEPRI_RTQUEUES);
	1506	/* A thread can't be bound if it's sitting on a (potentially incorrect) runqueue */
	1507	assert(thread->runq == PROCESSOR_NULL);
	1508
	1509	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_THREAD_BIND), thread_tid(thread), processor ? (uintptr_t)processor->cpu_id : (uintptr_t)-1, 0, 0, 0);
	1510
	1511	prev = thread->bound_processor;
	1512	thread->bound_processor = processor;
	1513
	1514	return (prev);
	1515	}
	1516
	1517	/*
	1518	* thread_vm_bind_group_add:
	1519	*
	1520	* The "VM bind group" is a special mechanism to mark a collection
	1521	* of threads from the VM subsystem that, in general, should be scheduled
	1522	* with only one CPU of parallelism. To accomplish this, we initially
	1523	* bind all the threads to the master processor, which has the effect
	1524	* that only one of the threads in the group can execute at once, including
	1525	* preempting threads in the group that are a lower priority. Future
	1526	* mechanisms may use more dynamic mechanisms to prevent the collection
	1527	* of VM threads from using more CPU time than desired.
	1528	*
	1529	* The current implementation can result in priority inversions where
	1530	* compute-bound priority 95 or realtime threads that happen to have
	1531	* landed on the master processor prevent the VM threads from running.
	1532	* When this situation is detected, we unbind the threads for one
	1533	* scheduler tick to allow the scheduler to run the threads an
	1534	* additional CPUs, before restoring the binding (assuming high latency
	1535	* is no longer a problem).
	1536	*/
	1537
	1538	/*
	1539	* The current max is provisioned for:
	1540	* vm_compressor_swap_trigger_thread (92)
	1541	* 2 x vm_pageout_iothread_internal (92) when vm_restricted_to_single_processor==TRUE
	1542	* vm_pageout_continue (92)
	1543	* memorystatus_thread (95)
	1544	*/
	1545	#define MAX_VM_BIND_GROUP_COUNT (5)
	1546	decl_simple_lock_data(static,sched_vm_group_list_lock);
	1547	static thread_t sched_vm_group_thread_list[MAX_VM_BIND_GROUP_COUNT];
	1548	static int sched_vm_group_thread_count;
	1549	static boolean_t sched_vm_group_temporarily_unbound = FALSE;
	1550
	1551	void
	1552	thread_vm_bind_group_add(void)
	1553	{
	1554	thread_t self = current_thread();
	1555
	1556	thread_reference_internal(self);
	1557	self->options \|= TH_OPT_SCHED_VM_GROUP;
	1558
	1559	simple_lock(&sched_vm_group_list_lock);
	1560	assert(sched_vm_group_thread_count < MAX_VM_BIND_GROUP_COUNT);
	1561	sched_vm_group_thread_list[sched_vm_group_thread_count++] = self;
	1562	simple_unlock(&sched_vm_group_list_lock);
	1563
	1564	thread_bind(master_processor);
	1565
	1566	/* Switch to bound processor if not already there */
	1567	thread_block(THREAD_CONTINUE_NULL);
	1568	}
	1569
	1570	static void
	1571	sched_vm_group_maintenance(void)
	1572	{
	1573	uint64_t ctime = mach_absolute_time();
	1574	uint64_t longtime = ctime - sched_tick_interval;
	1575	int i;
	1576	spl_t s;
	1577	boolean_t high_latency_observed = FALSE;
	1578	boolean_t runnable_and_not_on_runq_observed = FALSE;
	1579	boolean_t bind_target_changed = FALSE;
	1580	processor_t bind_target = PROCESSOR_NULL;
	1581
	1582	/* Make sure nobody attempts to add new threads while we are enumerating them */
	1583	simple_lock(&sched_vm_group_list_lock);
	1584
	1585	s = splsched();
	1586
	1587	for (i=0; i < sched_vm_group_thread_count; i++) {
	1588	thread_t thread = sched_vm_group_thread_list[i];
	1589	assert(thread != THREAD_NULL);
	1590	thread_lock(thread);
	1591	if ((thread->state & (TH_RUN\|TH_WAIT)) == TH_RUN) {
	1592	if (thread->runq != PROCESSOR_NULL && thread->last_made_runnable_time < longtime) {
	1593	high_latency_observed = TRUE;
	1594	} else if (thread->runq == PROCESSOR_NULL) {
	1595	/* There are some cases where a thread be transitiong that also fall into this case */
	1596	runnable_and_not_on_runq_observed = TRUE;
	1597	}
	1598	}
	1599	thread_unlock(thread);
	1600
	1601	if (high_latency_observed && runnable_and_not_on_runq_observed) {
	1602	/* All the things we are looking for are true, stop looking */
	1603	break;
	1604	}
	1605	}
	1606
	1607	splx(s);
	1608
	1609	if (sched_vm_group_temporarily_unbound) {
	1610	/* If we turned off binding, make sure everything is OK before rebinding */
	1611	if (!high_latency_observed) {
	1612	/* rebind */
	1613	bind_target_changed = TRUE;
	1614	bind_target = master_processor;
	1615	sched_vm_group_temporarily_unbound = FALSE; /* might be reset to TRUE if change cannot be completed */
	1616	}
	1617	} else {
	1618	/*
	1619	* Check if we're in a bad state, which is defined by high
	1620	* latency with no core currently executing a thread. If a
	1621	* single thread is making progress on a CPU, that means the
	1622	* binding concept to reduce parallelism is working as
	1623	* designed.
	1624	*/
	1625	if (high_latency_observed && !runnable_and_not_on_runq_observed) {
	1626	/* unbind */
	1627	bind_target_changed = TRUE;
	1628	bind_target = PROCESSOR_NULL;
	1629	sched_vm_group_temporarily_unbound = TRUE;
	1630	}
	1631	}
	1632
	1633	if (bind_target_changed) {
	1634	s = splsched();
	1635	for (i=0; i < sched_vm_group_thread_count; i++) {
	1636	thread_t thread = sched_vm_group_thread_list[i];
	1637	boolean_t removed;
	1638	assert(thread != THREAD_NULL);
	1639
	1640	thread_lock(thread);
	1641	removed = thread_run_queue_remove(thread);
	1642	if (removed \|\| ((thread->state & (TH_RUN \| TH_WAIT)) == TH_WAIT)) {
	1643	thread_bind_internal(thread, bind_target);
	1644	} else {
	1645	/*
	1646	* Thread was in the middle of being context-switched-to,
	1647	* or was in the process of blocking. To avoid switching the bind
	1648	* state out mid-flight, defer the change if possible.
	1649	*/
	1650	if (bind_target == PROCESSOR_NULL) {
	1651	thread_bind_internal(thread, bind_target);
	1652	} else {
	1653	sched_vm_group_temporarily_unbound = TRUE; /* next pass will try again */
	1654	}
	1655	}
	1656
	1657	if (removed) {
	1658	thread_run_queue_reinsert(thread, SCHED_PREEMPT \| SCHED_TAILQ);
	1659	}
	1660	thread_unlock(thread);
	1661	}
	1662	splx(s);
	1663	}
	1664
	1665	simple_unlock(&sched_vm_group_list_lock);
	1666	}
	1667
	1668	/* Invoked prior to idle entry to determine if, on SMT capable processors, an SMT
	1669	* rebalancing opportunity exists when a core is (instantaneously) idle, but
	1670	* other SMT-capable cores may be over-committed. TODO: some possible negatives:
	1671	* IPI thrash if this core does not remain idle following the load balancing ASTs
	1672	* Idle "thrash", when IPI issue is followed by idle entry/core power down
	1673	* followed by a wakeup shortly thereafter.
	1674	*/
	1675
	1676	#if (DEVELOPMENT \|\| DEBUG)
	1677	int sched_smt_balance = 1;
	1678	#endif
	1679
	1680	#if __SMP__
	1681	/* Invoked with pset locked, returns with pset unlocked */
	1682	static void
	1683	sched_SMT_balance(processor_t cprocessor, processor_set_t cpset) {
	1684	processor_t ast_processor = NULL;
	1685
	1686	#if (DEVELOPMENT \|\| DEBUG)
	1687	if (__improbable(sched_smt_balance == 0))
	1688	goto smt_balance_exit;
	1689	#endif
	1690
	1691	assert(cprocessor == current_processor());
	1692	if (cprocessor->is_SMT == FALSE)
	1693	goto smt_balance_exit;
	1694
	1695	processor_t sib_processor = cprocessor->processor_secondary ? cprocessor->processor_secondary : cprocessor->processor_primary;
	1696
	1697	/* Determine if both this processor and its sibling are idle,
	1698	* indicating an SMT rebalancing opportunity.
	1699	*/
	1700	if (sib_processor->state != PROCESSOR_IDLE)
	1701	goto smt_balance_exit;
	1702
	1703	processor_t sprocessor;
	1704
	1705	qe_foreach_element(sprocessor, &cpset->active_queue, processor_queue) {
	1706	if ((sprocessor->state == PROCESSOR_RUNNING) &&
	1707	(sprocessor->processor_primary != sprocessor) &&
	1708	(sprocessor->processor_primary->state == PROCESSOR_RUNNING) &&
	1709	(sprocessor->current_pri < BASEPRI_RTQUEUES) &&
	1710	((cpset->pending_AST_cpu_mask & (1ULL << sprocessor->cpu_id)) == 0)) {
	1711	assert(sprocessor != cprocessor);
	1712	ast_processor = sprocessor;
	1713	break;
	1714	}
	1715	}
	1716
	1717	smt_balance_exit:
	1718	pset_unlock(cpset);
	1719
	1720	if (ast_processor) {
	1721	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_SMT_BALANCE), ast_processor->cpu_id, ast_processor->state, ast_processor->processor_primary->state, 0, 0);
	1722	cause_ast_check(ast_processor);
	1723	}
	1724	}
	1725	#endif /* __SMP__ */
	1726
	1727	/*
	1728	* thread_select:
	1729	*
	1730	* Select a new thread for the current processor to execute.
	1731	*
	1732	* May select the current thread, which must be locked.
	1733	*/
	1734	static thread_t
	1735	thread_select(
	1736	thread_t thread,
	1737	processor_t processor,
	1738	ast_t reason)
	1739	{
	1740	processor_set_t pset = processor->processor_set;
	1741	thread_t new_thread = THREAD_NULL;
	1742
	1743	assert(processor == current_processor());
	1744	assert((thread->state & (TH_RUN\|TH_TERMINATE2)) == TH_RUN);
	1745
	1746	do {
	1747	/*
	1748	* Update the priority.
	1749	*/
	1750	if (SCHED(can_update_priority)(thread))
	1751	SCHED(update_priority)(thread);
	1752
	1753	processor->current_pri = thread->sched_pri;
	1754	processor->current_thmode = thread->sched_mode;
	1755	processor->current_sfi_class = thread->sfi_class;
	1756
	1757	pset_lock(pset);
	1758
	1759	assert(processor->state != PROCESSOR_OFF_LINE);
	1760
	1761	if (!processor->is_recommended) {
	1762	/*
	1763	* The performance controller has provided a hint to not dispatch more threads,
	1764	* unless they are bound to us (and thus we are the only option
	1765	*/
	1766	if (!SCHED(processor_bound_count)(processor)) {
	1767	goto idle;
	1768	}
	1769	} else if (processor->processor_primary != processor) {
	1770	/*
	1771	* Should this secondary SMT processor attempt to find work? For pset runqueue systems,
	1772	* we should look for work only under the same conditions that choose_processor()
	1773	* would have assigned work, which is when all primary processors have been assigned work.
	1774	*
	1775	* An exception is that bound threads are dispatched to a processor without going through
	1776	* choose_processor(), so in those cases we should continue trying to dequeue work.
	1777	*/
	1778	if (!SCHED(processor_bound_count)(processor) && !queue_empty(&pset->idle_queue) && !rt_runq.count) {
	1779	goto idle;
	1780	}
	1781	}
	1782
	1783	rt_lock_lock();
	1784
	1785	/*
	1786	* Test to see if the current thread should continue
	1787	* to run on this processor. Must not be attempting to wait, and not
	1788	* bound to a different processor, nor be in the wrong
	1789	* processor set, nor be forced to context switch by TH_SUSP.
	1790	*
	1791	* Note that there are never any RT threads in the regular runqueue.
	1792	*
	1793	* This code is very insanely tricky.
	1794	*/
	1795
	1796	if (((thread->state & (TH_TERMINATE\|TH_IDLE\|TH_WAIT\|TH_RUN\|TH_SUSP)) == TH_RUN) &&
	1797	(thread->sched_pri >= BASEPRI_RTQUEUES \|\| processor->processor_primary == processor) &&
	1798	(thread->bound_processor == PROCESSOR_NULL \|\| thread->bound_processor == processor) &&
	1799	(thread->affinity_set == AFFINITY_SET_NULL \|\| thread->affinity_set->aset_pset == pset)) {
	1800	/*
	1801	* RT threads with un-expired quantum stay on processor,
	1802	* unless there's a valid RT thread with an earlier deadline.
	1803	*/
	1804	if (thread->sched_pri >= BASEPRI_RTQUEUES && processor->first_timeslice) {
	1805	if (rt_runq.count > 0) {
	1806	thread_t next_rt = qe_queue_first(&rt_runq.queue, struct thread, runq_links);
	1807
	1808	assert(next_rt->runq == THREAD_ON_RT_RUNQ);
	1809
	1810	if (next_rt->realtime.deadline < processor->deadline &&
	1811	(next_rt->bound_processor == PROCESSOR_NULL \|\|
	1812	next_rt->bound_processor == processor)) {
	1813	/* The next RT thread is better, so pick it off the runqueue. */
	1814	goto pick_new_rt_thread;
	1815	}
	1816	}
	1817
	1818	/* This is still the best RT thread to run. */
	1819	processor->deadline = thread->realtime.deadline;
	1820
	1821	rt_lock_unlock();
	1822	pset_unlock(pset);
	1823
	1824	return (thread);
	1825	}
	1826
	1827	if ((rt_runq.count == 0) &&
	1828	SCHED(processor_queue_has_priority)(processor, thread->sched_pri, TRUE) == FALSE) {
	1829	/* This thread is still the highest priority runnable (non-idle) thread */
	1830	processor->deadline = UINT64_MAX;
	1831
	1832	rt_lock_unlock();
	1833	pset_unlock(pset);
	1834
	1835	return (thread);
	1836	}
	1837	}
	1838
	1839	/* OK, so we're not going to run the current thread. Look at the RT queue. */
	1840	if (rt_runq.count > 0) {
	1841	thread_t next_rt = qe_queue_first(&rt_runq.queue, struct thread, runq_links);
	1842
	1843	assert(next_rt->runq == THREAD_ON_RT_RUNQ);
	1844
	1845	if (__probable((next_rt->bound_processor == PROCESSOR_NULL \|\|
	1846	(next_rt->bound_processor == processor)))) {
	1847	pick_new_rt_thread:
	1848	new_thread = qe_dequeue_head(&rt_runq.queue, struct thread, runq_links);
	1849
	1850	new_thread->runq = PROCESSOR_NULL;
	1851	SCHED_STATS_RUNQ_CHANGE(&rt_runq.runq_stats, rt_runq.count);
	1852	rt_runq.count--;
	1853
	1854	processor->deadline = new_thread->realtime.deadline;
	1855
	1856	rt_lock_unlock();
	1857	pset_unlock(pset);
	1858
	1859	return (new_thread);
	1860	}
	1861	}
	1862
	1863	processor->deadline = UINT64_MAX;
	1864	rt_lock_unlock();
	1865
	1866	/* No RT threads, so let's look at the regular threads. */
	1867	if ((new_thread = SCHED(choose_thread)(processor, MINPRI, reason)) != THREAD_NULL) {
	1868	pset_unlock(pset);
	1869	return (new_thread);
	1870	}
	1871
	1872	#if __SMP__
	1873	if (SCHED(steal_thread_enabled)) {
	1874	/*
	1875	* No runnable threads, attempt to steal
	1876	* from other processors. Returns with pset lock dropped.
	1877	*/
	1878
	1879	if ((new_thread = SCHED(steal_thread)(pset)) != THREAD_NULL) {
	1880	return (new_thread);
	1881	}
	1882
	1883	/*
	1884	* If other threads have appeared, shortcut
	1885	* around again.
	1886	*/
	1887	if (!SCHED(processor_queue_empty)(processor) \|\| rt_runq.count > 0)
	1888	continue;
	1889
	1890	pset_lock(pset);
	1891	}
	1892	#endif
	1893
	1894	idle:
	1895	/*
	1896	* Nothing is runnable, so set this processor idle if it
	1897	* was running.
	1898	*/
	1899	if (processor->state == PROCESSOR_RUNNING) {
	1900	processor->state = PROCESSOR_IDLE;
	1901
	1902	if (processor->processor_primary == processor) {
	1903	re_queue_head(&pset->idle_queue, &processor->processor_queue);
	1904	} else {
	1905	re_queue_head(&pset->idle_secondary_queue, &processor->processor_queue);
	1906	}
	1907	}
	1908
	1909	#if __SMP__
	1910	/* Invoked with pset locked, returns with pset unlocked */
	1911	sched_SMT_balance(processor, pset);
	1912	#else
	1913	pset_unlock(pset);
	1914	#endif
	1915
	1916	#if CONFIG_SCHED_IDLE_IN_PLACE
	1917	/*
	1918	* Choose idle thread if fast idle is not possible.
	1919	*/
	1920	if (processor->processor_primary != processor)
	1921	return (processor->idle_thread);
	1922
	1923	if ((thread->state & (TH_IDLE\|TH_TERMINATE\|TH_SUSP)) \|\| !(thread->state & TH_WAIT) \|\| thread->wake_active \|\| thread->sched_pri >= BASEPRI_RTQUEUES)
	1924	return (processor->idle_thread);
	1925
	1926	/*
	1927	* Perform idling activities directly without a
	1928	* context switch. Return dispatched thread,
	1929	* else check again for a runnable thread.
	1930	*/
	1931	new_thread = thread_select_idle(thread, processor);
	1932
	1933	#else /* !CONFIG_SCHED_IDLE_IN_PLACE */
	1934
	1935	/*
	1936	* Do a full context switch to idle so that the current
	1937	* thread can start running on another processor without
	1938	* waiting for the fast-idled processor to wake up.
	1939	*/
	1940	new_thread = processor->idle_thread;
	1941
	1942	#endif /* !CONFIG_SCHED_IDLE_IN_PLACE */
	1943
	1944	} while (new_thread == THREAD_NULL);
	1945
	1946	return (new_thread);
	1947	}
	1948
	1949	#if CONFIG_SCHED_IDLE_IN_PLACE
	1950	/*
	1951	* thread_select_idle:
	1952	*
	1953	* Idle the processor using the current thread context.
	1954	*
	1955	* Called with thread locked, then dropped and relocked.
	1956	*/
	1957	static thread_t
	1958	thread_select_idle(
	1959	thread_t thread,
	1960	processor_t processor)
	1961	{
	1962	thread_t new_thread;
	1963	uint64_t arg1, arg2;
	1964	int urgency;
	1965
	1966	sched_run_decr(thread);
	1967
	1968	thread->state \|= TH_IDLE;
	1969	processor->current_pri = IDLEPRI;
	1970	processor->current_thmode = TH_MODE_NONE;
	1971	processor->current_sfi_class = SFI_CLASS_KERNEL;
	1972
	1973	/* Reload precise timing global policy to thread-local policy */
	1974	thread->precise_user_kernel_time = use_precise_user_kernel_time(thread);
	1975
	1976	thread_unlock(thread);
	1977
	1978	/*
	1979	* Switch execution timing to processor idle thread.
	1980	*/
	1981	processor->last_dispatch = mach_absolute_time();
	1982
	1983	#ifdef CONFIG_MACH_APPROXIMATE_TIME
	1984	commpage_update_mach_approximate_time(processor->last_dispatch);
	1985	#endif
	1986
	1987	thread->last_run_time = processor->last_dispatch;
	1988	thread_timer_event(processor->last_dispatch, &processor->idle_thread->system_timer);
	1989	PROCESSOR_DATA(processor, kernel_timer) = &processor->idle_thread->system_timer;
	1990
	1991	/*
	1992	* Cancel the quantum timer while idling.
	1993	*/
	1994	timer_call_cancel(&processor->quantum_timer);
	1995	processor->first_timeslice = FALSE;
	1996
	1997	(*thread->sched_call)(SCHED_CALL_BLOCK, thread);
	1998
	1999	thread_tell_urgency(THREAD_URGENCY_NONE, 0, 0, 0, NULL);
	2000
	2001	/*
	2002	* Enable interrupts and perform idling activities. No
	2003	* preemption due to TH_IDLE being set.
	2004	*/
	2005	spllo(); new_thread = processor_idle(thread, processor);
	2006
	2007	/*
	2008	* Return at splsched.
	2009	*/
	2010	(*thread->sched_call)(SCHED_CALL_UNBLOCK, thread);
	2011
	2012	thread_lock(thread);
	2013
	2014	/*
	2015	* If awakened, switch to thread timer and start a new quantum.
	2016	* Otherwise skip; we will context switch to another thread or return here.
	2017	*/
	2018	if (!(thread->state & TH_WAIT)) {
	2019	processor->last_dispatch = mach_absolute_time();
	2020	thread_timer_event(processor->last_dispatch, &thread->system_timer);
	2021	PROCESSOR_DATA(processor, kernel_timer) = &thread->system_timer;
	2022
	2023	thread_quantum_init(thread);
	2024	processor->quantum_end = processor->last_dispatch + thread->quantum_remaining;
	2025	timer_call_enter1(&processor->quantum_timer, thread, processor->quantum_end, TIMER_CALL_SYS_CRITICAL \| TIMER_CALL_LOCAL);
	2026	processor->first_timeslice = TRUE;
	2027
	2028	thread->computation_epoch = processor->last_dispatch;
	2029	}
	2030
	2031	thread->state &= ~TH_IDLE;
	2032
	2033	urgency = thread_get_urgency(thread, &arg1, &arg2);
	2034
	2035	thread_tell_urgency(urgency, arg1, arg2, 0, new_thread);
	2036
	2037	sched_run_incr(thread);
	2038
	2039	return (new_thread);
	2040	}
	2041	#endif /* CONFIG_SCHED_IDLE_IN_PLACE */
	2042
	2043	/*
	2044	* thread_invoke
	2045	*
	2046	* Called at splsched with neither thread locked.
	2047	*
	2048	* Perform a context switch and start executing the new thread.
	2049	*
	2050	* Returns FALSE when the context switch didn't happen.
	2051	* The reference to the new thread is still consumed.
	2052	*
	2053	* "self" is what is currently running on the processor,
	2054	* "thread" is the new thread to context switch to
	2055	* (which may be the same thread in some cases)
	2056	*/
	2057	static boolean_t
	2058	thread_invoke(
	2059	thread_t self,
	2060	thread_t thread,
	2061	ast_t reason)
	2062	{
	2063	if (__improbable(get_preemption_level() != 0)) {
	2064	int pl = get_preemption_level();
	2065	panic("thread_invoke: preemption_level %d, possible cause: %s",
	2066	pl, (pl < 0 ? "unlocking an unlocked mutex or spinlock" :
	2067	"blocking while holding a spinlock, or within interrupt context"));
	2068	}
	2069
	2070	thread_continue_t continuation = self->continuation;
	2071	void *parameter = self->parameter;
	2072	processor_t processor;
	2073
	2074	uint64_t ctime = mach_absolute_time();
	2075
	2076	#ifdef CONFIG_MACH_APPROXIMATE_TIME
	2077	commpage_update_mach_approximate_time(ctime);
	2078	#endif
	2079
	2080	#if defined(CONFIG_SCHED_TIMESHARE_CORE)
	2081	sched_timeshare_consider_maintenance(ctime);
	2082	#endif
	2083
	2084	assert_thread_magic(self);
	2085	assert(self == current_thread());
	2086	assert(self->runq == PROCESSOR_NULL);
	2087	assert((self->state & (TH_RUN\|TH_TERMINATE2)) == TH_RUN);
	2088
	2089	thread_lock(thread);
	2090
	2091	assert_thread_magic(thread);
	2092	assert((thread->state & (TH_RUN\|TH_WAIT\|TH_UNINT\|TH_TERMINATE\|TH_TERMINATE2)) == TH_RUN);
	2093	assert(thread->bound_processor == PROCESSOR_NULL \|\| thread->bound_processor == current_processor());
	2094	assert(thread->runq == PROCESSOR_NULL);
	2095
	2096	/* Reload precise timing global policy to thread-local policy */
	2097	thread->precise_user_kernel_time = use_precise_user_kernel_time(thread);
	2098
	2099	/* Update SFI class based on other factors */
	2100	thread->sfi_class = sfi_thread_classify(thread);
	2101
	2102	/* Allow realtime threads to hang onto a stack. */
	2103	if ((self->sched_mode == TH_MODE_REALTIME) && !self->reserved_stack)
	2104	self->reserved_stack = self->kernel_stack;
	2105
	2106	if (continuation != NULL) {
	2107	if (!thread->kernel_stack) {
	2108	/*
	2109	* If we are using a privileged stack,
	2110	* check to see whether we can exchange it with
	2111	* that of the other thread.
	2112	*/
	2113	if (self->kernel_stack == self->reserved_stack && !thread->reserved_stack)
	2114	goto need_stack;
	2115
	2116	/*
	2117	* Context switch by performing a stack handoff.
	2118	*/
	2119	continuation = thread->continuation;
	2120	parameter = thread->parameter;
	2121
	2122	processor = current_processor();
	2123	processor->active_thread = thread;
	2124	processor->current_pri = thread->sched_pri;
	2125	processor->current_thmode = thread->sched_mode;
	2126	processor->current_sfi_class = thread->sfi_class;
	2127	if (thread->last_processor != processor && thread->last_processor != NULL) {
	2128	if (thread->last_processor->processor_set != processor->processor_set)
	2129	thread->ps_switch++;
	2130	thread->p_switch++;
	2131	}
	2132	thread->last_processor = processor;
	2133	thread->c_switch++;
	2134	ast_context(thread);
	2135
	2136	thread_unlock(thread);
	2137
	2138	self->reason = reason;
	2139
	2140	processor->last_dispatch = ctime;
	2141	self->last_run_time = ctime;
	2142	thread_timer_event(ctime, &thread->system_timer);
	2143	PROCESSOR_DATA(processor, kernel_timer) = &thread->system_timer;
	2144
	2145	/*
	2146	* Since non-precise user/kernel time doesn't update the state timer
	2147	* during privilege transitions, synthesize an event now.
	2148	*/
	2149	if (!thread->precise_user_kernel_time) {
	2150	timer_switch(PROCESSOR_DATA(processor, current_state),
	2151	ctime,
	2152	PROCESSOR_DATA(processor, current_state));
	2153	}
	2154
	2155	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	2156	MACHDBG_CODE(DBG_MACH_SCHED, MACH_STACK_HANDOFF)\|DBG_FUNC_NONE,
	2157	self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
	2158
	2159	if ((thread->chosen_processor != processor) && (thread->chosen_processor != PROCESSOR_NULL)) {
	2160	SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_MOVED)\|DBG_FUNC_NONE,
	2161	(uintptr_t)thread_tid(thread), (uintptr_t)thread->chosen_processor->cpu_id, 0, 0, 0);
	2162	}
	2163
	2164	DTRACE_SCHED2(off__cpu, struct thread , thread, struct proc , thread->task->bsd_info);
	2165
	2166	SCHED_STATS_CSW(processor, self->reason, self->sched_pri, thread->sched_pri);
	2167
	2168	TLOG(1, "thread_invoke: calling stack_handoff\n");
	2169	stack_handoff(self, thread);
	2170
	2171	/* 'self' is now off core */
	2172	assert(thread == current_thread());
	2173
	2174	DTRACE_SCHED(on__cpu);
	2175
	2176	#if KPERF
	2177	kperf_on_cpu(thread, continuation, NULL);
	2178	#endif /* KPERF */
	2179
	2180	thread_dispatch(self, thread);
	2181
	2182	thread->continuation = thread->parameter = NULL;
	2183
	2184	counter(c_thread_invoke_hits++);
	2185
	2186	(void) spllo();
	2187
	2188	assert(continuation);
	2189	call_continuation(continuation, parameter, thread->wait_result);
	2190	/NOTREACHED/
	2191	}
	2192	else if (thread == self) {
	2193	/* same thread but with continuation */
	2194	ast_context(self);
	2195	counter(++c_thread_invoke_same);
	2196
	2197	thread_unlock(self);
	2198
	2199	#if KPERF
	2200	kperf_on_cpu(thread, continuation, NULL);
	2201	#endif /* KPERF */
	2202
	2203	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	2204	MACHDBG_CODE(DBG_MACH_SCHED,MACH_SCHED) \| DBG_FUNC_NONE,
	2205	self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
	2206
	2207	self->continuation = self->parameter = NULL;
	2208
	2209	(void) spllo();
	2210
	2211	call_continuation(continuation, parameter, self->wait_result);
	2212	/NOTREACHED/
	2213	}
	2214	} else {
	2215	/*
	2216	* Check that the other thread has a stack
	2217	*/
	2218	if (!thread->kernel_stack) {
	2219	need_stack:
	2220	if (!stack_alloc_try(thread)) {
	2221	counter(c_thread_invoke_misses++);
	2222	thread_unlock(thread);
	2223	thread_stack_enqueue(thread);
	2224	return (FALSE);
	2225	}
	2226	} else if (thread == self) {
	2227	ast_context(self);
	2228	counter(++c_thread_invoke_same);
	2229	thread_unlock(self);
	2230
	2231	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	2232	MACHDBG_CODE(DBG_MACH_SCHED,MACH_SCHED) \| DBG_FUNC_NONE,
	2233	self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
	2234
	2235	return (TRUE);
	2236	}
	2237	}
	2238
	2239	/*
	2240	* Context switch by full context save.
	2241	*/
	2242	processor = current_processor();
	2243	processor->active_thread = thread;
	2244	processor->current_pri = thread->sched_pri;
	2245	processor->current_thmode = thread->sched_mode;
	2246	processor->current_sfi_class = thread->sfi_class;
	2247	if (thread->last_processor != processor && thread->last_processor != NULL) {
	2248	if (thread->last_processor->processor_set != processor->processor_set)
	2249	thread->ps_switch++;
	2250	thread->p_switch++;
	2251	}
	2252	thread->last_processor = processor;
	2253	thread->c_switch++;
	2254	ast_context(thread);
	2255
	2256	thread_unlock(thread);
	2257
	2258	counter(c_thread_invoke_csw++);
	2259
	2260	self->reason = reason;
	2261
	2262	processor->last_dispatch = ctime;
	2263	self->last_run_time = ctime;
	2264	thread_timer_event(ctime, &thread->system_timer);
	2265	PROCESSOR_DATA(processor, kernel_timer) = &thread->system_timer;
	2266
	2267	/*
	2268	* Since non-precise user/kernel time doesn't update the state timer
	2269	* during privilege transitions, synthesize an event now.
	2270	*/
	2271	if (!thread->precise_user_kernel_time) {
	2272	timer_switch(PROCESSOR_DATA(processor, current_state),
	2273	ctime,
	2274	PROCESSOR_DATA(processor, current_state));
	2275	}
	2276
	2277	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	2278	MACHDBG_CODE(DBG_MACH_SCHED,MACH_SCHED) \| DBG_FUNC_NONE,
	2279	self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
	2280
	2281	if ((thread->chosen_processor != processor) && (thread->chosen_processor != NULL)) {
	2282	SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_MOVED)\|DBG_FUNC_NONE,
	2283	(uintptr_t)thread_tid(thread), (uintptr_t)thread->chosen_processor->cpu_id, 0, 0, 0);
	2284	}
	2285
	2286	DTRACE_SCHED2(off__cpu, struct thread , thread, struct proc , thread->task->bsd_info);
	2287
	2288	SCHED_STATS_CSW(processor, self->reason, self->sched_pri, thread->sched_pri);
	2289
	2290	/*
	2291	* This is where we actually switch register context,
	2292	* and address space if required. We will next run
	2293	* as a result of a subsequent context switch.
	2294	*
	2295	* Once registers are switched and the processor is running "thread",
	2296	* the stack variables and non-volatile registers will contain whatever
	2297	* was there the last time that thread blocked. No local variables should
	2298	* be used after this point, except for the special case of "thread", which
	2299	* the platform layer returns as the previous thread running on the processor
	2300	* via the function call ABI as a return register, and "self", which may have
	2301	* been stored on the stack or a non-volatile register, but a stale idea of
	2302	* what was on the CPU is newly-accurate because that thread is again
	2303	* running on the CPU.
	2304	*/
	2305	assert(continuation == self->continuation);
	2306	thread = machine_switch_context(self, continuation, thread);
	2307	assert(self == current_thread());
	2308	TLOG(1,"thread_invoke: returning machine_switch_context: self %p continuation %p thread %p\n", self, continuation, thread);
	2309
	2310	DTRACE_SCHED(on__cpu);
	2311
	2312	#if KPERF
	2313	kperf_on_cpu(self, NULL, __builtin_frame_address(0));
	2314	#endif /* KPERF */
	2315
	2316	/*
	2317	* We have been resumed and are set to run.
	2318	*/
	2319	thread_dispatch(thread, self);
	2320
	2321	if (continuation) {
	2322	self->continuation = self->parameter = NULL;
	2323
	2324	(void) spllo();
	2325
	2326	call_continuation(continuation, parameter, self->wait_result);
	2327	/NOTREACHED/
	2328	}
	2329
	2330	return (TRUE);
	2331	}
	2332
	2333	#if defined(CONFIG_SCHED_DEFERRED_AST)
	2334	/*
	2335	* pset_cancel_deferred_dispatch:
	2336	*
	2337	* Cancels all ASTs that we can cancel for the given processor set
	2338	* if the current processor is running the last runnable thread in the
	2339	* system.
	2340	*
	2341	* This function assumes the current thread is runnable. This must
	2342	* be called with the pset unlocked.
	2343	*/
	2344	static void
	2345	pset_cancel_deferred_dispatch(
	2346	processor_set_t pset,
	2347	processor_t processor)
	2348	{
	2349	processor_t active_processor = NULL;
	2350	uint32_t sampled_sched_run_count;
	2351
	2352	pset_lock(pset);
	2353	sampled_sched_run_count = (volatile uint32_t) sched_run_buckets[TH_BUCKET_RUN];
	2354
	2355	/*
	2356	* If we have emptied the run queue, and our current thread is runnable, we
	2357	* should tell any processors that are still DISPATCHING that they will
	2358	* probably not have any work to do. In the event that there are no
	2359	* pending signals that we can cancel, this is also uninteresting.
	2360	*
	2361	* In the unlikely event that another thread becomes runnable while we are
	2362	* doing this (sched_run_count is atomically updated, not guarded), the
	2363	* codepath making it runnable SHOULD (a dangerous word) need the pset lock
	2364	* in order to dispatch it to a processor in our pset. So, the other
	2365	* codepath will wait while we squash all cancelable ASTs, get the pset
	2366	* lock, and then dispatch the freshly runnable thread. So this should be
	2367	* correct (we won't accidentally have a runnable thread that hasn't been
	2368	* dispatched to an idle processor), if not ideal (we may be restarting the
	2369	* dispatch process, which could have some overhead).
	2370	*
	2371	*/
	2372	if ((sampled_sched_run_count == 1) &&
	2373	(pset->pending_deferred_AST_cpu_mask)) {
	2374	qe_foreach_element_safe(active_processor, &pset->active_queue, processor_queue) {
	2375	/*
	2376	* If a processor is DISPATCHING, it could be because of
	2377	* a cancelable signal.
	2378	*
	2379	* IF the processor is not our
	2380	* current processor (the current processor should not
	2381	* be DISPATCHING, so this is a bit paranoid), AND there
	2382	* is a cancelable signal pending on the processor, AND
	2383	* there is no non-cancelable signal pending (as there is
	2384	* no point trying to backtrack on bringing the processor
	2385	* up if a signal we cannot cancel is outstanding), THEN
	2386	* it should make sense to roll back the processor state
	2387	* to the IDLE state.
	2388	*
	2389	* If the racey nature of this approach (as the signal
	2390	* will be arbitrated by hardware, and can fire as we
	2391	* roll back state) results in the core responding
	2392	* despite being pushed back to the IDLE state, it
	2393	* should be no different than if the core took some
	2394	* interrupt while IDLE.
	2395	*/
	2396	if ((active_processor->state == PROCESSOR_DISPATCHING) &&
	2397	(pset->pending_deferred_AST_cpu_mask & (1ULL << active_processor->cpu_id)) &&
	2398	(!(pset->pending_AST_cpu_mask & (1ULL << active_processor->cpu_id))) &&
	2399	(active_processor != processor)) {
	2400	/*
	2401	* Squash all of the processor state back to some
	2402	* reasonable facsimile of PROCESSOR_IDLE.
	2403	*
	2404	* TODO: What queue policy do we actually want here?
	2405	* We want to promote selection of a good processor
	2406	* to run on. Do we want to enqueue at the head?
	2407	* The tail? At the (relative) old position in the
	2408	* queue? Or something else entirely?
	2409	*/
	2410	re_queue_head(&pset->idle_queue, &active_processor->processor_queue);
	2411
	2412	assert(active_processor->next_thread == THREAD_NULL);
	2413
	2414	active_processor->current_pri = IDLEPRI;
	2415	active_processor->current_thmode = TH_MODE_FIXED;
	2416	active_processor->current_sfi_class = SFI_CLASS_KERNEL;
	2417	active_processor->deadline = UINT64_MAX;
	2418	active_processor->state = PROCESSOR_IDLE;
	2419	pset->pending_deferred_AST_cpu_mask &= ~(1U << active_processor->cpu_id);
	2420	machine_signal_idle_cancel(active_processor);
	2421	}
	2422
	2423	}
	2424	}
	2425
	2426	pset_unlock(pset);
	2427	}
	2428	#else
	2429	/* We don't support deferred ASTs; everything is candycanes and sunshine. */
	2430	#endif
	2431
	2432	/*
	2433	* thread_dispatch:
	2434	*
	2435	* Handle threads at context switch. Re-dispatch other thread
	2436	* if still running, otherwise update run state and perform
	2437	* special actions. Update quantum for other thread and begin
	2438	* the quantum for ourselves.
	2439	*
	2440	* "thread" is the old thread that we have switched away from.
	2441	* "self" is the new current thread that we have context switched to
	2442	*
	2443	* Called at splsched.
	2444	*/
	2445	void
	2446	thread_dispatch(
	2447	thread_t thread,
	2448	thread_t self)
	2449	{
	2450	processor_t processor = self->last_processor;
	2451
	2452	assert(processor == current_processor());
	2453	assert(self == current_thread());
	2454	assert(thread != self);
	2455
	2456	if (thread != THREAD_NULL) {
	2457	/*
	2458	* If blocked at a continuation, discard
	2459	* the stack.
	2460	*/
	2461	if (thread->continuation != NULL && thread->kernel_stack != 0)
	2462	stack_free(thread);
	2463
	2464	if (thread->state & TH_IDLE) {
	2465	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	2466	MACHDBG_CODE(DBG_MACH_SCHED,MACH_DISPATCH) \| DBG_FUNC_NONE,
	2467	(uintptr_t)thread_tid(thread), 0, thread->state,
	2468	sched_run_buckets[TH_BUCKET_RUN], 0);
	2469	} else {
	2470	int64_t consumed;
	2471	int64_t remainder = 0;
	2472
	2473	if (processor->quantum_end > processor->last_dispatch)
	2474	remainder = processor->quantum_end -
	2475	processor->last_dispatch;
	2476
	2477	consumed = thread->quantum_remaining - remainder;
	2478
	2479	if ((thread->reason & AST_LEDGER) == 0) {
	2480	/*
	2481	* Bill CPU time to both the task and
	2482	* the individual thread.
	2483	*/
	2484	ledger_credit(thread->t_ledger,
	2485	task_ledgers.cpu_time, consumed);
	2486	ledger_credit(thread->t_threadledger,
	2487	thread_ledgers.cpu_time, consumed);
	2488	#ifdef CONFIG_BANK
	2489	if (thread->t_bankledger) {
	2490	ledger_credit(thread->t_bankledger,
	2491	bank_ledgers.cpu_time,
	2492	(consumed - thread->t_deduct_bank_ledger_time));
	2493
	2494	}
	2495	thread->t_deduct_bank_ledger_time =0;
	2496	#endif
	2497	}
	2498
	2499	wake_lock(thread);
	2500	thread_lock(thread);
	2501
	2502	/*
	2503	* Apply a priority floor if the thread holds a kernel resource
	2504	* Do this before checking starting_pri to avoid overpenalizing
	2505	* repeated rwlock blockers.
	2506	*/
	2507	if (__improbable(thread->rwlock_count != 0))
	2508	lck_rw_set_promotion_locked(thread);
	2509
	2510	boolean_t keep_quantum = processor->first_timeslice;
	2511
	2512	/*
	2513	* Treat a thread which has dropped priority since it got on core
	2514	* as having expired its quantum.
	2515	*/
	2516	if (processor->starting_pri > thread->sched_pri)
	2517	keep_quantum = FALSE;
	2518
	2519	/* Compute remainder of current quantum. */
	2520	if (keep_quantum &&
	2521	processor->quantum_end > processor->last_dispatch)
	2522	thread->quantum_remaining = (uint32_t)remainder;
	2523	else
	2524	thread->quantum_remaining = 0;
	2525
	2526	if (thread->sched_mode == TH_MODE_REALTIME) {
	2527	/*
	2528	* Cancel the deadline if the thread has
	2529	* consumed the entire quantum.
	2530	*/
	2531	if (thread->quantum_remaining == 0) {
	2532	thread->realtime.deadline = UINT64_MAX;
	2533	}
	2534	} else {
	2535	#if defined(CONFIG_SCHED_TIMESHARE_CORE)
	2536	/*
	2537	* For non-realtime threads treat a tiny
	2538	* remaining quantum as an expired quantum
	2539	* but include what's left next time.
	2540	*/
	2541	if (thread->quantum_remaining < min_std_quantum) {
	2542	thread->reason \|= AST_QUANTUM;
	2543	thread->quantum_remaining += SCHED(initial_quantum_size)(thread);
	2544	}
	2545	#endif /* CONFIG_SCHED_TIMESHARE_CORE */
	2546	}
	2547
	2548	/*
	2549	* If we are doing a direct handoff then
	2550	* take the remainder of the quantum.
	2551	*/
	2552	if ((thread->reason & (AST_HANDOFF\|AST_QUANTUM)) == AST_HANDOFF) {
	2553	self->quantum_remaining = thread->quantum_remaining;
	2554	thread->reason \|= AST_QUANTUM;
	2555	thread->quantum_remaining = 0;
	2556	} else {
	2557	#if defined(CONFIG_SCHED_MULTIQ)
	2558	if (SCHED(sched_groups_enabled) &&
	2559	thread->sched_group == self->sched_group) {
	2560	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	2561	MACHDBG_CODE(DBG_MACH_SCHED, MACH_QUANTUM_HANDOFF),
	2562	self->reason, (uintptr_t)thread_tid(thread),
	2563	self->quantum_remaining, thread->quantum_remaining, 0);
	2564
	2565	self->quantum_remaining = thread->quantum_remaining;
	2566	thread->quantum_remaining = 0;
	2567	/* Don't set AST_QUANTUM here - old thread might still want to preempt someone else */
	2568	}
	2569	#endif /* defined(CONFIG_SCHED_MULTIQ) */
	2570	}
	2571
	2572	thread->computation_metered += (processor->last_dispatch - thread->computation_epoch);
	2573
	2574	if (!(thread->state & TH_WAIT)) {
	2575	/*
	2576	* Still runnable.
	2577	*/
	2578	thread->last_made_runnable_time = mach_approximate_time();
	2579
	2580	machine_thread_going_off_core(thread, FALSE, processor->last_dispatch);
	2581
	2582	if (thread->reason & AST_QUANTUM)
	2583	thread_setrun(thread, SCHED_TAILQ);
	2584	else if (thread->reason & AST_PREEMPT)
	2585	thread_setrun(thread, SCHED_HEADQ);
	2586	else
	2587	thread_setrun(thread, SCHED_PREEMPT \| SCHED_TAILQ);
	2588
	2589	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	2590	MACHDBG_CODE(DBG_MACH_SCHED,MACH_DISPATCH) \| DBG_FUNC_NONE,
	2591	(uintptr_t)thread_tid(thread), thread->reason, thread->state,
	2592	sched_run_buckets[TH_BUCKET_RUN], 0);
	2593
	2594	if (thread->wake_active) {
	2595	thread->wake_active = FALSE;
	2596	thread_unlock(thread);
	2597
	2598	thread_wakeup(&thread->wake_active);
	2599	} else {
	2600	thread_unlock(thread);
	2601	}
	2602
	2603	wake_unlock(thread);
	2604	} else {
	2605	/*
	2606	* Waiting.
	2607	*/
	2608	boolean_t should_terminate = FALSE;
	2609	uint32_t new_run_count;
	2610
	2611	/* Only the first call to thread_dispatch
	2612	* after explicit termination should add
	2613	* the thread to the termination queue
	2614	*/
	2615	if ((thread->state & (TH_TERMINATE\|TH_TERMINATE2)) == TH_TERMINATE) {
	2616	should_terminate = TRUE;
	2617	thread->state \|= TH_TERMINATE2;
	2618	}
	2619
	2620	thread->state &= ~TH_RUN;
	2621	thread->last_made_runnable_time = ~0ULL;
	2622	thread->chosen_processor = PROCESSOR_NULL;
	2623
	2624	new_run_count = sched_run_decr(thread);
	2625
	2626	#if CONFIG_SCHED_SFI
	2627	if ((thread->state & (TH_WAIT \| TH_TERMINATE)) == TH_WAIT) {
	2628	if (thread->reason & AST_SFI) {
	2629	thread->wait_sfi_begin_time = processor->last_dispatch;
	2630	}
	2631	}
	2632	#endif
	2633
	2634	machine_thread_going_off_core(thread, should_terminate, processor->last_dispatch);
	2635
	2636	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	2637	MACHDBG_CODE(DBG_MACH_SCHED,MACH_DISPATCH) \| DBG_FUNC_NONE,
	2638	(uintptr_t)thread_tid(thread), thread->reason, thread->state,
	2639	new_run_count, 0);
	2640
	2641	(*thread->sched_call)(SCHED_CALL_BLOCK, thread);
	2642
	2643	if (thread->wake_active) {
	2644	thread->wake_active = FALSE;
	2645	thread_unlock(thread);
	2646
	2647	thread_wakeup(&thread->wake_active);
	2648	} else {
	2649	thread_unlock(thread);
	2650	}
	2651
	2652	wake_unlock(thread);
	2653
	2654	if (should_terminate)
	2655	thread_terminate_enqueue(thread);
	2656	}
	2657	}
	2658	}
	2659
	2660	/* Update (new) current thread and reprogram quantum timer */
	2661	thread_lock(self);
	2662	if (!(self->state & TH_IDLE)) {
	2663	uint64_t arg1, arg2;
	2664	int urgency;
	2665	uint64_t latency;
	2666
	2667	#if CONFIG_SCHED_SFI
	2668	ast_t new_ast;
	2669
	2670	new_ast = sfi_thread_needs_ast(self, NULL);
	2671
	2672	if (new_ast != AST_NONE) {
	2673	ast_on(new_ast);
	2674	}
	2675	#endif
	2676
	2677	assertf(processor->last_dispatch >= self->last_made_runnable_time, "Non-monotonic time? dispatch at 0x%llx, runnable at 0x%llx", processor->last_dispatch, self->last_made_runnable_time);
	2678	latency = processor->last_dispatch - self->last_made_runnable_time;
	2679
	2680	urgency = thread_get_urgency(self, &arg1, &arg2);
	2681
	2682	thread_tell_urgency(urgency, arg1, arg2, latency, self);
	2683
	2684	machine_thread_going_on_core(self, urgency, latency, processor->last_dispatch);
	2685
	2686	/*
	2687	* Get a new quantum if none remaining.
	2688	*/
	2689	if (self->quantum_remaining == 0) {
	2690	thread_quantum_init(self);
	2691	}
	2692
	2693	/*
	2694	* Set up quantum timer and timeslice.
	2695	*/
	2696	processor->quantum_end = processor->last_dispatch + self->quantum_remaining;
	2697	timer_call_enter1(&processor->quantum_timer, self, processor->quantum_end, TIMER_CALL_SYS_CRITICAL \| TIMER_CALL_LOCAL);
	2698
	2699	processor->first_timeslice = TRUE;
	2700	} else {
	2701	timer_call_cancel(&processor->quantum_timer);
	2702	processor->first_timeslice = FALSE;
	2703
	2704	thread_tell_urgency(THREAD_URGENCY_NONE, 0, 0, 0, self);
	2705	machine_thread_going_on_core(self, THREAD_URGENCY_NONE, 0, processor->last_dispatch);
	2706	}
	2707
	2708	assert(self->block_hint == kThreadWaitNone);
	2709	self->computation_epoch = processor->last_dispatch;
	2710	self->reason = AST_NONE;
	2711	processor->starting_pri = self->sched_pri;
	2712
	2713	thread_unlock(self);
	2714
	2715	#if defined(CONFIG_SCHED_DEFERRED_AST)
	2716	/*
	2717	* TODO: Can we state that redispatching our old thread is also
	2718	* uninteresting?
	2719	*/
	2720	if ((((volatile uint32_t)sched_run_buckets[TH_BUCKET_RUN]) == 1) &&
	2721	!(self->state & TH_IDLE)) {
	2722	pset_cancel_deferred_dispatch(processor->processor_set, processor);
	2723	}
	2724	#endif
	2725
	2726	}
	2727
	2728	/*
	2729	* thread_block_reason:
	2730	*
	2731	* Forces a reschedule, blocking the caller if a wait
	2732	* has been asserted.
	2733	*
	2734	* If a continuation is specified, then thread_invoke will
	2735	* attempt to discard the thread's kernel stack. When the
	2736	* thread resumes, it will execute the continuation function
	2737	* on a new kernel stack.
	2738	*/
	2739	counter(mach_counter_t c_thread_block_calls = 0;)
	2740
	2741	wait_result_t
	2742	thread_block_reason(
	2743	thread_continue_t continuation,
	2744	void *parameter,
	2745	ast_t reason)
	2746	{
	2747	thread_t self = current_thread();
	2748	processor_t processor;
	2749	thread_t new_thread;
	2750	spl_t s;
	2751
	2752	counter(++c_thread_block_calls);
	2753
	2754	s = splsched();
	2755
	2756	processor = current_processor();
	2757
	2758	/* If we're explicitly yielding, force a subsequent quantum */
	2759	if (reason & AST_YIELD)
	2760	processor->first_timeslice = FALSE;
	2761
	2762	/* We're handling all scheduling AST's */
	2763	ast_off(AST_SCHEDULING);
	2764
	2765	#if PROC_REF_DEBUG
	2766	if ((continuation != NULL) && (self->task != kernel_task)) {
	2767	if (uthread_get_proc_refcount(self->uthread) != 0) {
	2768	panic("thread_block_reason with continuation uthread %p with uu_proc_refcount != 0", self->uthread);
	2769	}
	2770	}
	2771	#endif
	2772
	2773	self->continuation = continuation;
	2774	self->parameter = parameter;
	2775
	2776	if (self->state & ~(TH_RUN \| TH_IDLE)) {
	2777	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	2778	MACHDBG_CODE(DBG_MACH_SCHED,MACH_BLOCK),
	2779	reason, VM_KERNEL_UNSLIDE(continuation), 0, 0, 0);
	2780	}
	2781
	2782	do {
	2783	thread_lock(self);
	2784	new_thread = thread_select(self, processor, reason);
	2785	thread_unlock(self);
	2786	} while (!thread_invoke(self, new_thread, reason));
	2787
	2788	splx(s);
	2789
	2790	return (self->wait_result);
	2791	}
	2792
	2793	/*
	2794	* thread_block:
	2795	*
	2796	* Block the current thread if a wait has been asserted.
	2797	*/
	2798	wait_result_t
	2799	thread_block(
	2800	thread_continue_t continuation)
	2801	{
	2802	return thread_block_reason(continuation, NULL, AST_NONE);
	2803	}
	2804
	2805	wait_result_t
	2806	thread_block_parameter(
	2807	thread_continue_t continuation,
	2808	void *parameter)
	2809	{
	2810	return thread_block_reason(continuation, parameter, AST_NONE);
	2811	}
	2812
	2813	/*
	2814	* thread_run:
	2815	*
	2816	* Switch directly from the current thread to the
	2817	* new thread, handing off our quantum if appropriate.
	2818	*
	2819	* New thread must be runnable, and not on a run queue.
	2820	*
	2821	* Called at splsched.
	2822	*/
	2823	int
	2824	thread_run(
	2825	thread_t self,
	2826	thread_continue_t continuation,
	2827	void *parameter,
	2828	thread_t new_thread)
	2829	{
	2830	ast_t handoff = AST_HANDOFF;
	2831
	2832	self->continuation = continuation;
	2833	self->parameter = parameter;
	2834
	2835	while (!thread_invoke(self, new_thread, handoff)) {
	2836	processor_t processor = current_processor();
	2837
	2838	thread_lock(self);
	2839	new_thread = thread_select(self, processor, AST_NONE);
	2840	thread_unlock(self);
	2841	handoff = AST_NONE;
	2842	}
	2843
	2844	return (self->wait_result);
	2845	}
	2846
	2847	/*
	2848	* thread_continue:
	2849	*
	2850	* Called at splsched when a thread first receives
	2851	* a new stack after a continuation.
	2852	*/
	2853	void
	2854	thread_continue(
	2855	thread_t thread)
	2856	{
	2857	thread_t self = current_thread();
	2858	thread_continue_t continuation;
	2859	void *parameter;
	2860
	2861	DTRACE_SCHED(on__cpu);
	2862
	2863	continuation = self->continuation;
	2864	parameter = self->parameter;
	2865
	2866	#if KPERF
	2867	kperf_on_cpu(self, continuation, NULL);
	2868	#endif
	2869
	2870	thread_dispatch(thread, self);
	2871
	2872	self->continuation = self->parameter = NULL;
	2873
	2874	if (thread != THREAD_NULL)
	2875	(void)spllo();
	2876
	2877	TLOG(1, "thread_continue: calling call_continuation \n");
	2878	call_continuation(continuation, parameter, self->wait_result);
	2879	/NOTREACHED/
	2880	}
	2881
	2882	void
	2883	thread_quantum_init(thread_t thread)
	2884	{
	2885	if (thread->sched_mode == TH_MODE_REALTIME) {
	2886	thread->quantum_remaining = thread->realtime.computation;
	2887	} else {
	2888	thread->quantum_remaining = SCHED(initial_quantum_size)(thread);
	2889	}
	2890	}
	2891
	2892	uint32_t
	2893	sched_timeshare_initial_quantum_size(thread_t thread)
	2894	{
	2895	if ((thread != THREAD_NULL) && thread->th_sched_bucket == TH_BUCKET_SHARE_BG)
	2896	return bg_quantum;
	2897	else
	2898	return std_quantum;
	2899	}
	2900
	2901	/*
	2902	* run_queue_init:
	2903	*
	2904	* Initialize a run queue before first use.
	2905	*/
	2906	void
	2907	run_queue_init(
	2908	run_queue_t rq)
	2909	{
	2910	rq->highq = NOPRI;
	2911	for (u_int i = 0; i < BITMAP_LEN(NRQS); i++)
	2912	rq->bitmap[i] = 0;
	2913	rq->urgency = rq->count = 0;
	2914	for (int i = 0; i < NRQS; i++)
	2915	queue_init(&rq->queues[i]);
	2916	}
	2917
	2918	/*
	2919	* run_queue_dequeue:
	2920	*
	2921	* Perform a dequeue operation on a run queue,
	2922	* and return the resulting thread.
	2923	*
	2924	* The run queue must be locked (see thread_run_queue_remove()
	2925	* for more info), and not empty.
	2926	*/
	2927	thread_t
	2928	run_queue_dequeue(
	2929	run_queue_t rq,
	2930	integer_t options)
	2931	{
	2932	thread_t thread;
	2933	queue_t queue = &rq->queues[rq->highq];
	2934
	2935	if (options & SCHED_HEADQ) {
	2936	thread = qe_dequeue_head(queue, struct thread, runq_links);
	2937	} else {
	2938	thread = qe_dequeue_tail(queue, struct thread, runq_links);
	2939	}
	2940
	2941	assert(thread != THREAD_NULL);
	2942	assert_thread_magic(thread);
	2943
	2944	thread->runq = PROCESSOR_NULL;
	2945	SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
	2946	rq->count--;
	2947	if (SCHED(priority_is_urgent)(rq->highq)) {
	2948	rq->urgency--; assert(rq->urgency >= 0);
	2949	}
	2950	if (queue_empty(queue)) {
	2951	bitmap_clear(rq->bitmap, rq->highq);
	2952	rq->highq = bitmap_first(rq->bitmap, NRQS);
	2953	}
	2954
	2955	return thread;
	2956	}
	2957
	2958	/*
	2959	* run_queue_enqueue:
	2960	*
	2961	* Perform a enqueue operation on a run queue.
	2962	*
	2963	* The run queue must be locked (see thread_run_queue_remove()
	2964	* for more info).
	2965	*/
	2966	boolean_t
	2967	run_queue_enqueue(
	2968	run_queue_t rq,
	2969	thread_t thread,
	2970	integer_t options)
	2971	{
	2972	queue_t queue = &rq->queues[thread->sched_pri];
	2973	boolean_t result = FALSE;
	2974
	2975	assert_thread_magic(thread);
	2976
	2977	if (queue_empty(queue)) {
	2978	enqueue_tail(queue, &thread->runq_links);
	2979
	2980	rq_bitmap_set(rq->bitmap, thread->sched_pri);
	2981	if (thread->sched_pri > rq->highq) {
	2982	rq->highq = thread->sched_pri;
	2983	result = TRUE;
	2984	}
	2985	} else {
	2986	if (options & SCHED_TAILQ)
	2987	enqueue_tail(queue, &thread->runq_links);
	2988	else
	2989	enqueue_head(queue, &thread->runq_links);
	2990	}
	2991	if (SCHED(priority_is_urgent)(thread->sched_pri))
	2992	rq->urgency++;
	2993	SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
	2994	rq->count++;
	2995
	2996	return (result);
	2997	}
	2998
	2999	/*
	3000	* run_queue_remove:
	3001	*
	3002	* Remove a specific thread from a runqueue.
	3003	*
	3004	* The run queue must be locked.
	3005	*/
	3006	void
	3007	run_queue_remove(
	3008	run_queue_t rq,
	3009	thread_t thread)
	3010	{
	3011	assert(thread->runq != PROCESSOR_NULL);
	3012	assert_thread_magic(thread);
	3013
	3014	remqueue(&thread->runq_links);
	3015	SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
	3016	rq->count--;
	3017	if (SCHED(priority_is_urgent)(thread->sched_pri)) {
	3018	rq->urgency--; assert(rq->urgency >= 0);
	3019	}
	3020
	3021	if (queue_empty(&rq->queues[thread->sched_pri])) {
	3022	/* update run queue status */
	3023	bitmap_clear(rq->bitmap, thread->sched_pri);
	3024	rq->highq = bitmap_first(rq->bitmap, NRQS);
	3025	}
	3026
	3027	thread->runq = PROCESSOR_NULL;
	3028	}
	3029
	3030	/* Assumes RT lock is not held, and acquires splsched/rt_lock itself */
	3031	void
	3032	rt_runq_scan(sched_update_scan_context_t scan_context)
	3033	{
	3034	spl_t s;
	3035	thread_t thread;
	3036
	3037	s = splsched();
	3038	rt_lock_lock();
	3039
	3040	qe_foreach_element_safe(thread, &rt_runq.queue, runq_links) {
	3041	if (thread->last_made_runnable_time < scan_context->earliest_rt_make_runnable_time) {
	3042	scan_context->earliest_rt_make_runnable_time = thread->last_made_runnable_time;
	3043	}
	3044	}
	3045
	3046	rt_lock_unlock();
	3047	splx(s);
	3048	}
	3049
	3050
	3051	/*
	3052	* realtime_queue_insert:
	3053	*
	3054	* Enqueue a thread for realtime execution.
	3055	*/
	3056	static boolean_t
	3057	realtime_queue_insert(thread_t thread)
	3058	{
	3059	queue_t queue = &rt_runq.queue;
	3060	uint64_t deadline = thread->realtime.deadline;
	3061	boolean_t preempt = FALSE;
	3062
	3063	rt_lock_lock();
	3064
	3065	if (queue_empty(queue)) {
	3066	enqueue_tail(queue, &thread->runq_links);
	3067	preempt = TRUE;
	3068	} else {
	3069	/* Insert into rt_runq in thread deadline order */
	3070	queue_entry_t iter;
	3071	qe_foreach(iter, queue) {
	3072	thread_t iter_thread = qe_element(iter, struct thread, runq_links);
	3073	assert_thread_magic(iter_thread);
	3074
	3075	if (deadline < iter_thread->realtime.deadline) {
	3076	if (iter == queue_first(queue))
	3077	preempt = TRUE;
	3078	insque(&thread->runq_links, queue_prev(iter));
	3079	break;
	3080	} else if (iter == queue_last(queue)) {
	3081	enqueue_tail(queue, &thread->runq_links);
	3082	break;
	3083	}
	3084	}
	3085	}
	3086
	3087	thread->runq = THREAD_ON_RT_RUNQ;
	3088	SCHED_STATS_RUNQ_CHANGE(&rt_runq.runq_stats, rt_runq.count);
	3089	rt_runq.count++;
	3090
	3091	rt_lock_unlock();
	3092
	3093	return (preempt);
	3094	}
	3095
	3096	/*
	3097	* realtime_setrun:
	3098	*
	3099	* Dispatch a thread for realtime execution.
	3100	*
	3101	* Thread must be locked. Associated pset must
	3102	* be locked, and is returned unlocked.
	3103	*/
	3104	static void
	3105	realtime_setrun(
	3106	processor_t processor,
	3107	thread_t thread)
	3108	{
	3109	processor_set_t pset = processor->processor_set;
	3110	ast_t preempt;
	3111
	3112	boolean_t do_signal_idle = FALSE, do_cause_ast = FALSE;
	3113
	3114	thread->chosen_processor = processor;
	3115
	3116	/* <rdar://problem/15102234> */
	3117	assert(thread->bound_processor == PROCESSOR_NULL);
	3118
	3119	/*
	3120	* Dispatch directly onto idle processor.
	3121	*/
	3122	if ( (thread->bound_processor == processor)
	3123	&& processor->state == PROCESSOR_IDLE) {
	3124	re_queue_tail(&pset->active_queue, &processor->processor_queue);
	3125
	3126	processor->next_thread = thread;
	3127	processor->current_pri = thread->sched_pri;
	3128	processor->current_thmode = thread->sched_mode;
	3129	processor->current_sfi_class = thread->sfi_class;
	3130	processor->deadline = thread->realtime.deadline;
	3131	processor->state = PROCESSOR_DISPATCHING;
	3132
	3133	if (processor != current_processor()) {
	3134	if (!(pset->pending_AST_cpu_mask & (1ULL << processor->cpu_id))) {
	3135	/* cleared on exit from main processor_idle() loop */
	3136	pset->pending_AST_cpu_mask \|= (1ULL << processor->cpu_id);
	3137	do_signal_idle = TRUE;
	3138	}
	3139	}
	3140	pset_unlock(pset);
	3141
	3142	if (do_signal_idle) {
	3143	machine_signal_idle(processor);
	3144	}
	3145	return;
	3146	}
	3147
	3148	if (processor->current_pri < BASEPRI_RTQUEUES)
	3149	preempt = (AST_PREEMPT \| AST_URGENT);
	3150	else if (thread->realtime.deadline < processor->deadline)
	3151	preempt = (AST_PREEMPT \| AST_URGENT);
	3152	else
	3153	preempt = AST_NONE;
	3154
	3155	realtime_queue_insert(thread);
	3156
	3157	if (preempt != AST_NONE) {
	3158	if (processor->state == PROCESSOR_IDLE) {
	3159	re_queue_tail(&pset->active_queue, &processor->processor_queue);
	3160
	3161	processor->next_thread = THREAD_NULL;
	3162	processor->current_pri = thread->sched_pri;
	3163	processor->current_thmode = thread->sched_mode;
	3164	processor->current_sfi_class = thread->sfi_class;
	3165	processor->deadline = thread->realtime.deadline;
	3166	processor->state = PROCESSOR_DISPATCHING;
	3167	if (processor == current_processor()) {
	3168	ast_on(preempt);
	3169	} else {
	3170	if (!(pset->pending_AST_cpu_mask & (1ULL << processor->cpu_id))) {
	3171	/* cleared on exit from main processor_idle() loop */
	3172	pset->pending_AST_cpu_mask \|= (1ULL << processor->cpu_id);
	3173	do_signal_idle = TRUE;
	3174	}
	3175	}
	3176	} else if (processor->state == PROCESSOR_DISPATCHING) {
	3177	if ((processor->next_thread == THREAD_NULL) && ((processor->current_pri < thread->sched_pri) \|\| (processor->deadline > thread->realtime.deadline))) {
	3178	processor->current_pri = thread->sched_pri;
	3179	processor->current_thmode = thread->sched_mode;
	3180	processor->current_sfi_class = thread->sfi_class;
	3181	processor->deadline = thread->realtime.deadline;
	3182	}
	3183	} else {
	3184	if (processor == current_processor()) {
	3185	ast_on(preempt);
	3186	} else {
	3187	if (!(pset->pending_AST_cpu_mask & (1ULL << processor->cpu_id))) {
	3188	/* cleared after IPI causes csw_check() to be called */
	3189	pset->pending_AST_cpu_mask \|= (1ULL << processor->cpu_id);
	3190	do_cause_ast = TRUE;
	3191	}
	3192	}
	3193	}
	3194	} else {
	3195	/* Selected processor was too busy, just keep thread enqueued and let other processors drain it naturally. */
	3196	}
	3197
	3198	pset_unlock(pset);
	3199
	3200	if (do_signal_idle) {
	3201	machine_signal_idle(processor);
	3202	} else if (do_cause_ast) {
	3203	cause_ast_check(processor);
	3204	}
	3205	}
	3206
	3207
	3208	#if defined(CONFIG_SCHED_TIMESHARE_CORE)
	3209
	3210	boolean_t
	3211	priority_is_urgent(int priority)
	3212	{
	3213	return bitmap_test(sched_preempt_pri, priority) ? TRUE : FALSE;
	3214	}
	3215
	3216	#endif /* CONFIG_SCHED_TIMESHARE_CORE */
	3217
	3218	/*
	3219	* processor_setrun:
	3220	*
	3221	* Dispatch a thread for execution on a
	3222	* processor.
	3223	*
	3224	* Thread must be locked. Associated pset must
	3225	* be locked, and is returned unlocked.
	3226	*/
	3227	static void
	3228	processor_setrun(
	3229	processor_t processor,
	3230	thread_t thread,
	3231	integer_t options)
	3232	{
	3233	processor_set_t pset = processor->processor_set;
	3234	ast_t preempt;
	3235	enum { eExitIdle, eInterruptRunning, eDoNothing } ipi_action = eDoNothing;
	3236	enum { eNoSignal, eDoSignal, eDoDeferredSignal } do_signal_idle = eNoSignal;
	3237
	3238	boolean_t do_cause_ast = FALSE;
	3239
	3240	thread->chosen_processor = processor;
	3241
	3242	/*
	3243	* Dispatch directly onto idle processor.
	3244	*/
	3245	if ( (SCHED(direct_dispatch_to_idle_processors) \|\|
	3246	thread->bound_processor == processor)
	3247	&& processor->state == PROCESSOR_IDLE) {
	3248
	3249	re_queue_tail(&pset->active_queue, &processor->processor_queue);
	3250
	3251	processor->next_thread = thread;
	3252	processor->current_pri = thread->sched_pri;
	3253	processor->current_thmode = thread->sched_mode;
	3254	processor->current_sfi_class = thread->sfi_class;
	3255	processor->deadline = UINT64_MAX;
	3256	processor->state = PROCESSOR_DISPATCHING;
	3257
	3258	if (!(pset->pending_AST_cpu_mask & (1ULL << processor->cpu_id))) {
	3259	/* cleared on exit from main processor_idle() loop */
	3260	pset->pending_AST_cpu_mask \|= (1ULL << processor->cpu_id);
	3261	do_signal_idle = eDoSignal;
	3262	}
	3263
	3264	pset_unlock(pset);
	3265
	3266	if (do_signal_idle == eDoSignal) {
	3267	machine_signal_idle(processor);
	3268	}
	3269
	3270	return;
	3271	}
	3272
	3273	/*
	3274	* Set preemption mode.
	3275	*/
	3276	#if defined(CONFIG_SCHED_DEFERRED_AST)
	3277	/* TODO: Do we need to care about urgency (see rdar://problem/20136239)? */
	3278	#endif
	3279	if (SCHED(priority_is_urgent)(thread->sched_pri) && thread->sched_pri > processor->current_pri)
	3280	preempt = (AST_PREEMPT \| AST_URGENT);
	3281	else if(processor->active_thread && thread_eager_preemption(processor->active_thread))
	3282	preempt = (AST_PREEMPT \| AST_URGENT);
	3283	else if ((thread->sched_mode == TH_MODE_TIMESHARE) && (thread->sched_pri < thread->base_pri)) {
	3284	if(SCHED(priority_is_urgent)(thread->base_pri) && thread->sched_pri > processor->current_pri) {
	3285	preempt = (options & SCHED_PREEMPT)? AST_PREEMPT: AST_NONE;
	3286	} else {
	3287	preempt = AST_NONE;
	3288	}
	3289	} else
	3290	preempt = (options & SCHED_PREEMPT)? AST_PREEMPT: AST_NONE;
	3291
	3292	SCHED(processor_enqueue)(processor, thread, options);
	3293
	3294	if (preempt != AST_NONE) {
	3295	if (processor->state == PROCESSOR_IDLE) {
	3296	re_queue_tail(&pset->active_queue, &processor->processor_queue);
	3297
	3298	processor->next_thread = THREAD_NULL;
	3299	processor->current_pri = thread->sched_pri;
	3300	processor->current_thmode = thread->sched_mode;
	3301	processor->current_sfi_class = thread->sfi_class;
	3302	processor->deadline = UINT64_MAX;
	3303	processor->state = PROCESSOR_DISPATCHING;
	3304
	3305	ipi_action = eExitIdle;
	3306	} else if ( processor->state == PROCESSOR_DISPATCHING) {
	3307	if ((processor->next_thread == THREAD_NULL) && (processor->current_pri < thread->sched_pri)) {
	3308	processor->current_pri = thread->sched_pri;
	3309	processor->current_thmode = thread->sched_mode;
	3310	processor->current_sfi_class = thread->sfi_class;
	3311	processor->deadline = UINT64_MAX;
	3312	}
	3313	} else if ( (processor->state == PROCESSOR_RUNNING \|\|
	3314	processor->state == PROCESSOR_SHUTDOWN) &&
	3315	(thread->sched_pri >= processor->current_pri)) {
	3316	ipi_action = eInterruptRunning;
	3317	}
	3318	} else {
	3319	/*
	3320	* New thread is not important enough to preempt what is running, but
	3321	* special processor states may need special handling
	3322	*/
	3323	if (processor->state == PROCESSOR_SHUTDOWN &&
	3324	thread->sched_pri >= processor->current_pri ) {
	3325	ipi_action = eInterruptRunning;
	3326	} else if (processor->state == PROCESSOR_IDLE) {
	3327	re_queue_tail(&pset->active_queue, &processor->processor_queue);
	3328
	3329	processor->next_thread = THREAD_NULL;
	3330	processor->current_pri = thread->sched_pri;
	3331	processor->current_thmode = thread->sched_mode;
	3332	processor->current_sfi_class = thread->sfi_class;
	3333	processor->deadline = UINT64_MAX;
	3334	processor->state = PROCESSOR_DISPATCHING;
	3335
	3336	ipi_action = eExitIdle;
	3337	}
	3338	}
	3339
	3340	switch (ipi_action) {
	3341	case eDoNothing:
	3342	break;
	3343	case eExitIdle:
	3344	if (processor == current_processor()) {
	3345	if (csw_check_locked(processor, pset, AST_NONE) != AST_NONE)
	3346	ast_on(preempt);
	3347	} else {
	3348	#if defined(CONFIG_SCHED_DEFERRED_AST)
	3349	if (!(pset->pending_deferred_AST_cpu_mask & (1ULL << processor->cpu_id)) &&
	3350	!(pset->pending_AST_cpu_mask & (1ULL << processor->cpu_id))) {
	3351	/* cleared on exit from main processor_idle() loop */
	3352	pset->pending_deferred_AST_cpu_mask \|= (1ULL << processor->cpu_id);
	3353	do_signal_idle = eDoDeferredSignal;
	3354	}
	3355	#else
	3356	if (!(pset->pending_AST_cpu_mask & (1ULL << processor->cpu_id))) {
	3357	/* cleared on exit from main processor_idle() loop */
	3358	pset->pending_AST_cpu_mask \|= (1ULL << processor->cpu_id);
	3359	do_signal_idle = eDoSignal;
	3360	}
	3361	#endif
	3362	}
	3363	break;
	3364	case eInterruptRunning:
	3365	if (processor == current_processor()) {
	3366	if (csw_check_locked(processor, pset, AST_NONE) != AST_NONE)
	3367	ast_on(preempt);
	3368	} else {
	3369	if (!(pset->pending_AST_cpu_mask & (1ULL << processor->cpu_id))) {
	3370	/* cleared after IPI causes csw_check() to be called */
	3371	pset->pending_AST_cpu_mask \|= (1ULL << processor->cpu_id);
	3372	do_cause_ast = TRUE;
	3373	}
	3374	}
	3375	break;
	3376	}
	3377
	3378	pset_unlock(pset);
	3379
	3380	if (do_signal_idle == eDoSignal) {
	3381	machine_signal_idle(processor);
	3382	}
	3383	#if defined(CONFIG_SCHED_DEFERRED_AST)
	3384	else if (do_signal_idle == eDoDeferredSignal) {
	3385	/*
	3386	* TODO: The ability to cancel this signal could make
	3387	* sending it outside of the pset lock an issue. Do
	3388	* we need to address this? Or would the only fallout
	3389	* be that the core takes a signal? As long as we do
	3390	* not run the risk of having a core marked as signal
	3391	* outstanding, with no real signal outstanding, the
	3392	* only result should be that we fail to cancel some
	3393	* signals.
	3394	*/
	3395	machine_signal_idle_deferred(processor);
	3396	}
	3397	#endif
	3398	else if (do_cause_ast) {
	3399	cause_ast_check(processor);
	3400	}
	3401	}
	3402
	3403	/*
	3404	* choose_next_pset:
	3405	*
	3406	* Return the next sibling pset containing
	3407	* available processors.
	3408	*
	3409	* Returns the original pset if none other is
	3410	* suitable.
	3411	*/
	3412	static processor_set_t
	3413	choose_next_pset(
	3414	processor_set_t pset)
	3415	{
	3416	processor_set_t nset = pset;
	3417
	3418	do {
	3419	nset = next_pset(nset);
	3420	} while (nset->online_processor_count < 1 && nset != pset);
	3421
	3422	return (nset);
	3423	}
	3424
	3425	/*
	3426	* choose_processor:
	3427	*
	3428	* Choose a processor for the thread, beginning at
	3429	* the pset. Accepts an optional processor hint in
	3430	* the pset.
	3431	*
	3432	* Returns a processor, possibly from a different pset.
	3433	*
	3434	* The thread must be locked. The pset must be locked,
	3435	* and the resulting pset is locked on return.
	3436	*/
	3437	processor_t
	3438	choose_processor(
	3439	processor_set_t pset,
	3440	processor_t processor,
	3441	thread_t thread)
	3442	{
	3443	processor_set_t nset, cset = pset;
	3444
	3445	assert(thread->sched_pri <= BASEPRI_RTQUEUES);
	3446
	3447	/*
	3448	* Prefer the hinted processor, when appropriate.
	3449	*/
	3450
	3451	/* Fold last processor hint from secondary processor to its primary */
	3452	if (processor != PROCESSOR_NULL) {
	3453	processor = processor->processor_primary;
	3454	}
	3455
	3456	/*
	3457	* Only consult platform layer if pset is active, which
	3458	* it may not be in some cases when a multi-set system
	3459	* is going to sleep.
	3460	*/
	3461	if (pset->online_processor_count) {
	3462	if ((processor == PROCESSOR_NULL) \|\| (processor->processor_set == pset && processor->state == PROCESSOR_IDLE)) {
	3463	processor_t mc_processor = machine_choose_processor(pset, processor);
	3464	if (mc_processor != PROCESSOR_NULL)
	3465	processor = mc_processor->processor_primary;
	3466	}
	3467	}
	3468
	3469	/*
	3470	* At this point, we may have a processor hint, and we may have
	3471	* an initial starting pset. If the hint is not in the pset, or
	3472	* if the hint is for a processor in an invalid state, discard
	3473	* the hint.
	3474	*/
	3475	if (processor != PROCESSOR_NULL) {
	3476	if (processor->processor_set != pset) {
	3477	processor = PROCESSOR_NULL;
	3478	} else if (!processor->is_recommended) {
	3479	processor = PROCESSOR_NULL;
	3480	} else {
	3481	switch (processor->state) {
	3482	case PROCESSOR_START:
	3483	case PROCESSOR_SHUTDOWN:
	3484	case PROCESSOR_OFF_LINE:
	3485	/*
	3486	* Hint is for a processor that cannot support running new threads.
	3487	*/
	3488	processor = PROCESSOR_NULL;
	3489	break;
	3490	case PROCESSOR_IDLE:
	3491	/*
	3492	* Hint is for an idle processor. Assume it is no worse than any other
	3493	* idle processor. The platform layer had an opportunity to provide
	3494	* the "least cost idle" processor above.
	3495	*/
	3496	return (processor);
	3497	case PROCESSOR_RUNNING:
	3498	case PROCESSOR_DISPATCHING:
	3499	/*
	3500	* Hint is for an active CPU. This fast-path allows
	3501	* realtime threads to preempt non-realtime threads
	3502	* to regain their previous executing processor.
	3503	*/
	3504	if ((thread->sched_pri >= BASEPRI_RTQUEUES) &&
	3505	(processor->current_pri < BASEPRI_RTQUEUES))
	3506	return (processor);
	3507
	3508	/* Otherwise, use hint as part of search below */
	3509	break;
	3510	default:
	3511	processor = PROCESSOR_NULL;
	3512	break;
	3513	}
	3514	}
	3515	}
	3516
	3517	/*
	3518	* Iterate through the processor sets to locate
	3519	* an appropriate processor. Seed results with
	3520	* a last-processor hint, if available, so that
	3521	* a search must find something strictly better
	3522	* to replace it.
	3523	*
	3524	* A primary/secondary pair of SMT processors are
	3525	* "unpaired" if the primary is busy but its
	3526	* corresponding secondary is idle (so the physical
	3527	* core has full use of its resources).
	3528	*/
	3529
	3530	integer_t lowest_priority = MAXPRI + 1;
	3531	integer_t lowest_unpaired_primary_priority = MAXPRI + 1;
	3532	integer_t lowest_count = INT_MAX;
	3533	uint64_t furthest_deadline = 1;
	3534	processor_t lp_processor = PROCESSOR_NULL;
	3535	processor_t lp_unpaired_primary_processor = PROCESSOR_NULL;
	3536	processor_t lp_unpaired_secondary_processor = PROCESSOR_NULL;
	3537	processor_t lc_processor = PROCESSOR_NULL;
	3538	processor_t fd_processor = PROCESSOR_NULL;
	3539
	3540	if (processor != PROCESSOR_NULL) {
	3541	/* All other states should be enumerated above. */
	3542	assert(processor->state == PROCESSOR_RUNNING \|\| processor->state == PROCESSOR_DISPATCHING);
	3543
	3544	lowest_priority = processor->current_pri;
	3545	lp_processor = processor;
	3546
	3547	if (processor->current_pri >= BASEPRI_RTQUEUES) {
	3548	furthest_deadline = processor->deadline;
	3549	fd_processor = processor;
	3550	}
	3551
	3552	lowest_count = SCHED(processor_runq_count)(processor);
	3553	lc_processor = processor;
	3554	}
	3555
	3556	do {
	3557
	3558	/*
	3559	* Choose an idle processor, in pset traversal order
	3560	*/
	3561	qe_foreach_element(processor, &cset->idle_queue, processor_queue) {
	3562	if (processor->is_recommended)
	3563	return processor;
	3564	}
	3565
	3566	/*
	3567	* Otherwise, enumerate active and idle processors to find candidates
	3568	* with lower priority/etc.
	3569	*/
	3570
	3571	qe_foreach_element(processor, &cset->active_queue, processor_queue) {
	3572
	3573	if (!processor->is_recommended) {
	3574	continue;
	3575	}
	3576
	3577	integer_t cpri = processor->current_pri;
	3578	if (cpri < lowest_priority) {
	3579	lowest_priority = cpri;
	3580	lp_processor = processor;
	3581	}
	3582
	3583	if ((cpri >= BASEPRI_RTQUEUES) && (processor->deadline > furthest_deadline)) {
	3584	furthest_deadline = processor->deadline;
	3585	fd_processor = processor;
	3586	}
	3587
	3588	integer_t ccount = SCHED(processor_runq_count)(processor);
	3589	if (ccount < lowest_count) {
	3590	lowest_count = ccount;
	3591	lc_processor = processor;
	3592	}
	3593	}
	3594
	3595	/*
	3596	* For SMT configs, these idle secondary processors must have active primary. Otherwise
	3597	* the idle primary would have short-circuited the loop above
	3598	*/
	3599	qe_foreach_element(processor, &cset->idle_secondary_queue, processor_queue) {
	3600
	3601	if (!processor->is_recommended) {
	3602	continue;
	3603	}
	3604
	3605	processor_t cprimary = processor->processor_primary;
	3606
	3607	/* If the primary processor is offline or starting up, it's not a candidate for this path */
	3608	if (cprimary->state == PROCESSOR_RUNNING \|\| cprimary->state == PROCESSOR_DISPATCHING) {
	3609	integer_t primary_pri = cprimary->current_pri;
	3610
	3611	if (primary_pri < lowest_unpaired_primary_priority) {
	3612	lowest_unpaired_primary_priority = primary_pri;
	3613	lp_unpaired_primary_processor = cprimary;
	3614	lp_unpaired_secondary_processor = processor;
	3615	}
	3616	}
	3617	}
	3618
	3619
	3620	if (thread->sched_pri >= BASEPRI_RTQUEUES) {
	3621
	3622	/*
	3623	* For realtime threads, the most important aspect is
	3624	* scheduling latency, so we attempt to assign threads
	3625	* to good preemption candidates (assuming an idle primary
	3626	* processor was not available above).
	3627	*/
	3628
	3629	if (thread->sched_pri > lowest_unpaired_primary_priority) {
	3630	/* Move to end of active queue so that the next thread doesn't also pick it */
	3631	re_queue_tail(&cset->active_queue, &lp_unpaired_primary_processor->processor_queue);
	3632	return lp_unpaired_primary_processor;
	3633	}
	3634	if (thread->sched_pri > lowest_priority) {
	3635	/* Move to end of active queue so that the next thread doesn't also pick it */
	3636	re_queue_tail(&cset->active_queue, &lp_processor->processor_queue);
	3637	return lp_processor;
	3638	}
	3639	if (thread->realtime.deadline < furthest_deadline)
	3640	return fd_processor;
	3641
	3642	/*
	3643	* If all primary and secondary CPUs are busy with realtime
	3644	* threads with deadlines earlier than us, move on to next
	3645	* pset.
	3646	*/
	3647	}
	3648	else {
	3649
	3650	if (thread->sched_pri > lowest_unpaired_primary_priority) {
	3651	/* Move to end of active queue so that the next thread doesn't also pick it */
	3652	re_queue_tail(&cset->active_queue, &lp_unpaired_primary_processor->processor_queue);
	3653	return lp_unpaired_primary_processor;
	3654	}
	3655	if (thread->sched_pri > lowest_priority) {
	3656	/* Move to end of active queue so that the next thread doesn't also pick it */
	3657	re_queue_tail(&cset->active_queue, &lp_processor->processor_queue);
	3658	return lp_processor;
	3659	}
	3660
	3661	/*
	3662	* If all primary processor in this pset are running a higher
	3663	* priority thread, move on to next pset. Only when we have
	3664	* exhausted this search do we fall back to other heuristics.
	3665	*/
	3666	}
	3667
	3668	/*
	3669	* Move onto the next processor set.
	3670	*/
	3671	nset = next_pset(cset);
	3672
	3673	if (nset != pset) {
	3674	pset_unlock(cset);
	3675
	3676	cset = nset;
	3677	pset_lock(cset);
	3678	}
	3679	} while (nset != pset);
	3680
	3681	/*
	3682	* Make sure that we pick a running processor,
	3683	* and that the correct processor set is locked.
	3684	* Since we may have unlock the candidate processor's
	3685	* pset, it may have changed state.
	3686	*
	3687	* All primary processors are running a higher priority
	3688	* thread, so the only options left are enqueuing on
	3689	* the secondary processor that would perturb the least priority
	3690	* primary, or the least busy primary.
	3691	*/
	3692	do {
	3693
	3694	/* lowest_priority is evaluated in the main loops above */
	3695	if (lp_unpaired_secondary_processor != PROCESSOR_NULL) {
	3696	processor = lp_unpaired_secondary_processor;
	3697	lp_unpaired_secondary_processor = PROCESSOR_NULL;
	3698	} else if (lc_processor != PROCESSOR_NULL) {
	3699	processor = lc_processor;
	3700	lc_processor = PROCESSOR_NULL;
	3701	} else {
	3702	/*
	3703	* All processors are executing higher
	3704	* priority threads, and the lowest_count
	3705	* candidate was not usable
	3706	*/
	3707	processor = master_processor;
	3708	}
	3709
	3710	/*
	3711	* Check that the correct processor set is
	3712	* returned locked.
	3713	*/
	3714	if (cset != processor->processor_set) {
	3715	pset_unlock(cset);
	3716	cset = processor->processor_set;
	3717	pset_lock(cset);
	3718	}
	3719
	3720	/*
	3721	* We must verify that the chosen processor is still available.
	3722	* master_processor is an exception, since we may need to preempt
	3723	* a running thread on it during processor shutdown (for sleep),
	3724	* and that thread needs to be enqueued on its runqueue to run
	3725	* when the processor is restarted.
	3726	*/
	3727	if (processor != master_processor && (processor->state == PROCESSOR_SHUTDOWN \|\| processor->state == PROCESSOR_OFF_LINE))
	3728	processor = PROCESSOR_NULL;
	3729
	3730	} while (processor == PROCESSOR_NULL);
	3731
	3732	return (processor);
	3733	}
	3734
	3735	/*
	3736	* thread_setrun:
	3737	*
	3738	* Dispatch thread for execution, onto an idle
	3739	* processor or run queue, and signal a preemption
	3740	* as appropriate.
	3741	*
	3742	* Thread must be locked.
	3743	*/
	3744	void
	3745	thread_setrun(
	3746	thread_t thread,
	3747	integer_t options)
	3748	{
	3749	processor_t processor;
	3750	processor_set_t pset;
	3751
	3752	assert((thread->state & (TH_RUN\|TH_WAIT\|TH_UNINT\|TH_TERMINATE\|TH_TERMINATE2)) == TH_RUN);
	3753	assert(thread->runq == PROCESSOR_NULL);
	3754
	3755	/*
	3756	* Update priority if needed.
	3757	*/
	3758	if (SCHED(can_update_priority)(thread))
	3759	SCHED(update_priority)(thread);
	3760
	3761	thread->sfi_class = sfi_thread_classify(thread);
	3762
	3763	assert(thread->runq == PROCESSOR_NULL);
	3764
	3765	#if __SMP__
	3766	if (thread->bound_processor == PROCESSOR_NULL) {
	3767	/*
	3768	* Unbound case.
	3769	*/
	3770	if (thread->affinity_set != AFFINITY_SET_NULL) {
	3771	/*
	3772	* Use affinity set policy hint.
	3773	*/
	3774	pset = thread->affinity_set->aset_pset;
	3775	pset_lock(pset);
	3776
	3777	processor = SCHED(choose_processor)(pset, PROCESSOR_NULL, thread);
	3778
	3779	SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR)\|DBG_FUNC_NONE,
	3780	(uintptr_t)thread_tid(thread), (uintptr_t)-1, processor->cpu_id, processor->state, 0);
	3781	} else if (thread->last_processor != PROCESSOR_NULL) {
	3782	/*
	3783	* Simple (last processor) affinity case.
	3784	*/
	3785	processor = thread->last_processor;
	3786	pset = processor->processor_set;
	3787	pset_lock(pset);
	3788	processor = SCHED(choose_processor)(pset, processor, thread);
	3789
	3790	SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR)\|DBG_FUNC_NONE,
	3791	(uintptr_t)thread_tid(thread), thread->last_processor->cpu_id, processor->cpu_id, processor->state, 0);
	3792	} else {
	3793	/*
	3794	* No Affinity case:
	3795	*
	3796	* Utilitize a per task hint to spread threads
	3797	* among the available processor sets.
	3798	*/
	3799	task_t task = thread->task;
	3800
	3801	pset = task->pset_hint;
	3802	if (pset == PROCESSOR_SET_NULL)
	3803	pset = current_processor()->processor_set;
	3804
	3805	pset = choose_next_pset(pset);
	3806	pset_lock(pset);
	3807
	3808	processor = SCHED(choose_processor)(pset, PROCESSOR_NULL, thread);
	3809	task->pset_hint = processor->processor_set;
	3810
	3811	SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR)\|DBG_FUNC_NONE,
	3812	(uintptr_t)thread_tid(thread), (uintptr_t)-1, processor->cpu_id, processor->state, 0);
	3813	}
	3814	} else {
	3815	/*
	3816	* Bound case:
	3817	*
	3818	* Unconditionally dispatch on the processor.
	3819	*/
	3820	processor = thread->bound_processor;
	3821	pset = processor->processor_set;
	3822	pset_lock(pset);
	3823
	3824	SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR)\|DBG_FUNC_NONE,
	3825	(uintptr_t)thread_tid(thread), (uintptr_t)-2, processor->cpu_id, processor->state, 0);
	3826	}
	3827	#else /* !__SMP__ */
	3828	/* Only one processor to choose */
	3829	assert(thread->bound_processor == PROCESSOR_NULL \|\| thread->bound_processor == master_processor);
	3830	processor = master_processor;
	3831	pset = processor->processor_set;
	3832	pset_lock(pset);
	3833	#endif /* !__SMP__ */
	3834
	3835	/*
	3836	* Dispatch the thread on the chosen processor.
	3837	* TODO: This should be based on sched_mode, not sched_pri
	3838	*/
	3839	if (thread->sched_pri >= BASEPRI_RTQUEUES)
	3840	realtime_setrun(processor, thread);
	3841	else
	3842	processor_setrun(processor, thread, options);
	3843	}
	3844
	3845	processor_set_t
	3846	task_choose_pset(
	3847	task_t task)
	3848	{
	3849	processor_set_t pset = task->pset_hint;
	3850
	3851	if (pset != PROCESSOR_SET_NULL)
	3852	pset = choose_next_pset(pset);
	3853
	3854	return (pset);
	3855	}
	3856
	3857	/*
	3858	* Check for a preemption point in
	3859	* the current context.
	3860	*
	3861	* Called at splsched with thread locked.
	3862	*/
	3863	ast_t
	3864	csw_check(
	3865	processor_t processor,
	3866	ast_t check_reason)
	3867	{
	3868	processor_set_t pset = processor->processor_set;
	3869	ast_t result;
	3870
	3871	pset_lock(pset);
	3872
	3873	/* If we were sent a remote AST and interrupted a running processor, acknowledge it here with pset lock held */
	3874	pset->pending_AST_cpu_mask &= ~(1ULL << processor->cpu_id);
	3875
	3876	result = csw_check_locked(processor, pset, check_reason);
	3877
	3878	pset_unlock(pset);
	3879
	3880	return result;
	3881	}
	3882
	3883	/*
	3884	* Check for preemption at splsched with
	3885	* pset and thread locked
	3886	*/
	3887	ast_t
	3888	csw_check_locked(
	3889	processor_t processor,
	3890	processor_set_t pset __unused,
	3891	ast_t check_reason)
	3892	{
	3893	ast_t result;
	3894	thread_t thread = processor->active_thread;
	3895
	3896	if (processor->first_timeslice) {
	3897	if (rt_runq.count > 0)
	3898	return (check_reason \| AST_PREEMPT \| AST_URGENT);
	3899	}
	3900	else {
	3901	if (rt_runq.count > 0) {
	3902	if (BASEPRI_RTQUEUES > processor->current_pri)
	3903	return (check_reason \| AST_PREEMPT \| AST_URGENT);
	3904	else
	3905	return (check_reason \| AST_PREEMPT);
	3906	}
	3907	}
	3908
	3909	result = SCHED(processor_csw_check)(processor);
	3910	if (result != AST_NONE)
	3911	return (check_reason \| result \| (thread_eager_preemption(thread) ? AST_URGENT : AST_NONE));
	3912
	3913	#if __SMP__
	3914
	3915	/*
	3916	* If the current thread is running on a processor that is no longer recommended, gently
	3917	* (non-urgently) get to a point and then block, and which point thread_select() should
	3918	* try to idle the processor and re-dispatch the thread to a recommended processor.
	3919	*/
	3920	if (!processor->is_recommended)
	3921	return (check_reason \| AST_PREEMPT);
	3922
	3923	/*
	3924	* Even though we could continue executing on this processor, a
	3925	* secondary SMT core should try to shed load to another primary core.
	3926	*
	3927	* TODO: Should this do the same check that thread_select does? i.e.
	3928	* if no bound threads target this processor, and idle primaries exist, preempt
	3929	* The case of RT threads existing is already taken care of above
	3930	* Consider Capri in this scenario.
	3931	*
	3932	* if (!SCHED(processor_bound_count)(processor) && !queue_empty(&pset->idle_queue))
	3933	*
	3934	* TODO: Alternatively - check if only primary is idle, or check if primary's pri is lower than mine.
	3935	*/
	3936
	3937	if (processor->current_pri < BASEPRI_RTQUEUES &&
	3938	processor->processor_primary != processor)
	3939	return (check_reason \| AST_PREEMPT);
	3940	#endif
	3941
	3942	if (thread->state & TH_SUSP)
	3943	return (check_reason \| AST_PREEMPT);
	3944
	3945	#if CONFIG_SCHED_SFI
	3946	/*
	3947	* Current thread may not need to be preempted, but maybe needs
	3948	* an SFI wait?
	3949	*/
	3950	result = sfi_thread_needs_ast(thread, NULL);
	3951	if (result != AST_NONE)
	3952	return (check_reason \| result);
	3953	#endif
	3954
	3955	return (AST_NONE);
	3956	}
	3957
	3958	/*
	3959	* set_sched_pri:
	3960	*
	3961	* Set the scheduled priority of the specified thread.
	3962	*
	3963	* This may cause the thread to change queues.
	3964	*
	3965	* Thread must be locked.
	3966	*/
	3967	void
	3968	set_sched_pri(
	3969	thread_t thread,
	3970	int priority)
	3971	{
	3972	thread_t cthread = current_thread();
	3973	boolean_t is_current_thread = (thread == cthread) ? TRUE : FALSE;
	3974	int curgency, nurgency;
	3975	uint64_t urgency_param1, urgency_param2;
	3976	boolean_t removed_from_runq = FALSE;
	3977
	3978	/* If we're already at this priority, no need to mess with the runqueue */
	3979	if (priority == thread->sched_pri)
	3980	return;
	3981
	3982	if (is_current_thread) {
	3983	assert(thread->runq == PROCESSOR_NULL);
	3984	curgency = thread_get_urgency(thread, &urgency_param1, &urgency_param2);
	3985	} else {
	3986	removed_from_runq = thread_run_queue_remove(thread);
	3987	}
	3988
	3989	thread->sched_pri = priority;
	3990
	3991	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHANGE_PRIORITY),
	3992	(uintptr_t)thread_tid(thread),
	3993	thread->base_pri,
	3994	thread->sched_pri,
	3995	0, /* eventually, 'reason' */
	3996	0);
	3997
	3998	if (is_current_thread) {
	3999	nurgency = thread_get_urgency(thread, &urgency_param1, &urgency_param2);
	4000	/*
	4001	* set_sched_pri doesn't alter RT params. We expect direct base priority/QoS
	4002	* class alterations from user space to occur relatively infrequently, hence
	4003	* those are lazily handled. QoS classes have distinct priority bands, and QoS
	4004	* inheritance is expected to involve priority changes.
	4005	*/
	4006	if (nurgency != curgency) {
	4007	thread_tell_urgency(nurgency, urgency_param1, urgency_param2, 0, thread);
	4008	machine_thread_going_on_core(thread, nurgency, 0, 0);
	4009	}
	4010	}
	4011
	4012	/* TODO: Should this be TAILQ if it went down, HEADQ if it went up? */
	4013	if (removed_from_runq)
	4014	thread_run_queue_reinsert(thread, SCHED_PREEMPT \| SCHED_TAILQ);
	4015	else if (thread->state & TH_RUN) {
	4016	processor_t processor = thread->last_processor;
	4017
	4018	if (is_current_thread) {
	4019	ast_t preempt;
	4020
	4021	processor->current_pri = priority;
	4022	processor->current_thmode = thread->sched_mode;
	4023	processor->current_sfi_class = thread->sfi_class = sfi_thread_classify(thread);
	4024	if ((preempt = csw_check(processor, AST_NONE)) != AST_NONE)
	4025	ast_on(preempt);
	4026	} else if (processor != PROCESSOR_NULL && processor->active_thread == thread)
	4027	cause_ast_check(processor);
	4028	}
	4029	}
	4030
	4031	/*
	4032	* thread_run_queue_remove_for_handoff
	4033	*
	4034	* Pull a thread or its (recursive) push target out of the runqueue
	4035	* so that it is ready for thread_run()
	4036	*
	4037	* Called at splsched
	4038	*
	4039	* Returns the thread that was pulled or THREAD_NULL if no thread could be pulled.
	4040	* This may be different than the thread that was passed in.
	4041	*/
	4042	thread_t
	4043	thread_run_queue_remove_for_handoff(thread_t thread) {
	4044
	4045	thread_t pulled_thread = THREAD_NULL;
	4046
	4047	thread_lock(thread);
	4048
	4049	/*
	4050	* Check that the thread is not bound
	4051	* to a different processor, and that realtime
	4052	* is not involved.
	4053	*
	4054	* Next, pull it off its run queue. If it
	4055	* doesn't come, it's not eligible.
	4056	*/
	4057
	4058	processor_t processor = current_processor();
	4059	if (processor->current_pri < BASEPRI_RTQUEUES && thread->sched_pri < BASEPRI_RTQUEUES &&
	4060	(thread->bound_processor == PROCESSOR_NULL \|\| thread->bound_processor == processor)) {
	4061
	4062	if (thread_run_queue_remove(thread))
	4063	pulled_thread = thread;
	4064	}
	4065
	4066	thread_unlock(thread);
	4067
	4068	return pulled_thread;
	4069	}
	4070
	4071	/*
	4072	* thread_run_queue_remove:
	4073	*
	4074	* Remove a thread from its current run queue and
	4075	* return TRUE if successful.
	4076	*
	4077	* Thread must be locked.
	4078	*
	4079	* If thread->runq is PROCESSOR_NULL, the thread will not re-enter the
	4080	* run queues because the caller locked the thread. Otherwise
	4081	* the thread is on a run queue, but could be chosen for dispatch
	4082	* and removed by another processor under a different lock, which
	4083	* will set thread->runq to PROCESSOR_NULL.
	4084	*
	4085	* Hence the thread select path must not rely on anything that could
	4086	* be changed under the thread lock after calling this function,
	4087	* most importantly thread->sched_pri.
	4088	*/
	4089	boolean_t
	4090	thread_run_queue_remove(
	4091	thread_t thread)
	4092	{
	4093	boolean_t removed = FALSE;
	4094	processor_t processor = thread->runq;
	4095
	4096	if ((thread->state & (TH_RUN\|TH_WAIT)) == TH_WAIT) {
	4097	/* Thread isn't runnable */
	4098	assert(thread->runq == PROCESSOR_NULL);
	4099	return FALSE;
	4100	}
	4101
	4102	if (processor == PROCESSOR_NULL) {
	4103	/*
	4104	* The thread is either not on the runq,
	4105	* or is in the midst of being removed from the runq.
	4106	*
	4107	* runq is set to NULL under the pset lock, not the thread
	4108	* lock, so the thread may still be in the process of being dequeued
	4109	* from the runq. It will wait in invoke for the thread lock to be
	4110	* dropped.
	4111	*/
	4112
	4113	return FALSE;
	4114	}
	4115
	4116	if (thread->sched_pri < BASEPRI_RTQUEUES) {
	4117	return SCHED(processor_queue_remove)(processor, thread);
	4118	}
	4119
	4120	rt_lock_lock();
	4121
	4122	if (thread->runq != PROCESSOR_NULL) {
	4123	/*
	4124	* Thread is on the RT run queue and we have a lock on
	4125	* that run queue.
	4126	*/
	4127
	4128	assert(thread->runq == THREAD_ON_RT_RUNQ);
	4129
	4130	remqueue(&thread->runq_links);
	4131	SCHED_STATS_RUNQ_CHANGE(&rt_runq.runq_stats, rt_runq.count);
	4132	rt_runq.count--;
	4133
	4134	thread->runq = PROCESSOR_NULL;
	4135
	4136	removed = TRUE;
	4137	}
	4138
	4139	rt_lock_unlock();
	4140
	4141	return (removed);
	4142	}
	4143
	4144	/*
	4145	* Put the thread back where it goes after a thread_run_queue_remove
	4146	*
	4147	* Thread must have been removed under the same thread lock hold
	4148	*
	4149	* thread locked, at splsched
	4150	*/
	4151	void
	4152	thread_run_queue_reinsert(thread_t thread, integer_t options)
	4153	{
	4154	assert(thread->runq == PROCESSOR_NULL);
	4155
	4156	assert(thread->state & (TH_RUN));
	4157	thread_setrun(thread, options);
	4158
	4159	}
	4160
	4161	void
	4162	sys_override_cpu_throttle(int flag)
	4163	{
	4164	if (flag == CPU_THROTTLE_ENABLE)
	4165	cpu_throttle_enabled = 1;
	4166	if (flag == CPU_THROTTLE_DISABLE)
	4167	cpu_throttle_enabled = 0;
	4168	}
	4169
	4170	int
	4171	thread_get_urgency(thread_t thread, uint64_t arg1, uint64_t arg2)
	4172	{
	4173	if (thread == NULL \|\| (thread->state & TH_IDLE)) {
	4174	*arg1 = 0;
	4175	*arg2 = 0;
	4176
	4177	return (THREAD_URGENCY_NONE);
	4178	} else if (thread->sched_mode == TH_MODE_REALTIME) {
	4179	*arg1 = thread->realtime.period;
	4180	*arg2 = thread->realtime.deadline;
	4181
	4182	return (THREAD_URGENCY_REAL_TIME);
	4183	} else if (cpu_throttle_enabled &&
	4184	((thread->sched_pri <= MAXPRI_THROTTLE) && (thread->base_pri <= MAXPRI_THROTTLE))) {
	4185	/*
	4186	* Background urgency applied when thread priority is MAXPRI_THROTTLE or lower and thread is not promoted
	4187	*/
	4188	*arg1 = thread->sched_pri;
	4189	*arg2 = thread->base_pri;
	4190
	4191	return (THREAD_URGENCY_BACKGROUND);
	4192	} else {
	4193	/* For otherwise unclassified threads, report throughput QoS
	4194	* parameters
	4195	*/
	4196	*arg1 = proc_get_effective_thread_policy(thread, TASK_POLICY_THROUGH_QOS);
	4197	*arg2 = proc_get_effective_task_policy(thread->task, TASK_POLICY_THROUGH_QOS);
	4198
	4199	return (THREAD_URGENCY_NORMAL);
	4200	}
	4201	}
	4202
	4203
	4204	/*
	4205	* This is the processor idle loop, which just looks for other threads
	4206	* to execute. Processor idle threads invoke this without supplying a
	4207	* current thread to idle without an asserted wait state.
	4208	*
	4209	* Returns a the next thread to execute if dispatched directly.
	4210	*/
	4211
	4212	#if 0
	4213	#define IDLE_KERNEL_DEBUG_CONSTANT(...) KERNEL_DEBUG_CONSTANT(__VA_ARGS__)
	4214	#else
	4215	#define IDLE_KERNEL_DEBUG_CONSTANT(...) do { } while(0)
	4216	#endif
	4217
	4218	thread_t
	4219	processor_idle(
	4220	thread_t thread,
	4221	processor_t processor)
	4222	{
	4223	processor_set_t pset = processor->processor_set;
	4224	thread_t new_thread;
	4225	int state;
	4226	(void)splsched();
	4227
	4228	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	4229	MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) \| DBG_FUNC_START,
	4230	(uintptr_t)thread_tid(thread), 0, 0, 0, 0);
	4231
	4232	SCHED_STATS_CPU_IDLE_START(processor);
	4233
	4234	timer_switch(&PROCESSOR_DATA(processor, system_state),
	4235	mach_absolute_time(), &PROCESSOR_DATA(processor, idle_state));
	4236	PROCESSOR_DATA(processor, current_state) = &PROCESSOR_DATA(processor, idle_state);
	4237
	4238	while (1) {
	4239	if (processor->state != PROCESSOR_IDLE) /* unsafe, but worst case we loop around once */
	4240	break;
	4241	if (pset->pending_AST_cpu_mask & (1ULL << processor->cpu_id))
	4242	break;
	4243	if (processor->is_recommended) {
	4244	if (rt_runq.count)
	4245	break;
	4246	} else {
	4247	if (SCHED(processor_bound_count)(processor))
	4248	break;
	4249	}
	4250
	4251	#if CONFIG_SCHED_IDLE_IN_PLACE
	4252	if (thread != THREAD_NULL) {
	4253	/* Did idle-in-place thread wake up */
	4254	if ((thread->state & (TH_WAIT\|TH_SUSP)) != TH_WAIT \|\| thread->wake_active)
	4255	break;
	4256	}
	4257	#endif
	4258
	4259	IDLE_KERNEL_DEBUG_CONSTANT(
	4260	MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) \| DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), rt_runq.count, SCHED(processor_runq_count)(processor), -1, 0);
	4261
	4262	machine_track_platform_idle(TRUE);
	4263
	4264	machine_idle();
	4265
	4266	machine_track_platform_idle(FALSE);
	4267
	4268	(void)splsched();
	4269
	4270	IDLE_KERNEL_DEBUG_CONSTANT(
	4271	MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) \| DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), rt_runq.count, SCHED(processor_runq_count)(processor), -2, 0);
	4272
	4273	if (!SCHED(processor_queue_empty)(processor)) {
	4274	/* Secondary SMT processors respond to directed wakeups
	4275	* exclusively. Some platforms induce 'spurious' SMT wakeups.
	4276	*/
	4277	if (processor->processor_primary == processor)
	4278	break;
	4279	}
	4280	}
	4281
	4282	timer_switch(&PROCESSOR_DATA(processor, idle_state),
	4283	mach_absolute_time(), &PROCESSOR_DATA(processor, system_state));
	4284	PROCESSOR_DATA(processor, current_state) = &PROCESSOR_DATA(processor, system_state);
	4285
	4286	pset_lock(pset);
	4287
	4288	/* If we were sent a remote AST and came out of idle, acknowledge it here with pset lock held */
	4289	pset->pending_AST_cpu_mask &= ~(1ULL << processor->cpu_id);
	4290	#if defined(CONFIG_SCHED_DEFERRED_AST)
	4291	pset->pending_deferred_AST_cpu_mask &= ~(1ULL << processor->cpu_id);
	4292	#endif
	4293
	4294	state = processor->state;
	4295	if (state == PROCESSOR_DISPATCHING) {
	4296	/*
	4297	* Commmon case -- cpu dispatched.
	4298	*/
	4299	new_thread = processor->next_thread;
	4300	processor->next_thread = THREAD_NULL;
	4301	processor->state = PROCESSOR_RUNNING;
	4302
	4303	if ((new_thread != THREAD_NULL) && (SCHED(processor_queue_has_priority)(processor, new_thread->sched_pri, FALSE) \|\|
	4304	(rt_runq.count > 0)) ) {
	4305	/* Something higher priority has popped up on the runqueue - redispatch this thread elsewhere */
	4306	processor->current_pri = IDLEPRI;
	4307	processor->current_thmode = TH_MODE_FIXED;
	4308	processor->current_sfi_class = SFI_CLASS_KERNEL;
	4309	processor->deadline = UINT64_MAX;
	4310
	4311	pset_unlock(pset);
	4312
	4313	thread_lock(new_thread);
	4314	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_REDISPATCH), (uintptr_t)thread_tid(new_thread), new_thread->sched_pri, rt_runq.count, 0, 0);
	4315	thread_setrun(new_thread, SCHED_HEADQ);
	4316	thread_unlock(new_thread);
	4317
	4318	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	4319	MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) \| DBG_FUNC_END,
	4320	(uintptr_t)thread_tid(thread), state, 0, 0, 0);
	4321
	4322	return (THREAD_NULL);
	4323	}
	4324
	4325	pset_unlock(pset);
	4326
	4327	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	4328	MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) \| DBG_FUNC_END,
	4329	(uintptr_t)thread_tid(thread), state, (uintptr_t)thread_tid(new_thread), 0, 0);
	4330
	4331	return (new_thread);
	4332
	4333	} else if (state == PROCESSOR_IDLE) {
	4334	re_queue_tail(&pset->active_queue, &processor->processor_queue);
	4335
	4336	processor->state = PROCESSOR_RUNNING;
	4337	processor->current_pri = IDLEPRI;
	4338	processor->current_thmode = TH_MODE_FIXED;
	4339	processor->current_sfi_class = SFI_CLASS_KERNEL;
	4340	processor->deadline = UINT64_MAX;
	4341
	4342	} else if (state == PROCESSOR_SHUTDOWN) {
	4343	/*
	4344	* Going off-line. Force a
	4345	* reschedule.
	4346	*/
	4347	if ((new_thread = processor->next_thread) != THREAD_NULL) {
	4348	processor->next_thread = THREAD_NULL;
	4349	processor->current_pri = IDLEPRI;
	4350	processor->current_thmode = TH_MODE_FIXED;
	4351	processor->current_sfi_class = SFI_CLASS_KERNEL;
	4352	processor->deadline = UINT64_MAX;
	4353
	4354	pset_unlock(pset);
	4355
	4356	thread_lock(new_thread);
	4357	thread_setrun(new_thread, SCHED_HEADQ);
	4358	thread_unlock(new_thread);
	4359
	4360	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	4361	MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) \| DBG_FUNC_END,
	4362	(uintptr_t)thread_tid(thread), state, 0, 0, 0);
	4363
	4364	return (THREAD_NULL);
	4365	}
	4366	}
	4367
	4368	pset_unlock(pset);
	4369
	4370	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	4371	MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) \| DBG_FUNC_END,
	4372	(uintptr_t)thread_tid(thread), state, 0, 0, 0);
	4373
	4374	return (THREAD_NULL);
	4375	}
	4376
	4377	/*
	4378	* Each processor has a dedicated thread which
	4379	* executes the idle loop when there is no suitable
	4380	* previous context.
	4381	*/
	4382	void
	4383	idle_thread(void)
	4384	{
	4385	processor_t processor = current_processor();
	4386	thread_t new_thread;
	4387
	4388	new_thread = processor_idle(THREAD_NULL, processor);
	4389	if (new_thread != THREAD_NULL) {
	4390	thread_run(processor->idle_thread, (thread_continue_t)idle_thread, NULL, new_thread);
	4391	/NOTREACHED/
	4392	}
	4393
	4394	thread_block((thread_continue_t)idle_thread);
	4395	/NOTREACHED/
	4396	}
	4397
	4398	kern_return_t
	4399	idle_thread_create(
	4400	processor_t processor)
	4401	{
	4402	kern_return_t result;
	4403	thread_t thread;
	4404	spl_t s;
	4405
	4406	result = kernel_thread_create((thread_continue_t)idle_thread, NULL, MAXPRI_KERNEL, &thread);
	4407	if (result != KERN_SUCCESS)
	4408	return (result);
	4409
	4410	s = splsched();
	4411	thread_lock(thread);
	4412	thread->bound_processor = processor;
	4413	processor->idle_thread = thread;
	4414	thread->sched_pri = thread->base_pri = IDLEPRI;
	4415	thread->state = (TH_RUN \| TH_IDLE);
	4416	thread->options \|= TH_OPT_IDLE_THREAD;
	4417	thread_unlock(thread);
	4418	splx(s);
	4419
	4420	thread_deallocate(thread);
	4421
	4422	return (KERN_SUCCESS);
	4423	}
	4424
	4425	/*
	4426	* sched_startup:
	4427	*
	4428	* Kicks off scheduler services.
	4429	*
	4430	* Called at splsched.
	4431	*/
	4432	void
	4433	sched_startup(void)
	4434	{
	4435	kern_return_t result;
	4436	thread_t thread;
	4437
	4438	simple_lock_init(&sched_vm_group_list_lock, 0);
	4439
	4440
	4441	result = kernel_thread_start_priority((thread_continue_t)sched_init_thread,
	4442	(void *)SCHED(maintenance_continuation), MAXPRI_KERNEL, &thread);
	4443	if (result != KERN_SUCCESS)
	4444	panic("sched_startup");
	4445
	4446	thread_deallocate(thread);
	4447
	4448	assert_thread_magic(thread);
	4449
	4450	/*
	4451	* Yield to the sched_init_thread once, to
	4452	* initialize our own thread after being switched
	4453	* back to.
	4454	*
	4455	* The current thread is the only other thread
	4456	* active at this point.
	4457	*/
	4458	thread_block(THREAD_CONTINUE_NULL);
	4459	}
	4460
	4461	#if defined(CONFIG_SCHED_TIMESHARE_CORE)
	4462
	4463	static volatile uint64_t sched_maintenance_deadline;
	4464	static uint64_t sched_tick_last_abstime;
	4465	static uint64_t sched_tick_delta;
	4466	uint64_t sched_tick_max_delta;
	4467	/*
	4468	* sched_init_thread:
	4469	*
	4470	* Perform periodic bookkeeping functions about ten
	4471	* times per second.
	4472	*/
	4473	void
	4474	sched_timeshare_maintenance_continue(void)
	4475	{
	4476	uint64_t sched_tick_ctime, late_time;
	4477
	4478	struct sched_update_scan_context scan_context = {
	4479	.earliest_bg_make_runnable_time = UINT64_MAX,
	4480	.earliest_normal_make_runnable_time = UINT64_MAX,
	4481	.earliest_rt_make_runnable_time = UINT64_MAX
	4482	};
	4483
	4484	sched_tick_ctime = mach_absolute_time();
	4485
	4486	if (__improbable(sched_tick_last_abstime == 0)) {
	4487	sched_tick_last_abstime = sched_tick_ctime;
	4488	late_time = 0;
	4489	sched_tick_delta = 1;
	4490	} else {
	4491	late_time = sched_tick_ctime - sched_tick_last_abstime;
	4492	sched_tick_delta = late_time / sched_tick_interval;
	4493	/* Ensure a delta of 1, since the interval could be slightly
	4494	* smaller than the sched_tick_interval due to dispatch
	4495	* latencies.
	4496	*/
	4497	sched_tick_delta = MAX(sched_tick_delta, 1);
	4498
	4499	/* In the event interrupt latencies or platform
	4500	* idle events that advanced the timebase resulted
	4501	* in periods where no threads were dispatched,
	4502	* cap the maximum "tick delta" at SCHED_TICK_MAX_DELTA
	4503	* iterations.
	4504	*/
	4505	sched_tick_delta = MIN(sched_tick_delta, SCHED_TICK_MAX_DELTA);
	4506
	4507	sched_tick_last_abstime = sched_tick_ctime;
	4508	sched_tick_max_delta = MAX(sched_tick_delta, sched_tick_max_delta);
	4509	}
	4510
	4511	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_MAINTENANCE)\|DBG_FUNC_START,
	4512	sched_tick_delta, late_time, 0, 0, 0);
	4513
	4514	/* Add a number of pseudo-ticks corresponding to the elapsed interval
	4515	* This could be greater than 1 if substantial intervals where
	4516	* all processors are idle occur, which rarely occurs in practice.
	4517	*/
	4518
	4519	sched_tick += sched_tick_delta;
	4520
	4521	/*
	4522	* Compute various averages.
	4523	*/
	4524	compute_averages(sched_tick_delta);
	4525
	4526	/*
	4527	* Scan the run queues for threads which
	4528	* may need to be updated, and find the earliest runnable thread on the runqueue
	4529	* to report its latency.
	4530	*/
	4531	SCHED(thread_update_scan)(&scan_context);
	4532
	4533	rt_runq_scan(&scan_context);
	4534
	4535	uint64_t ctime = mach_absolute_time();
	4536
	4537	uint64_t bg_max_latency = (ctime > scan_context.earliest_bg_make_runnable_time) ?
	4538	ctime - scan_context.earliest_bg_make_runnable_time : 0;
	4539
	4540	uint64_t default_max_latency = (ctime > scan_context.earliest_normal_make_runnable_time) ?
	4541	ctime - scan_context.earliest_normal_make_runnable_time : 0;
	4542
	4543	uint64_t realtime_max_latency = (ctime > scan_context.earliest_rt_make_runnable_time) ?
	4544	ctime - scan_context.earliest_rt_make_runnable_time : 0;
	4545
	4546	machine_max_runnable_latency(bg_max_latency, default_max_latency, realtime_max_latency);
	4547
	4548	/*
	4549	* Check to see if the special sched VM group needs attention.
	4550	*/
	4551	sched_vm_group_maintenance();
	4552
	4553
	4554	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_MAINTENANCE) \| DBG_FUNC_END,
	4555	sched_pri_shifts[TH_BUCKET_SHARE_FG], sched_pri_shifts[TH_BUCKET_SHARE_BG],
	4556	sched_pri_shifts[TH_BUCKET_SHARE_UT], 0, 0);
	4557
	4558	assert_wait((event_t)sched_timeshare_maintenance_continue, THREAD_UNINT);
	4559	thread_block((thread_continue_t)sched_timeshare_maintenance_continue);
	4560	/NOTREACHED/
	4561	}
	4562
	4563	static uint64_t sched_maintenance_wakeups;
	4564
	4565	/*
	4566	* Determine if the set of routines formerly driven by a maintenance timer
	4567	* must be invoked, based on a deadline comparison. Signals the scheduler
	4568	* maintenance thread on deadline expiration. Must be invoked at an interval
	4569	* lower than the "sched_tick_interval", currently accomplished by
	4570	* invocation via the quantum expiration timer and at context switch time.
	4571	* Performance matters: this routine reuses a timestamp approximating the
	4572	* current absolute time received from the caller, and should perform
	4573	* no more than a comparison against the deadline in the common case.
	4574	*/
	4575	void
	4576	sched_timeshare_consider_maintenance(uint64_t ctime) {
	4577	uint64_t ndeadline, deadline = sched_maintenance_deadline;
	4578
	4579	if (__improbable(ctime >= deadline)) {
	4580	if (__improbable(current_thread() == sched_maintenance_thread))
	4581	return;
	4582	OSMemoryBarrier();
	4583
	4584	ndeadline = ctime + sched_tick_interval;
	4585
	4586	if (__probable(__sync_bool_compare_and_swap(&sched_maintenance_deadline, deadline, ndeadline))) {
	4587	thread_wakeup((event_t)sched_timeshare_maintenance_continue);
	4588	sched_maintenance_wakeups++;
	4589	}
	4590	}
	4591	}
	4592
	4593	#endif /* CONFIG_SCHED_TIMESHARE_CORE */
	4594
	4595	void
	4596	sched_init_thread(void (*continuation)(void))
	4597	{
	4598	thread_block(THREAD_CONTINUE_NULL);
	4599
	4600	thread_t thread = current_thread();
	4601
	4602	thread_set_thread_name(thread, "sched_maintenance_thread");
	4603
	4604	sched_maintenance_thread = thread;
	4605
	4606	continuation();
	4607
	4608	/NOTREACHED/
	4609	}
	4610
	4611	#if defined(CONFIG_SCHED_TIMESHARE_CORE)
	4612
	4613	/*
	4614	* thread_update_scan / runq_scan:
	4615	*
	4616	* Scan the run queues to account for timesharing threads
	4617	* which need to be updated.
	4618	*
	4619	* Scanner runs in two passes. Pass one squirrels likely
	4620	* threads away in an array, pass two does the update.
	4621	*
	4622	* This is necessary because the run queue is locked for
	4623	* the candidate scan, but the thread is locked for the update.
	4624	*
	4625	* Array should be sized to make forward progress, without
	4626	* disabling preemption for long periods.
	4627	*/
	4628
	4629	#define THREAD_UPDATE_SIZE 128
	4630
	4631	static thread_t thread_update_array[THREAD_UPDATE_SIZE];
	4632	static uint32_t thread_update_count = 0;
	4633
	4634	/* Returns TRUE if thread was added, FALSE if thread_update_array is full */
	4635	boolean_t
	4636	thread_update_add_thread(thread_t thread)
	4637	{
	4638	if (thread_update_count == THREAD_UPDATE_SIZE)
	4639	return (FALSE);
	4640
	4641	thread_update_array[thread_update_count++] = thread;
	4642	thread_reference_internal(thread);
	4643	return (TRUE);
	4644	}
	4645
	4646	void
	4647	thread_update_process_threads(void)
	4648	{
	4649	assert(thread_update_count <= THREAD_UPDATE_SIZE);
	4650
	4651	for (uint32_t i = 0 ; i < thread_update_count ; i++) {
	4652	thread_t thread = thread_update_array[i];
	4653	assert_thread_magic(thread);
	4654	thread_update_array[i] = THREAD_NULL;
	4655
	4656	spl_t s = splsched();
	4657	thread_lock(thread);
	4658	if (!(thread->state & (TH_WAIT)) && thread->sched_stamp != sched_tick) {
	4659	SCHED(update_priority)(thread);
	4660	}
	4661	thread_unlock(thread);
	4662	splx(s);
	4663
	4664	thread_deallocate(thread);
	4665	}
	4666
	4667	thread_update_count = 0;
	4668	}
	4669
	4670	/*
	4671	* Scan a runq for candidate threads.
	4672	*
	4673	* Returns TRUE if retry is needed.
	4674	*/
	4675	boolean_t
	4676	runq_scan(
	4677	run_queue_t runq,
	4678	sched_update_scan_context_t scan_context)
	4679	{
	4680	int count = runq->count;
	4681	int queue_index;
	4682
	4683	assert(count >= 0);
	4684
	4685	if (count == 0)
	4686	return FALSE;
	4687
	4688	for (queue_index = bitmap_first(runq->bitmap, NRQS);
	4689	queue_index >= 0;
	4690	queue_index = bitmap_next(runq->bitmap, queue_index)) {
	4691
	4692	thread_t thread;
	4693	queue_t queue = &runq->queues[queue_index];
	4694
	4695	qe_foreach_element(thread, queue, runq_links) {
	4696	assert(count > 0);
	4697	assert_thread_magic(thread);
	4698
	4699	if (thread->sched_stamp != sched_tick &&
	4700	thread->sched_mode == TH_MODE_TIMESHARE) {
	4701	if (thread_update_add_thread(thread) == FALSE)
	4702	return TRUE;
	4703	}
	4704
	4705	if (cpu_throttle_enabled && ((thread->sched_pri <= MAXPRI_THROTTLE) && (thread->base_pri <= MAXPRI_THROTTLE))) {
	4706	if (thread->last_made_runnable_time < scan_context->earliest_bg_make_runnable_time) {
	4707	scan_context->earliest_bg_make_runnable_time = thread->last_made_runnable_time;
	4708	}
	4709	} else {
	4710	if (thread->last_made_runnable_time < scan_context->earliest_normal_make_runnable_time) {
	4711	scan_context->earliest_normal_make_runnable_time = thread->last_made_runnable_time;
	4712	}
	4713	}
	4714	count--;
	4715	}
	4716	}
	4717
	4718	return FALSE;
	4719	}
	4720
	4721	#endif /* CONFIG_SCHED_TIMESHARE_CORE */
	4722
	4723	boolean_t
	4724	thread_eager_preemption(thread_t thread)
	4725	{
	4726	return ((thread->sched_flags & TH_SFLAG_EAGERPREEMPT) != 0);
	4727	}
	4728
	4729	void
	4730	thread_set_eager_preempt(thread_t thread)
	4731	{
	4732	spl_t x;
	4733	processor_t p;
	4734	ast_t ast = AST_NONE;
	4735
	4736	x = splsched();
	4737	p = current_processor();
	4738
	4739	thread_lock(thread);
	4740	thread->sched_flags \|= TH_SFLAG_EAGERPREEMPT;
	4741
	4742	if (thread == current_thread()) {
	4743
	4744	ast = csw_check(p, AST_NONE);
	4745	thread_unlock(thread);
	4746	if (ast != AST_NONE) {
	4747	(void) thread_block_reason(THREAD_CONTINUE_NULL, NULL, ast);
	4748	}
	4749	} else {
	4750	p = thread->last_processor;
	4751
	4752	if (p != PROCESSOR_NULL && p->state == PROCESSOR_RUNNING &&
	4753	p->active_thread == thread) {
	4754	cause_ast_check(p);
	4755	}
	4756
	4757	thread_unlock(thread);
	4758	}
	4759
	4760	splx(x);
	4761	}
	4762
	4763	void
	4764	thread_clear_eager_preempt(thread_t thread)
	4765	{
	4766	spl_t x;
	4767
	4768	x = splsched();
	4769	thread_lock(thread);
	4770
	4771	thread->sched_flags &= ~TH_SFLAG_EAGERPREEMPT;
	4772
	4773	thread_unlock(thread);
	4774	splx(x);
	4775	}
	4776
	4777	/*
	4778	* Scheduling statistics
	4779	*/
	4780	void
	4781	sched_stats_handle_csw(processor_t processor, int reasons, int selfpri, int otherpri)
	4782	{
	4783	struct processor_sched_statistics *stats;
	4784	boolean_t to_realtime = FALSE;
	4785
	4786	stats = &processor->processor_data.sched_stats;
	4787	stats->csw_count++;
	4788
	4789	if (otherpri >= BASEPRI_REALTIME) {
	4790	stats->rt_sched_count++;
	4791	to_realtime = TRUE;
	4792	}
	4793
	4794	if ((reasons & AST_PREEMPT) != 0) {
	4795	stats->preempt_count++;
	4796
	4797	if (selfpri >= BASEPRI_REALTIME) {
	4798	stats->preempted_rt_count++;
	4799	}
	4800
	4801	if (to_realtime) {
	4802	stats->preempted_by_rt_count++;
	4803	}
	4804
	4805	}
	4806	}
	4807
	4808	void
	4809	sched_stats_handle_runq_change(struct runq_stats *stats, int old_count)
	4810	{
	4811	uint64_t timestamp = mach_absolute_time();
	4812
	4813	stats->count_sum += (timestamp - stats->last_change_timestamp) * old_count;
	4814	stats->last_change_timestamp = timestamp;
	4815	}
	4816
	4817	/*
	4818	* For calls from assembly code
	4819	*/
	4820	#undef thread_wakeup
	4821	void
	4822	thread_wakeup(
	4823	event_t x);
	4824
	4825	void
	4826	thread_wakeup(
	4827	event_t x)
	4828	{
	4829	thread_wakeup_with_result(x, THREAD_AWAKENED);
	4830	}
	4831
	4832	boolean_t
	4833	preemption_enabled(void)
	4834	{
	4835	return (get_preemption_level() == 0 && ml_get_interrupts_enabled());
	4836	}
	4837
	4838	static void
	4839	sched_timer_deadline_tracking_init(void) {
	4840	nanoseconds_to_absolutetime(TIMER_DEADLINE_TRACKING_BIN_1_DEFAULT, &timer_deadline_tracking_bin_1);
	4841	nanoseconds_to_absolutetime(TIMER_DEADLINE_TRACKING_BIN_2_DEFAULT, &timer_deadline_tracking_bin_2);
	4842	}
	4843
	4844
	4845	kern_return_t
	4846	sched_work_interval_notify(thread_t thread, uint64_t work_interval_id, uint64_t start, uint64_t finish, uint64_t deadline, uint64_t next_start, uint32_t flags)
	4847	{
	4848	int urgency;
	4849	uint64_t urgency_param1, urgency_param2;
	4850	spl_t s;
	4851
	4852	if (work_interval_id == 0) {
	4853	return (KERN_INVALID_ARGUMENT);
	4854	}
	4855
	4856	assert(thread == current_thread());
	4857
	4858	thread_mtx_lock(thread);
	4859	if (thread->work_interval_id != work_interval_id) {
	4860	thread_mtx_unlock(thread);
	4861	return (KERN_INVALID_ARGUMENT);
	4862	}
	4863	thread_mtx_unlock(thread);
	4864
	4865	s = splsched();
	4866	thread_lock(thread);
	4867	urgency = thread_get_urgency(thread, &urgency_param1, &urgency_param2);
	4868	thread_unlock(thread);
	4869	splx(s);
	4870
	4871	machine_work_interval_notify(thread, work_interval_id, start, finish, deadline, next_start, urgency, flags);
	4872	return (KERN_SUCCESS);
	4873	}
	4874
	4875	void thread_set_options(uint32_t thopt) {
	4876	spl_t x;
	4877	thread_t t = current_thread();
	4878
	4879	x = splsched();
	4880	thread_lock(t);
	4881
	4882	t->options \|= thopt;
	4883
	4884	thread_unlock(t);
	4885	splx(x);
	4886	}
	4887
	4888	void thread_set_pending_block_hint(thread_t thread, block_hint_t block_hint) {
	4889	thread->pending_block_hint = block_hint;
	4890	}