git.saurik.com Git - apple/xnu.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2000-2016 Apple Inc. All rights reserved.
	3	*
	4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
	5	*
	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. The rights granted to you under the License
	10	* may not be used to create, or enable the creation or redistribution of,
	11	* unlawful or unlicensed copies of an Apple operating system, or to
	12	* circumvent, violate, or enable the circumvention or violation of, any
	13	* terms of an Apple operating system software license agreement.
	14	*
	15	* Please obtain a copy of the License at
	16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
	17	*
	18	* The Original Code and all software distributed under the License are
	19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	23	* Please see the License for the specific language governing rights and
	24	* limitations under the License.
	25	*
	26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
	27	*/
	28	/*
	29	* @OSF_FREE_COPYRIGHT@
	30	*/
	31	/*
	32	* Mach Operating System
	33	* Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
	34	* All Rights Reserved.
	35	*
	36	* Permission to use, copy, modify and distribute this software and its
	37	* documentation is hereby granted, provided that both the copyright
	38	* notice and this permission notice appear in all copies of the
	39	* software, derivative works or modified versions, and any portions
	40	* thereof, and that both notices appear in supporting documentation.
	41	*
	42	* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
	43	* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
	44	* ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
	45	*
	46	* Carnegie Mellon requests users of this software to return to
	47	*
	48	* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
	49	* School of Computer Science
	50	* Carnegie Mellon University
	51	* Pittsburgh PA 15213-3890
	52	*
	53	* any improvements or extensions that they make and grant Carnegie Mellon
	54	* the rights to redistribute these changes.
	55	*/
	56	/*
	57	*/
	58	/*
	59	* File: sched_prim.c
	60	* Author: Avadis Tevanian, Jr.
	61	* Date: 1986
	62	*
	63	* Scheduling primitives
	64	*
	65	*/
	66
	67	#include <debug.h>
	68
	69	#include <mach/mach_types.h>
	70	#include <mach/machine.h>
	71	#include <mach/policy.h>
	72	#include <mach/sync_policy.h>
	73	#include <mach/thread_act.h>
	74
	75	#include <machine/machine_routines.h>
	76	#include <machine/sched_param.h>
	77	#include <machine/machine_cpu.h>
	78	#include <machine/machlimits.h>
	79
	80	#ifdef CONFIG_MACH_APPROXIMATE_TIME
	81	#include <machine/commpage.h>
	82	#endif
	83
	84	#include <kern/kern_types.h>
	85	#include <kern/backtrace.h>
	86	#include <kern/clock.h>
	87	#include <kern/counters.h>
	88	#include <kern/cpu_number.h>
	89	#include <kern/cpu_data.h>
	90	#include <kern/smp.h>
	91	#include <kern/debug.h>
	92	#include <kern/macro_help.h>
	93	#include <kern/machine.h>
	94	#include <kern/misc_protos.h>
	95	#include <kern/processor.h>
	96	#include <kern/queue.h>
	97	#include <kern/sched.h>
	98	#include <kern/sched_prim.h>
	99	#include <kern/sfi.h>
	100	#include <kern/syscall_subr.h>
	101	#include <kern/task.h>
	102	#include <kern/thread.h>
	103	#include <kern/ledger.h>
	104	#include <kern/timer_queue.h>
	105	#include <kern/waitq.h>
	106	#include <kern/policy_internal.h>
	107
	108	#include <vm/pmap.h>
	109	#include <vm/vm_kern.h>
	110	#include <vm/vm_map.h>
	111
	112	#include <mach/sdt.h>
	113
	114	#include <sys/kdebug.h>
	115	#include <kperf/kperf.h>
	116	#include <kern/kpc.h>
	117
	118	#include <kern/pms.h>
	119
	120	struct rt_queue rt_runq;
	121
	122	uintptr_t sched_thread_on_rt_queue = (uintptr_t)0xDEAFBEE0;
	123
	124	/* Lock RT runq, must be done with interrupts disabled (under splsched()) */
	125	#if __SMP__
	126	decl_simple_lock_data(static,rt_lock);
	127	#define rt_lock_init() simple_lock_init(&rt_lock, 0)
	128	#define rt_lock_lock() simple_lock(&rt_lock)
	129	#define rt_lock_unlock() simple_unlock(&rt_lock)
	130	#else
	131	#define rt_lock_init() do { } while(0)
	132	#define rt_lock_lock() do { } while(0)
	133	#define rt_lock_unlock() do { } while(0)
	134	#endif
	135
	136	#define DEFAULT_PREEMPTION_RATE 100 /* (1/s) */
	137	int default_preemption_rate = DEFAULT_PREEMPTION_RATE;
	138
	139	#define DEFAULT_BG_PREEMPTION_RATE 400 /* (1/s) */
	140	int default_bg_preemption_rate = DEFAULT_BG_PREEMPTION_RATE;
	141
	142	#define MAX_UNSAFE_QUANTA 800
	143	int max_unsafe_quanta = MAX_UNSAFE_QUANTA;
	144
	145	#define MAX_POLL_QUANTA 2
	146	int max_poll_quanta = MAX_POLL_QUANTA;
	147
	148	#define SCHED_POLL_YIELD_SHIFT 4 /* 1/16 */
	149	int sched_poll_yield_shift = SCHED_POLL_YIELD_SHIFT;
	150
	151	uint64_t max_poll_computation;
	152
	153	uint64_t max_unsafe_computation;
	154	uint64_t sched_safe_duration;
	155
	156	#if defined(CONFIG_SCHED_TIMESHARE_CORE)
	157
	158	uint32_t std_quantum;
	159	uint32_t min_std_quantum;
	160	uint32_t bg_quantum;
	161
	162	uint32_t std_quantum_us;
	163	uint32_t bg_quantum_us;
	164
	165	#endif /* CONFIG_SCHED_TIMESHARE_CORE */
	166
	167	uint32_t thread_depress_time;
	168	uint32_t default_timeshare_computation;
	169	uint32_t default_timeshare_constraint;
	170
	171	uint32_t max_rt_quantum;
	172	uint32_t min_rt_quantum;
	173
	174	#if defined(CONFIG_SCHED_TIMESHARE_CORE)
	175
	176	unsigned sched_tick;
	177	uint32_t sched_tick_interval;
	178
	179	uint32_t sched_pri_shifts[TH_BUCKET_MAX];
	180	uint32_t sched_fixed_shift;
	181
	182	uint32_t sched_decay_usage_age_factor = 1; /* accelerate 5/8^n usage aging */
	183
	184	/* Allow foreground to decay past default to resolve inversions */
	185	#define DEFAULT_DECAY_BAND_LIMIT ((BASEPRI_FOREGROUND - BASEPRI_DEFAULT) + 2)
	186	int sched_pri_decay_band_limit = DEFAULT_DECAY_BAND_LIMIT;
	187
	188	/* Defaults for timer deadline profiling */
	189	#define TIMER_DEADLINE_TRACKING_BIN_1_DEFAULT 2000000 /* Timers with deadlines <=
	190	* 2ms */
	191	#define TIMER_DEADLINE_TRACKING_BIN_2_DEFAULT 5000000 /* Timers with deadlines
	192	<= 5ms */
	193
	194	uint64_t timer_deadline_tracking_bin_1;
	195	uint64_t timer_deadline_tracking_bin_2;
	196
	197	#endif /* CONFIG_SCHED_TIMESHARE_CORE */
	198
	199	thread_t sched_maintenance_thread;
	200
	201
	202	uint64_t sched_one_second_interval;
	203
	204	/* Forwards */
	205
	206	#if defined(CONFIG_SCHED_TIMESHARE_CORE)
	207
	208	static void load_shift_init(void);
	209	static void preempt_pri_init(void);
	210
	211	#endif /* CONFIG_SCHED_TIMESHARE_CORE */
	212
	213	static thread_t thread_select(
	214	thread_t thread,
	215	processor_t processor,
	216	ast_t reason);
	217
	218	#if CONFIG_SCHED_IDLE_IN_PLACE
	219	static thread_t thread_select_idle(
	220	thread_t thread,
	221	processor_t processor);
	222	#endif
	223
	224	thread_t processor_idle(
	225	thread_t thread,
	226	processor_t processor);
	227
	228	ast_t
	229	csw_check_locked( processor_t processor,
	230	processor_set_t pset,
	231	ast_t check_reason);
	232
	233	static void processor_setrun(
	234	processor_t processor,
	235	thread_t thread,
	236	integer_t options);
	237
	238	static void
	239	sched_realtime_init(void);
	240
	241	static void
	242	sched_realtime_timebase_init(void);
	243
	244	static void
	245	sched_timer_deadline_tracking_init(void);
	246
	247	#if DEBUG
	248	extern int debug_task;
	249	#define TLOG(a, fmt, args...) if(debug_task & a) kprintf(fmt, ## args)
	250	#else
	251	#define TLOG(a, fmt, args...) do {} while (0)
	252	#endif
	253
	254	static processor_t
	255	thread_bind_internal(
	256	thread_t thread,
	257	processor_t processor);
	258
	259	static void
	260	sched_vm_group_maintenance(void);
	261
	262	#if defined(CONFIG_SCHED_TIMESHARE_CORE)
	263	int8_t sched_load_shifts[NRQS];
	264	bitmap_t sched_preempt_pri[BITMAP_LEN(NRQS)];
	265	#endif /* CONFIG_SCHED_TIMESHARE_CORE */
	266
	267	const struct sched_dispatch_table *sched_current_dispatch = NULL;
	268
	269	/*
	270	* Statically allocate a buffer to hold the longest possible
	271	* scheduler description string, as currently implemented.
	272	* bsd/kern/kern_sysctl.c has a corresponding definition in bsd/
	273	* to export to userspace via sysctl(3). If either version
	274	* changes, update the other.
	275	*
	276	* Note that in addition to being an upper bound on the strings
	277	* in the kernel, it's also an exact parameter to PE_get_default(),
	278	* which interrogates the device tree on some platforms. That
	279	* API requires the caller know the exact size of the device tree
	280	* property, so we need both a legacy size (32) and the current size
	281	* (48) to deal with old and new device trees. The device tree property
	282	* is similarly padded to a fixed size so that the same kernel image
	283	* can run on multiple devices with different schedulers configured
	284	* in the device tree.
	285	*/
	286	char sched_string[SCHED_STRING_MAX_LENGTH];
	287
	288	uint32_t sched_debug_flags;
	289
	290	/* Global flag which indicates whether Background Stepper Context is enabled */
	291	static int cpu_throttle_enabled = 1;
	292
	293	void
	294	sched_init(void)
	295	{
	296	char sched_arg[SCHED_STRING_MAX_LENGTH] = { '\0' };
	297
	298	/* Check for runtime selection of the scheduler algorithm */
	299	if (!PE_parse_boot_argn("sched", sched_arg, sizeof (sched_arg))) {
	300	/* If no boot-args override, look in device tree */
	301	if (!PE_get_default("kern.sched", sched_arg,
	302	SCHED_STRING_MAX_LENGTH)) {
	303	sched_arg[0] = '\0';
	304	}
	305	}
	306
	307
	308	if (!PE_parse_boot_argn("sched_pri_decay_limit", &sched_pri_decay_band_limit, sizeof(sched_pri_decay_band_limit))) {
	309	/* No boot-args, check in device tree */
	310	if (!PE_get_default("kern.sched_pri_decay_limit",
	311	&sched_pri_decay_band_limit,
	312	sizeof(sched_pri_decay_band_limit))) {
	313	/* Allow decay all the way to normal limits */
	314	sched_pri_decay_band_limit = DEFAULT_DECAY_BAND_LIMIT;
	315	}
	316	}
	317
	318	kprintf("Setting scheduler priority decay band limit %d\n", sched_pri_decay_band_limit);
	319
	320	if (strlen(sched_arg) > 0) {
	321	if (0) {
	322	/* Allow pattern below */
	323	#if defined(CONFIG_SCHED_TRADITIONAL)
	324	} else if (0 == strcmp(sched_arg, sched_traditional_dispatch.sched_name)) {
	325	sched_current_dispatch = &sched_traditional_dispatch;
	326	} else if (0 == strcmp(sched_arg, sched_traditional_with_pset_runqueue_dispatch.sched_name)) {
	327	sched_current_dispatch = &sched_traditional_with_pset_runqueue_dispatch;
	328	#endif
	329	#if defined(CONFIG_SCHED_PROTO)
	330	} else if (0 == strcmp(sched_arg, sched_proto_dispatch.sched_name)) {
	331	sched_current_dispatch = &sched_proto_dispatch;
	332	#endif
	333	#if defined(CONFIG_SCHED_GRRR)
	334	} else if (0 == strcmp(sched_arg, sched_grrr_dispatch.sched_name)) {
	335	sched_current_dispatch = &sched_grrr_dispatch;
	336	#endif
	337	#if defined(CONFIG_SCHED_MULTIQ)
	338	} else if (0 == strcmp(sched_arg, sched_multiq_dispatch.sched_name)) {
	339	sched_current_dispatch = &sched_multiq_dispatch;
	340	} else if (0 == strcmp(sched_arg, sched_dualq_dispatch.sched_name)) {
	341	sched_current_dispatch = &sched_dualq_dispatch;
	342	#endif
	343	} else {
	344	#if defined(CONFIG_SCHED_TRADITIONAL)
	345	printf("Unrecognized scheduler algorithm: %s\n", sched_arg);
	346	printf("Scheduler: Using instead: %s\n", sched_traditional_with_pset_runqueue_dispatch.sched_name);
	347	sched_current_dispatch = &sched_traditional_with_pset_runqueue_dispatch;
	348	#else
	349	panic("Unrecognized scheduler algorithm: %s", sched_arg);
	350	#endif
	351	}
	352	kprintf("Scheduler: Runtime selection of %s\n", SCHED(sched_name));
	353	} else {
	354	#if defined(CONFIG_SCHED_MULTIQ)
	355	sched_current_dispatch = &sched_multiq_dispatch;
	356	#elif defined(CONFIG_SCHED_TRADITIONAL)
	357	sched_current_dispatch = &sched_traditional_with_pset_runqueue_dispatch;
	358	#elif defined(CONFIG_SCHED_PROTO)
	359	sched_current_dispatch = &sched_proto_dispatch;
	360	#elif defined(CONFIG_SCHED_GRRR)
	361	sched_current_dispatch = &sched_grrr_dispatch;
	362	#else
	363	#error No default scheduler implementation
	364	#endif
	365	kprintf("Scheduler: Default of %s\n", SCHED(sched_name));
	366	}
	367
	368	strlcpy(sched_string, SCHED(sched_name), sizeof(sched_string));
	369
	370	if (PE_parse_boot_argn("sched_debug", &sched_debug_flags, sizeof(sched_debug_flags))) {
	371	kprintf("Scheduler: Debug flags 0x%08x\n", sched_debug_flags);
	372	}
	373
	374	SCHED(init)();
	375	sched_realtime_init();
	376	ast_init();
	377	sched_timer_deadline_tracking_init();
	378
	379	SCHED(pset_init)(&pset0);
	380	SCHED(processor_init)(master_processor);
	381	}
	382
	383	void
	384	sched_timebase_init(void)
	385	{
	386	uint64_t abstime;
	387
	388	clock_interval_to_absolutetime_interval(1, NSEC_PER_SEC, &abstime);
	389	sched_one_second_interval = abstime;
	390
	391	SCHED(timebase_init)();
	392	sched_realtime_timebase_init();
	393	}
	394
	395	#if defined(CONFIG_SCHED_TIMESHARE_CORE)
	396
	397	void
	398	sched_timeshare_init(void)
	399	{
	400	/*
	401	* Calculate the timeslicing quantum
	402	* in us.
	403	*/
	404	if (default_preemption_rate < 1)
	405	default_preemption_rate = DEFAULT_PREEMPTION_RATE;
	406	std_quantum_us = (1000 * 1000) / default_preemption_rate;
	407
	408	printf("standard timeslicing quantum is %d us\n", std_quantum_us);
	409
	410	if (default_bg_preemption_rate < 1)
	411	default_bg_preemption_rate = DEFAULT_BG_PREEMPTION_RATE;
	412	bg_quantum_us = (1000 * 1000) / default_bg_preemption_rate;
	413
	414	printf("standard background quantum is %d us\n", bg_quantum_us);
	415
	416	load_shift_init();
	417	preempt_pri_init();
	418	sched_tick = 0;
	419	}
	420
	421	void
	422	sched_timeshare_timebase_init(void)
	423	{
	424	uint64_t abstime;
	425	uint32_t shift;
	426
	427	/* standard timeslicing quantum */
	428	clock_interval_to_absolutetime_interval(
	429	std_quantum_us, NSEC_PER_USEC, &abstime);
	430	assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
	431	std_quantum = (uint32_t)abstime;
	432
	433	/* smallest remaining quantum (250 us) */
	434	clock_interval_to_absolutetime_interval(250, NSEC_PER_USEC, &abstime);
	435	assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
	436	min_std_quantum = (uint32_t)abstime;
	437
	438	/* quantum for background tasks */
	439	clock_interval_to_absolutetime_interval(
	440	bg_quantum_us, NSEC_PER_USEC, &abstime);
	441	assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
	442	bg_quantum = (uint32_t)abstime;
	443
	444	/* scheduler tick interval */
	445	clock_interval_to_absolutetime_interval(USEC_PER_SEC >> SCHED_TICK_SHIFT,
	446	NSEC_PER_USEC, &abstime);
	447	assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
	448	sched_tick_interval = (uint32_t)abstime;
	449
	450	/*
	451	* Compute conversion factor from usage to
	452	* timesharing priorities with 5/8 ** n aging.
	453	*/
	454	abstime = (abstime * 5) / 3;
	455	for (shift = 0; abstime > BASEPRI_DEFAULT; ++shift)
	456	abstime >>= 1;
	457	sched_fixed_shift = shift;
	458
	459	for (uint32_t i = 0 ; i < TH_BUCKET_MAX ; i++)
	460	sched_pri_shifts[i] = INT8_MAX;
	461
	462	max_unsafe_computation = ((uint64_t)max_unsafe_quanta) * std_quantum;
	463	sched_safe_duration = 2 * ((uint64_t)max_unsafe_quanta) * std_quantum;
	464
	465	max_poll_computation = ((uint64_t)max_poll_quanta) * std_quantum;
	466	thread_depress_time = 1 * std_quantum;
	467	default_timeshare_computation = std_quantum / 2;
	468	default_timeshare_constraint = std_quantum;
	469
	470	}
	471
	472	#endif /* CONFIG_SCHED_TIMESHARE_CORE */
	473
	474	static void
	475	sched_realtime_init(void)
	476	{
	477	rt_lock_init();
	478
	479	rt_runq.count = 0;
	480	queue_init(&rt_runq.queue);
	481	}
	482
	483	static void
	484	sched_realtime_timebase_init(void)
	485	{
	486	uint64_t abstime;
	487
	488	/* smallest rt computaton (50 us) */
	489	clock_interval_to_absolutetime_interval(50, NSEC_PER_USEC, &abstime);
	490	assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
	491	min_rt_quantum = (uint32_t)abstime;
	492
	493	/* maximum rt computation (50 ms) */
	494	clock_interval_to_absolutetime_interval(
	495	50, 1000*NSEC_PER_USEC, &abstime);
	496	assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
	497	max_rt_quantum = (uint32_t)abstime;
	498
	499	}
	500
	501	#if defined(CONFIG_SCHED_TIMESHARE_CORE)
	502
	503	/*
	504	* Set up values for timeshare
	505	* loading factors.
	506	*/
	507	static void
	508	load_shift_init(void)
	509	{
	510	int8_t k, *p = sched_load_shifts;
	511	uint32_t i, j;
	512
	513	uint32_t sched_decay_penalty = 1;
	514
	515	if (PE_parse_boot_argn("sched_decay_penalty", &sched_decay_penalty, sizeof (sched_decay_penalty))) {
	516	kprintf("Overriding scheduler decay penalty %u\n", sched_decay_penalty);
	517	}
	518
	519	if (PE_parse_boot_argn("sched_decay_usage_age_factor", &sched_decay_usage_age_factor, sizeof (sched_decay_usage_age_factor))) {
	520	kprintf("Overriding scheduler decay usage age factor %u\n", sched_decay_usage_age_factor);
	521	}
	522
	523	if (sched_decay_penalty == 0) {
	524	/*
	525	* There is no penalty for timeshare threads for using too much
	526	* CPU, so set all load shifts to INT8_MIN. Even under high load,
	527	* sched_pri_shift will be >INT8_MAX, and there will be no
	528	* penalty applied to threads (nor will sched_usage be updated per
	529	* thread).
	530	*/
	531	for (i = 0; i < NRQS; i++) {
	532	sched_load_shifts[i] = INT8_MIN;
	533	}
	534
	535	return;
	536	}
	537
	538	p++ = INT8_MIN; p++ = 0;
	539
	540	/*
	541	* For a given system load "i", the per-thread priority
	542	* penalty per quantum of CPU usage is ~2^k priority
	543	* levels. "sched_decay_penalty" can cause more
	544	* array entries to be filled with smaller "k" values
	545	*/
	546	for (i = 2, j = 1 << sched_decay_penalty, k = 1; i < NRQS; ++k) {
	547	for (j <<= 1; (i < j) && (i < NRQS); ++i)
	548	*p++ = k;
	549	}
	550	}
	551
	552	static void
	553	preempt_pri_init(void)
	554	{
	555	bitmap_t *p = sched_preempt_pri;
	556
	557	for (int i = BASEPRI_FOREGROUND; i < MINPRI_KERNEL; ++i)
	558	bitmap_set(p, i);
	559
	560	for (int i = BASEPRI_PREEMPT; i <= MAXPRI; ++i)
	561	bitmap_set(p, i);
	562	}
	563
	564	#endif /* CONFIG_SCHED_TIMESHARE_CORE */
	565
	566	/*
	567	* Thread wait timer expiration.
	568	*/
	569	void
	570	thread_timer_expire(
	571	void *p0,
	572	__unused void *p1)
	573	{
	574	thread_t thread = p0;
	575	spl_t s;
	576
	577	assert_thread_magic(thread);
	578
	579	s = splsched();
	580	thread_lock(thread);
	581	if (--thread->wait_timer_active == 0) {
	582	if (thread->wait_timer_is_set) {
	583	thread->wait_timer_is_set = FALSE;
	584	clear_wait_internal(thread, THREAD_TIMED_OUT);
	585	}
	586	}
	587	thread_unlock(thread);
	588	splx(s);
	589	}
	590
	591	/*
	592	* thread_unblock:
	593	*
	594	* Unblock thread on wake up.
	595	*
	596	* Returns TRUE if the thread should now be placed on the runqueue.
	597	*
	598	* Thread must be locked.
	599	*
	600	* Called at splsched().
	601	*/
	602	boolean_t
	603	thread_unblock(
	604	thread_t thread,
	605	wait_result_t wresult)
	606	{
	607	boolean_t ready_for_runq = FALSE;
	608	thread_t cthread = current_thread();
	609	uint32_t new_run_count;
	610
	611	/*
	612	* Set wait_result.
	613	*/
	614	thread->wait_result = wresult;
	615
	616	/*
	617	* Cancel pending wait timer.
	618	*/
	619	if (thread->wait_timer_is_set) {
	620	if (timer_call_cancel(&thread->wait_timer))
	621	thread->wait_timer_active--;
	622	thread->wait_timer_is_set = FALSE;
	623	}
	624
	625	/*
	626	* Update scheduling state: not waiting,
	627	* set running.
	628	*/
	629	thread->state &= ~(TH_WAIT\|TH_UNINT);
	630
	631	if (!(thread->state & TH_RUN)) {
	632	thread->state \|= TH_RUN;
	633	thread->last_made_runnable_time = mach_approximate_time();
	634
	635	ready_for_runq = TRUE;
	636
	637	(*thread->sched_call)(SCHED_CALL_UNBLOCK, thread);
	638
	639	/* Update the runnable thread count */
	640	new_run_count = sched_run_incr(thread);
	641	} else {
	642	/*
	643	* Either the thread is idling in place on another processor,
	644	* or it hasn't finished context switching yet.
	645	*/
	646	#if CONFIG_SCHED_IDLE_IN_PLACE
	647	if (thread->state & TH_IDLE) {
	648	processor_t processor = thread->last_processor;
	649
	650	if (processor != current_processor())
	651	machine_signal_idle(processor);
	652	}
	653	#else
	654	assert((thread->state & TH_IDLE) == 0);
	655	#endif
	656	/*
	657	* The run count is only dropped after the context switch completes
	658	* and the thread is still waiting, so we should not run_incr here
	659	*/
	660	new_run_count = sched_run_buckets[TH_BUCKET_RUN];
	661	}
	662
	663
	664	/*
	665	* Calculate deadline for real-time threads.
	666	*/
	667	if (thread->sched_mode == TH_MODE_REALTIME) {
	668	uint64_t ctime;
	669
	670	ctime = mach_absolute_time();
	671	thread->realtime.deadline = thread->realtime.constraint + ctime;
	672	}
	673
	674	/*
	675	* Clear old quantum, fail-safe computation, etc.
	676	*/
	677	thread->quantum_remaining = 0;
	678	thread->computation_metered = 0;
	679	thread->reason = AST_NONE;
	680
	681	/* Obtain power-relevant interrupt and "platform-idle exit" statistics.
	682	* We also account for "double hop" thread signaling via
	683	* the thread callout infrastructure.
	684	* DRK: consider removing the callout wakeup counters in the future
	685	* they're present for verification at the moment.
	686	*/
	687	boolean_t aticontext, pidle;
	688	ml_get_power_state(&aticontext, &pidle);
	689
	690	if (__improbable(aticontext && !(thread_get_tag_internal(thread) & THREAD_TAG_CALLOUT))) {
	691	ledger_credit(thread->t_ledger, task_ledgers.interrupt_wakeups, 1);
	692	DTRACE_SCHED2(iwakeup, struct thread , thread, struct proc , thread->task->bsd_info);
	693
	694	uint64_t ttd = PROCESSOR_DATA(current_processor(), timer_call_ttd);
	695
	696	if (ttd) {
	697	if (ttd <= timer_deadline_tracking_bin_1)
	698	thread->thread_timer_wakeups_bin_1++;
	699	else
	700	if (ttd <= timer_deadline_tracking_bin_2)
	701	thread->thread_timer_wakeups_bin_2++;
	702	}
	703
	704	if (pidle) {
	705	ledger_credit(thread->t_ledger, task_ledgers.platform_idle_wakeups, 1);
	706	}
	707
	708	} else if (thread_get_tag_internal(cthread) & THREAD_TAG_CALLOUT) {
	709	if (cthread->callout_woken_from_icontext) {
	710	ledger_credit(thread->t_ledger, task_ledgers.interrupt_wakeups, 1);
	711	thread->thread_callout_interrupt_wakeups++;
	712	if (cthread->callout_woken_from_platform_idle) {
	713	ledger_credit(thread->t_ledger, task_ledgers.platform_idle_wakeups, 1);
	714	thread->thread_callout_platform_idle_wakeups++;
	715	}
	716
	717	cthread->callout_woke_thread = TRUE;
	718	}
	719	}
	720
	721	if (thread_get_tag_internal(thread) & THREAD_TAG_CALLOUT) {
	722	thread->callout_woken_from_icontext = aticontext;
	723	thread->callout_woken_from_platform_idle = pidle;
	724	thread->callout_woke_thread = FALSE;
	725	}
	726
	727	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	728	MACHDBG_CODE(DBG_MACH_SCHED,MACH_MAKE_RUNNABLE) \| DBG_FUNC_NONE,
	729	(uintptr_t)thread_tid(thread), thread->sched_pri, thread->wait_result,
	730	sched_run_buckets[TH_BUCKET_RUN], 0);
	731
	732	DTRACE_SCHED2(wakeup, struct thread , thread, struct proc , thread->task->bsd_info);
	733
	734	return (ready_for_runq);
	735	}
	736
	737	/*
	738	* Routine: thread_go
	739	* Purpose:
	740	* Unblock and dispatch thread.
	741	* Conditions:
	742	* thread lock held, IPC locks may be held.
	743	* thread must have been pulled from wait queue under same lock hold.
	744	* thread must have been waiting
	745	* Returns:
	746	* KERN_SUCCESS - Thread was set running
	747	*
	748	* TODO: This should return void
	749	*/
	750	kern_return_t
	751	thread_go(
	752	thread_t thread,
	753	wait_result_t wresult)
	754	{
	755	assert_thread_magic(thread);
	756
	757	assert(thread->at_safe_point == FALSE);
	758	assert(thread->wait_event == NO_EVENT64);
	759	assert(thread->waitq == NULL);
	760
	761	assert(!(thread->state & (TH_TERMINATE\|TH_TERMINATE2)));
	762	assert(thread->state & TH_WAIT);
	763
	764
	765	if (thread_unblock(thread, wresult)) {
	766	#if SCHED_TRACE_THREAD_WAKEUPS
	767	backtrace(&thread->thread_wakeup_bt[0],
	768	(sizeof(thread->thread_wakeup_bt)/sizeof(uintptr_t)));
	769	#endif
	770	thread_setrun(thread, SCHED_PREEMPT \| SCHED_TAILQ);
	771	}
	772
	773	return (KERN_SUCCESS);
	774	}
	775
	776	/*
	777	* Routine: thread_mark_wait_locked
	778	* Purpose:
	779	* Mark a thread as waiting. If, given the circumstances,
	780	* it doesn't want to wait (i.e. already aborted), then
	781	* indicate that in the return value.
	782	* Conditions:
	783	* at splsched() and thread is locked.
	784	*/
	785	__private_extern__
	786	wait_result_t
	787	thread_mark_wait_locked(
	788	thread_t thread,
	789	wait_interrupt_t interruptible)
	790	{
	791	boolean_t at_safe_point;
	792
	793	assert(!(thread->state & (TH_WAIT\|TH_IDLE\|TH_UNINT\|TH_TERMINATE2)));
	794
	795	/*
	796	* The thread may have certain types of interrupts/aborts masked
	797	* off. Even if the wait location says these types of interrupts
	798	* are OK, we have to honor mask settings (outer-scoped code may
	799	* not be able to handle aborts at the moment).
	800	*/
	801	if (interruptible > (thread->options & TH_OPT_INTMASK))
	802	interruptible = thread->options & TH_OPT_INTMASK;
	803
	804	at_safe_point = (interruptible == THREAD_ABORTSAFE);
	805
	806	if ( interruptible == THREAD_UNINT \|\|
	807	!(thread->sched_flags & TH_SFLAG_ABORT) \|\|
	808	(!at_safe_point &&
	809	(thread->sched_flags & TH_SFLAG_ABORTSAFELY))) {
	810
	811	if ( !(thread->state & TH_TERMINATE))
	812	DTRACE_SCHED(sleep);
	813
	814	thread->state \|= (interruptible) ? TH_WAIT : (TH_WAIT \| TH_UNINT);
	815	thread->at_safe_point = at_safe_point;
	816	return (thread->wait_result = THREAD_WAITING);
	817	}
	818	else
	819	if (thread->sched_flags & TH_SFLAG_ABORTSAFELY)
	820	thread->sched_flags &= ~TH_SFLAG_ABORTED_MASK;
	821
	822	return (thread->wait_result = THREAD_INTERRUPTED);
	823	}
	824
	825	/*
	826	* Routine: thread_interrupt_level
	827	* Purpose:
	828	* Set the maximum interruptible state for the
	829	* current thread. The effective value of any
	830	* interruptible flag passed into assert_wait
	831	* will never exceed this.
	832	*
	833	* Useful for code that must not be interrupted,
	834	* but which calls code that doesn't know that.
	835	* Returns:
	836	* The old interrupt level for the thread.
	837	*/
	838	__private_extern__
	839	wait_interrupt_t
	840	thread_interrupt_level(
	841	wait_interrupt_t new_level)
	842	{
	843	thread_t thread = current_thread();
	844	wait_interrupt_t result = thread->options & TH_OPT_INTMASK;
	845
	846	thread->options = (thread->options & ~TH_OPT_INTMASK) \| (new_level & TH_OPT_INTMASK);
	847
	848	return result;
	849	}
	850
	851	/*
	852	* Check to see if an assert wait is possible, without actually doing one.
	853	* This is used by debug code in locks and elsewhere to verify that it is
	854	* always OK to block when trying to take a blocking lock (since waiting
	855	* for the actual assert_wait to catch the case may make it hard to detect
	856	* this case.
	857	*/
	858	boolean_t
	859	assert_wait_possible(void)
	860	{
	861
	862	thread_t thread;
	863
	864	#if DEBUG
	865	if(debug_mode) return TRUE; /* Always succeed in debug mode */
	866	#endif
	867
	868	thread = current_thread();
	869
	870	return (thread == NULL \|\| waitq_wait_possible(thread));
	871	}
	872
	873	/*
	874	* assert_wait:
	875	*
	876	* Assert that the current thread is about to go to
	877	* sleep until the specified event occurs.
	878	*/
	879	wait_result_t
	880	assert_wait(
	881	event_t event,
	882	wait_interrupt_t interruptible)
	883	{
	884	if (__improbable(event == NO_EVENT))
	885	panic("%s() called with NO_EVENT", __func__);
	886
	887	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	888	MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)\|DBG_FUNC_NONE,
	889	VM_KERNEL_UNSLIDE_OR_PERM(event), 0, 0, 0, 0);
	890
	891	struct waitq *waitq;
	892	waitq = global_eventq(event);
	893	return waitq_assert_wait64(waitq, CAST_EVENT64_T(event), interruptible, TIMEOUT_WAIT_FOREVER);
	894	}
	895
	896	/*
	897	* assert_wait_queue:
	898	*
	899	* Return the global waitq for the specified event
	900	*/
	901	struct waitq *
	902	assert_wait_queue(
	903	event_t event)
	904	{
	905	return global_eventq(event);
	906	}
	907
	908	wait_result_t
	909	assert_wait_timeout(
	910	event_t event,
	911	wait_interrupt_t interruptible,
	912	uint32_t interval,
	913	uint32_t scale_factor)
	914	{
	915	thread_t thread = current_thread();
	916	wait_result_t wresult;
	917	uint64_t deadline;
	918	spl_t s;
	919
	920	if (__improbable(event == NO_EVENT))
	921	panic("%s() called with NO_EVENT", __func__);
	922
	923	struct waitq *waitq;
	924	waitq = global_eventq(event);
	925
	926	s = splsched();
	927	waitq_lock(waitq);
	928
	929	clock_interval_to_deadline(interval, scale_factor, &deadline);
	930
	931	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	932	MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)\|DBG_FUNC_NONE,
	933	VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
	934
	935	wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
	936	interruptible,
	937	TIMEOUT_URGENCY_SYS_NORMAL,
	938	deadline, TIMEOUT_NO_LEEWAY,
	939	thread);
	940
	941	waitq_unlock(waitq);
	942	splx(s);
	943	return wresult;
	944	}
	945
	946	wait_result_t
	947	assert_wait_timeout_with_leeway(
	948	event_t event,
	949	wait_interrupt_t interruptible,
	950	wait_timeout_urgency_t urgency,
	951	uint32_t interval,
	952	uint32_t leeway,
	953	uint32_t scale_factor)
	954	{
	955	thread_t thread = current_thread();
	956	wait_result_t wresult;
	957	uint64_t deadline;
	958	uint64_t abstime;
	959	uint64_t slop;
	960	uint64_t now;
	961	spl_t s;
	962
	963	if (__improbable(event == NO_EVENT))
	964	panic("%s() called with NO_EVENT", __func__);
	965
	966	now = mach_absolute_time();
	967	clock_interval_to_absolutetime_interval(interval, scale_factor, &abstime);
	968	deadline = now + abstime;
	969
	970	clock_interval_to_absolutetime_interval(leeway, scale_factor, &slop);
	971
	972	struct waitq *waitq;
	973	waitq = global_eventq(event);
	974
	975	s = splsched();
	976	waitq_lock(waitq);
	977
	978	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	979	MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)\|DBG_FUNC_NONE,
	980	VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
	981
	982	wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
	983	interruptible,
	984	urgency, deadline, slop,
	985	thread);
	986
	987	waitq_unlock(waitq);
	988	splx(s);
	989	return wresult;
	990	}
	991
	992	wait_result_t
	993	assert_wait_deadline(
	994	event_t event,
	995	wait_interrupt_t interruptible,
	996	uint64_t deadline)
	997	{
	998	thread_t thread = current_thread();
	999	wait_result_t wresult;
	1000	spl_t s;
	1001
	1002	if (__improbable(event == NO_EVENT))
	1003	panic("%s() called with NO_EVENT", __func__);
	1004
	1005	struct waitq *waitq;
	1006	waitq = global_eventq(event);
	1007
	1008	s = splsched();
	1009	waitq_lock(waitq);
	1010
	1011	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	1012	MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)\|DBG_FUNC_NONE,
	1013	VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
	1014
	1015	wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
	1016	interruptible,
	1017	TIMEOUT_URGENCY_SYS_NORMAL, deadline,
	1018	TIMEOUT_NO_LEEWAY, thread);
	1019	waitq_unlock(waitq);
	1020	splx(s);
	1021	return wresult;
	1022	}
	1023
	1024	wait_result_t
	1025	assert_wait_deadline_with_leeway(
	1026	event_t event,
	1027	wait_interrupt_t interruptible,
	1028	wait_timeout_urgency_t urgency,
	1029	uint64_t deadline,
	1030	uint64_t leeway)
	1031	{
	1032	thread_t thread = current_thread();
	1033	wait_result_t wresult;
	1034	spl_t s;
	1035
	1036	if (__improbable(event == NO_EVENT))
	1037	panic("%s() called with NO_EVENT", __func__);
	1038
	1039	struct waitq *waitq;
	1040	waitq = global_eventq(event);
	1041
	1042	s = splsched();
	1043	waitq_lock(waitq);
	1044
	1045	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	1046	MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)\|DBG_FUNC_NONE,
	1047	VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
	1048
	1049	wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
	1050	interruptible,
	1051	urgency, deadline, leeway,
	1052	thread);
	1053	waitq_unlock(waitq);
	1054	splx(s);
	1055	return wresult;
	1056	}
	1057
	1058	/*
	1059	* thread_isoncpu:
	1060	*
	1061	* Return TRUE if a thread is running on a processor such that an AST
	1062	* is needed to pull it out of userspace execution, or if executing in
	1063	* the kernel, bring to a context switch boundary that would cause
	1064	* thread state to be serialized in the thread PCB.
	1065	*
	1066	* Thread locked, returns the same way. While locked, fields
	1067	* like "state" cannot change. "runq" can change only from set to unset.
	1068	*/
	1069	static inline boolean_t
	1070	thread_isoncpu(thread_t thread)
	1071	{
	1072	/* Not running or runnable */
	1073	if (!(thread->state & TH_RUN))
	1074	return (FALSE);
	1075
	1076	/* Waiting on a runqueue, not currently running */
	1077	/* TODO: This is invalid - it can get dequeued without thread lock, but not context switched. */
	1078	if (thread->runq != PROCESSOR_NULL)
	1079	return (FALSE);
	1080
	1081	/*
	1082	* Thread does not have a stack yet
	1083	* It could be on the stack alloc queue or preparing to be invoked
	1084	*/
	1085	if (!thread->kernel_stack)
	1086	return (FALSE);
	1087
	1088	/*
	1089	* Thread must be running on a processor, or
	1090	* about to run, or just did run. In all these
	1091	* cases, an AST to the processor is needed
	1092	* to guarantee that the thread is kicked out
	1093	* of userspace and the processor has
	1094	* context switched (and saved register state).
	1095	*/
	1096	return (TRUE);
	1097	}
	1098
	1099	/*
	1100	* thread_stop:
	1101	*
	1102	* Force a preemption point for a thread and wait
	1103	* for it to stop running on a CPU. If a stronger
	1104	* guarantee is requested, wait until no longer
	1105	* runnable. Arbitrates access among
	1106	* multiple stop requests. (released by unstop)
	1107	*
	1108	* The thread must enter a wait state and stop via a
	1109	* separate means.
	1110	*
	1111	* Returns FALSE if interrupted.
	1112	*/
	1113	boolean_t
	1114	thread_stop(
	1115	thread_t thread,
	1116	boolean_t until_not_runnable)
	1117	{
	1118	wait_result_t wresult;
	1119	spl_t s = splsched();
	1120	boolean_t oncpu;
	1121
	1122	wake_lock(thread);
	1123	thread_lock(thread);
	1124
	1125	while (thread->state & TH_SUSP) {
	1126	thread->wake_active = TRUE;
	1127	thread_unlock(thread);
	1128
	1129	wresult = assert_wait(&thread->wake_active, THREAD_ABORTSAFE);
	1130	wake_unlock(thread);
	1131	splx(s);
	1132
	1133	if (wresult == THREAD_WAITING)
	1134	wresult = thread_block(THREAD_CONTINUE_NULL);
	1135
	1136	if (wresult != THREAD_AWAKENED)
	1137	return (FALSE);
	1138
	1139	s = splsched();
	1140	wake_lock(thread);
	1141	thread_lock(thread);
	1142	}
	1143
	1144	thread->state \|= TH_SUSP;
	1145
	1146	while ((oncpu = thread_isoncpu(thread)) \|\|
	1147	(until_not_runnable && (thread->state & TH_RUN))) {
	1148	processor_t processor;
	1149
	1150	if (oncpu) {
	1151	assert(thread->state & TH_RUN);
	1152	processor = thread->chosen_processor;
	1153	cause_ast_check(processor);
	1154	}
	1155
	1156	thread->wake_active = TRUE;
	1157	thread_unlock(thread);
	1158
	1159	wresult = assert_wait(&thread->wake_active, THREAD_ABORTSAFE);
	1160	wake_unlock(thread);
	1161	splx(s);
	1162
	1163	if (wresult == THREAD_WAITING)
	1164	wresult = thread_block(THREAD_CONTINUE_NULL);
	1165
	1166	if (wresult != THREAD_AWAKENED) {
	1167	thread_unstop(thread);
	1168	return (FALSE);
	1169	}
	1170
	1171	s = splsched();
	1172	wake_lock(thread);
	1173	thread_lock(thread);
	1174	}
	1175
	1176	thread_unlock(thread);
	1177	wake_unlock(thread);
	1178	splx(s);
	1179
	1180	/*
	1181	* We return with the thread unlocked. To prevent it from
	1182	* transitioning to a runnable state (or from TH_RUN to
	1183	* being on the CPU), the caller must ensure the thread
	1184	* is stopped via an external means (such as an AST)
	1185	*/
	1186
	1187	return (TRUE);
	1188	}
	1189
	1190	/*
	1191	* thread_unstop:
	1192	*
	1193	* Release a previous stop request and set
	1194	* the thread running if appropriate.
	1195	*
	1196	* Use only after a successful stop operation.
	1197	*/
	1198	void
	1199	thread_unstop(
	1200	thread_t thread)
	1201	{
	1202	spl_t s = splsched();
	1203
	1204	wake_lock(thread);
	1205	thread_lock(thread);
	1206
	1207	assert((thread->state & (TH_RUN\|TH_WAIT\|TH_SUSP)) != TH_SUSP);
	1208
	1209	if (thread->state & TH_SUSP) {
	1210	thread->state &= ~TH_SUSP;
	1211
	1212	if (thread->wake_active) {
	1213	thread->wake_active = FALSE;
	1214	thread_unlock(thread);
	1215
	1216	thread_wakeup(&thread->wake_active);
	1217	wake_unlock(thread);
	1218	splx(s);
	1219
	1220	return;
	1221	}
	1222	}
	1223
	1224	thread_unlock(thread);
	1225	wake_unlock(thread);
	1226	splx(s);
	1227	}
	1228
	1229	/*
	1230	* thread_wait:
	1231	*
	1232	* Wait for a thread to stop running. (non-interruptible)
	1233	*
	1234	*/
	1235	void
	1236	thread_wait(
	1237	thread_t thread,
	1238	boolean_t until_not_runnable)
	1239	{
	1240	wait_result_t wresult;
	1241	boolean_t oncpu;
	1242	processor_t processor;
	1243	spl_t s = splsched();
	1244
	1245	wake_lock(thread);
	1246	thread_lock(thread);
	1247
	1248	/*
	1249	* Wait until not running on a CPU. If stronger requirement
	1250	* desired, wait until not runnable. Assumption: if thread is
	1251	* on CPU, then TH_RUN is set, so we're not waiting in any case
	1252	* where the original, pure "TH_RUN" check would have let us
	1253	* finish.
	1254	*/
	1255	while ((oncpu = thread_isoncpu(thread)) \|\|
	1256	(until_not_runnable && (thread->state & TH_RUN))) {
	1257
	1258	if (oncpu) {
	1259	assert(thread->state & TH_RUN);
	1260	processor = thread->chosen_processor;
	1261	cause_ast_check(processor);
	1262	}
	1263
	1264	thread->wake_active = TRUE;
	1265	thread_unlock(thread);
	1266
	1267	wresult = assert_wait(&thread->wake_active, THREAD_UNINT);
	1268	wake_unlock(thread);
	1269	splx(s);
	1270
	1271	if (wresult == THREAD_WAITING)
	1272	thread_block(THREAD_CONTINUE_NULL);
	1273
	1274	s = splsched();
	1275	wake_lock(thread);
	1276	thread_lock(thread);
	1277	}
	1278
	1279	thread_unlock(thread);
	1280	wake_unlock(thread);
	1281	splx(s);
	1282	}
	1283
	1284	/*
	1285	* Routine: clear_wait_internal
	1286	*
	1287	* Clear the wait condition for the specified thread.
	1288	* Start the thread executing if that is appropriate.
	1289	* Arguments:
	1290	* thread thread to awaken
	1291	* result Wakeup result the thread should see
	1292	* Conditions:
	1293	* At splsched
	1294	* the thread is locked.
	1295	* Returns:
	1296	* KERN_SUCCESS thread was rousted out a wait
	1297	* KERN_FAILURE thread was waiting but could not be rousted
	1298	* KERN_NOT_WAITING thread was not waiting
	1299	*/
	1300	__private_extern__ kern_return_t
	1301	clear_wait_internal(
	1302	thread_t thread,
	1303	wait_result_t wresult)
	1304	{
	1305	uint32_t i = LockTimeOutUsec;
	1306	struct waitq *waitq = thread->waitq;
	1307
	1308	do {
	1309	if (wresult == THREAD_INTERRUPTED && (thread->state & TH_UNINT))
	1310	return (KERN_FAILURE);
	1311
	1312	if (waitq != NULL) {
	1313	if (!waitq_pull_thread_locked(waitq, thread)) {
	1314	thread_unlock(thread);
	1315	delay(1);
	1316	if (i > 0 && !machine_timeout_suspended())
	1317	i--;
	1318	thread_lock(thread);
	1319	if (waitq != thread->waitq)
	1320	return KERN_NOT_WAITING;
	1321	continue;
	1322	}
	1323	}
	1324
	1325	/* TODO: Can we instead assert TH_TERMINATE is not set? */
	1326	if ((thread->state & (TH_WAIT\|TH_TERMINATE)) == TH_WAIT)
	1327	return (thread_go(thread, wresult));
	1328	else
	1329	return (KERN_NOT_WAITING);
	1330	} while (i > 0);
	1331
	1332	panic("clear_wait_internal: deadlock: thread=%p, wq=%p, cpu=%d\n",
	1333	thread, waitq, cpu_number());
	1334
	1335	return (KERN_FAILURE);
	1336	}
	1337
	1338
	1339	/*
	1340	* clear_wait:
	1341	*
	1342	* Clear the wait condition for the specified thread. Start the thread
	1343	* executing if that is appropriate.
	1344	*
	1345	* parameters:
	1346	* thread thread to awaken
	1347	* result Wakeup result the thread should see
	1348	*/
	1349	kern_return_t
	1350	clear_wait(
	1351	thread_t thread,
	1352	wait_result_t result)
	1353	{
	1354	kern_return_t ret;
	1355	spl_t s;
	1356
	1357	s = splsched();
	1358	thread_lock(thread);
	1359	ret = clear_wait_internal(thread, result);
	1360	thread_unlock(thread);
	1361	splx(s);
	1362	return ret;
	1363	}
	1364
	1365
	1366	/*
	1367	* thread_wakeup_prim:
	1368	*
	1369	* Common routine for thread_wakeup, thread_wakeup_with_result,
	1370	* and thread_wakeup_one.
	1371	*
	1372	*/
	1373	kern_return_t
	1374	thread_wakeup_prim(
	1375	event_t event,
	1376	boolean_t one_thread,
	1377	wait_result_t result)
	1378	{
	1379	if (__improbable(event == NO_EVENT))
	1380	panic("%s() called with NO_EVENT", __func__);
	1381
	1382	struct waitq *wq = global_eventq(event);
	1383
	1384	if (one_thread)
	1385	return waitq_wakeup64_one(wq, CAST_EVENT64_T(event), result, WAITQ_ALL_PRIORITIES);
	1386	else
	1387	return waitq_wakeup64_all(wq, CAST_EVENT64_T(event), result, WAITQ_ALL_PRIORITIES);
	1388	}
	1389
	1390	/*
	1391	* Wakeup a specified thread if and only if it's waiting for this event
	1392	*/
	1393	kern_return_t
	1394	thread_wakeup_thread(
	1395	event_t event,
	1396	thread_t thread)
	1397	{
	1398	if (__improbable(event == NO_EVENT))
	1399	panic("%s() called with NO_EVENT", __func__);
	1400
	1401	struct waitq *wq = global_eventq(event);
	1402
	1403	return waitq_wakeup64_thread(wq, CAST_EVENT64_T(event), thread, THREAD_AWAKENED);
	1404	}
	1405
	1406	/*
	1407	* Wakeup a thread waiting on an event and promote it to a priority.
	1408	*
	1409	* Requires woken thread to un-promote itself when done.
	1410	*/
	1411	kern_return_t
	1412	thread_wakeup_one_with_pri(
	1413	event_t event,
	1414	int priority)
	1415	{
	1416	if (__improbable(event == NO_EVENT))
	1417	panic("%s() called with NO_EVENT", __func__);
	1418
	1419	struct waitq *wq = global_eventq(event);
	1420
	1421	return waitq_wakeup64_one(wq, CAST_EVENT64_T(event), THREAD_AWAKENED, priority);
	1422	}
	1423
	1424	/*
	1425	* Wakeup a thread waiting on an event,
	1426	* promote it to a priority,
	1427	* and return a reference to the woken thread.
	1428	*
	1429	* Requires woken thread to un-promote itself when done.
	1430	*/
	1431	thread_t
	1432	thread_wakeup_identify(event_t event,
	1433	int priority)
	1434	{
	1435	if (__improbable(event == NO_EVENT))
	1436	panic("%s() called with NO_EVENT", __func__);
	1437
	1438	struct waitq *wq = global_eventq(event);
	1439
	1440	return waitq_wakeup64_identify(wq, CAST_EVENT64_T(event), THREAD_AWAKENED, priority);
	1441	}
	1442
	1443	/*
	1444	* thread_bind:
	1445	*
	1446	* Force the current thread to execute on the specified processor.
	1447	* Takes effect after the next thread_block().
	1448	*
	1449	* Returns the previous binding. PROCESSOR_NULL means
	1450	* not bound.
	1451	*
	1452	* XXX - DO NOT export this to users - XXX
	1453	*/
	1454	processor_t
	1455	thread_bind(
	1456	processor_t processor)
	1457	{
	1458	thread_t self = current_thread();
	1459	processor_t prev;
	1460	spl_t s;
	1461
	1462	s = splsched();
	1463	thread_lock(self);
	1464
	1465	prev = thread_bind_internal(self, processor);
	1466
	1467	thread_unlock(self);
	1468	splx(s);
	1469
	1470	return (prev);
	1471	}
	1472
	1473	/*
	1474	* thread_bind_internal:
	1475	*
	1476	* If the specified thread is not the current thread, and it is currently
	1477	* running on another CPU, a remote AST must be sent to that CPU to cause
	1478	* the thread to migrate to its bound processor. Otherwise, the migration
	1479	* will occur at the next quantum expiration or blocking point.
	1480	*
	1481	* When the thread is the current thread, and explicit thread_block() should
	1482	* be used to force the current processor to context switch away and
	1483	* let the thread migrate to the bound processor.
	1484	*
	1485	* Thread must be locked, and at splsched.
	1486	*/
	1487
	1488	static processor_t
	1489	thread_bind_internal(
	1490	thread_t thread,
	1491	processor_t processor)
	1492	{
	1493	processor_t prev;
	1494
	1495	/* <rdar://problem/15102234> */
	1496	assert(thread->sched_pri < BASEPRI_RTQUEUES);
	1497	/* A thread can't be bound if it's sitting on a (potentially incorrect) runqueue */
	1498	assert(thread->runq == PROCESSOR_NULL);
	1499
	1500	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_THREAD_BIND), thread_tid(thread), processor ? (uintptr_t)processor->cpu_id : (uintptr_t)-1, 0, 0, 0);
	1501
	1502	prev = thread->bound_processor;
	1503	thread->bound_processor = processor;
	1504
	1505	return (prev);
	1506	}
	1507
	1508	/*
	1509	* thread_vm_bind_group_add:
	1510	*
	1511	* The "VM bind group" is a special mechanism to mark a collection
	1512	* of threads from the VM subsystem that, in general, should be scheduled
	1513	* with only one CPU of parallelism. To accomplish this, we initially
	1514	* bind all the threads to the master processor, which has the effect
	1515	* that only one of the threads in the group can execute at once, including
	1516	* preempting threads in the group that are a lower priority. Future
	1517	* mechanisms may use more dynamic mechanisms to prevent the collection
	1518	* of VM threads from using more CPU time than desired.
	1519	*
	1520	* The current implementation can result in priority inversions where
	1521	* compute-bound priority 95 or realtime threads that happen to have
	1522	* landed on the master processor prevent the VM threads from running.
	1523	* When this situation is detected, we unbind the threads for one
	1524	* scheduler tick to allow the scheduler to run the threads an
	1525	* additional CPUs, before restoring the binding (assuming high latency
	1526	* is no longer a problem).
	1527	*/
	1528
	1529	/*
	1530	* The current max is provisioned for:
	1531	* vm_compressor_swap_trigger_thread (92)
	1532	* 2 x vm_pageout_iothread_internal (92) when vm_restricted_to_single_processor==TRUE
	1533	* vm_pageout_continue (92)
	1534	* memorystatus_thread (95)
	1535	*/
	1536	#define MAX_VM_BIND_GROUP_COUNT (5)
	1537	decl_simple_lock_data(static,sched_vm_group_list_lock);
	1538	static thread_t sched_vm_group_thread_list[MAX_VM_BIND_GROUP_COUNT];
	1539	static int sched_vm_group_thread_count;
	1540	static boolean_t sched_vm_group_temporarily_unbound = FALSE;
	1541
	1542	void
	1543	thread_vm_bind_group_add(void)
	1544	{
	1545	thread_t self = current_thread();
	1546
	1547	thread_reference_internal(self);
	1548	self->options \|= TH_OPT_SCHED_VM_GROUP;
	1549
	1550	simple_lock(&sched_vm_group_list_lock);
	1551	assert(sched_vm_group_thread_count < MAX_VM_BIND_GROUP_COUNT);
	1552	sched_vm_group_thread_list[sched_vm_group_thread_count++] = self;
	1553	simple_unlock(&sched_vm_group_list_lock);
	1554
	1555	thread_bind(master_processor);
	1556
	1557	/* Switch to bound processor if not already there */
	1558	thread_block(THREAD_CONTINUE_NULL);
	1559	}
	1560
	1561	static void
	1562	sched_vm_group_maintenance(void)
	1563	{
	1564	uint64_t ctime = mach_absolute_time();
	1565	uint64_t longtime = ctime - sched_tick_interval;
	1566	int i;
	1567	spl_t s;
	1568	boolean_t high_latency_observed = FALSE;
	1569	boolean_t runnable_and_not_on_runq_observed = FALSE;
	1570	boolean_t bind_target_changed = FALSE;
	1571	processor_t bind_target = PROCESSOR_NULL;
	1572
	1573	/* Make sure nobody attempts to add new threads while we are enumerating them */
	1574	simple_lock(&sched_vm_group_list_lock);
	1575
	1576	s = splsched();
	1577
	1578	for (i=0; i < sched_vm_group_thread_count; i++) {
	1579	thread_t thread = sched_vm_group_thread_list[i];
	1580	assert(thread != THREAD_NULL);
	1581	thread_lock(thread);
	1582	if ((thread->state & (TH_RUN\|TH_WAIT)) == TH_RUN) {
	1583	if (thread->runq != PROCESSOR_NULL && thread->last_made_runnable_time < longtime) {
	1584	high_latency_observed = TRUE;
	1585	} else if (thread->runq == PROCESSOR_NULL) {
	1586	/* There are some cases where a thread be transitiong that also fall into this case */
	1587	runnable_and_not_on_runq_observed = TRUE;
	1588	}
	1589	}
	1590	thread_unlock(thread);
	1591
	1592	if (high_latency_observed && runnable_and_not_on_runq_observed) {
	1593	/* All the things we are looking for are true, stop looking */
	1594	break;
	1595	}
	1596	}
	1597
	1598	splx(s);
	1599
	1600	if (sched_vm_group_temporarily_unbound) {
	1601	/* If we turned off binding, make sure everything is OK before rebinding */
	1602	if (!high_latency_observed) {
	1603	/* rebind */
	1604	bind_target_changed = TRUE;
	1605	bind_target = master_processor;
	1606	sched_vm_group_temporarily_unbound = FALSE; /* might be reset to TRUE if change cannot be completed */
	1607	}
	1608	} else {
	1609	/*
	1610	* Check if we're in a bad state, which is defined by high
	1611	* latency with no core currently executing a thread. If a
	1612	* single thread is making progress on a CPU, that means the
	1613	* binding concept to reduce parallelism is working as
	1614	* designed.
	1615	*/
	1616	if (high_latency_observed && !runnable_and_not_on_runq_observed) {
	1617	/* unbind */
	1618	bind_target_changed = TRUE;
	1619	bind_target = PROCESSOR_NULL;
	1620	sched_vm_group_temporarily_unbound = TRUE;
	1621	}
	1622	}
	1623
	1624	if (bind_target_changed) {
	1625	s = splsched();
	1626	for (i=0; i < sched_vm_group_thread_count; i++) {
	1627	thread_t thread = sched_vm_group_thread_list[i];
	1628	boolean_t removed;
	1629	assert(thread != THREAD_NULL);
	1630
	1631	thread_lock(thread);
	1632	removed = thread_run_queue_remove(thread);
	1633	if (removed \|\| ((thread->state & (TH_RUN \| TH_WAIT)) == TH_WAIT)) {
	1634	thread_bind_internal(thread, bind_target);
	1635	} else {
	1636	/*
	1637	* Thread was in the middle of being context-switched-to,
	1638	* or was in the process of blocking. To avoid switching the bind
	1639	* state out mid-flight, defer the change if possible.
	1640	*/
	1641	if (bind_target == PROCESSOR_NULL) {
	1642	thread_bind_internal(thread, bind_target);
	1643	} else {
	1644	sched_vm_group_temporarily_unbound = TRUE; /* next pass will try again */
	1645	}
	1646	}
	1647
	1648	if (removed) {
	1649	thread_run_queue_reinsert(thread, SCHED_PREEMPT \| SCHED_TAILQ);
	1650	}
	1651	thread_unlock(thread);
	1652	}
	1653	splx(s);
	1654	}
	1655
	1656	simple_unlock(&sched_vm_group_list_lock);
	1657	}
	1658
	1659	/* Invoked prior to idle entry to determine if, on SMT capable processors, an SMT
	1660	* rebalancing opportunity exists when a core is (instantaneously) idle, but
	1661	* other SMT-capable cores may be over-committed. TODO: some possible negatives:
	1662	* IPI thrash if this core does not remain idle following the load balancing ASTs
	1663	* Idle "thrash", when IPI issue is followed by idle entry/core power down
	1664	* followed by a wakeup shortly thereafter.
	1665	*/
	1666
	1667	#if (DEVELOPMENT \|\| DEBUG)
	1668	int sched_smt_balance = 1;
	1669	#endif
	1670
	1671	#if __SMP__
	1672	/* Invoked with pset locked, returns with pset unlocked */
	1673	static void
	1674	sched_SMT_balance(processor_t cprocessor, processor_set_t cpset) {
	1675	processor_t ast_processor = NULL;
	1676
	1677	#if (DEVELOPMENT \|\| DEBUG)
	1678	if (__improbable(sched_smt_balance == 0))
	1679	goto smt_balance_exit;
	1680	#endif
	1681
	1682	assert(cprocessor == current_processor());
	1683	if (cprocessor->is_SMT == FALSE)
	1684	goto smt_balance_exit;
	1685
	1686	processor_t sib_processor = cprocessor->processor_secondary ? cprocessor->processor_secondary : cprocessor->processor_primary;
	1687
	1688	/* Determine if both this processor and its sibling are idle,
	1689	* indicating an SMT rebalancing opportunity.
	1690	*/
	1691	if (sib_processor->state != PROCESSOR_IDLE)
	1692	goto smt_balance_exit;
	1693
	1694	processor_t sprocessor;
	1695
	1696	qe_foreach_element(sprocessor, &cpset->active_queue, processor_queue) {
	1697	if ((sprocessor->state == PROCESSOR_RUNNING) &&
	1698	(sprocessor->processor_primary != sprocessor) &&
	1699	(sprocessor->processor_primary->state == PROCESSOR_RUNNING) &&
	1700	(sprocessor->current_pri < BASEPRI_RTQUEUES) &&
	1701	((cpset->pending_AST_cpu_mask & (1ULL << sprocessor->cpu_id)) == 0)) {
	1702	assert(sprocessor != cprocessor);
	1703	ast_processor = sprocessor;
	1704	break;
	1705	}
	1706	}
	1707
	1708	smt_balance_exit:
	1709	pset_unlock(cpset);
	1710
	1711	if (ast_processor) {
	1712	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_SMT_BALANCE), ast_processor->cpu_id, ast_processor->state, ast_processor->processor_primary->state, 0, 0);
	1713	cause_ast_check(ast_processor);
	1714	}
	1715	}
	1716	#endif /* __SMP__ */
	1717
	1718	/*
	1719	* thread_select:
	1720	*
	1721	* Select a new thread for the current processor to execute.
	1722	*
	1723	* May select the current thread, which must be locked.
	1724	*/
	1725	static thread_t
	1726	thread_select(
	1727	thread_t thread,
	1728	processor_t processor,
	1729	ast_t reason)
	1730	{
	1731	processor_set_t pset = processor->processor_set;
	1732	thread_t new_thread = THREAD_NULL;
	1733
	1734	assert(processor == current_processor());
	1735	assert((thread->state & (TH_RUN\|TH_TERMINATE2)) == TH_RUN);
	1736
	1737	do {
	1738	/*
	1739	* Update the priority.
	1740	*/
	1741	if (SCHED(can_update_priority)(thread))
	1742	SCHED(update_priority)(thread);
	1743
	1744	processor->current_pri = thread->sched_pri;
	1745	processor->current_thmode = thread->sched_mode;
	1746	processor->current_sfi_class = thread->sfi_class;
	1747
	1748	pset_lock(pset);
	1749
	1750	assert(processor->state != PROCESSOR_OFF_LINE);
	1751
	1752	if (!processor->is_recommended) {
	1753	/*
	1754	* The performance controller has provided a hint to not dispatch more threads,
	1755	* unless they are bound to us (and thus we are the only option
	1756	*/
	1757	if (!SCHED(processor_bound_count)(processor)) {
	1758	goto idle;
	1759	}
	1760	} else if (processor->processor_primary != processor) {
	1761	/*
	1762	* Should this secondary SMT processor attempt to find work? For pset runqueue systems,
	1763	* we should look for work only under the same conditions that choose_processor()
	1764	* would have assigned work, which is when all primary processors have been assigned work.
	1765	*
	1766	* An exception is that bound threads are dispatched to a processor without going through
	1767	* choose_processor(), so in those cases we should continue trying to dequeue work.
	1768	*/
	1769	if (!SCHED(processor_bound_count)(processor) && !queue_empty(&pset->idle_queue) && !rt_runq.count) {
	1770	goto idle;
	1771	}
	1772	}
	1773
	1774	rt_lock_lock();
	1775
	1776	/*
	1777	* Test to see if the current thread should continue
	1778	* to run on this processor. Must not be attempting to wait, and not
	1779	* bound to a different processor, nor be in the wrong
	1780	* processor set, nor be forced to context switch by TH_SUSP.
	1781	*
	1782	* Note that there are never any RT threads in the regular runqueue.
	1783	*
	1784	* This code is very insanely tricky.
	1785	*/
	1786
	1787	if (((thread->state & (TH_TERMINATE\|TH_IDLE\|TH_WAIT\|TH_RUN\|TH_SUSP)) == TH_RUN) &&
	1788	(thread->sched_pri >= BASEPRI_RTQUEUES \|\| processor->processor_primary == processor) &&
	1789	(thread->bound_processor == PROCESSOR_NULL \|\| thread->bound_processor == processor) &&
	1790	(thread->affinity_set == AFFINITY_SET_NULL \|\| thread->affinity_set->aset_pset == pset)) {
	1791	/*
	1792	* RT threads with un-expired quantum stay on processor,
	1793	* unless there's a valid RT thread with an earlier deadline.
	1794	*/
	1795	if (thread->sched_pri >= BASEPRI_RTQUEUES && processor->first_timeslice) {
	1796	if (rt_runq.count > 0) {
	1797	thread_t next_rt = qe_queue_first(&rt_runq.queue, struct thread, runq_links);
	1798
	1799	assert(next_rt->runq == THREAD_ON_RT_RUNQ);
	1800
	1801	if (next_rt->realtime.deadline < processor->deadline &&
	1802	(next_rt->bound_processor == PROCESSOR_NULL \|\|
	1803	next_rt->bound_processor == processor)) {
	1804	/* The next RT thread is better, so pick it off the runqueue. */
	1805	goto pick_new_rt_thread;
	1806	}
	1807	}
	1808
	1809	/* This is still the best RT thread to run. */
	1810	processor->deadline = thread->realtime.deadline;
	1811
	1812	rt_lock_unlock();
	1813	pset_unlock(pset);
	1814
	1815	return (thread);
	1816	}
	1817
	1818	if ((rt_runq.count == 0) &&
	1819	SCHED(processor_queue_has_priority)(processor, thread->sched_pri, TRUE) == FALSE) {
	1820	/* This thread is still the highest priority runnable (non-idle) thread */
	1821	processor->deadline = UINT64_MAX;
	1822
	1823	rt_lock_unlock();
	1824	pset_unlock(pset);
	1825
	1826	return (thread);
	1827	}
	1828	}
	1829
	1830	/* OK, so we're not going to run the current thread. Look at the RT queue. */
	1831	if (rt_runq.count > 0) {
	1832	thread_t next_rt = qe_queue_first(&rt_runq.queue, struct thread, runq_links);
	1833
	1834	assert(next_rt->runq == THREAD_ON_RT_RUNQ);
	1835
	1836	if (__probable((next_rt->bound_processor == PROCESSOR_NULL \|\|
	1837	(next_rt->bound_processor == processor)))) {
	1838	pick_new_rt_thread:
	1839	new_thread = qe_dequeue_head(&rt_runq.queue, struct thread, runq_links);
	1840
	1841	new_thread->runq = PROCESSOR_NULL;
	1842	SCHED_STATS_RUNQ_CHANGE(&rt_runq.runq_stats, rt_runq.count);
	1843	rt_runq.count--;
	1844
	1845	processor->deadline = new_thread->realtime.deadline;
	1846
	1847	rt_lock_unlock();
	1848	pset_unlock(pset);
	1849
	1850	return (new_thread);
	1851	}
	1852	}
	1853
	1854	processor->deadline = UINT64_MAX;
	1855	rt_lock_unlock();
	1856
	1857	/* No RT threads, so let's look at the regular threads. */
	1858	if ((new_thread = SCHED(choose_thread)(processor, MINPRI, reason)) != THREAD_NULL) {
	1859	pset_unlock(pset);
	1860	return (new_thread);
	1861	}
	1862
	1863	#if __SMP__
	1864	if (SCHED(steal_thread_enabled)) {
	1865	/*
	1866	* No runnable threads, attempt to steal
	1867	* from other processors. Returns with pset lock dropped.
	1868	*/
	1869
	1870	if ((new_thread = SCHED(steal_thread)(pset)) != THREAD_NULL) {
	1871	return (new_thread);
	1872	}
	1873
	1874	/*
	1875	* If other threads have appeared, shortcut
	1876	* around again.
	1877	*/
	1878	if (!SCHED(processor_queue_empty)(processor) \|\| rt_runq.count > 0)
	1879	continue;
	1880
	1881	pset_lock(pset);
	1882	}
	1883	#endif
	1884
	1885	idle:
	1886	/*
	1887	* Nothing is runnable, so set this processor idle if it
	1888	* was running.
	1889	*/
	1890	if (processor->state == PROCESSOR_RUNNING) {
	1891	processor->state = PROCESSOR_IDLE;
	1892
	1893	if (processor->processor_primary == processor) {
	1894	re_queue_head(&pset->idle_queue, &processor->processor_queue);
	1895	} else {
	1896	re_queue_head(&pset->idle_secondary_queue, &processor->processor_queue);
	1897	}
	1898	}
	1899
	1900	#if __SMP__
	1901	/* Invoked with pset locked, returns with pset unlocked */
	1902	sched_SMT_balance(processor, pset);
	1903	#else
	1904	pset_unlock(pset);
	1905	#endif
	1906
	1907	#if CONFIG_SCHED_IDLE_IN_PLACE
	1908	/*
	1909	* Choose idle thread if fast idle is not possible.
	1910	*/
	1911	if (processor->processor_primary != processor)
	1912	return (processor->idle_thread);
	1913
	1914	if ((thread->state & (TH_IDLE\|TH_TERMINATE\|TH_SUSP)) \|\| !(thread->state & TH_WAIT) \|\| thread->wake_active \|\| thread->sched_pri >= BASEPRI_RTQUEUES)
	1915	return (processor->idle_thread);
	1916
	1917	/*
	1918	* Perform idling activities directly without a
	1919	* context switch. Return dispatched thread,
	1920	* else check again for a runnable thread.
	1921	*/
	1922	new_thread = thread_select_idle(thread, processor);
	1923
	1924	#else /* !CONFIG_SCHED_IDLE_IN_PLACE */
	1925
	1926	/*
	1927	* Do a full context switch to idle so that the current
	1928	* thread can start running on another processor without
	1929	* waiting for the fast-idled processor to wake up.
	1930	*/
	1931	new_thread = processor->idle_thread;
	1932
	1933	#endif /* !CONFIG_SCHED_IDLE_IN_PLACE */
	1934
	1935	} while (new_thread == THREAD_NULL);
	1936
	1937	return (new_thread);
	1938	}
	1939
	1940	#if CONFIG_SCHED_IDLE_IN_PLACE
	1941	/*
	1942	* thread_select_idle:
	1943	*
	1944	* Idle the processor using the current thread context.
	1945	*
	1946	* Called with thread locked, then dropped and relocked.
	1947	*/
	1948	static thread_t
	1949	thread_select_idle(
	1950	thread_t thread,
	1951	processor_t processor)
	1952	{
	1953	thread_t new_thread;
	1954	uint64_t arg1, arg2;
	1955	int urgency;
	1956
	1957	sched_run_decr(thread);
	1958
	1959	thread->state \|= TH_IDLE;
	1960	processor->current_pri = IDLEPRI;
	1961	processor->current_thmode = TH_MODE_NONE;
	1962	processor->current_sfi_class = SFI_CLASS_KERNEL;
	1963
	1964	/* Reload precise timing global policy to thread-local policy */
	1965	thread->precise_user_kernel_time = use_precise_user_kernel_time(thread);
	1966
	1967	thread_unlock(thread);
	1968
	1969	/*
	1970	* Switch execution timing to processor idle thread.
	1971	*/
	1972	processor->last_dispatch = mach_absolute_time();
	1973
	1974	#ifdef CONFIG_MACH_APPROXIMATE_TIME
	1975	commpage_update_mach_approximate_time(processor->last_dispatch);
	1976	#endif
	1977
	1978	thread->last_run_time = processor->last_dispatch;
	1979	thread_timer_event(processor->last_dispatch, &processor->idle_thread->system_timer);
	1980	PROCESSOR_DATA(processor, kernel_timer) = &processor->idle_thread->system_timer;
	1981
	1982	/*
	1983	* Cancel the quantum timer while idling.
	1984	*/
	1985	timer_call_cancel(&processor->quantum_timer);
	1986	processor->first_timeslice = FALSE;
	1987
	1988	(*thread->sched_call)(SCHED_CALL_BLOCK, thread);
	1989
	1990	thread_tell_urgency(THREAD_URGENCY_NONE, 0, 0, 0, NULL);
	1991
	1992	/*
	1993	* Enable interrupts and perform idling activities. No
	1994	* preemption due to TH_IDLE being set.
	1995	*/
	1996	spllo(); new_thread = processor_idle(thread, processor);
	1997
	1998	/*
	1999	* Return at splsched.
	2000	*/
	2001	(*thread->sched_call)(SCHED_CALL_UNBLOCK, thread);
	2002
	2003	thread_lock(thread);
	2004
	2005	/*
	2006	* If awakened, switch to thread timer and start a new quantum.
	2007	* Otherwise skip; we will context switch to another thread or return here.
	2008	*/
	2009	if (!(thread->state & TH_WAIT)) {
	2010	processor->last_dispatch = mach_absolute_time();
	2011	thread_timer_event(processor->last_dispatch, &thread->system_timer);
	2012	PROCESSOR_DATA(processor, kernel_timer) = &thread->system_timer;
	2013
	2014	thread_quantum_init(thread);
	2015	processor->quantum_end = processor->last_dispatch + thread->quantum_remaining;
	2016	timer_call_enter1(&processor->quantum_timer, thread, processor->quantum_end, TIMER_CALL_SYS_CRITICAL \| TIMER_CALL_LOCAL);
	2017	processor->first_timeslice = TRUE;
	2018
	2019	thread->computation_epoch = processor->last_dispatch;
	2020	}
	2021
	2022	thread->state &= ~TH_IDLE;
	2023
	2024	urgency = thread_get_urgency(thread, &arg1, &arg2);
	2025
	2026	thread_tell_urgency(urgency, arg1, arg2, 0, new_thread);
	2027
	2028	sched_run_incr(thread);
	2029
	2030	return (new_thread);
	2031	}
	2032	#endif /* CONFIG_SCHED_IDLE_IN_PLACE */
	2033
	2034	/*
	2035	* thread_invoke
	2036	*
	2037	* Called at splsched with neither thread locked.
	2038	*
	2039	* Perform a context switch and start executing the new thread.
	2040	*
	2041	* Returns FALSE when the context switch didn't happen.
	2042	* The reference to the new thread is still consumed.
	2043	*
	2044	* "self" is what is currently running on the processor,
	2045	* "thread" is the new thread to context switch to
	2046	* (which may be the same thread in some cases)
	2047	*/
	2048	static boolean_t
	2049	thread_invoke(
	2050	thread_t self,
	2051	thread_t thread,
	2052	ast_t reason)
	2053	{
	2054	if (__improbable(get_preemption_level() != 0)) {
	2055	int pl = get_preemption_level();
	2056	panic("thread_invoke: preemption_level %d, possible cause: %s",
	2057	pl, (pl < 0 ? "unlocking an unlocked mutex or spinlock" :
	2058	"blocking while holding a spinlock, or within interrupt context"));
	2059	}
	2060
	2061	thread_continue_t continuation = self->continuation;
	2062	void *parameter = self->parameter;
	2063	processor_t processor;
	2064
	2065	uint64_t ctime = mach_absolute_time();
	2066
	2067	#ifdef CONFIG_MACH_APPROXIMATE_TIME
	2068	commpage_update_mach_approximate_time(ctime);
	2069	#endif
	2070
	2071	#if defined(CONFIG_SCHED_TIMESHARE_CORE)
	2072	sched_timeshare_consider_maintenance(ctime);
	2073	#endif
	2074
	2075	assert_thread_magic(self);
	2076	assert(self == current_thread());
	2077	assert(self->runq == PROCESSOR_NULL);
	2078	assert((self->state & (TH_RUN\|TH_TERMINATE2)) == TH_RUN);
	2079
	2080	thread_lock(thread);
	2081
	2082	assert_thread_magic(thread);
	2083	assert((thread->state & (TH_RUN\|TH_WAIT\|TH_UNINT\|TH_TERMINATE\|TH_TERMINATE2)) == TH_RUN);
	2084	assert(thread->bound_processor == PROCESSOR_NULL \|\| thread->bound_processor == current_processor());
	2085	assert(thread->runq == PROCESSOR_NULL);
	2086
	2087	/* Reload precise timing global policy to thread-local policy */
	2088	thread->precise_user_kernel_time = use_precise_user_kernel_time(thread);
	2089
	2090	/* Update SFI class based on other factors */
	2091	thread->sfi_class = sfi_thread_classify(thread);
	2092
	2093	/* Allow realtime threads to hang onto a stack. */
	2094	if ((self->sched_mode == TH_MODE_REALTIME) && !self->reserved_stack)
	2095	self->reserved_stack = self->kernel_stack;
	2096
	2097	if (continuation != NULL) {
	2098	if (!thread->kernel_stack) {
	2099	/*
	2100	* If we are using a privileged stack,
	2101	* check to see whether we can exchange it with
	2102	* that of the other thread.
	2103	*/
	2104	if (self->kernel_stack == self->reserved_stack && !thread->reserved_stack)
	2105	goto need_stack;
	2106
	2107	/*
	2108	* Context switch by performing a stack handoff.
	2109	*/
	2110	continuation = thread->continuation;
	2111	parameter = thread->parameter;
	2112
	2113	processor = current_processor();
	2114	processor->active_thread = thread;
	2115	processor->current_pri = thread->sched_pri;
	2116	processor->current_thmode = thread->sched_mode;
	2117	processor->current_sfi_class = thread->sfi_class;
	2118	if (thread->last_processor != processor && thread->last_processor != NULL) {
	2119	if (thread->last_processor->processor_set != processor->processor_set)
	2120	thread->ps_switch++;
	2121	thread->p_switch++;
	2122	}
	2123	thread->last_processor = processor;
	2124	thread->c_switch++;
	2125	ast_context(thread);
	2126
	2127	thread_unlock(thread);
	2128
	2129	self->reason = reason;
	2130
	2131	processor->last_dispatch = ctime;
	2132	self->last_run_time = ctime;
	2133	thread_timer_event(ctime, &thread->system_timer);
	2134	PROCESSOR_DATA(processor, kernel_timer) = &thread->system_timer;
	2135
	2136	/*
	2137	* Since non-precise user/kernel time doesn't update the state timer
	2138	* during privilege transitions, synthesize an event now.
	2139	*/
	2140	if (!thread->precise_user_kernel_time) {
	2141	timer_switch(PROCESSOR_DATA(processor, current_state),
	2142	ctime,
	2143	PROCESSOR_DATA(processor, current_state));
	2144	}
	2145
	2146	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	2147	MACHDBG_CODE(DBG_MACH_SCHED, MACH_STACK_HANDOFF)\|DBG_FUNC_NONE,
	2148	self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
	2149
	2150	if ((thread->chosen_processor != processor) && (thread->chosen_processor != PROCESSOR_NULL)) {
	2151	SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_MOVED)\|DBG_FUNC_NONE,
	2152	(uintptr_t)thread_tid(thread), (uintptr_t)thread->chosen_processor->cpu_id, 0, 0, 0);
	2153	}
	2154
	2155	DTRACE_SCHED2(off__cpu, struct thread , thread, struct proc , thread->task->bsd_info);
	2156
	2157	SCHED_STATS_CSW(processor, self->reason, self->sched_pri, thread->sched_pri);
	2158
	2159	TLOG(1, "thread_invoke: calling stack_handoff\n");
	2160	stack_handoff(self, thread);
	2161
	2162	/* 'self' is now off core */
	2163	assert(thread == current_thread());
	2164
	2165	DTRACE_SCHED(on__cpu);
	2166
	2167	#if KPERF
	2168	kperf_on_cpu(thread, continuation, NULL);
	2169	#endif /* KPERF */
	2170
	2171	thread_dispatch(self, thread);
	2172
	2173	thread->continuation = thread->parameter = NULL;
	2174
	2175	counter(c_thread_invoke_hits++);
	2176
	2177	(void) spllo();
	2178
	2179	assert(continuation);
	2180	call_continuation(continuation, parameter, thread->wait_result);
	2181	/NOTREACHED/
	2182	}
	2183	else if (thread == self) {
	2184	/* same thread but with continuation */
	2185	ast_context(self);
	2186	counter(++c_thread_invoke_same);
	2187
	2188	thread_unlock(self);
	2189
	2190	#if KPERF
	2191	kperf_on_cpu(thread, continuation, NULL);
	2192	#endif /* KPERF */
	2193
	2194	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	2195	MACHDBG_CODE(DBG_MACH_SCHED,MACH_SCHED) \| DBG_FUNC_NONE,
	2196	self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
	2197
	2198	self->continuation = self->parameter = NULL;
	2199
	2200	(void) spllo();
	2201
	2202	call_continuation(continuation, parameter, self->wait_result);
	2203	/NOTREACHED/
	2204	}
	2205	} else {
	2206	/*
	2207	* Check that the other thread has a stack
	2208	*/
	2209	if (!thread->kernel_stack) {
	2210	need_stack:
	2211	if (!stack_alloc_try(thread)) {
	2212	counter(c_thread_invoke_misses++);
	2213	thread_unlock(thread);
	2214	thread_stack_enqueue(thread);
	2215	return (FALSE);
	2216	}
	2217	} else if (thread == self) {
	2218	ast_context(self);
	2219	counter(++c_thread_invoke_same);
	2220	thread_unlock(self);
	2221
	2222	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	2223	MACHDBG_CODE(DBG_MACH_SCHED,MACH_SCHED) \| DBG_FUNC_NONE,
	2224	self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
	2225
	2226	return (TRUE);
	2227	}
	2228	}
	2229
	2230	/*
	2231	* Context switch by full context save.
	2232	*/
	2233	processor = current_processor();
	2234	processor->active_thread = thread;
	2235	processor->current_pri = thread->sched_pri;
	2236	processor->current_thmode = thread->sched_mode;
	2237	processor->current_sfi_class = thread->sfi_class;
	2238	if (thread->last_processor != processor && thread->last_processor != NULL) {
	2239	if (thread->last_processor->processor_set != processor->processor_set)
	2240	thread->ps_switch++;
	2241	thread->p_switch++;
	2242	}
	2243	thread->last_processor = processor;
	2244	thread->c_switch++;
	2245	ast_context(thread);
	2246
	2247	thread_unlock(thread);
	2248
	2249	counter(c_thread_invoke_csw++);
	2250
	2251	self->reason = reason;
	2252
	2253	processor->last_dispatch = ctime;
	2254	self->last_run_time = ctime;
	2255	thread_timer_event(ctime, &thread->system_timer);
	2256	PROCESSOR_DATA(processor, kernel_timer) = &thread->system_timer;
	2257
	2258	/*
	2259	* Since non-precise user/kernel time doesn't update the state timer
	2260	* during privilege transitions, synthesize an event now.
	2261	*/
	2262	if (!thread->precise_user_kernel_time) {
	2263	timer_switch(PROCESSOR_DATA(processor, current_state),
	2264	ctime,
	2265	PROCESSOR_DATA(processor, current_state));
	2266	}
	2267
	2268	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	2269	MACHDBG_CODE(DBG_MACH_SCHED,MACH_SCHED) \| DBG_FUNC_NONE,
	2270	self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
	2271
	2272	if ((thread->chosen_processor != processor) && (thread->chosen_processor != NULL)) {
	2273	SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_MOVED)\|DBG_FUNC_NONE,
	2274	(uintptr_t)thread_tid(thread), (uintptr_t)thread->chosen_processor->cpu_id, 0, 0, 0);
	2275	}
	2276
	2277	DTRACE_SCHED2(off__cpu, struct thread , thread, struct proc , thread->task->bsd_info);
	2278
	2279	SCHED_STATS_CSW(processor, self->reason, self->sched_pri, thread->sched_pri);
	2280
	2281	/*
	2282	* This is where we actually switch register context,
	2283	* and address space if required. We will next run
	2284	* as a result of a subsequent context switch.
	2285	*
	2286	* Once registers are switched and the processor is running "thread",
	2287	* the stack variables and non-volatile registers will contain whatever
	2288	* was there the last time that thread blocked. No local variables should
	2289	* be used after this point, except for the special case of "thread", which
	2290	* the platform layer returns as the previous thread running on the processor
	2291	* via the function call ABI as a return register, and "self", which may have
	2292	* been stored on the stack or a non-volatile register, but a stale idea of
	2293	* what was on the CPU is newly-accurate because that thread is again
	2294	* running on the CPU.
	2295	*/
	2296	assert(continuation == self->continuation);
	2297	thread = machine_switch_context(self, continuation, thread);
	2298	assert(self == current_thread());
	2299	TLOG(1,"thread_invoke: returning machine_switch_context: self %p continuation %p thread %p\n", self, continuation, thread);
	2300
	2301	DTRACE_SCHED(on__cpu);
	2302
	2303	#if KPERF
	2304	kperf_on_cpu(self, NULL, __builtin_frame_address(0));
	2305	#endif /* KPERF */
	2306
	2307	/*
	2308	* We have been resumed and are set to run.
	2309	*/
	2310	thread_dispatch(thread, self);
	2311
	2312	if (continuation) {
	2313	self->continuation = self->parameter = NULL;
	2314
	2315	(void) spllo();
	2316
	2317	call_continuation(continuation, parameter, self->wait_result);
	2318	/NOTREACHED/
	2319	}
	2320
	2321	return (TRUE);
	2322	}
	2323
	2324	#if defined(CONFIG_SCHED_DEFERRED_AST)
	2325	/*
	2326	* pset_cancel_deferred_dispatch:
	2327	*
	2328	* Cancels all ASTs that we can cancel for the given processor set
	2329	* if the current processor is running the last runnable thread in the
	2330	* system.
	2331	*
	2332	* This function assumes the current thread is runnable. This must
	2333	* be called with the pset unlocked.
	2334	*/
	2335	static void
	2336	pset_cancel_deferred_dispatch(
	2337	processor_set_t pset,
	2338	processor_t processor)
	2339	{
	2340	processor_t active_processor = NULL;
	2341	uint32_t sampled_sched_run_count;
	2342
	2343	pset_lock(pset);
	2344	sampled_sched_run_count = (volatile uint32_t) sched_run_buckets[TH_BUCKET_RUN];
	2345
	2346	/*
	2347	* If we have emptied the run queue, and our current thread is runnable, we
	2348	* should tell any processors that are still DISPATCHING that they will
	2349	* probably not have any work to do. In the event that there are no
	2350	* pending signals that we can cancel, this is also uninteresting.
	2351	*
	2352	* In the unlikely event that another thread becomes runnable while we are
	2353	* doing this (sched_run_count is atomically updated, not guarded), the
	2354	* codepath making it runnable SHOULD (a dangerous word) need the pset lock
	2355	* in order to dispatch it to a processor in our pset. So, the other
	2356	* codepath will wait while we squash all cancelable ASTs, get the pset
	2357	* lock, and then dispatch the freshly runnable thread. So this should be
	2358	* correct (we won't accidentally have a runnable thread that hasn't been
	2359	* dispatched to an idle processor), if not ideal (we may be restarting the
	2360	* dispatch process, which could have some overhead).
	2361	*
	2362	*/
	2363	if ((sampled_sched_run_count == 1) &&
	2364	(pset->pending_deferred_AST_cpu_mask)) {
	2365	qe_foreach_element_safe(active_processor, &pset->active_queue, processor_queue) {
	2366	/*
	2367	* If a processor is DISPATCHING, it could be because of
	2368	* a cancelable signal.
	2369	*
	2370	* IF the processor is not our
	2371	* current processor (the current processor should not
	2372	* be DISPATCHING, so this is a bit paranoid), AND there
	2373	* is a cancelable signal pending on the processor, AND
	2374	* there is no non-cancelable signal pending (as there is
	2375	* no point trying to backtrack on bringing the processor
	2376	* up if a signal we cannot cancel is outstanding), THEN
	2377	* it should make sense to roll back the processor state
	2378	* to the IDLE state.
	2379	*
	2380	* If the racey nature of this approach (as the signal
	2381	* will be arbitrated by hardware, and can fire as we
	2382	* roll back state) results in the core responding
	2383	* despite being pushed back to the IDLE state, it
	2384	* should be no different than if the core took some
	2385	* interrupt while IDLE.
	2386	*/
	2387	if ((active_processor->state == PROCESSOR_DISPATCHING) &&
	2388	(pset->pending_deferred_AST_cpu_mask & (1ULL << active_processor->cpu_id)) &&
	2389	(!(pset->pending_AST_cpu_mask & (1ULL << active_processor->cpu_id))) &&
	2390	(active_processor != processor)) {
	2391	/*
	2392	* Squash all of the processor state back to some
	2393	* reasonable facsimile of PROCESSOR_IDLE.
	2394	*
	2395	* TODO: What queue policy do we actually want here?
	2396	* We want to promote selection of a good processor
	2397	* to run on. Do we want to enqueue at the head?
	2398	* The tail? At the (relative) old position in the
	2399	* queue? Or something else entirely?
	2400	*/
	2401	re_queue_head(&pset->idle_queue, &active_processor->processor_queue);
	2402
	2403	assert(active_processor->next_thread == THREAD_NULL);
	2404
	2405	active_processor->current_pri = IDLEPRI;
	2406	active_processor->current_thmode = TH_MODE_FIXED;
	2407	active_processor->current_sfi_class = SFI_CLASS_KERNEL;
	2408	active_processor->deadline = UINT64_MAX;
	2409	active_processor->state = PROCESSOR_IDLE;
	2410	pset->pending_deferred_AST_cpu_mask &= ~(1U << active_processor->cpu_id);
	2411	machine_signal_idle_cancel(active_processor);
	2412	}
	2413
	2414	}
	2415	}
	2416
	2417	pset_unlock(pset);
	2418	}
	2419	#else
	2420	/* We don't support deferred ASTs; everything is candycanes and sunshine. */
	2421	#endif
	2422
	2423	/*
	2424	* thread_dispatch:
	2425	*
	2426	* Handle threads at context switch. Re-dispatch other thread
	2427	* if still running, otherwise update run state and perform
	2428	* special actions. Update quantum for other thread and begin
	2429	* the quantum for ourselves.
	2430	*
	2431	* "thread" is the old thread that we have switched away from.
	2432	* "self" is the new current thread that we have context switched to
	2433	*
	2434	* Called at splsched.
	2435	*/
	2436	void
	2437	thread_dispatch(
	2438	thread_t thread,
	2439	thread_t self)
	2440	{
	2441	processor_t processor = self->last_processor;
	2442
	2443	assert(processor == current_processor());
	2444	assert(self == current_thread());
	2445	assert(thread != self);
	2446
	2447	if (thread != THREAD_NULL) {
	2448	/*
	2449	* If blocked at a continuation, discard
	2450	* the stack.
	2451	*/
	2452	if (thread->continuation != NULL && thread->kernel_stack != 0)
	2453	stack_free(thread);
	2454
	2455	if (thread->state & TH_IDLE) {
	2456	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	2457	MACHDBG_CODE(DBG_MACH_SCHED,MACH_DISPATCH) \| DBG_FUNC_NONE,
	2458	(uintptr_t)thread_tid(thread), 0, thread->state,
	2459	sched_run_buckets[TH_BUCKET_RUN], 0);
	2460	} else {
	2461	int64_t consumed;
	2462	int64_t remainder = 0;
	2463
	2464	if (processor->quantum_end > processor->last_dispatch)
	2465	remainder = processor->quantum_end -
	2466	processor->last_dispatch;
	2467
	2468	consumed = thread->quantum_remaining - remainder;
	2469
	2470	if ((thread->reason & AST_LEDGER) == 0) {
	2471	/*
	2472	* Bill CPU time to both the task and
	2473	* the individual thread.
	2474	*/
	2475	ledger_credit(thread->t_ledger,
	2476	task_ledgers.cpu_time, consumed);
	2477	ledger_credit(thread->t_threadledger,
	2478	thread_ledgers.cpu_time, consumed);
	2479	#ifdef CONFIG_BANK
	2480	if (thread->t_bankledger) {
	2481	ledger_credit(thread->t_bankledger,
	2482	bank_ledgers.cpu_time,
	2483	(consumed - thread->t_deduct_bank_ledger_time));
	2484
	2485	}
	2486	thread->t_deduct_bank_ledger_time =0;
	2487	#endif
	2488	}
	2489
	2490	wake_lock(thread);
	2491	thread_lock(thread);
	2492
	2493	/*
	2494	* Apply a priority floor if the thread holds a kernel resource
	2495	* Do this before checking starting_pri to avoid overpenalizing
	2496	* repeated rwlock blockers.
	2497	*/
	2498	if (__improbable(thread->rwlock_count != 0))
	2499	lck_rw_set_promotion_locked(thread);
	2500
	2501	boolean_t keep_quantum = processor->first_timeslice;
	2502
	2503	/*
	2504	* Treat a thread which has dropped priority since it got on core
	2505	* as having expired its quantum.
	2506	*/
	2507	if (processor->starting_pri > thread->sched_pri)
	2508	keep_quantum = FALSE;
	2509
	2510	/* Compute remainder of current quantum. */
	2511	if (keep_quantum &&
	2512	processor->quantum_end > processor->last_dispatch)
	2513	thread->quantum_remaining = (uint32_t)remainder;
	2514	else
	2515	thread->quantum_remaining = 0;
	2516
	2517	if (thread->sched_mode == TH_MODE_REALTIME) {
	2518	/*
	2519	* Cancel the deadline if the thread has
	2520	* consumed the entire quantum.
	2521	*/
	2522	if (thread->quantum_remaining == 0) {
	2523	thread->realtime.deadline = UINT64_MAX;
	2524	}
	2525	} else {
	2526	#if defined(CONFIG_SCHED_TIMESHARE_CORE)
	2527	/*
	2528	* For non-realtime threads treat a tiny
	2529	* remaining quantum as an expired quantum
	2530	* but include what's left next time.
	2531	*/
	2532	if (thread->quantum_remaining < min_std_quantum) {
	2533	thread->reason \|= AST_QUANTUM;
	2534	thread->quantum_remaining += SCHED(initial_quantum_size)(thread);
	2535	}
	2536	#endif /* CONFIG_SCHED_TIMESHARE_CORE */
	2537	}
	2538
	2539	/*
	2540	* If we are doing a direct handoff then
	2541	* take the remainder of the quantum.
	2542	*/
	2543	if ((thread->reason & (AST_HANDOFF\|AST_QUANTUM)) == AST_HANDOFF) {
	2544	self->quantum_remaining = thread->quantum_remaining;
	2545	thread->reason \|= AST_QUANTUM;
	2546	thread->quantum_remaining = 0;
	2547	} else {
	2548	#if defined(CONFIG_SCHED_MULTIQ)
	2549	if (SCHED(sched_groups_enabled) &&
	2550	thread->sched_group == self->sched_group) {
	2551	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	2552	MACHDBG_CODE(DBG_MACH_SCHED, MACH_QUANTUM_HANDOFF),
	2553	self->reason, (uintptr_t)thread_tid(thread),
	2554	self->quantum_remaining, thread->quantum_remaining, 0);
	2555
	2556	self->quantum_remaining = thread->quantum_remaining;
	2557	thread->quantum_remaining = 0;
	2558	/* Don't set AST_QUANTUM here - old thread might still want to preempt someone else */
	2559	}
	2560	#endif /* defined(CONFIG_SCHED_MULTIQ) */
	2561	}
	2562
	2563	thread->computation_metered += (processor->last_dispatch - thread->computation_epoch);
	2564
	2565	if (!(thread->state & TH_WAIT)) {
	2566	/*
	2567	* Still runnable.
	2568	*/
	2569	thread->last_made_runnable_time = mach_approximate_time();
	2570
	2571	machine_thread_going_off_core(thread, FALSE);
	2572
	2573	if (thread->reason & AST_QUANTUM)
	2574	thread_setrun(thread, SCHED_TAILQ);
	2575	else if (thread->reason & AST_PREEMPT)
	2576	thread_setrun(thread, SCHED_HEADQ);
	2577	else
	2578	thread_setrun(thread, SCHED_PREEMPT \| SCHED_TAILQ);
	2579
	2580	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	2581	MACHDBG_CODE(DBG_MACH_SCHED,MACH_DISPATCH) \| DBG_FUNC_NONE,
	2582	(uintptr_t)thread_tid(thread), thread->reason, thread->state,
	2583	sched_run_buckets[TH_BUCKET_RUN], 0);
	2584
	2585	if (thread->wake_active) {
	2586	thread->wake_active = FALSE;
	2587	thread_unlock(thread);
	2588
	2589	thread_wakeup(&thread->wake_active);
	2590	} else {
	2591	thread_unlock(thread);
	2592	}
	2593
	2594	wake_unlock(thread);
	2595	} else {
	2596	/*
	2597	* Waiting.
	2598	*/
	2599	boolean_t should_terminate = FALSE;
	2600	uint32_t new_run_count;
	2601
	2602	/* Only the first call to thread_dispatch
	2603	* after explicit termination should add
	2604	* the thread to the termination queue
	2605	*/
	2606	if ((thread->state & (TH_TERMINATE\|TH_TERMINATE2)) == TH_TERMINATE) {
	2607	should_terminate = TRUE;
	2608	thread->state \|= TH_TERMINATE2;
	2609	}
	2610
	2611	thread->state &= ~TH_RUN;
	2612	thread->last_made_runnable_time = ~0ULL;
	2613	thread->chosen_processor = PROCESSOR_NULL;
	2614
	2615	new_run_count = sched_run_decr(thread);
	2616
	2617	#if CONFIG_SCHED_SFI
	2618	if ((thread->state & (TH_WAIT \| TH_TERMINATE)) == TH_WAIT) {
	2619	if (thread->reason & AST_SFI) {
	2620	thread->wait_sfi_begin_time = processor->last_dispatch;
	2621	}
	2622	}
	2623	#endif
	2624
	2625	machine_thread_going_off_core(thread, should_terminate);
	2626
	2627	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	2628	MACHDBG_CODE(DBG_MACH_SCHED,MACH_DISPATCH) \| DBG_FUNC_NONE,
	2629	(uintptr_t)thread_tid(thread), thread->reason, thread->state,
	2630	new_run_count, 0);
	2631
	2632	(*thread->sched_call)(SCHED_CALL_BLOCK, thread);
	2633
	2634	if (thread->wake_active) {
	2635	thread->wake_active = FALSE;
	2636	thread_unlock(thread);
	2637
	2638	thread_wakeup(&thread->wake_active);
	2639	} else {
	2640	thread_unlock(thread);
	2641	}
	2642
	2643	wake_unlock(thread);
	2644
	2645	if (should_terminate)
	2646	thread_terminate_enqueue(thread);
	2647	}
	2648	}
	2649	}
	2650
	2651	/* Update (new) current thread and reprogram quantum timer */
	2652	thread_lock(self);
	2653	if (!(self->state & TH_IDLE)) {
	2654	uint64_t arg1, arg2;
	2655	int urgency;
	2656	uint64_t latency;
	2657
	2658	#if CONFIG_SCHED_SFI
	2659	ast_t new_ast;
	2660
	2661	new_ast = sfi_thread_needs_ast(self, NULL);
	2662
	2663	if (new_ast != AST_NONE) {
	2664	ast_on(new_ast);
	2665	}
	2666	#endif
	2667
	2668	assertf(processor->last_dispatch >= self->last_made_runnable_time, "Non-monotonic time? dispatch at 0x%llx, runnable at 0x%llx", processor->last_dispatch, self->last_made_runnable_time);
	2669	latency = processor->last_dispatch - self->last_made_runnable_time;
	2670
	2671	urgency = thread_get_urgency(self, &arg1, &arg2);
	2672
	2673	thread_tell_urgency(urgency, arg1, arg2, latency, self);
	2674
	2675	machine_thread_going_on_core(self, urgency, latency);
	2676
	2677	/*
	2678	* Get a new quantum if none remaining.
	2679	*/
	2680	if (self->quantum_remaining == 0) {
	2681	thread_quantum_init(self);
	2682	}
	2683
	2684	/*
	2685	* Set up quantum timer and timeslice.
	2686	*/
	2687	processor->quantum_end = processor->last_dispatch + self->quantum_remaining;
	2688	timer_call_enter1(&processor->quantum_timer, self, processor->quantum_end, TIMER_CALL_SYS_CRITICAL \| TIMER_CALL_LOCAL);
	2689
	2690	processor->first_timeslice = TRUE;
	2691	} else {
	2692	timer_call_cancel(&processor->quantum_timer);
	2693	processor->first_timeslice = FALSE;
	2694
	2695	thread_tell_urgency(THREAD_URGENCY_NONE, 0, 0, 0, self);
	2696	machine_thread_going_on_core(self, THREAD_URGENCY_NONE, 0);
	2697	}
	2698
	2699	self->computation_epoch = processor->last_dispatch;
	2700	self->reason = AST_NONE;
	2701	processor->starting_pri = self->sched_pri;
	2702
	2703	thread_unlock(self);
	2704
	2705	#if defined(CONFIG_SCHED_DEFERRED_AST)
	2706	/*
	2707	* TODO: Can we state that redispatching our old thread is also
	2708	* uninteresting?
	2709	*/
	2710	if ((((volatile uint32_t)sched_run_buckets[TH_BUCKET_RUN]) == 1) &&
	2711	!(self->state & TH_IDLE)) {
	2712	pset_cancel_deferred_dispatch(processor->processor_set, processor);
	2713	}
	2714	#endif
	2715
	2716	}
	2717
	2718	/*
	2719	* thread_block_reason:
	2720	*
	2721	* Forces a reschedule, blocking the caller if a wait
	2722	* has been asserted.
	2723	*
	2724	* If a continuation is specified, then thread_invoke will
	2725	* attempt to discard the thread's kernel stack. When the
	2726	* thread resumes, it will execute the continuation function
	2727	* on a new kernel stack.
	2728	*/
	2729	counter(mach_counter_t c_thread_block_calls = 0;)
	2730
	2731	wait_result_t
	2732	thread_block_reason(
	2733	thread_continue_t continuation,
	2734	void *parameter,
	2735	ast_t reason)
	2736	{
	2737	thread_t self = current_thread();
	2738	processor_t processor;
	2739	thread_t new_thread;
	2740	spl_t s;
	2741
	2742	counter(++c_thread_block_calls);
	2743
	2744	s = splsched();
	2745
	2746	processor = current_processor();
	2747
	2748	/* If we're explicitly yielding, force a subsequent quantum */
	2749	if (reason & AST_YIELD)
	2750	processor->first_timeslice = FALSE;
	2751
	2752	/* We're handling all scheduling AST's */
	2753	ast_off(AST_SCHEDULING);
	2754
	2755	#if PROC_REF_DEBUG
	2756	if ((continuation != NULL) && (self->task != kernel_task)) {
	2757	if (uthread_get_proc_refcount(self->uthread) != 0) {
	2758	panic("thread_block_reason with continuation uthread %p with uu_proc_refcount != 0", self->uthread);
	2759	}
	2760	}
	2761	#endif
	2762
	2763	self->continuation = continuation;
	2764	self->parameter = parameter;
	2765
	2766	if (self->state & ~(TH_RUN \| TH_IDLE)) {
	2767	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	2768	MACHDBG_CODE(DBG_MACH_SCHED,MACH_BLOCK),
	2769	reason, VM_KERNEL_UNSLIDE(continuation), 0, 0, 0);
	2770	}
	2771
	2772	do {
	2773	thread_lock(self);
	2774	new_thread = thread_select(self, processor, reason);
	2775	thread_unlock(self);
	2776	} while (!thread_invoke(self, new_thread, reason));
	2777
	2778	splx(s);
	2779
	2780	return (self->wait_result);
	2781	}
	2782
	2783	/*
	2784	* thread_block:
	2785	*
	2786	* Block the current thread if a wait has been asserted.
	2787	*/
	2788	wait_result_t
	2789	thread_block(
	2790	thread_continue_t continuation)
	2791	{
	2792	return thread_block_reason(continuation, NULL, AST_NONE);
	2793	}
	2794
	2795	wait_result_t
	2796	thread_block_parameter(
	2797	thread_continue_t continuation,
	2798	void *parameter)
	2799	{
	2800	return thread_block_reason(continuation, parameter, AST_NONE);
	2801	}
	2802
	2803	/*
	2804	* thread_run:
	2805	*
	2806	* Switch directly from the current thread to the
	2807	* new thread, handing off our quantum if appropriate.
	2808	*
	2809	* New thread must be runnable, and not on a run queue.
	2810	*
	2811	* Called at splsched.
	2812	*/
	2813	int
	2814	thread_run(
	2815	thread_t self,
	2816	thread_continue_t continuation,
	2817	void *parameter,
	2818	thread_t new_thread)
	2819	{
	2820	ast_t handoff = AST_HANDOFF;
	2821
	2822	self->continuation = continuation;
	2823	self->parameter = parameter;
	2824
	2825	while (!thread_invoke(self, new_thread, handoff)) {
	2826	processor_t processor = current_processor();
	2827
	2828	thread_lock(self);
	2829	new_thread = thread_select(self, processor, AST_NONE);
	2830	thread_unlock(self);
	2831	handoff = AST_NONE;
	2832	}
	2833
	2834	return (self->wait_result);
	2835	}
	2836
	2837	/*
	2838	* thread_continue:
	2839	*
	2840	* Called at splsched when a thread first receives
	2841	* a new stack after a continuation.
	2842	*/
	2843	void
	2844	thread_continue(
	2845	thread_t thread)
	2846	{
	2847	thread_t self = current_thread();
	2848	thread_continue_t continuation;
	2849	void *parameter;
	2850
	2851	DTRACE_SCHED(on__cpu);
	2852
	2853	continuation = self->continuation;
	2854	parameter = self->parameter;
	2855
	2856	#if KPERF
	2857	kperf_on_cpu(self, continuation, NULL);
	2858	#endif
	2859
	2860	thread_dispatch(thread, self);
	2861
	2862	self->continuation = self->parameter = NULL;
	2863
	2864	if (thread != THREAD_NULL)
	2865	(void)spllo();
	2866
	2867	TLOG(1, "thread_continue: calling call_continuation \n");
	2868	call_continuation(continuation, parameter, self->wait_result);
	2869	/NOTREACHED/
	2870	}
	2871
	2872	void
	2873	thread_quantum_init(thread_t thread)
	2874	{
	2875	if (thread->sched_mode == TH_MODE_REALTIME) {
	2876	thread->quantum_remaining = thread->realtime.computation;
	2877	} else {
	2878	thread->quantum_remaining = SCHED(initial_quantum_size)(thread);
	2879	}
	2880	}
	2881
	2882	uint32_t
	2883	sched_timeshare_initial_quantum_size(thread_t thread)
	2884	{
	2885	if ((thread != THREAD_NULL) && thread->th_sched_bucket == TH_BUCKET_SHARE_BG)
	2886	return bg_quantum;
	2887	else
	2888	return std_quantum;
	2889	}
	2890
	2891	/*
	2892	* run_queue_init:
	2893	*
	2894	* Initialize a run queue before first use.
	2895	*/
	2896	void
	2897	run_queue_init(
	2898	run_queue_t rq)
	2899	{
	2900	rq->highq = NOPRI;
	2901	for (u_int i = 0; i < BITMAP_LEN(NRQS); i++)
	2902	rq->bitmap[i] = 0;
	2903	rq->urgency = rq->count = 0;
	2904	for (int i = 0; i < NRQS; i++)
	2905	queue_init(&rq->queues[i]);
	2906	}
	2907
	2908	/*
	2909	* run_queue_dequeue:
	2910	*
	2911	* Perform a dequeue operation on a run queue,
	2912	* and return the resulting thread.
	2913	*
	2914	* The run queue must be locked (see thread_run_queue_remove()
	2915	* for more info), and not empty.
	2916	*/
	2917	thread_t
	2918	run_queue_dequeue(
	2919	run_queue_t rq,
	2920	integer_t options)
	2921	{
	2922	thread_t thread;
	2923	queue_t queue = &rq->queues[rq->highq];
	2924
	2925	if (options & SCHED_HEADQ) {
	2926	thread = qe_dequeue_head(queue, struct thread, runq_links);
	2927	} else {
	2928	thread = qe_dequeue_tail(queue, struct thread, runq_links);
	2929	}
	2930
	2931	assert(thread != THREAD_NULL);
	2932	assert_thread_magic(thread);
	2933
	2934	thread->runq = PROCESSOR_NULL;
	2935	SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
	2936	rq->count--;
	2937	if (SCHED(priority_is_urgent)(rq->highq)) {
	2938	rq->urgency--; assert(rq->urgency >= 0);
	2939	}
	2940	if (queue_empty(queue)) {
	2941	bitmap_clear(rq->bitmap, rq->highq);
	2942	rq->highq = bitmap_first(rq->bitmap, NRQS);
	2943	}
	2944
	2945	return thread;
	2946	}
	2947
	2948	/*
	2949	* run_queue_enqueue:
	2950	*
	2951	* Perform a enqueue operation on a run queue.
	2952	*
	2953	* The run queue must be locked (see thread_run_queue_remove()
	2954	* for more info).
	2955	*/
	2956	boolean_t
	2957	run_queue_enqueue(
	2958	run_queue_t rq,
	2959	thread_t thread,
	2960	integer_t options)
	2961	{
	2962	queue_t queue = &rq->queues[thread->sched_pri];
	2963	boolean_t result = FALSE;
	2964
	2965	assert_thread_magic(thread);
	2966
	2967	if (queue_empty(queue)) {
	2968	enqueue_tail(queue, &thread->runq_links);
	2969
	2970	rq_bitmap_set(rq->bitmap, thread->sched_pri);
	2971	if (thread->sched_pri > rq->highq) {
	2972	rq->highq = thread->sched_pri;
	2973	result = TRUE;
	2974	}
	2975	} else {
	2976	if (options & SCHED_TAILQ)
	2977	enqueue_tail(queue, &thread->runq_links);
	2978	else
	2979	enqueue_head(queue, &thread->runq_links);
	2980	}
	2981	if (SCHED(priority_is_urgent)(thread->sched_pri))
	2982	rq->urgency++;
	2983	SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
	2984	rq->count++;
	2985
	2986	return (result);
	2987	}
	2988
	2989	/*
	2990	* run_queue_remove:
	2991	*
	2992	* Remove a specific thread from a runqueue.
	2993	*
	2994	* The run queue must be locked.
	2995	*/
	2996	void
	2997	run_queue_remove(
	2998	run_queue_t rq,
	2999	thread_t thread)
	3000	{
	3001	assert(thread->runq != PROCESSOR_NULL);
	3002	assert_thread_magic(thread);
	3003
	3004	remqueue(&thread->runq_links);
	3005	SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
	3006	rq->count--;
	3007	if (SCHED(priority_is_urgent)(thread->sched_pri)) {
	3008	rq->urgency--; assert(rq->urgency >= 0);
	3009	}
	3010
	3011	if (queue_empty(&rq->queues[thread->sched_pri])) {
	3012	/* update run queue status */
	3013	bitmap_clear(rq->bitmap, thread->sched_pri);
	3014	rq->highq = bitmap_first(rq->bitmap, NRQS);
	3015	}
	3016
	3017	thread->runq = PROCESSOR_NULL;
	3018	}
	3019
	3020	/* Assumes RT lock is not held, and acquires splsched/rt_lock itself */
	3021	void
	3022	rt_runq_scan(sched_update_scan_context_t scan_context)
	3023	{
	3024	spl_t s;
	3025	thread_t thread;
	3026
	3027	s = splsched();
	3028	rt_lock_lock();
	3029
	3030	qe_foreach_element_safe(thread, &rt_runq.queue, runq_links) {
	3031	if (thread->last_made_runnable_time < scan_context->earliest_rt_make_runnable_time) {
	3032	scan_context->earliest_rt_make_runnable_time = thread->last_made_runnable_time;
	3033	}
	3034	}
	3035
	3036	rt_lock_unlock();
	3037	splx(s);
	3038	}
	3039
	3040
	3041	/*
	3042	* realtime_queue_insert:
	3043	*
	3044	* Enqueue a thread for realtime execution.
	3045	*/
	3046	static boolean_t
	3047	realtime_queue_insert(thread_t thread)
	3048	{
	3049	queue_t queue = &rt_runq.queue;
	3050	uint64_t deadline = thread->realtime.deadline;
	3051	boolean_t preempt = FALSE;
	3052
	3053	rt_lock_lock();
	3054
	3055	if (queue_empty(queue)) {
	3056	enqueue_tail(queue, &thread->runq_links);
	3057	preempt = TRUE;
	3058	} else {
	3059	/* Insert into rt_runq in thread deadline order */
	3060	queue_entry_t iter;
	3061	qe_foreach(iter, queue) {
	3062	thread_t iter_thread = qe_element(iter, struct thread, runq_links);
	3063	assert_thread_magic(iter_thread);
	3064
	3065	if (deadline < iter_thread->realtime.deadline) {
	3066	if (iter == queue_first(queue))
	3067	preempt = TRUE;
	3068	insque(&thread->runq_links, queue_prev(iter));
	3069	break;
	3070	} else if (iter == queue_last(queue)) {
	3071	enqueue_tail(queue, &thread->runq_links);
	3072	break;
	3073	}
	3074	}
	3075	}
	3076
	3077	thread->runq = THREAD_ON_RT_RUNQ;
	3078	SCHED_STATS_RUNQ_CHANGE(&rt_runq.runq_stats, rt_runq.count);
	3079	rt_runq.count++;
	3080
	3081	rt_lock_unlock();
	3082
	3083	return (preempt);
	3084	}
	3085
	3086	/*
	3087	* realtime_setrun:
	3088	*
	3089	* Dispatch a thread for realtime execution.
	3090	*
	3091	* Thread must be locked. Associated pset must
	3092	* be locked, and is returned unlocked.
	3093	*/
	3094	static void
	3095	realtime_setrun(
	3096	processor_t processor,
	3097	thread_t thread)
	3098	{
	3099	processor_set_t pset = processor->processor_set;
	3100	ast_t preempt;
	3101
	3102	boolean_t do_signal_idle = FALSE, do_cause_ast = FALSE;
	3103
	3104	thread->chosen_processor = processor;
	3105
	3106	/* <rdar://problem/15102234> */
	3107	assert(thread->bound_processor == PROCESSOR_NULL);
	3108
	3109	/*
	3110	* Dispatch directly onto idle processor.
	3111	*/
	3112	if ( (thread->bound_processor == processor)
	3113	&& processor->state == PROCESSOR_IDLE) {
	3114	re_queue_tail(&pset->active_queue, &processor->processor_queue);
	3115
	3116	processor->next_thread = thread;
	3117	processor->current_pri = thread->sched_pri;
	3118	processor->current_thmode = thread->sched_mode;
	3119	processor->current_sfi_class = thread->sfi_class;
	3120	processor->deadline = thread->realtime.deadline;
	3121	processor->state = PROCESSOR_DISPATCHING;
	3122
	3123	if (processor != current_processor()) {
	3124	if (!(pset->pending_AST_cpu_mask & (1ULL << processor->cpu_id))) {
	3125	/* cleared on exit from main processor_idle() loop */
	3126	pset->pending_AST_cpu_mask \|= (1ULL << processor->cpu_id);
	3127	do_signal_idle = TRUE;
	3128	}
	3129	}
	3130	pset_unlock(pset);
	3131
	3132	if (do_signal_idle) {
	3133	machine_signal_idle(processor);
	3134	}
	3135	return;
	3136	}
	3137
	3138	if (processor->current_pri < BASEPRI_RTQUEUES)
	3139	preempt = (AST_PREEMPT \| AST_URGENT);
	3140	else if (thread->realtime.deadline < processor->deadline)
	3141	preempt = (AST_PREEMPT \| AST_URGENT);
	3142	else
	3143	preempt = AST_NONE;
	3144
	3145	realtime_queue_insert(thread);
	3146
	3147	if (preempt != AST_NONE) {
	3148	if (processor->state == PROCESSOR_IDLE) {
	3149	re_queue_tail(&pset->active_queue, &processor->processor_queue);
	3150
	3151	processor->next_thread = THREAD_NULL;
	3152	processor->current_pri = thread->sched_pri;
	3153	processor->current_thmode = thread->sched_mode;
	3154	processor->current_sfi_class = thread->sfi_class;
	3155	processor->deadline = thread->realtime.deadline;
	3156	processor->state = PROCESSOR_DISPATCHING;
	3157	if (processor == current_processor()) {
	3158	ast_on(preempt);
	3159	} else {
	3160	if (!(pset->pending_AST_cpu_mask & (1ULL << processor->cpu_id))) {
	3161	/* cleared on exit from main processor_idle() loop */
	3162	pset->pending_AST_cpu_mask \|= (1ULL << processor->cpu_id);
	3163	do_signal_idle = TRUE;
	3164	}
	3165	}
	3166	} else if (processor->state == PROCESSOR_DISPATCHING) {
	3167	if ((processor->next_thread == THREAD_NULL) && ((processor->current_pri < thread->sched_pri) \|\| (processor->deadline > thread->realtime.deadline))) {
	3168	processor->current_pri = thread->sched_pri;
	3169	processor->current_thmode = thread->sched_mode;
	3170	processor->current_sfi_class = thread->sfi_class;
	3171	processor->deadline = thread->realtime.deadline;
	3172	}
	3173	} else {
	3174	if (processor == current_processor()) {
	3175	ast_on(preempt);
	3176	} else {
	3177	if (!(pset->pending_AST_cpu_mask & (1ULL << processor->cpu_id))) {
	3178	/* cleared after IPI causes csw_check() to be called */
	3179	pset->pending_AST_cpu_mask \|= (1ULL << processor->cpu_id);
	3180	do_cause_ast = TRUE;
	3181	}
	3182	}
	3183	}
	3184	} else {
	3185	/* Selected processor was too busy, just keep thread enqueued and let other processors drain it naturally. */
	3186	}
	3187
	3188	pset_unlock(pset);
	3189
	3190	if (do_signal_idle) {
	3191	machine_signal_idle(processor);
	3192	} else if (do_cause_ast) {
	3193	cause_ast_check(processor);
	3194	}
	3195	}
	3196
	3197
	3198	#if defined(CONFIG_SCHED_TIMESHARE_CORE)
	3199
	3200	boolean_t
	3201	priority_is_urgent(int priority)
	3202	{
	3203	return bitmap_test(sched_preempt_pri, priority) ? TRUE : FALSE;
	3204	}
	3205
	3206	#endif /* CONFIG_SCHED_TIMESHARE_CORE */
	3207
	3208	/*
	3209	* processor_setrun:
	3210	*
	3211	* Dispatch a thread for execution on a
	3212	* processor.
	3213	*
	3214	* Thread must be locked. Associated pset must
	3215	* be locked, and is returned unlocked.
	3216	*/
	3217	static void
	3218	processor_setrun(
	3219	processor_t processor,
	3220	thread_t thread,
	3221	integer_t options)
	3222	{
	3223	processor_set_t pset = processor->processor_set;
	3224	ast_t preempt;
	3225	enum { eExitIdle, eInterruptRunning, eDoNothing } ipi_action = eDoNothing;
	3226	enum { eNoSignal, eDoSignal, eDoDeferredSignal } do_signal_idle = eNoSignal;
	3227
	3228	boolean_t do_cause_ast = FALSE;
	3229
	3230	thread->chosen_processor = processor;
	3231
	3232	/*
	3233	* Dispatch directly onto idle processor.
	3234	*/
	3235	if ( (SCHED(direct_dispatch_to_idle_processors) \|\|
	3236	thread->bound_processor == processor)
	3237	&& processor->state == PROCESSOR_IDLE) {
	3238
	3239	re_queue_tail(&pset->active_queue, &processor->processor_queue);
	3240
	3241	processor->next_thread = thread;
	3242	processor->current_pri = thread->sched_pri;
	3243	processor->current_thmode = thread->sched_mode;
	3244	processor->current_sfi_class = thread->sfi_class;
	3245	processor->deadline = UINT64_MAX;
	3246	processor->state = PROCESSOR_DISPATCHING;
	3247
	3248	if (!(pset->pending_AST_cpu_mask & (1ULL << processor->cpu_id))) {
	3249	/* cleared on exit from main processor_idle() loop */
	3250	pset->pending_AST_cpu_mask \|= (1ULL << processor->cpu_id);
	3251	do_signal_idle = eDoSignal;
	3252	}
	3253
	3254	pset_unlock(pset);
	3255
	3256	if (do_signal_idle == eDoSignal) {
	3257	machine_signal_idle(processor);
	3258	}
	3259
	3260	return;
	3261	}
	3262
	3263	/*
	3264	* Set preemption mode.
	3265	*/
	3266	#if defined(CONFIG_SCHED_DEFERRED_AST)
	3267	/* TODO: Do we need to care about urgency (see rdar://problem/20136239)? */
	3268	#endif
	3269	if (SCHED(priority_is_urgent)(thread->sched_pri) && thread->sched_pri > processor->current_pri)
	3270	preempt = (AST_PREEMPT \| AST_URGENT);
	3271	else if(processor->active_thread && thread_eager_preemption(processor->active_thread))
	3272	preempt = (AST_PREEMPT \| AST_URGENT);
	3273	else if ((thread->sched_mode == TH_MODE_TIMESHARE) && (thread->sched_pri < thread->base_pri)) {
	3274	if(SCHED(priority_is_urgent)(thread->base_pri) && thread->sched_pri > processor->current_pri) {
	3275	preempt = (options & SCHED_PREEMPT)? AST_PREEMPT: AST_NONE;
	3276	} else {
	3277	preempt = AST_NONE;
	3278	}
	3279	} else
	3280	preempt = (options & SCHED_PREEMPT)? AST_PREEMPT: AST_NONE;
	3281
	3282	SCHED(processor_enqueue)(processor, thread, options);
	3283
	3284	if (preempt != AST_NONE) {
	3285	if (processor->state == PROCESSOR_IDLE) {
	3286	re_queue_tail(&pset->active_queue, &processor->processor_queue);
	3287
	3288	processor->next_thread = THREAD_NULL;
	3289	processor->current_pri = thread->sched_pri;
	3290	processor->current_thmode = thread->sched_mode;
	3291	processor->current_sfi_class = thread->sfi_class;
	3292	processor->deadline = UINT64_MAX;
	3293	processor->state = PROCESSOR_DISPATCHING;
	3294
	3295	ipi_action = eExitIdle;
	3296	} else if ( processor->state == PROCESSOR_DISPATCHING) {
	3297	if ((processor->next_thread == THREAD_NULL) && (processor->current_pri < thread->sched_pri)) {
	3298	processor->current_pri = thread->sched_pri;
	3299	processor->current_thmode = thread->sched_mode;
	3300	processor->current_sfi_class = thread->sfi_class;
	3301	processor->deadline = UINT64_MAX;
	3302	}
	3303	} else if ( (processor->state == PROCESSOR_RUNNING \|\|
	3304	processor->state == PROCESSOR_SHUTDOWN) &&
	3305	(thread->sched_pri >= processor->current_pri)) {
	3306	ipi_action = eInterruptRunning;
	3307	}
	3308	} else {
	3309	/*
	3310	* New thread is not important enough to preempt what is running, but
	3311	* special processor states may need special handling
	3312	*/
	3313	if (processor->state == PROCESSOR_SHUTDOWN &&
	3314	thread->sched_pri >= processor->current_pri ) {
	3315	ipi_action = eInterruptRunning;
	3316	} else if ( processor->state == PROCESSOR_IDLE &&
	3317	processor != current_processor() ) {
	3318	re_queue_tail(&pset->active_queue, &processor->processor_queue);
	3319
	3320	processor->next_thread = THREAD_NULL;
	3321	processor->current_pri = thread->sched_pri;
	3322	processor->current_thmode = thread->sched_mode;
	3323	processor->current_sfi_class = thread->sfi_class;
	3324	processor->deadline = UINT64_MAX;
	3325	processor->state = PROCESSOR_DISPATCHING;
	3326
	3327	ipi_action = eExitIdle;
	3328	}
	3329	}
	3330
	3331	switch (ipi_action) {
	3332	case eDoNothing:
	3333	break;
	3334	case eExitIdle:
	3335	if (processor == current_processor()) {
	3336	if (csw_check_locked(processor, pset, AST_NONE) != AST_NONE)
	3337	ast_on(preempt);
	3338	} else {
	3339	#if defined(CONFIG_SCHED_DEFERRED_AST)
	3340	if (!(pset->pending_deferred_AST_cpu_mask & (1ULL << processor->cpu_id)) &&
	3341	!(pset->pending_AST_cpu_mask & (1ULL << processor->cpu_id))) {
	3342	/* cleared on exit from main processor_idle() loop */
	3343	pset->pending_deferred_AST_cpu_mask \|= (1ULL << processor->cpu_id);
	3344	do_signal_idle = eDoDeferredSignal;
	3345	}
	3346	#else
	3347	if (!(pset->pending_AST_cpu_mask & (1ULL << processor->cpu_id))) {
	3348	/* cleared on exit from main processor_idle() loop */
	3349	pset->pending_AST_cpu_mask \|= (1ULL << processor->cpu_id);
	3350	do_signal_idle = eDoSignal;
	3351	}
	3352	#endif
	3353	}
	3354	break;
	3355	case eInterruptRunning:
	3356	if (processor == current_processor()) {
	3357	if (csw_check_locked(processor, pset, AST_NONE) != AST_NONE)
	3358	ast_on(preempt);
	3359	} else {
	3360	if (!(pset->pending_AST_cpu_mask & (1ULL << processor->cpu_id))) {
	3361	/* cleared after IPI causes csw_check() to be called */
	3362	pset->pending_AST_cpu_mask \|= (1ULL << processor->cpu_id);
	3363	do_cause_ast = TRUE;
	3364	}
	3365	}
	3366	break;
	3367	}
	3368
	3369	pset_unlock(pset);
	3370
	3371	if (do_signal_idle == eDoSignal) {
	3372	machine_signal_idle(processor);
	3373	}
	3374	#if defined(CONFIG_SCHED_DEFERRED_AST)
	3375	else if (do_signal_idle == eDoDeferredSignal) {
	3376	/*
	3377	* TODO: The ability to cancel this signal could make
	3378	* sending it outside of the pset lock an issue. Do
	3379	* we need to address this? Or would the only fallout
	3380	* be that the core takes a signal? As long as we do
	3381	* not run the risk of having a core marked as signal
	3382	* outstanding, with no real signal outstanding, the
	3383	* only result should be that we fail to cancel some
	3384	* signals.
	3385	*/
	3386	machine_signal_idle_deferred(processor);
	3387	}
	3388	#endif
	3389	else if (do_cause_ast) {
	3390	cause_ast_check(processor);
	3391	}
	3392	}
	3393
	3394	/*
	3395	* choose_next_pset:
	3396	*
	3397	* Return the next sibling pset containing
	3398	* available processors.
	3399	*
	3400	* Returns the original pset if none other is
	3401	* suitable.
	3402	*/
	3403	static processor_set_t
	3404	choose_next_pset(
	3405	processor_set_t pset)
	3406	{
	3407	processor_set_t nset = pset;
	3408
	3409	do {
	3410	nset = next_pset(nset);
	3411	} while (nset->online_processor_count < 1 && nset != pset);
	3412
	3413	return (nset);
	3414	}
	3415
	3416	/*
	3417	* choose_processor:
	3418	*
	3419	* Choose a processor for the thread, beginning at
	3420	* the pset. Accepts an optional processor hint in
	3421	* the pset.
	3422	*
	3423	* Returns a processor, possibly from a different pset.
	3424	*
	3425	* The thread must be locked. The pset must be locked,
	3426	* and the resulting pset is locked on return.
	3427	*/
	3428	processor_t
	3429	choose_processor(
	3430	processor_set_t pset,
	3431	processor_t processor,
	3432	thread_t thread)
	3433	{
	3434	processor_set_t nset, cset = pset;
	3435
	3436	assert(thread->sched_pri <= BASEPRI_RTQUEUES);
	3437
	3438	/*
	3439	* Prefer the hinted processor, when appropriate.
	3440	*/
	3441
	3442	/* Fold last processor hint from secondary processor to its primary */
	3443	if (processor != PROCESSOR_NULL) {
	3444	processor = processor->processor_primary;
	3445	}
	3446
	3447	/*
	3448	* Only consult platform layer if pset is active, which
	3449	* it may not be in some cases when a multi-set system
	3450	* is going to sleep.
	3451	*/
	3452	if (pset->online_processor_count) {
	3453	if ((processor == PROCESSOR_NULL) \|\| (processor->processor_set == pset && processor->state == PROCESSOR_IDLE)) {
	3454	processor_t mc_processor = machine_choose_processor(pset, processor);
	3455	if (mc_processor != PROCESSOR_NULL)
	3456	processor = mc_processor->processor_primary;
	3457	}
	3458	}
	3459
	3460	/*
	3461	* At this point, we may have a processor hint, and we may have
	3462	* an initial starting pset. If the hint is not in the pset, or
	3463	* if the hint is for a processor in an invalid state, discard
	3464	* the hint.
	3465	*/
	3466	if (processor != PROCESSOR_NULL) {
	3467	if (processor->processor_set != pset) {
	3468	processor = PROCESSOR_NULL;
	3469	} else if (!processor->is_recommended) {
	3470	processor = PROCESSOR_NULL;
	3471	} else {
	3472	switch (processor->state) {
	3473	case PROCESSOR_START:
	3474	case PROCESSOR_SHUTDOWN:
	3475	case PROCESSOR_OFF_LINE:
	3476	/*
	3477	* Hint is for a processor that cannot support running new threads.
	3478	*/
	3479	processor = PROCESSOR_NULL;
	3480	break;
	3481	case PROCESSOR_IDLE:
	3482	/*
	3483	* Hint is for an idle processor. Assume it is no worse than any other
	3484	* idle processor. The platform layer had an opportunity to provide
	3485	* the "least cost idle" processor above.
	3486	*/
	3487	return (processor);
	3488	case PROCESSOR_RUNNING:
	3489	case PROCESSOR_DISPATCHING:
	3490	/*
	3491	* Hint is for an active CPU. This fast-path allows
	3492	* realtime threads to preempt non-realtime threads
	3493	* to regain their previous executing processor.
	3494	*/
	3495	if ((thread->sched_pri >= BASEPRI_RTQUEUES) &&
	3496	(processor->current_pri < BASEPRI_RTQUEUES))
	3497	return (processor);
	3498
	3499	/* Otherwise, use hint as part of search below */
	3500	break;
	3501	default:
	3502	processor = PROCESSOR_NULL;
	3503	break;
	3504	}
	3505	}
	3506	}
	3507
	3508	/*
	3509	* Iterate through the processor sets to locate
	3510	* an appropriate processor. Seed results with
	3511	* a last-processor hint, if available, so that
	3512	* a search must find something strictly better
	3513	* to replace it.
	3514	*
	3515	* A primary/secondary pair of SMT processors are
	3516	* "unpaired" if the primary is busy but its
	3517	* corresponding secondary is idle (so the physical
	3518	* core has full use of its resources).
	3519	*/
	3520
	3521	integer_t lowest_priority = MAXPRI + 1;
	3522	integer_t lowest_unpaired_primary_priority = MAXPRI + 1;
	3523	integer_t lowest_count = INT_MAX;
	3524	uint64_t furthest_deadline = 1;
	3525	processor_t lp_processor = PROCESSOR_NULL;
	3526	processor_t lp_unpaired_primary_processor = PROCESSOR_NULL;
	3527	processor_t lp_unpaired_secondary_processor = PROCESSOR_NULL;
	3528	processor_t lc_processor = PROCESSOR_NULL;
	3529	processor_t fd_processor = PROCESSOR_NULL;
	3530
	3531	if (processor != PROCESSOR_NULL) {
	3532	/* All other states should be enumerated above. */
	3533	assert(processor->state == PROCESSOR_RUNNING \|\| processor->state == PROCESSOR_DISPATCHING);
	3534
	3535	lowest_priority = processor->current_pri;
	3536	lp_processor = processor;
	3537
	3538	if (processor->current_pri >= BASEPRI_RTQUEUES) {
	3539	furthest_deadline = processor->deadline;
	3540	fd_processor = processor;
	3541	}
	3542
	3543	lowest_count = SCHED(processor_runq_count)(processor);
	3544	lc_processor = processor;
	3545	}
	3546
	3547	do {
	3548
	3549	/*
	3550	* Choose an idle processor, in pset traversal order
	3551	*/
	3552	qe_foreach_element(processor, &cset->idle_queue, processor_queue) {
	3553	if (processor->is_recommended)
	3554	return processor;
	3555	}
	3556
	3557	/*
	3558	* Otherwise, enumerate active and idle processors to find candidates
	3559	* with lower priority/etc.
	3560	*/
	3561
	3562	qe_foreach_element(processor, &cset->active_queue, processor_queue) {
	3563
	3564	if (!processor->is_recommended) {
	3565	continue;
	3566	}
	3567
	3568	integer_t cpri = processor->current_pri;
	3569	if (cpri < lowest_priority) {
	3570	lowest_priority = cpri;
	3571	lp_processor = processor;
	3572	}
	3573
	3574	if ((cpri >= BASEPRI_RTQUEUES) && (processor->deadline > furthest_deadline)) {
	3575	furthest_deadline = processor->deadline;
	3576	fd_processor = processor;
	3577	}
	3578
	3579	integer_t ccount = SCHED(processor_runq_count)(processor);
	3580	if (ccount < lowest_count) {
	3581	lowest_count = ccount;
	3582	lc_processor = processor;
	3583	}
	3584	}
	3585
	3586	/*
	3587	* For SMT configs, these idle secondary processors must have active primary. Otherwise
	3588	* the idle primary would have short-circuited the loop above
	3589	*/
	3590	qe_foreach_element(processor, &cset->idle_secondary_queue, processor_queue) {
	3591
	3592	if (!processor->is_recommended) {
	3593	continue;
	3594	}
	3595
	3596	processor_t cprimary = processor->processor_primary;
	3597
	3598	/* If the primary processor is offline or starting up, it's not a candidate for this path */
	3599	if (cprimary->state == PROCESSOR_RUNNING \|\| cprimary->state == PROCESSOR_DISPATCHING) {
	3600	integer_t primary_pri = cprimary->current_pri;
	3601
	3602	if (primary_pri < lowest_unpaired_primary_priority) {
	3603	lowest_unpaired_primary_priority = primary_pri;
	3604	lp_unpaired_primary_processor = cprimary;
	3605	lp_unpaired_secondary_processor = processor;
	3606	}
	3607	}
	3608	}
	3609
	3610
	3611	if (thread->sched_pri >= BASEPRI_RTQUEUES) {
	3612
	3613	/*
	3614	* For realtime threads, the most important aspect is
	3615	* scheduling latency, so we attempt to assign threads
	3616	* to good preemption candidates (assuming an idle primary
	3617	* processor was not available above).
	3618	*/
	3619
	3620	if (thread->sched_pri > lowest_unpaired_primary_priority) {
	3621	/* Move to end of active queue so that the next thread doesn't also pick it */
	3622	re_queue_tail(&cset->active_queue, &lp_unpaired_primary_processor->processor_queue);
	3623	return lp_unpaired_primary_processor;
	3624	}
	3625	if (thread->sched_pri > lowest_priority) {
	3626	/* Move to end of active queue so that the next thread doesn't also pick it */
	3627	re_queue_tail(&cset->active_queue, &lp_processor->processor_queue);
	3628	return lp_processor;
	3629	}
	3630	if (thread->realtime.deadline < furthest_deadline)
	3631	return fd_processor;
	3632
	3633	/*
	3634	* If all primary and secondary CPUs are busy with realtime
	3635	* threads with deadlines earlier than us, move on to next
	3636	* pset.
	3637	*/
	3638	}
	3639	else {
	3640
	3641	if (thread->sched_pri > lowest_unpaired_primary_priority) {
	3642	/* Move to end of active queue so that the next thread doesn't also pick it */
	3643	re_queue_tail(&cset->active_queue, &lp_unpaired_primary_processor->processor_queue);
	3644	return lp_unpaired_primary_processor;
	3645	}
	3646	if (thread->sched_pri > lowest_priority) {
	3647	/* Move to end of active queue so that the next thread doesn't also pick it */
	3648	re_queue_tail(&cset->active_queue, &lp_processor->processor_queue);
	3649	return lp_processor;
	3650	}
	3651
	3652	/*
	3653	* If all primary processor in this pset are running a higher
	3654	* priority thread, move on to next pset. Only when we have
	3655	* exhausted this search do we fall back to other heuristics.
	3656	*/
	3657	}
	3658
	3659	/*
	3660	* Move onto the next processor set.
	3661	*/
	3662	nset = next_pset(cset);
	3663
	3664	if (nset != pset) {
	3665	pset_unlock(cset);
	3666
	3667	cset = nset;
	3668	pset_lock(cset);
	3669	}
	3670	} while (nset != pset);
	3671
	3672	/*
	3673	* Make sure that we pick a running processor,
	3674	* and that the correct processor set is locked.
	3675	* Since we may have unlock the candidate processor's
	3676	* pset, it may have changed state.
	3677	*
	3678	* All primary processors are running a higher priority
	3679	* thread, so the only options left are enqueuing on
	3680	* the secondary processor that would perturb the least priority
	3681	* primary, or the least busy primary.
	3682	*/
	3683	do {
	3684
	3685	/* lowest_priority is evaluated in the main loops above */
	3686	if (lp_unpaired_secondary_processor != PROCESSOR_NULL) {
	3687	processor = lp_unpaired_secondary_processor;
	3688	lp_unpaired_secondary_processor = PROCESSOR_NULL;
	3689	} else if (lc_processor != PROCESSOR_NULL) {
	3690	processor = lc_processor;
	3691	lc_processor = PROCESSOR_NULL;
	3692	} else {
	3693	/*
	3694	* All processors are executing higher
	3695	* priority threads, and the lowest_count
	3696	* candidate was not usable
	3697	*/
	3698	processor = master_processor;
	3699	}
	3700
	3701	/*
	3702	* Check that the correct processor set is
	3703	* returned locked.
	3704	*/
	3705	if (cset != processor->processor_set) {
	3706	pset_unlock(cset);
	3707	cset = processor->processor_set;
	3708	pset_lock(cset);
	3709	}
	3710
	3711	/*
	3712	* We must verify that the chosen processor is still available.
	3713	* master_processor is an exception, since we may need to preempt
	3714	* a running thread on it during processor shutdown (for sleep),
	3715	* and that thread needs to be enqueued on its runqueue to run
	3716	* when the processor is restarted.
	3717	*/
	3718	if (processor != master_processor && (processor->state == PROCESSOR_SHUTDOWN \|\| processor->state == PROCESSOR_OFF_LINE))
	3719	processor = PROCESSOR_NULL;
	3720
	3721	} while (processor == PROCESSOR_NULL);
	3722
	3723	return (processor);
	3724	}
	3725
	3726	/*
	3727	* thread_setrun:
	3728	*
	3729	* Dispatch thread for execution, onto an idle
	3730	* processor or run queue, and signal a preemption
	3731	* as appropriate.
	3732	*
	3733	* Thread must be locked.
	3734	*/
	3735	void
	3736	thread_setrun(
	3737	thread_t thread,
	3738	integer_t options)
	3739	{
	3740	processor_t processor;
	3741	processor_set_t pset;
	3742
	3743	assert((thread->state & (TH_RUN\|TH_WAIT\|TH_UNINT\|TH_TERMINATE\|TH_TERMINATE2)) == TH_RUN);
	3744	assert(thread->runq == PROCESSOR_NULL);
	3745
	3746	/*
	3747	* Update priority if needed.
	3748	*/
	3749	if (SCHED(can_update_priority)(thread))
	3750	SCHED(update_priority)(thread);
	3751
	3752	thread->sfi_class = sfi_thread_classify(thread);
	3753
	3754	assert(thread->runq == PROCESSOR_NULL);
	3755
	3756	#if __SMP__
	3757	if (thread->bound_processor == PROCESSOR_NULL) {
	3758	/*
	3759	* Unbound case.
	3760	*/
	3761	if (thread->affinity_set != AFFINITY_SET_NULL) {
	3762	/*
	3763	* Use affinity set policy hint.
	3764	*/
	3765	pset = thread->affinity_set->aset_pset;
	3766	pset_lock(pset);
	3767
	3768	processor = SCHED(choose_processor)(pset, PROCESSOR_NULL, thread);
	3769
	3770	SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR)\|DBG_FUNC_NONE,
	3771	(uintptr_t)thread_tid(thread), (uintptr_t)-1, processor->cpu_id, processor->state, 0);
	3772	} else if (thread->last_processor != PROCESSOR_NULL) {
	3773	/*
	3774	* Simple (last processor) affinity case.
	3775	*/
	3776	processor = thread->last_processor;
	3777	pset = processor->processor_set;
	3778	pset_lock(pset);
	3779	processor = SCHED(choose_processor)(pset, processor, thread);
	3780
	3781	SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR)\|DBG_FUNC_NONE,
	3782	(uintptr_t)thread_tid(thread), thread->last_processor->cpu_id, processor->cpu_id, processor->state, 0);
	3783	} else {
	3784	/*
	3785	* No Affinity case:
	3786	*
	3787	* Utilitize a per task hint to spread threads
	3788	* among the available processor sets.
	3789	*/
	3790	task_t task = thread->task;
	3791
	3792	pset = task->pset_hint;
	3793	if (pset == PROCESSOR_SET_NULL)
	3794	pset = current_processor()->processor_set;
	3795
	3796	pset = choose_next_pset(pset);
	3797	pset_lock(pset);
	3798
	3799	processor = SCHED(choose_processor)(pset, PROCESSOR_NULL, thread);
	3800	task->pset_hint = processor->processor_set;
	3801
	3802	SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR)\|DBG_FUNC_NONE,
	3803	(uintptr_t)thread_tid(thread), (uintptr_t)-1, processor->cpu_id, processor->state, 0);
	3804	}
	3805	} else {
	3806	/*
	3807	* Bound case:
	3808	*
	3809	* Unconditionally dispatch on the processor.
	3810	*/
	3811	processor = thread->bound_processor;
	3812	pset = processor->processor_set;
	3813	pset_lock(pset);
	3814
	3815	SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR)\|DBG_FUNC_NONE,
	3816	(uintptr_t)thread_tid(thread), (uintptr_t)-2, processor->cpu_id, processor->state, 0);
	3817	}
	3818	#else /* !__SMP__ */
	3819	/* Only one processor to choose */
	3820	assert(thread->bound_processor == PROCESSOR_NULL \|\| thread->bound_processor == master_processor);
	3821	processor = master_processor;
	3822	pset = processor->processor_set;
	3823	pset_lock(pset);
	3824	#endif /* !__SMP__ */
	3825
	3826	/*
	3827	* Dispatch the thread on the chosen processor.
	3828	* TODO: This should be based on sched_mode, not sched_pri
	3829	*/
	3830	if (thread->sched_pri >= BASEPRI_RTQUEUES)
	3831	realtime_setrun(processor, thread);
	3832	else
	3833	processor_setrun(processor, thread, options);
	3834	}
	3835
	3836	processor_set_t
	3837	task_choose_pset(
	3838	task_t task)
	3839	{
	3840	processor_set_t pset = task->pset_hint;
	3841
	3842	if (pset != PROCESSOR_SET_NULL)
	3843	pset = choose_next_pset(pset);
	3844
	3845	return (pset);
	3846	}
	3847
	3848	/*
	3849	* Check for a preemption point in
	3850	* the current context.
	3851	*
	3852	* Called at splsched with thread locked.
	3853	*/
	3854	ast_t
	3855	csw_check(
	3856	processor_t processor,
	3857	ast_t check_reason)
	3858	{
	3859	processor_set_t pset = processor->processor_set;
	3860	ast_t result;
	3861
	3862	pset_lock(pset);
	3863
	3864	/* If we were sent a remote AST and interrupted a running processor, acknowledge it here with pset lock held */
	3865	pset->pending_AST_cpu_mask &= ~(1ULL << processor->cpu_id);
	3866
	3867	result = csw_check_locked(processor, pset, check_reason);
	3868
	3869	pset_unlock(pset);
	3870
	3871	return result;
	3872	}
	3873
	3874	/*
	3875	* Check for preemption at splsched with
	3876	* pset and thread locked
	3877	*/
	3878	ast_t
	3879	csw_check_locked(
	3880	processor_t processor,
	3881	processor_set_t pset __unused,
	3882	ast_t check_reason)
	3883	{
	3884	ast_t result;
	3885	thread_t thread = processor->active_thread;
	3886
	3887	if (processor->first_timeslice) {
	3888	if (rt_runq.count > 0)
	3889	return (check_reason \| AST_PREEMPT \| AST_URGENT);
	3890	}
	3891	else {
	3892	if (rt_runq.count > 0) {
	3893	if (BASEPRI_RTQUEUES > processor->current_pri)
	3894	return (check_reason \| AST_PREEMPT \| AST_URGENT);
	3895	else
	3896	return (check_reason \| AST_PREEMPT);
	3897	}
	3898	}
	3899
	3900	result = SCHED(processor_csw_check)(processor);
	3901	if (result != AST_NONE)
	3902	return (check_reason \| result \| (thread_eager_preemption(thread) ? AST_URGENT : AST_NONE));
	3903
	3904	#if __SMP__
	3905
	3906	/*
	3907	* If the current thread is running on a processor that is no longer recommended, gently
	3908	* (non-urgently) get to a point and then block, and which point thread_select() should
	3909	* try to idle the processor and re-dispatch the thread to a recommended processor.
	3910	*/
	3911	if (!processor->is_recommended)
	3912	return (check_reason \| AST_PREEMPT);
	3913
	3914	/*
	3915	* Even though we could continue executing on this processor, a
	3916	* secondary SMT core should try to shed load to another primary core.
	3917	*
	3918	* TODO: Should this do the same check that thread_select does? i.e.
	3919	* if no bound threads target this processor, and idle primaries exist, preempt
	3920	* The case of RT threads existing is already taken care of above
	3921	* Consider Capri in this scenario.
	3922	*
	3923	* if (!SCHED(processor_bound_count)(processor) && !queue_empty(&pset->idle_queue))
	3924	*
	3925	* TODO: Alternatively - check if only primary is idle, or check if primary's pri is lower than mine.
	3926	*/
	3927
	3928	if (processor->current_pri < BASEPRI_RTQUEUES &&
	3929	processor->processor_primary != processor)
	3930	return (check_reason \| AST_PREEMPT);
	3931	#endif
	3932
	3933	if (thread->state & TH_SUSP)
	3934	return (check_reason \| AST_PREEMPT);
	3935
	3936	#if CONFIG_SCHED_SFI
	3937	/*
	3938	* Current thread may not need to be preempted, but maybe needs
	3939	* an SFI wait?
	3940	*/
	3941	result = sfi_thread_needs_ast(thread, NULL);
	3942	if (result != AST_NONE)
	3943	return (check_reason \| result);
	3944	#endif
	3945
	3946	return (AST_NONE);
	3947	}
	3948
	3949	/*
	3950	* set_sched_pri:
	3951	*
	3952	* Set the scheduled priority of the specified thread.
	3953	*
	3954	* This may cause the thread to change queues.
	3955	*
	3956	* Thread must be locked.
	3957	*/
	3958	void
	3959	set_sched_pri(
	3960	thread_t thread,
	3961	int priority)
	3962	{
	3963	thread_t cthread = current_thread();
	3964	boolean_t is_current_thread = (thread == cthread) ? TRUE : FALSE;
	3965	int curgency, nurgency;
	3966	uint64_t urgency_param1, urgency_param2;
	3967	boolean_t removed_from_runq = FALSE;
	3968
	3969	/* If we're already at this priority, no need to mess with the runqueue */
	3970	if (priority == thread->sched_pri)
	3971	return;
	3972
	3973	if (is_current_thread) {
	3974	assert(thread->runq == PROCESSOR_NULL);
	3975	curgency = thread_get_urgency(thread, &urgency_param1, &urgency_param2);
	3976	} else {
	3977	removed_from_runq = thread_run_queue_remove(thread);
	3978	}
	3979
	3980	thread->sched_pri = priority;
	3981
	3982	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHANGE_PRIORITY),
	3983	(uintptr_t)thread_tid(thread),
	3984	thread->base_pri,
	3985	thread->sched_pri,
	3986	0, /* eventually, 'reason' */
	3987	0);
	3988
	3989	if (is_current_thread) {
	3990	nurgency = thread_get_urgency(thread, &urgency_param1, &urgency_param2);
	3991	/*
	3992	* set_sched_pri doesn't alter RT params. We expect direct base priority/QoS
	3993	* class alterations from user space to occur relatively infrequently, hence
	3994	* those are lazily handled. QoS classes have distinct priority bands, and QoS
	3995	* inheritance is expected to involve priority changes.
	3996	*/
	3997	if (nurgency != curgency) {
	3998	thread_tell_urgency(nurgency, urgency_param1, urgency_param2, 0, thread);
	3999	machine_thread_going_on_core(thread, nurgency, 0);
	4000	}
	4001	}
	4002
	4003	/* TODO: Should this be TAILQ if it went down, HEADQ if it went up? */
	4004	if (removed_from_runq)
	4005	thread_run_queue_reinsert(thread, SCHED_PREEMPT \| SCHED_TAILQ);
	4006	else if (thread->state & TH_RUN) {
	4007	processor_t processor = thread->last_processor;
	4008
	4009	if (is_current_thread) {
	4010	ast_t preempt;
	4011
	4012	processor->current_pri = priority;
	4013	processor->current_thmode = thread->sched_mode;
	4014	processor->current_sfi_class = thread->sfi_class = sfi_thread_classify(thread);
	4015	if ((preempt = csw_check(processor, AST_NONE)) != AST_NONE)
	4016	ast_on(preempt);
	4017	} else if (processor != PROCESSOR_NULL && processor->active_thread == thread)
	4018	cause_ast_check(processor);
	4019	}
	4020	}
	4021
	4022	/*
	4023	* thread_run_queue_remove_for_handoff
	4024	*
	4025	* Pull a thread or its (recursive) push target out of the runqueue
	4026	* so that it is ready for thread_run()
	4027	*
	4028	* Called at splsched
	4029	*
	4030	* Returns the thread that was pulled or THREAD_NULL if no thread could be pulled.
	4031	* This may be different than the thread that was passed in.
	4032	*/
	4033	thread_t
	4034	thread_run_queue_remove_for_handoff(thread_t thread) {
	4035
	4036	thread_t pulled_thread = THREAD_NULL;
	4037
	4038	thread_lock(thread);
	4039
	4040	/*
	4041	* Check that the thread is not bound
	4042	* to a different processor, and that realtime
	4043	* is not involved.
	4044	*
	4045	* Next, pull it off its run queue. If it
	4046	* doesn't come, it's not eligible.
	4047	*/
	4048
	4049	processor_t processor = current_processor();
	4050	if (processor->current_pri < BASEPRI_RTQUEUES && thread->sched_pri < BASEPRI_RTQUEUES &&
	4051	(thread->bound_processor == PROCESSOR_NULL \|\| thread->bound_processor == processor)) {
	4052
	4053	if (thread_run_queue_remove(thread))
	4054	pulled_thread = thread;
	4055	}
	4056
	4057	thread_unlock(thread);
	4058
	4059	return pulled_thread;
	4060	}
	4061
	4062	/*
	4063	* thread_run_queue_remove:
	4064	*
	4065	* Remove a thread from its current run queue and
	4066	* return TRUE if successful.
	4067	*
	4068	* Thread must be locked.
	4069	*
	4070	* If thread->runq is PROCESSOR_NULL, the thread will not re-enter the
	4071	* run queues because the caller locked the thread. Otherwise
	4072	* the thread is on a run queue, but could be chosen for dispatch
	4073	* and removed by another processor under a different lock, which
	4074	* will set thread->runq to PROCESSOR_NULL.
	4075	*
	4076	* Hence the thread select path must not rely on anything that could
	4077	* be changed under the thread lock after calling this function,
	4078	* most importantly thread->sched_pri.
	4079	*/
	4080	boolean_t
	4081	thread_run_queue_remove(
	4082	thread_t thread)
	4083	{
	4084	boolean_t removed = FALSE;
	4085	processor_t processor = thread->runq;
	4086
	4087	if ((thread->state & (TH_RUN\|TH_WAIT)) == TH_WAIT) {
	4088	/* Thread isn't runnable */
	4089	assert(thread->runq == PROCESSOR_NULL);
	4090	return FALSE;
	4091	}
	4092
	4093	if (processor == PROCESSOR_NULL) {
	4094	/*
	4095	* The thread is either not on the runq,
	4096	* or is in the midst of being removed from the runq.
	4097	*
	4098	* runq is set to NULL under the pset lock, not the thread
	4099	* lock, so the thread may still be in the process of being dequeued
	4100	* from the runq. It will wait in invoke for the thread lock to be
	4101	* dropped.
	4102	*/
	4103
	4104	return FALSE;
	4105	}
	4106
	4107	if (thread->sched_pri < BASEPRI_RTQUEUES) {
	4108	return SCHED(processor_queue_remove)(processor, thread);
	4109	}
	4110
	4111	rt_lock_lock();
	4112
	4113	if (thread->runq != PROCESSOR_NULL) {
	4114	/*
	4115	* Thread is on the RT run queue and we have a lock on
	4116	* that run queue.
	4117	*/
	4118
	4119	assert(thread->runq == THREAD_ON_RT_RUNQ);
	4120
	4121	remqueue(&thread->runq_links);
	4122	SCHED_STATS_RUNQ_CHANGE(&rt_runq.runq_stats, rt_runq.count);
	4123	rt_runq.count--;
	4124
	4125	thread->runq = PROCESSOR_NULL;
	4126
	4127	removed = TRUE;
	4128	}
	4129
	4130	rt_lock_unlock();
	4131
	4132	return (removed);
	4133	}
	4134
	4135	/*
	4136	* Put the thread back where it goes after a thread_run_queue_remove
	4137	*
	4138	* Thread must have been removed under the same thread lock hold
	4139	*
	4140	* thread locked, at splsched
	4141	*/
	4142	void
	4143	thread_run_queue_reinsert(thread_t thread, integer_t options)
	4144	{
	4145	assert(thread->runq == PROCESSOR_NULL);
	4146
	4147	assert(thread->state & (TH_RUN));
	4148	thread_setrun(thread, options);
	4149
	4150	}
	4151
	4152	void
	4153	sys_override_cpu_throttle(int flag)
	4154	{
	4155	if (flag == CPU_THROTTLE_ENABLE)
	4156	cpu_throttle_enabled = 1;
	4157	if (flag == CPU_THROTTLE_DISABLE)
	4158	cpu_throttle_enabled = 0;
	4159	}
	4160
	4161	int
	4162	thread_get_urgency(thread_t thread, uint64_t arg1, uint64_t arg2)
	4163	{
	4164	if (thread == NULL \|\| (thread->state & TH_IDLE)) {
	4165	*arg1 = 0;
	4166	*arg2 = 0;
	4167
	4168	return (THREAD_URGENCY_NONE);
	4169	} else if (thread->sched_mode == TH_MODE_REALTIME) {
	4170	*arg1 = thread->realtime.period;
	4171	*arg2 = thread->realtime.deadline;
	4172
	4173	return (THREAD_URGENCY_REAL_TIME);
	4174	} else if (cpu_throttle_enabled &&
	4175	((thread->sched_pri <= MAXPRI_THROTTLE) && (thread->base_pri <= MAXPRI_THROTTLE))) {
	4176	/*
	4177	* Background urgency applied when thread priority is MAXPRI_THROTTLE or lower and thread is not promoted
	4178	*/
	4179	*arg1 = thread->sched_pri;
	4180	*arg2 = thread->base_pri;
	4181
	4182	return (THREAD_URGENCY_BACKGROUND);
	4183	} else {
	4184	/* For otherwise unclassified threads, report throughput QoS
	4185	* parameters
	4186	*/
	4187	*arg1 = proc_get_effective_thread_policy(thread, TASK_POLICY_THROUGH_QOS);
	4188	*arg2 = proc_get_effective_task_policy(thread->task, TASK_POLICY_THROUGH_QOS);
	4189
	4190	return (THREAD_URGENCY_NORMAL);
	4191	}
	4192	}
	4193
	4194
	4195	/*
	4196	* This is the processor idle loop, which just looks for other threads
	4197	* to execute. Processor idle threads invoke this without supplying a
	4198	* current thread to idle without an asserted wait state.
	4199	*
	4200	* Returns a the next thread to execute if dispatched directly.
	4201	*/
	4202
	4203	#if 0
	4204	#define IDLE_KERNEL_DEBUG_CONSTANT(...) KERNEL_DEBUG_CONSTANT(__VA_ARGS__)
	4205	#else
	4206	#define IDLE_KERNEL_DEBUG_CONSTANT(...) do { } while(0)
	4207	#endif
	4208
	4209	thread_t
	4210	processor_idle(
	4211	thread_t thread,
	4212	processor_t processor)
	4213	{
	4214	processor_set_t pset = processor->processor_set;
	4215	thread_t new_thread;
	4216	int state;
	4217	(void)splsched();
	4218
	4219	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	4220	MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) \| DBG_FUNC_START,
	4221	(uintptr_t)thread_tid(thread), 0, 0, 0, 0);
	4222
	4223	SCHED_STATS_CPU_IDLE_START(processor);
	4224
	4225	timer_switch(&PROCESSOR_DATA(processor, system_state),
	4226	mach_absolute_time(), &PROCESSOR_DATA(processor, idle_state));
	4227	PROCESSOR_DATA(processor, current_state) = &PROCESSOR_DATA(processor, idle_state);
	4228
	4229	while (1) {
	4230	if (processor->state != PROCESSOR_IDLE) /* unsafe, but worst case we loop around once */
	4231	break;
	4232	if (pset->pending_AST_cpu_mask & (1ULL << processor->cpu_id))
	4233	break;
	4234	if (processor->is_recommended) {
	4235	if (rt_runq.count)
	4236	break;
	4237	} else {
	4238	if (SCHED(processor_bound_count)(processor))
	4239	break;
	4240	}
	4241
	4242	#if CONFIG_SCHED_IDLE_IN_PLACE
	4243	if (thread != THREAD_NULL) {
	4244	/* Did idle-in-place thread wake up */
	4245	if ((thread->state & (TH_WAIT\|TH_SUSP)) != TH_WAIT \|\| thread->wake_active)
	4246	break;
	4247	}
	4248	#endif
	4249
	4250	IDLE_KERNEL_DEBUG_CONSTANT(
	4251	MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) \| DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), rt_runq.count, SCHED(processor_runq_count)(processor), -1, 0);
	4252
	4253	machine_track_platform_idle(TRUE);
	4254
	4255	machine_idle();
	4256
	4257	machine_track_platform_idle(FALSE);
	4258
	4259	(void)splsched();
	4260
	4261	IDLE_KERNEL_DEBUG_CONSTANT(
	4262	MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) \| DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), rt_runq.count, SCHED(processor_runq_count)(processor), -2, 0);
	4263
	4264	if (!SCHED(processor_queue_empty)(processor)) {
	4265	/* Secondary SMT processors respond to directed wakeups
	4266	* exclusively. Some platforms induce 'spurious' SMT wakeups.
	4267	*/
	4268	if (processor->processor_primary == processor)
	4269	break;
	4270	}
	4271	}
	4272
	4273	timer_switch(&PROCESSOR_DATA(processor, idle_state),
	4274	mach_absolute_time(), &PROCESSOR_DATA(processor, system_state));
	4275	PROCESSOR_DATA(processor, current_state) = &PROCESSOR_DATA(processor, system_state);
	4276
	4277	pset_lock(pset);
	4278
	4279	/* If we were sent a remote AST and came out of idle, acknowledge it here with pset lock held */
	4280	pset->pending_AST_cpu_mask &= ~(1ULL << processor->cpu_id);
	4281	#if defined(CONFIG_SCHED_DEFERRED_AST)
	4282	pset->pending_deferred_AST_cpu_mask &= ~(1ULL << processor->cpu_id);
	4283	#endif
	4284
	4285	state = processor->state;
	4286	if (state == PROCESSOR_DISPATCHING) {
	4287	/*
	4288	* Commmon case -- cpu dispatched.
	4289	*/
	4290	new_thread = processor->next_thread;
	4291	processor->next_thread = THREAD_NULL;
	4292	processor->state = PROCESSOR_RUNNING;
	4293
	4294	if ((new_thread != THREAD_NULL) && (SCHED(processor_queue_has_priority)(processor, new_thread->sched_pri, FALSE) \|\|
	4295	(rt_runq.count > 0)) ) {
	4296	/* Something higher priority has popped up on the runqueue - redispatch this thread elsewhere */
	4297	processor->current_pri = IDLEPRI;
	4298	processor->current_thmode = TH_MODE_FIXED;
	4299	processor->current_sfi_class = SFI_CLASS_KERNEL;
	4300	processor->deadline = UINT64_MAX;
	4301
	4302	pset_unlock(pset);
	4303
	4304	thread_lock(new_thread);
	4305	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_REDISPATCH), (uintptr_t)thread_tid(new_thread), new_thread->sched_pri, rt_runq.count, 0, 0);
	4306	thread_setrun(new_thread, SCHED_HEADQ);
	4307	thread_unlock(new_thread);
	4308
	4309	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	4310	MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) \| DBG_FUNC_END,
	4311	(uintptr_t)thread_tid(thread), state, 0, 0, 0);
	4312
	4313	return (THREAD_NULL);
	4314	}
	4315
	4316	pset_unlock(pset);
	4317
	4318	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	4319	MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) \| DBG_FUNC_END,
	4320	(uintptr_t)thread_tid(thread), state, (uintptr_t)thread_tid(new_thread), 0, 0);
	4321
	4322	return (new_thread);
	4323
	4324	} else if (state == PROCESSOR_IDLE) {
	4325	re_queue_tail(&pset->active_queue, &processor->processor_queue);
	4326
	4327	processor->state = PROCESSOR_RUNNING;
	4328	processor->current_pri = IDLEPRI;
	4329	processor->current_thmode = TH_MODE_FIXED;
	4330	processor->current_sfi_class = SFI_CLASS_KERNEL;
	4331	processor->deadline = UINT64_MAX;
	4332
	4333	} else if (state == PROCESSOR_SHUTDOWN) {
	4334	/*
	4335	* Going off-line. Force a
	4336	* reschedule.
	4337	*/
	4338	if ((new_thread = processor->next_thread) != THREAD_NULL) {
	4339	processor->next_thread = THREAD_NULL;
	4340	processor->current_pri = IDLEPRI;
	4341	processor->current_thmode = TH_MODE_FIXED;
	4342	processor->current_sfi_class = SFI_CLASS_KERNEL;
	4343	processor->deadline = UINT64_MAX;
	4344
	4345	pset_unlock(pset);
	4346
	4347	thread_lock(new_thread);
	4348	thread_setrun(new_thread, SCHED_HEADQ);
	4349	thread_unlock(new_thread);
	4350
	4351	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	4352	MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) \| DBG_FUNC_END,
	4353	(uintptr_t)thread_tid(thread), state, 0, 0, 0);
	4354
	4355	return (THREAD_NULL);
	4356	}
	4357	}
	4358
	4359	pset_unlock(pset);
	4360
	4361	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	4362	MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) \| DBG_FUNC_END,
	4363	(uintptr_t)thread_tid(thread), state, 0, 0, 0);
	4364
	4365	return (THREAD_NULL);
	4366	}
	4367
	4368	/*
	4369	* Each processor has a dedicated thread which
	4370	* executes the idle loop when there is no suitable
	4371	* previous context.
	4372	*/
	4373	void
	4374	idle_thread(void)
	4375	{
	4376	processor_t processor = current_processor();
	4377	thread_t new_thread;
	4378
	4379	new_thread = processor_idle(THREAD_NULL, processor);
	4380	if (new_thread != THREAD_NULL) {
	4381	thread_run(processor->idle_thread, (thread_continue_t)idle_thread, NULL, new_thread);
	4382	/NOTREACHED/
	4383	}
	4384
	4385	thread_block((thread_continue_t)idle_thread);
	4386	/NOTREACHED/
	4387	}
	4388
	4389	kern_return_t
	4390	idle_thread_create(
	4391	processor_t processor)
	4392	{
	4393	kern_return_t result;
	4394	thread_t thread;
	4395	spl_t s;
	4396
	4397	result = kernel_thread_create((thread_continue_t)idle_thread, NULL, MAXPRI_KERNEL, &thread);
	4398	if (result != KERN_SUCCESS)
	4399	return (result);
	4400
	4401	s = splsched();
	4402	thread_lock(thread);
	4403	thread->bound_processor = processor;
	4404	processor->idle_thread = thread;
	4405	thread->sched_pri = thread->base_pri = IDLEPRI;
	4406	thread->state = (TH_RUN \| TH_IDLE);
	4407	thread->options \|= TH_OPT_IDLE_THREAD;
	4408	thread_unlock(thread);
	4409	splx(s);
	4410
	4411	thread_deallocate(thread);
	4412
	4413	return (KERN_SUCCESS);
	4414	}
	4415
	4416	/*
	4417	* sched_startup:
	4418	*
	4419	* Kicks off scheduler services.
	4420	*
	4421	* Called at splsched.
	4422	*/
	4423	void
	4424	sched_startup(void)
	4425	{
	4426	kern_return_t result;
	4427	thread_t thread;
	4428
	4429	simple_lock_init(&sched_vm_group_list_lock, 0);
	4430
	4431
	4432	result = kernel_thread_start_priority((thread_continue_t)sched_init_thread,
	4433	(void *)SCHED(maintenance_continuation), MAXPRI_KERNEL, &thread);
	4434	if (result != KERN_SUCCESS)
	4435	panic("sched_startup");
	4436
	4437	thread_deallocate(thread);
	4438
	4439	assert_thread_magic(thread);
	4440
	4441	/*
	4442	* Yield to the sched_init_thread once, to
	4443	* initialize our own thread after being switched
	4444	* back to.
	4445	*
	4446	* The current thread is the only other thread
	4447	* active at this point.
	4448	*/
	4449	thread_block(THREAD_CONTINUE_NULL);
	4450	}
	4451
	4452	#if defined(CONFIG_SCHED_TIMESHARE_CORE)
	4453
	4454	static volatile uint64_t sched_maintenance_deadline;
	4455	static uint64_t sched_tick_last_abstime;
	4456	static uint64_t sched_tick_delta;
	4457	uint64_t sched_tick_max_delta;
	4458	/*
	4459	* sched_init_thread:
	4460	*
	4461	* Perform periodic bookkeeping functions about ten
	4462	* times per second.
	4463	*/
	4464	void
	4465	sched_timeshare_maintenance_continue(void)
	4466	{
	4467	uint64_t sched_tick_ctime, late_time;
	4468
	4469	struct sched_update_scan_context scan_context = {
	4470	.earliest_bg_make_runnable_time = UINT64_MAX,
	4471	.earliest_normal_make_runnable_time = UINT64_MAX,
	4472	.earliest_rt_make_runnable_time = UINT64_MAX
	4473	};
	4474
	4475	sched_tick_ctime = mach_absolute_time();
	4476
	4477	if (__improbable(sched_tick_last_abstime == 0)) {
	4478	sched_tick_last_abstime = sched_tick_ctime;
	4479	late_time = 0;
	4480	sched_tick_delta = 1;
	4481	} else {
	4482	late_time = sched_tick_ctime - sched_tick_last_abstime;
	4483	sched_tick_delta = late_time / sched_tick_interval;
	4484	/* Ensure a delta of 1, since the interval could be slightly
	4485	* smaller than the sched_tick_interval due to dispatch
	4486	* latencies.
	4487	*/
	4488	sched_tick_delta = MAX(sched_tick_delta, 1);
	4489
	4490	/* In the event interrupt latencies or platform
	4491	* idle events that advanced the timebase resulted
	4492	* in periods where no threads were dispatched,
	4493	* cap the maximum "tick delta" at SCHED_TICK_MAX_DELTA
	4494	* iterations.
	4495	*/
	4496	sched_tick_delta = MIN(sched_tick_delta, SCHED_TICK_MAX_DELTA);
	4497
	4498	sched_tick_last_abstime = sched_tick_ctime;
	4499	sched_tick_max_delta = MAX(sched_tick_delta, sched_tick_max_delta);
	4500	}
	4501
	4502	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_MAINTENANCE)\|DBG_FUNC_START,
	4503	sched_tick_delta, late_time, 0, 0, 0);
	4504
	4505	/* Add a number of pseudo-ticks corresponding to the elapsed interval
	4506	* This could be greater than 1 if substantial intervals where
	4507	* all processors are idle occur, which rarely occurs in practice.
	4508	*/
	4509
	4510	sched_tick += sched_tick_delta;
	4511
	4512	/*
	4513	* Compute various averages.
	4514	*/
	4515	compute_averages(sched_tick_delta);
	4516
	4517	/*
	4518	* Scan the run queues for threads which
	4519	* may need to be updated, and find the earliest runnable thread on the runqueue
	4520	* to report its latency.
	4521	*/
	4522	SCHED(thread_update_scan)(&scan_context);
	4523
	4524	rt_runq_scan(&scan_context);
	4525
	4526	uint64_t ctime = mach_absolute_time();
	4527
	4528	uint64_t bg_max_latency = (ctime > scan_context.earliest_bg_make_runnable_time) ?
	4529	ctime - scan_context.earliest_bg_make_runnable_time : 0;
	4530
	4531	uint64_t default_max_latency = (ctime > scan_context.earliest_normal_make_runnable_time) ?
	4532	ctime - scan_context.earliest_normal_make_runnable_time : 0;
	4533
	4534	uint64_t realtime_max_latency = (ctime > scan_context.earliest_rt_make_runnable_time) ?
	4535	ctime - scan_context.earliest_rt_make_runnable_time : 0;
	4536
	4537	machine_max_runnable_latency(bg_max_latency, default_max_latency, realtime_max_latency);
	4538
	4539	/*
	4540	* Check to see if the special sched VM group needs attention.
	4541	*/
	4542	sched_vm_group_maintenance();
	4543
	4544
	4545	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_MAINTENANCE) \| DBG_FUNC_END,
	4546	sched_pri_shifts[TH_BUCKET_SHARE_FG], sched_pri_shifts[TH_BUCKET_SHARE_BG],
	4547	sched_pri_shifts[TH_BUCKET_SHARE_UT], 0, 0);
	4548
	4549	assert_wait((event_t)sched_timeshare_maintenance_continue, THREAD_UNINT);
	4550	thread_block((thread_continue_t)sched_timeshare_maintenance_continue);
	4551	/NOTREACHED/
	4552	}
	4553
	4554	static uint64_t sched_maintenance_wakeups;
	4555
	4556	/*
	4557	* Determine if the set of routines formerly driven by a maintenance timer
	4558	* must be invoked, based on a deadline comparison. Signals the scheduler
	4559	* maintenance thread on deadline expiration. Must be invoked at an interval
	4560	* lower than the "sched_tick_interval", currently accomplished by
	4561	* invocation via the quantum expiration timer and at context switch time.
	4562	* Performance matters: this routine reuses a timestamp approximating the
	4563	* current absolute time received from the caller, and should perform
	4564	* no more than a comparison against the deadline in the common case.
	4565	*/
	4566	void
	4567	sched_timeshare_consider_maintenance(uint64_t ctime) {
	4568	uint64_t ndeadline, deadline = sched_maintenance_deadline;
	4569
	4570	if (__improbable(ctime >= deadline)) {
	4571	if (__improbable(current_thread() == sched_maintenance_thread))
	4572	return;
	4573	OSMemoryBarrier();
	4574
	4575	ndeadline = ctime + sched_tick_interval;
	4576
	4577	if (__probable(__sync_bool_compare_and_swap(&sched_maintenance_deadline, deadline, ndeadline))) {
	4578	thread_wakeup((event_t)sched_timeshare_maintenance_continue);
	4579	sched_maintenance_wakeups++;
	4580	}
	4581	}
	4582	}
	4583
	4584	#endif /* CONFIG_SCHED_TIMESHARE_CORE */
	4585
	4586	void
	4587	sched_init_thread(void (*continuation)(void))
	4588	{
	4589	thread_block(THREAD_CONTINUE_NULL);
	4590
	4591	thread_t thread = current_thread();
	4592
	4593	thread_set_thread_name(thread, "sched_maintenance_thread");
	4594
	4595	sched_maintenance_thread = thread;
	4596
	4597	continuation();
	4598
	4599	/NOTREACHED/
	4600	}
	4601
	4602	#if defined(CONFIG_SCHED_TIMESHARE_CORE)
	4603
	4604	/*
	4605	* thread_update_scan / runq_scan:
	4606	*
	4607	* Scan the run queues to account for timesharing threads
	4608	* which need to be updated.
	4609	*
	4610	* Scanner runs in two passes. Pass one squirrels likely
	4611	* threads away in an array, pass two does the update.
	4612	*
	4613	* This is necessary because the run queue is locked for
	4614	* the candidate scan, but the thread is locked for the update.
	4615	*
	4616	* Array should be sized to make forward progress, without
	4617	* disabling preemption for long periods.
	4618	*/
	4619
	4620	#define THREAD_UPDATE_SIZE 128
	4621
	4622	static thread_t thread_update_array[THREAD_UPDATE_SIZE];
	4623	static uint32_t thread_update_count = 0;
	4624
	4625	/* Returns TRUE if thread was added, FALSE if thread_update_array is full */
	4626	boolean_t
	4627	thread_update_add_thread(thread_t thread)
	4628	{
	4629	if (thread_update_count == THREAD_UPDATE_SIZE)
	4630	return (FALSE);
	4631
	4632	thread_update_array[thread_update_count++] = thread;
	4633	thread_reference_internal(thread);
	4634	return (TRUE);
	4635	}
	4636
	4637	void
	4638	thread_update_process_threads(void)
	4639	{
	4640	assert(thread_update_count <= THREAD_UPDATE_SIZE);
	4641
	4642	for (uint32_t i = 0 ; i < thread_update_count ; i++) {
	4643	thread_t thread = thread_update_array[i];
	4644	assert_thread_magic(thread);
	4645	thread_update_array[i] = THREAD_NULL;
	4646
	4647	spl_t s = splsched();
	4648	thread_lock(thread);
	4649	if (!(thread->state & (TH_WAIT)) && thread->sched_stamp != sched_tick) {
	4650	SCHED(update_priority)(thread);
	4651	}
	4652	thread_unlock(thread);
	4653	splx(s);
	4654
	4655	thread_deallocate(thread);
	4656	}
	4657
	4658	thread_update_count = 0;
	4659	}
	4660
	4661	/*
	4662	* Scan a runq for candidate threads.
	4663	*
	4664	* Returns TRUE if retry is needed.
	4665	*/
	4666	boolean_t
	4667	runq_scan(
	4668	run_queue_t runq,
	4669	sched_update_scan_context_t scan_context)
	4670	{
	4671	int count = runq->count;
	4672	int queue_index;
	4673
	4674	assert(count >= 0);
	4675
	4676	if (count == 0)
	4677	return FALSE;
	4678
	4679	for (queue_index = bitmap_first(runq->bitmap, NRQS);
	4680	queue_index >= 0;
	4681	queue_index = bitmap_next(runq->bitmap, queue_index)) {
	4682
	4683	thread_t thread;
	4684	queue_t queue = &runq->queues[queue_index];
	4685
	4686	qe_foreach_element(thread, queue, runq_links) {
	4687	assert(count > 0);
	4688	assert_thread_magic(thread);
	4689
	4690	if (thread->sched_stamp != sched_tick &&
	4691	thread->sched_mode == TH_MODE_TIMESHARE) {
	4692	if (thread_update_add_thread(thread) == FALSE)
	4693	return TRUE;
	4694	}
	4695
	4696	if (cpu_throttle_enabled && ((thread->sched_pri <= MAXPRI_THROTTLE) && (thread->base_pri <= MAXPRI_THROTTLE))) {
	4697	if (thread->last_made_runnable_time < scan_context->earliest_bg_make_runnable_time) {
	4698	scan_context->earliest_bg_make_runnable_time = thread->last_made_runnable_time;
	4699	}
	4700	} else {
	4701	if (thread->last_made_runnable_time < scan_context->earliest_normal_make_runnable_time) {
	4702	scan_context->earliest_normal_make_runnable_time = thread->last_made_runnable_time;
	4703	}
	4704	}
	4705	count--;
	4706	}
	4707	}
	4708
	4709	return FALSE;
	4710	}
	4711
	4712	#endif /* CONFIG_SCHED_TIMESHARE_CORE */
	4713
	4714	boolean_t
	4715	thread_eager_preemption(thread_t thread)
	4716	{
	4717	return ((thread->sched_flags & TH_SFLAG_EAGERPREEMPT) != 0);
	4718	}
	4719
	4720	void
	4721	thread_set_eager_preempt(thread_t thread)
	4722	{
	4723	spl_t x;
	4724	processor_t p;
	4725	ast_t ast = AST_NONE;
	4726
	4727	x = splsched();
	4728	p = current_processor();
	4729
	4730	thread_lock(thread);
	4731	thread->sched_flags \|= TH_SFLAG_EAGERPREEMPT;
	4732
	4733	if (thread == current_thread()) {
	4734
	4735	ast = csw_check(p, AST_NONE);
	4736	thread_unlock(thread);
	4737	if (ast != AST_NONE) {
	4738	(void) thread_block_reason(THREAD_CONTINUE_NULL, NULL, ast);
	4739	}
	4740	} else {
	4741	p = thread->last_processor;
	4742
	4743	if (p != PROCESSOR_NULL && p->state == PROCESSOR_RUNNING &&
	4744	p->active_thread == thread) {
	4745	cause_ast_check(p);
	4746	}
	4747
	4748	thread_unlock(thread);
	4749	}
	4750
	4751	splx(x);
	4752	}
	4753
	4754	void
	4755	thread_clear_eager_preempt(thread_t thread)
	4756	{
	4757	spl_t x;
	4758
	4759	x = splsched();
	4760	thread_lock(thread);
	4761
	4762	thread->sched_flags &= ~TH_SFLAG_EAGERPREEMPT;
	4763
	4764	thread_unlock(thread);
	4765	splx(x);
	4766	}
	4767
	4768	/*
	4769	* Scheduling statistics
	4770	*/
	4771	void
	4772	sched_stats_handle_csw(processor_t processor, int reasons, int selfpri, int otherpri)
	4773	{
	4774	struct processor_sched_statistics *stats;
	4775	boolean_t to_realtime = FALSE;
	4776
	4777	stats = &processor->processor_data.sched_stats;
	4778	stats->csw_count++;
	4779
	4780	if (otherpri >= BASEPRI_REALTIME) {
	4781	stats->rt_sched_count++;
	4782	to_realtime = TRUE;
	4783	}
	4784
	4785	if ((reasons & AST_PREEMPT) != 0) {
	4786	stats->preempt_count++;
	4787
	4788	if (selfpri >= BASEPRI_REALTIME) {
	4789	stats->preempted_rt_count++;
	4790	}
	4791
	4792	if (to_realtime) {
	4793	stats->preempted_by_rt_count++;
	4794	}
	4795
	4796	}
	4797	}
	4798
	4799	void
	4800	sched_stats_handle_runq_change(struct runq_stats *stats, int old_count)
	4801	{
	4802	uint64_t timestamp = mach_absolute_time();
	4803
	4804	stats->count_sum += (timestamp - stats->last_change_timestamp) * old_count;
	4805	stats->last_change_timestamp = timestamp;
	4806	}
	4807
	4808	/*
	4809	* For calls from assembly code
	4810	*/
	4811	#undef thread_wakeup
	4812	void
	4813	thread_wakeup(
	4814	event_t x);
	4815
	4816	void
	4817	thread_wakeup(
	4818	event_t x)
	4819	{
	4820	thread_wakeup_with_result(x, THREAD_AWAKENED);
	4821	}
	4822
	4823	boolean_t
	4824	preemption_enabled(void)
	4825	{
	4826	return (get_preemption_level() == 0 && ml_get_interrupts_enabled());
	4827	}
	4828
	4829	static void
	4830	sched_timer_deadline_tracking_init(void) {
	4831	nanoseconds_to_absolutetime(TIMER_DEADLINE_TRACKING_BIN_1_DEFAULT, &timer_deadline_tracking_bin_1);
	4832	nanoseconds_to_absolutetime(TIMER_DEADLINE_TRACKING_BIN_2_DEFAULT, &timer_deadline_tracking_bin_2);
	4833	}
	4834
	4835
	4836	kern_return_t
	4837	sched_work_interval_notify(thread_t thread, uint64_t work_interval_id, uint64_t start, uint64_t finish, uint64_t deadline, uint64_t next_start, uint32_t flags)
	4838	{
	4839	int urgency;
	4840	uint64_t urgency_param1, urgency_param2;
	4841	spl_t s;
	4842
	4843	if (work_interval_id == 0) {
	4844	return (KERN_INVALID_ARGUMENT);
	4845	}
	4846
	4847	assert(thread == current_thread());
	4848
	4849	thread_mtx_lock(thread);
	4850	if (thread->work_interval_id != work_interval_id) {
	4851	thread_mtx_unlock(thread);
	4852	return (KERN_INVALID_ARGUMENT);
	4853	}
	4854	thread_mtx_unlock(thread);
	4855
	4856	s = splsched();
	4857	thread_lock(thread);
	4858	urgency = thread_get_urgency(thread, &urgency_param1, &urgency_param2);
	4859	thread_unlock(thread);
	4860	splx(s);
	4861
	4862	machine_work_interval_notify(thread, work_interval_id, start, finish, deadline, next_start, urgency, flags);
	4863	return (KERN_SUCCESS);
	4864	}
	4865
	4866	void thread_set_options(uint32_t thopt) {
	4867	spl_t x;
	4868	thread_t t = current_thread();
	4869
	4870	x = splsched();
	4871	thread_lock(t);
	4872
	4873	t->options \|= thopt;
	4874
	4875	thread_unlock(t);
	4876	splx(x);
	4877	}