]> git.saurik.com Git - apple/xnu.git/blame - osfmk/kern/sched_prim.c
xnu-3789.21.4.tar.gz
[apple/xnu.git] / osfmk / kern / sched_prim.c
CommitLineData
1c79356b 1/*
39037602 2 * Copyright (c) 2000-2016 Apple Inc. All rights reserved.
1c79356b 3 *
2d21ac55 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
1c79356b 5 *
2d21ac55
A
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
8f6c56a5 14 *
2d21ac55
A
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
8f6c56a5
A
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
2d21ac55
A
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
8f6c56a5 25 *
2d21ac55 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
1c79356b
A
27 */
28/*
29 * @OSF_FREE_COPYRIGHT@
30 */
31/*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56/*
57 */
58/*
59 * File: sched_prim.c
60 * Author: Avadis Tevanian, Jr.
61 * Date: 1986
62 *
63 * Scheduling primitives
64 *
65 */
66
67#include <debug.h>
91447636
A
68
69#include <mach/mach_types.h>
1c79356b 70#include <mach/machine.h>
91447636
A
71#include <mach/policy.h>
72#include <mach/sync_policy.h>
6d2010ae 73#include <mach/thread_act.h>
91447636 74
1c79356b
A
75#include <machine/machine_routines.h>
76#include <machine/sched_param.h>
0c530ab8 77#include <machine/machine_cpu.h>
6d2010ae 78#include <machine/machlimits.h>
91447636 79
fe8ab488
A
80#ifdef CONFIG_MACH_APPROXIMATE_TIME
81#include <machine/commpage.h>
82#endif
83
91447636 84#include <kern/kern_types.h>
39037602 85#include <kern/backtrace.h>
1c79356b
A
86#include <kern/clock.h>
87#include <kern/counters.h>
88#include <kern/cpu_number.h>
89#include <kern/cpu_data.h>
3e170ce0 90#include <kern/smp.h>
91447636 91#include <kern/debug.h>
1c79356b
A
92#include <kern/macro_help.h>
93#include <kern/machine.h>
94#include <kern/misc_protos.h>
95#include <kern/processor.h>
96#include <kern/queue.h>
97#include <kern/sched.h>
98#include <kern/sched_prim.h>
fe8ab488 99#include <kern/sfi.h>
1c79356b
A
100#include <kern/syscall_subr.h>
101#include <kern/task.h>
102#include <kern/thread.h>
316670eb 103#include <kern/ledger.h>
39236c6e 104#include <kern/timer_queue.h>
3e170ce0 105#include <kern/waitq.h>
39037602 106#include <kern/policy_internal.h>
91447636 107
1c79356b
A
108#include <vm/pmap.h>
109#include <vm/vm_kern.h>
110#include <vm/vm_map.h>
91447636 111
b0d623f7
A
112#include <mach/sdt.h>
113
1c79356b 114#include <sys/kdebug.h>
39037602
A
115#include <kperf/kperf.h>
116#include <kern/kpc.h>
1c79356b 117
0c530ab8 118#include <kern/pms.h>
3a60a9f5 119
6d2010ae 120struct rt_queue rt_runq;
2d21ac55 121
3e170ce0
A
122uintptr_t sched_thread_on_rt_queue = (uintptr_t)0xDEAFBEE0;
123
124/* Lock RT runq, must be done with interrupts disabled (under splsched()) */
125#if __SMP__
126decl_simple_lock_data(static,rt_lock);
127#define rt_lock_init() simple_lock_init(&rt_lock, 0)
128#define rt_lock_lock() simple_lock(&rt_lock)
129#define rt_lock_unlock() simple_unlock(&rt_lock)
130#else
131#define rt_lock_init() do { } while(0)
132#define rt_lock_lock() do { } while(0)
133#define rt_lock_unlock() do { } while(0)
134#endif
6d2010ae 135
0b4e3aa0 136#define DEFAULT_PREEMPTION_RATE 100 /* (1/s) */
1c79356b
A
137int default_preemption_rate = DEFAULT_PREEMPTION_RATE;
138
316670eb
A
139#define DEFAULT_BG_PREEMPTION_RATE 400 /* (1/s) */
140int default_bg_preemption_rate = DEFAULT_BG_PREEMPTION_RATE;
141
0b4e3aa0
A
142#define MAX_UNSAFE_QUANTA 800
143int max_unsafe_quanta = MAX_UNSAFE_QUANTA;
144
145#define MAX_POLL_QUANTA 2
146int max_poll_quanta = MAX_POLL_QUANTA;
147
148#define SCHED_POLL_YIELD_SHIFT 4 /* 1/16 */
149int sched_poll_yield_shift = SCHED_POLL_YIELD_SHIFT;
150
55e303ae
A
151uint64_t max_poll_computation;
152
6d2010ae
A
153uint64_t max_unsafe_computation;
154uint64_t sched_safe_duration;
155
fe8ab488 156#if defined(CONFIG_SCHED_TIMESHARE_CORE)
6d2010ae 157
55e303ae
A
158uint32_t std_quantum;
159uint32_t min_std_quantum;
316670eb 160uint32_t bg_quantum;
55e303ae 161
91447636 162uint32_t std_quantum_us;
316670eb 163uint32_t bg_quantum_us;
91447636 164
fe8ab488 165#endif /* CONFIG_SCHED_TIMESHARE_CORE */
6d2010ae
A
166
167uint32_t thread_depress_time;
168uint32_t default_timeshare_computation;
169uint32_t default_timeshare_constraint;
170
55e303ae
A
171uint32_t max_rt_quantum;
172uint32_t min_rt_quantum;
173
fe8ab488 174#if defined(CONFIG_SCHED_TIMESHARE_CORE)
6d2010ae 175
1c79356b 176unsigned sched_tick;
91447636 177uint32_t sched_tick_interval;
1c79356b 178
39037602 179uint32_t sched_pri_shifts[TH_BUCKET_MAX];
2d21ac55 180uint32_t sched_fixed_shift;
39236c6e
A
181
182uint32_t sched_decay_usage_age_factor = 1; /* accelerate 5/8^n usage aging */
2d21ac55 183
fe8ab488
A
184/* Allow foreground to decay past default to resolve inversions */
185#define DEFAULT_DECAY_BAND_LIMIT ((BASEPRI_FOREGROUND - BASEPRI_DEFAULT) + 2)
186int sched_pri_decay_band_limit = DEFAULT_DECAY_BAND_LIMIT;
6d2010ae 187
4b17d6b6
A
188/* Defaults for timer deadline profiling */
189#define TIMER_DEADLINE_TRACKING_BIN_1_DEFAULT 2000000 /* Timers with deadlines <=
39236c6e 190 * 2ms */
4b17d6b6 191#define TIMER_DEADLINE_TRACKING_BIN_2_DEFAULT 5000000 /* Timers with deadlines
39236c6e
A
192 <= 5ms */
193
4b17d6b6
A
194uint64_t timer_deadline_tracking_bin_1;
195uint64_t timer_deadline_tracking_bin_2;
196
490019cf
A
197#endif /* CONFIG_SCHED_TIMESHARE_CORE */
198
39236c6e
A
199thread_t sched_maintenance_thread;
200
fe8ab488 201
6d2010ae
A
202uint64_t sched_one_second_interval;
203
1c79356b 204/* Forwards */
6d2010ae 205
fe8ab488 206#if defined(CONFIG_SCHED_TIMESHARE_CORE)
6d2010ae 207
39236c6e
A
208static void load_shift_init(void);
209static void preempt_pri_init(void);
2d21ac55 210
fe8ab488 211#endif /* CONFIG_SCHED_TIMESHARE_CORE */
c910b4d9 212
6d2010ae
A
213static thread_t thread_select(
214 thread_t thread,
fe8ab488
A
215 processor_t processor,
216 ast_t reason);
b0d623f7 217
6d2010ae 218#if CONFIG_SCHED_IDLE_IN_PLACE
2d21ac55
A
219static thread_t thread_select_idle(
220 thread_t thread,
221 processor_t processor);
6d2010ae 222#endif
1c79356b 223
6d2010ae 224thread_t processor_idle(
2d21ac55
A
225 thread_t thread,
226 processor_t processor);
91447636 227
39236c6e
A
228ast_t
229csw_check_locked( processor_t processor,
fe8ab488
A
230 processor_set_t pset,
231 ast_t check_reason);
39236c6e 232
6d2010ae
A
233static void processor_setrun(
234 processor_t processor,
235 thread_t thread,
236 integer_t options);
237
6d2010ae 238static void
39236c6e 239sched_realtime_init(void);
6d2010ae
A
240
241static void
242sched_realtime_timebase_init(void);
243
4b17d6b6
A
244static void
245sched_timer_deadline_tracking_init(void);
246
2d21ac55
A
247#if DEBUG
248extern int debug_task;
249#define TLOG(a, fmt, args...) if(debug_task & a) kprintf(fmt, ## args)
250#else
251#define TLOG(a, fmt, args...) do {} while (0)
252#endif
253
3e170ce0
A
254static processor_t
255thread_bind_internal(
256 thread_t thread,
257 processor_t processor);
1c79356b 258
3e170ce0
A
259static void
260sched_vm_group_maintenance(void);
1c79356b 261
fe8ab488 262#if defined(CONFIG_SCHED_TIMESHARE_CORE)
91447636 263int8_t sched_load_shifts[NRQS];
39037602 264bitmap_t sched_preempt_pri[BITMAP_LEN(NRQS)];
fe8ab488 265#endif /* CONFIG_SCHED_TIMESHARE_CORE */
6d2010ae 266
6d2010ae
A
267const struct sched_dispatch_table *sched_current_dispatch = NULL;
268
269/*
270 * Statically allocate a buffer to hold the longest possible
271 * scheduler description string, as currently implemented.
272 * bsd/kern/kern_sysctl.c has a corresponding definition in bsd/
273 * to export to userspace via sysctl(3). If either version
274 * changes, update the other.
275 *
276 * Note that in addition to being an upper bound on the strings
277 * in the kernel, it's also an exact parameter to PE_get_default(),
278 * which interrogates the device tree on some platforms. That
279 * API requires the caller know the exact size of the device tree
280 * property, so we need both a legacy size (32) and the current size
281 * (48) to deal with old and new device trees. The device tree property
282 * is similarly padded to a fixed size so that the same kernel image
283 * can run on multiple devices with different schedulers configured
284 * in the device tree.
285 */
6d2010ae 286char sched_string[SCHED_STRING_MAX_LENGTH];
3e170ce0
A
287
288uint32_t sched_debug_flags;
39236c6e
A
289
290/* Global flag which indicates whether Background Stepper Context is enabled */
291static int cpu_throttle_enabled = 1;
91447636 292
1c79356b
A
293void
294sched_init(void)
6d2010ae
A
295{
296 char sched_arg[SCHED_STRING_MAX_LENGTH] = { '\0' };
297
298 /* Check for runtime selection of the scheduler algorithm */
299 if (!PE_parse_boot_argn("sched", sched_arg, sizeof (sched_arg))) {
300 /* If no boot-args override, look in device tree */
301 if (!PE_get_default("kern.sched", sched_arg,
302 SCHED_STRING_MAX_LENGTH)) {
303 sched_arg[0] = '\0';
304 }
305 }
306
fe8ab488
A
307
308 if (!PE_parse_boot_argn("sched_pri_decay_limit", &sched_pri_decay_band_limit, sizeof(sched_pri_decay_band_limit))) {
309 /* No boot-args, check in device tree */
310 if (!PE_get_default("kern.sched_pri_decay_limit",
311 &sched_pri_decay_band_limit,
312 sizeof(sched_pri_decay_band_limit))) {
313 /* Allow decay all the way to normal limits */
314 sched_pri_decay_band_limit = DEFAULT_DECAY_BAND_LIMIT;
315 }
316 }
317
318 kprintf("Setting scheduler priority decay band limit %d\n", sched_pri_decay_band_limit);
319
6d2010ae
A
320 if (strlen(sched_arg) > 0) {
321 if (0) {
322 /* Allow pattern below */
323#if defined(CONFIG_SCHED_TRADITIONAL)
3e170ce0 324 } else if (0 == strcmp(sched_arg, sched_traditional_dispatch.sched_name)) {
6d2010ae 325 sched_current_dispatch = &sched_traditional_dispatch;
3e170ce0 326 } else if (0 == strcmp(sched_arg, sched_traditional_with_pset_runqueue_dispatch.sched_name)) {
6d2010ae 327 sched_current_dispatch = &sched_traditional_with_pset_runqueue_dispatch;
6d2010ae
A
328#endif
329#if defined(CONFIG_SCHED_PROTO)
3e170ce0 330 } else if (0 == strcmp(sched_arg, sched_proto_dispatch.sched_name)) {
6d2010ae 331 sched_current_dispatch = &sched_proto_dispatch;
6d2010ae
A
332#endif
333#if defined(CONFIG_SCHED_GRRR)
3e170ce0 334 } else if (0 == strcmp(sched_arg, sched_grrr_dispatch.sched_name)) {
6d2010ae 335 sched_current_dispatch = &sched_grrr_dispatch;
6d2010ae 336#endif
fe8ab488 337#if defined(CONFIG_SCHED_MULTIQ)
3e170ce0 338 } else if (0 == strcmp(sched_arg, sched_multiq_dispatch.sched_name)) {
fe8ab488 339 sched_current_dispatch = &sched_multiq_dispatch;
3e170ce0 340 } else if (0 == strcmp(sched_arg, sched_dualq_dispatch.sched_name)) {
fe8ab488 341 sched_current_dispatch = &sched_dualq_dispatch;
6d2010ae
A
342#endif
343 } else {
fe8ab488
A
344#if defined(CONFIG_SCHED_TRADITIONAL)
345 printf("Unrecognized scheduler algorithm: %s\n", sched_arg);
3e170ce0 346 printf("Scheduler: Using instead: %s\n", sched_traditional_with_pset_runqueue_dispatch.sched_name);
fe8ab488 347 sched_current_dispatch = &sched_traditional_with_pset_runqueue_dispatch;
fe8ab488 348#else
6d2010ae 349 panic("Unrecognized scheduler algorithm: %s", sched_arg);
fe8ab488 350#endif
6d2010ae 351 }
3e170ce0 352 kprintf("Scheduler: Runtime selection of %s\n", SCHED(sched_name));
6d2010ae 353 } else {
fe8ab488
A
354#if defined(CONFIG_SCHED_MULTIQ)
355 sched_current_dispatch = &sched_multiq_dispatch;
fe8ab488 356#elif defined(CONFIG_SCHED_TRADITIONAL)
39236c6e 357 sched_current_dispatch = &sched_traditional_with_pset_runqueue_dispatch;
6d2010ae
A
358#elif defined(CONFIG_SCHED_PROTO)
359 sched_current_dispatch = &sched_proto_dispatch;
6d2010ae
A
360#elif defined(CONFIG_SCHED_GRRR)
361 sched_current_dispatch = &sched_grrr_dispatch;
6d2010ae
A
362#else
363#error No default scheduler implementation
364#endif
3e170ce0
A
365 kprintf("Scheduler: Default of %s\n", SCHED(sched_name));
366 }
367
368 strlcpy(sched_string, SCHED(sched_name), sizeof(sched_string));
369
370 if (PE_parse_boot_argn("sched_debug", &sched_debug_flags, sizeof(sched_debug_flags))) {
371 kprintf("Scheduler: Debug flags 0x%08x\n", sched_debug_flags);
6d2010ae
A
372 }
373
374 SCHED(init)();
6d2010ae
A
375 sched_realtime_init();
376 ast_init();
4b17d6b6 377 sched_timer_deadline_tracking_init();
39236c6e 378
6d2010ae
A
379 SCHED(pset_init)(&pset0);
380 SCHED(processor_init)(master_processor);
381}
382
383void
384sched_timebase_init(void)
385{
386 uint64_t abstime;
387
388 clock_interval_to_absolutetime_interval(1, NSEC_PER_SEC, &abstime);
389 sched_one_second_interval = abstime;
390
391 SCHED(timebase_init)();
392 sched_realtime_timebase_init();
393}
394
fe8ab488 395#if defined(CONFIG_SCHED_TIMESHARE_CORE)
6d2010ae 396
fe8ab488 397void
3e170ce0 398sched_timeshare_init(void)
1c79356b
A
399{
400 /*
0b4e3aa0
A
401 * Calculate the timeslicing quantum
402 * in us.
1c79356b
A
403 */
404 if (default_preemption_rate < 1)
405 default_preemption_rate = DEFAULT_PREEMPTION_RATE;
0b4e3aa0 406 std_quantum_us = (1000 * 1000) / default_preemption_rate;
1c79356b 407
0b4e3aa0 408 printf("standard timeslicing quantum is %d us\n", std_quantum_us);
1c79356b 409
316670eb
A
410 if (default_bg_preemption_rate < 1)
411 default_bg_preemption_rate = DEFAULT_BG_PREEMPTION_RATE;
412 bg_quantum_us = (1000 * 1000) / default_bg_preemption_rate;
413
414 printf("standard background quantum is %d us\n", bg_quantum_us);
415
91447636 416 load_shift_init();
4a3eedf9 417 preempt_pri_init();
1c79356b 418 sched_tick = 0;
1c79356b
A
419}
420
fe8ab488 421void
3e170ce0 422sched_timeshare_timebase_init(void)
55e303ae 423{
91447636
A
424 uint64_t abstime;
425 uint32_t shift;
55e303ae 426
91447636 427 /* standard timeslicing quantum */
55e303ae
A
428 clock_interval_to_absolutetime_interval(
429 std_quantum_us, NSEC_PER_USEC, &abstime);
430 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
b0d623f7 431 std_quantum = (uint32_t)abstime;
55e303ae 432
91447636 433 /* smallest remaining quantum (250 us) */
55e303ae
A
434 clock_interval_to_absolutetime_interval(250, NSEC_PER_USEC, &abstime);
435 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
b0d623f7 436 min_std_quantum = (uint32_t)abstime;
55e303ae 437
316670eb
A
438 /* quantum for background tasks */
439 clock_interval_to_absolutetime_interval(
440 bg_quantum_us, NSEC_PER_USEC, &abstime);
441 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
442 bg_quantum = (uint32_t)abstime;
443
91447636
A
444 /* scheduler tick interval */
445 clock_interval_to_absolutetime_interval(USEC_PER_SEC >> SCHED_TICK_SHIFT,
446 NSEC_PER_USEC, &abstime);
cf7d32b8 447 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
b0d623f7 448 sched_tick_interval = (uint32_t)abstime;
55e303ae 449
91447636
A
450 /*
451 * Compute conversion factor from usage to
452 * timesharing priorities with 5/8 ** n aging.
453 */
454 abstime = (abstime * 5) / 3;
455 for (shift = 0; abstime > BASEPRI_DEFAULT; ++shift)
456 abstime >>= 1;
2d21ac55 457 sched_fixed_shift = shift;
91447636 458
39037602
A
459 for (uint32_t i = 0 ; i < TH_BUCKET_MAX ; i++)
460 sched_pri_shifts[i] = INT8_MAX;
461
fe8ab488
A
462 max_unsafe_computation = ((uint64_t)max_unsafe_quanta) * std_quantum;
463 sched_safe_duration = 2 * ((uint64_t)max_unsafe_quanta) * std_quantum;
39037602 464
fe8ab488 465 max_poll_computation = ((uint64_t)max_poll_quanta) * std_quantum;
6d2010ae
A
466 thread_depress_time = 1 * std_quantum;
467 default_timeshare_computation = std_quantum / 2;
468 default_timeshare_constraint = std_quantum;
469
470}
471
fe8ab488
A
472#endif /* CONFIG_SCHED_TIMESHARE_CORE */
473
6d2010ae
A
474static void
475sched_realtime_init(void)
476{
3e170ce0 477 rt_lock_init();
6d2010ae
A
478
479 rt_runq.count = 0;
480 queue_init(&rt_runq.queue);
55e303ae
A
481}
482
6d2010ae
A
483static void
484sched_realtime_timebase_init(void)
485{
486 uint64_t abstime;
487
488 /* smallest rt computaton (50 us) */
489 clock_interval_to_absolutetime_interval(50, NSEC_PER_USEC, &abstime);
490 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
491 min_rt_quantum = (uint32_t)abstime;
492
493 /* maximum rt computation (50 ms) */
494 clock_interval_to_absolutetime_interval(
495 50, 1000*NSEC_PER_USEC, &abstime);
496 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
497 max_rt_quantum = (uint32_t)abstime;
498
499}
500
fe8ab488 501#if defined(CONFIG_SCHED_TIMESHARE_CORE)
6d2010ae 502
91447636
A
503/*
504 * Set up values for timeshare
505 * loading factors.
506 */
507static void
508load_shift_init(void)
509{
510 int8_t k, *p = sched_load_shifts;
511 uint32_t i, j;
512
39236c6e
A
513 uint32_t sched_decay_penalty = 1;
514
515 if (PE_parse_boot_argn("sched_decay_penalty", &sched_decay_penalty, sizeof (sched_decay_penalty))) {
516 kprintf("Overriding scheduler decay penalty %u\n", sched_decay_penalty);
517 }
518
519 if (PE_parse_boot_argn("sched_decay_usage_age_factor", &sched_decay_usage_age_factor, sizeof (sched_decay_usage_age_factor))) {
520 kprintf("Overriding scheduler decay usage age factor %u\n", sched_decay_usage_age_factor);
521 }
522
39236c6e
A
523 if (sched_decay_penalty == 0) {
524 /*
525 * There is no penalty for timeshare threads for using too much
526 * CPU, so set all load shifts to INT8_MIN. Even under high load,
527 * sched_pri_shift will be >INT8_MAX, and there will be no
528 * penalty applied to threads (nor will sched_usage be updated per
529 * thread).
530 */
531 for (i = 0; i < NRQS; i++) {
532 sched_load_shifts[i] = INT8_MIN;
533 }
534
535 return;
536 }
537
91447636
A
538 *p++ = INT8_MIN; *p++ = 0;
539
39236c6e
A
540 /*
541 * For a given system load "i", the per-thread priority
542 * penalty per quantum of CPU usage is ~2^k priority
543 * levels. "sched_decay_penalty" can cause more
544 * array entries to be filled with smaller "k" values
545 */
546 for (i = 2, j = 1 << sched_decay_penalty, k = 1; i < NRQS; ++k) {
547 for (j <<= 1; (i < j) && (i < NRQS); ++i)
91447636
A
548 *p++ = k;
549 }
550}
551
4a3eedf9
A
552static void
553preempt_pri_init(void)
554{
39037602 555 bitmap_t *p = sched_preempt_pri;
4a3eedf9 556
39037602
A
557 for (int i = BASEPRI_FOREGROUND; i < MINPRI_KERNEL; ++i)
558 bitmap_set(p, i);
4a3eedf9 559
39037602
A
560 for (int i = BASEPRI_PREEMPT; i <= MAXPRI; ++i)
561 bitmap_set(p, i);
4a3eedf9
A
562}
563
fe8ab488 564#endif /* CONFIG_SCHED_TIMESHARE_CORE */
6d2010ae 565
1c79356b 566/*
0b4e3aa0 567 * Thread wait timer expiration.
1c79356b
A
568 */
569void
570thread_timer_expire(
91447636
A
571 void *p0,
572 __unused void *p1)
1c79356b
A
573{
574 thread_t thread = p0;
575 spl_t s;
576
39037602
A
577 assert_thread_magic(thread);
578
1c79356b 579 s = splsched();
55e303ae 580 thread_lock(thread);
91447636 581 if (--thread->wait_timer_active == 0) {
0b4e3aa0
A
582 if (thread->wait_timer_is_set) {
583 thread->wait_timer_is_set = FALSE;
55e303ae 584 clear_wait_internal(thread, THREAD_TIMED_OUT);
0b4e3aa0 585 }
1c79356b 586 }
55e303ae 587 thread_unlock(thread);
1c79356b
A
588 splx(s);
589}
590
1c79356b 591/*
91447636
A
592 * thread_unblock:
593 *
594 * Unblock thread on wake up.
595 *
3e170ce0 596 * Returns TRUE if the thread should now be placed on the runqueue.
91447636
A
597 *
598 * Thread must be locked.
3e170ce0
A
599 *
600 * Called at splsched().
1c79356b 601 */
91447636
A
602boolean_t
603thread_unblock(
604 thread_t thread,
605 wait_result_t wresult)
1c79356b 606{
3e170ce0 607 boolean_t ready_for_runq = FALSE;
4b17d6b6 608 thread_t cthread = current_thread();
fe8ab488 609 uint32_t new_run_count;
0b4e3aa0 610
91447636 611 /*
2d21ac55 612 * Set wait_result.
91447636
A
613 */
614 thread->wait_result = wresult;
1c79356b 615
91447636 616 /*
2d21ac55 617 * Cancel pending wait timer.
91447636 618 */
1c79356b
A
619 if (thread->wait_timer_is_set) {
620 if (timer_call_cancel(&thread->wait_timer))
621 thread->wait_timer_active--;
622 thread->wait_timer_is_set = FALSE;
623 }
624
91447636 625 /*
2d21ac55
A
626 * Update scheduling state: not waiting,
627 * set running.
91447636
A
628 */
629 thread->state &= ~(TH_WAIT|TH_UNINT);
1c79356b 630
91447636
A
631 if (!(thread->state & TH_RUN)) {
632 thread->state |= TH_RUN;
3e170ce0
A
633 thread->last_made_runnable_time = mach_approximate_time();
634
635 ready_for_runq = TRUE;
1c79356b 636
2d21ac55 637 (*thread->sched_call)(SCHED_CALL_UNBLOCK, thread);
1c79356b 638
39037602 639 /* Update the runnable thread count */
fe8ab488 640 new_run_count = sched_run_incr(thread);
3e170ce0 641 } else {
2d21ac55 642 /*
39037602
A
643 * Either the thread is idling in place on another processor,
644 * or it hasn't finished context switching yet.
2d21ac55 645 */
6d2010ae 646#if CONFIG_SCHED_IDLE_IN_PLACE
2d21ac55
A
647 if (thread->state & TH_IDLE) {
648 processor_t processor = thread->last_processor;
649
650 if (processor != current_processor())
651 machine_signal_idle(processor);
652 }
6d2010ae
A
653#else
654 assert((thread->state & TH_IDLE) == 0);
655#endif
39037602
A
656 /*
657 * The run count is only dropped after the context switch completes
658 * and the thread is still waiting, so we should not run_incr here
659 */
660 new_run_count = sched_run_buckets[TH_BUCKET_RUN];
2d21ac55 661 }
1c79356b 662
3e170ce0 663
91447636
A
664 /*
665 * Calculate deadline for real-time threads.
666 */
6d2010ae 667 if (thread->sched_mode == TH_MODE_REALTIME) {
3e170ce0 668 uint64_t ctime;
fe8ab488
A
669
670 ctime = mach_absolute_time();
671 thread->realtime.deadline = thread->realtime.constraint + ctime;
0b4e3aa0
A
672 }
673
91447636
A
674 /*
675 * Clear old quantum, fail-safe computation, etc.
676 */
fe8ab488 677 thread->quantum_remaining = 0;
91447636
A
678 thread->computation_metered = 0;
679 thread->reason = AST_NONE;
1c79356b 680
4b17d6b6
A
681 /* Obtain power-relevant interrupt and "platform-idle exit" statistics.
682 * We also account for "double hop" thread signaling via
683 * the thread callout infrastructure.
684 * DRK: consider removing the callout wakeup counters in the future
685 * they're present for verification at the moment.
686 */
687 boolean_t aticontext, pidle;
688 ml_get_power_state(&aticontext, &pidle);
39236c6e
A
689
690 if (__improbable(aticontext && !(thread_get_tag_internal(thread) & THREAD_TAG_CALLOUT))) {
4b17d6b6 691 ledger_credit(thread->t_ledger, task_ledgers.interrupt_wakeups, 1);
39236c6e
A
692 DTRACE_SCHED2(iwakeup, struct thread *, thread, struct proc *, thread->task->bsd_info);
693
4b17d6b6 694 uint64_t ttd = PROCESSOR_DATA(current_processor(), timer_call_ttd);
39236c6e 695
4b17d6b6
A
696 if (ttd) {
697 if (ttd <= timer_deadline_tracking_bin_1)
698 thread->thread_timer_wakeups_bin_1++;
699 else
700 if (ttd <= timer_deadline_tracking_bin_2)
701 thread->thread_timer_wakeups_bin_2++;
702 }
39236c6e 703
4b17d6b6
A
704 if (pidle) {
705 ledger_credit(thread->t_ledger, task_ledgers.platform_idle_wakeups, 1);
706 }
39236c6e 707
4b17d6b6
A
708 } else if (thread_get_tag_internal(cthread) & THREAD_TAG_CALLOUT) {
709 if (cthread->callout_woken_from_icontext) {
710 ledger_credit(thread->t_ledger, task_ledgers.interrupt_wakeups, 1);
711 thread->thread_callout_interrupt_wakeups++;
712 if (cthread->callout_woken_from_platform_idle) {
713 ledger_credit(thread->t_ledger, task_ledgers.platform_idle_wakeups, 1);
714 thread->thread_callout_platform_idle_wakeups++;
715 }
39236c6e
A
716
717 cthread->callout_woke_thread = TRUE;
4b17d6b6
A
718 }
719 }
720
721 if (thread_get_tag_internal(thread) & THREAD_TAG_CALLOUT) {
39236c6e
A
722 thread->callout_woken_from_icontext = aticontext;
723 thread->callout_woken_from_platform_idle = pidle;
724 thread->callout_woke_thread = FALSE;
4b17d6b6
A
725 }
726
fe8ab488
A
727 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
728 MACHDBG_CODE(DBG_MACH_SCHED,MACH_MAKE_RUNNABLE) | DBG_FUNC_NONE,
39037602
A
729 (uintptr_t)thread_tid(thread), thread->sched_pri, thread->wait_result,
730 sched_run_buckets[TH_BUCKET_RUN], 0);
b0d623f7
A
731
732 DTRACE_SCHED2(wakeup, struct thread *, thread, struct proc *, thread->task->bsd_info);
91447636 733
3e170ce0 734 return (ready_for_runq);
1c79356b
A
735}
736
737/*
91447636 738 * Routine: thread_go
1c79356b 739 * Purpose:
91447636 740 * Unblock and dispatch thread.
1c79356b
A
741 * Conditions:
742 * thread lock held, IPC locks may be held.
743 * thread must have been pulled from wait queue under same lock hold.
3e170ce0
A
744 * thread must have been waiting
745 * Returns:
9bccf70c 746 * KERN_SUCCESS - Thread was set running
3e170ce0
A
747 *
748 * TODO: This should return void
1c79356b 749 */
9bccf70c 750kern_return_t
91447636 751thread_go(
3e170ce0
A
752 thread_t thread,
753 wait_result_t wresult)
1c79356b 754{
39037602
A
755 assert_thread_magic(thread);
756
1c79356b 757 assert(thread->at_safe_point == FALSE);
9bccf70c 758 assert(thread->wait_event == NO_EVENT64);
3e170ce0 759 assert(thread->waitq == NULL);
1c79356b 760
3e170ce0
A
761 assert(!(thread->state & (TH_TERMINATE|TH_TERMINATE2)));
762 assert(thread->state & TH_WAIT);
55e303ae 763
55e303ae 764
39037602
A
765 if (thread_unblock(thread, wresult)) {
766#if SCHED_TRACE_THREAD_WAKEUPS
767 backtrace(&thread->thread_wakeup_bt[0],
768 (sizeof(thread->thread_wakeup_bt)/sizeof(uintptr_t)));
769#endif
3e170ce0 770 thread_setrun(thread, SCHED_PREEMPT | SCHED_TAILQ);
39037602 771 }
3e170ce0
A
772
773 return (KERN_SUCCESS);
1c79356b
A
774}
775
9bccf70c
A
776/*
777 * Routine: thread_mark_wait_locked
778 * Purpose:
779 * Mark a thread as waiting. If, given the circumstances,
780 * it doesn't want to wait (i.e. already aborted), then
781 * indicate that in the return value.
782 * Conditions:
783 * at splsched() and thread is locked.
784 */
785__private_extern__
786wait_result_t
1c79356b 787thread_mark_wait_locked(
9bccf70c
A
788 thread_t thread,
789 wait_interrupt_t interruptible)
1c79356b 790{
55e303ae 791 boolean_t at_safe_point;
1c79356b 792
3e170ce0 793 assert(!(thread->state & (TH_WAIT|TH_IDLE|TH_UNINT|TH_TERMINATE2)));
b0d623f7 794
9bccf70c
A
795 /*
796 * The thread may have certain types of interrupts/aborts masked
797 * off. Even if the wait location says these types of interrupts
798 * are OK, we have to honor mask settings (outer-scoped code may
799 * not be able to handle aborts at the moment).
800 */
91447636
A
801 if (interruptible > (thread->options & TH_OPT_INTMASK))
802 interruptible = thread->options & TH_OPT_INTMASK;
9bccf70c
A
803
804 at_safe_point = (interruptible == THREAD_ABORTSAFE);
805
55e303ae 806 if ( interruptible == THREAD_UNINT ||
6d2010ae 807 !(thread->sched_flags & TH_SFLAG_ABORT) ||
55e303ae 808 (!at_safe_point &&
6d2010ae 809 (thread->sched_flags & TH_SFLAG_ABORTSAFELY))) {
b0d623f7 810
316670eb
A
811 if ( !(thread->state & TH_TERMINATE))
812 DTRACE_SCHED(sleep);
b0d623f7 813
9bccf70c
A
814 thread->state |= (interruptible) ? TH_WAIT : (TH_WAIT | TH_UNINT);
815 thread->at_safe_point = at_safe_point;
9bccf70c 816 return (thread->wait_result = THREAD_WAITING);
9bccf70c 817 }
55e303ae 818 else
6d2010ae
A
819 if (thread->sched_flags & TH_SFLAG_ABORTSAFELY)
820 thread->sched_flags &= ~TH_SFLAG_ABORTED_MASK;
55e303ae 821
9bccf70c 822 return (thread->wait_result = THREAD_INTERRUPTED);
1c79356b
A
823}
824
9bccf70c
A
825/*
826 * Routine: thread_interrupt_level
827 * Purpose:
828 * Set the maximum interruptible state for the
829 * current thread. The effective value of any
830 * interruptible flag passed into assert_wait
831 * will never exceed this.
832 *
833 * Useful for code that must not be interrupted,
834 * but which calls code that doesn't know that.
835 * Returns:
836 * The old interrupt level for the thread.
837 */
838__private_extern__
839wait_interrupt_t
840thread_interrupt_level(
841 wait_interrupt_t new_level)
842{
843 thread_t thread = current_thread();
91447636 844 wait_interrupt_t result = thread->options & TH_OPT_INTMASK;
1c79356b 845
91447636 846 thread->options = (thread->options & ~TH_OPT_INTMASK) | (new_level & TH_OPT_INTMASK);
1c79356b 847
91447636 848 return result;
1c79356b
A
849}
850
851/*
852 * Check to see if an assert wait is possible, without actually doing one.
853 * This is used by debug code in locks and elsewhere to verify that it is
854 * always OK to block when trying to take a blocking lock (since waiting
855 * for the actual assert_wait to catch the case may make it hard to detect
856 * this case.
857 */
858boolean_t
859assert_wait_possible(void)
860{
861
862 thread_t thread;
1c79356b
A
863
864#if DEBUG
865 if(debug_mode) return TRUE; /* Always succeed in debug mode */
866#endif
867
868 thread = current_thread();
869
3e170ce0 870 return (thread == NULL || waitq_wait_possible(thread));
1c79356b
A
871}
872
873/*
874 * assert_wait:
875 *
876 * Assert that the current thread is about to go to
877 * sleep until the specified event occurs.
878 */
9bccf70c 879wait_result_t
1c79356b
A
880assert_wait(
881 event_t event,
9bccf70c 882 wait_interrupt_t interruptible)
1c79356b 883{
3e170ce0
A
884 if (__improbable(event == NO_EVENT))
885 panic("%s() called with NO_EVENT", __func__);
1c79356b 886
316670eb
A
887 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
888 MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)|DBG_FUNC_NONE,
4bd07ac2 889 VM_KERNEL_UNSLIDE_OR_PERM(event), 0, 0, 0, 0);
316670eb 890
3e170ce0
A
891 struct waitq *waitq;
892 waitq = global_eventq(event);
893 return waitq_assert_wait64(waitq, CAST_EVENT64_T(event), interruptible, TIMEOUT_WAIT_FOREVER);
9bccf70c
A
894}
895
39037602
A
896/*
897 * assert_wait_queue:
898 *
899 * Return the global waitq for the specified event
900 */
901struct waitq *
902assert_wait_queue(
903 event_t event)
904{
905 return global_eventq(event);
906}
907
91447636
A
908wait_result_t
909assert_wait_timeout(
910 event_t event,
911 wait_interrupt_t interruptible,
912 uint32_t interval,
913 uint32_t scale_factor)
55e303ae 914{
91447636
A
915 thread_t thread = current_thread();
916 wait_result_t wresult;
91447636
A
917 uint64_t deadline;
918 spl_t s;
919
3e170ce0
A
920 if (__improbable(event == NO_EVENT))
921 panic("%s() called with NO_EVENT", __func__);
fe8ab488 922
3e170ce0
A
923 struct waitq *waitq;
924 waitq = global_eventq(event);
91447636
A
925
926 s = splsched();
3e170ce0 927 waitq_lock(waitq);
91447636
A
928
929 clock_interval_to_deadline(interval, scale_factor, &deadline);
3e170ce0 930
316670eb 931 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3e170ce0 932 MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)|DBG_FUNC_NONE,
4bd07ac2 933 VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
3e170ce0
A
934
935 wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
936 interruptible,
937 TIMEOUT_URGENCY_SYS_NORMAL,
938 deadline, TIMEOUT_NO_LEEWAY,
939 thread);
39236c6e 940
3e170ce0 941 waitq_unlock(waitq);
39236c6e 942 splx(s);
3e170ce0 943 return wresult;
39236c6e
A
944}
945
946wait_result_t
947assert_wait_timeout_with_leeway(
948 event_t event,
949 wait_interrupt_t interruptible,
950 wait_timeout_urgency_t urgency,
951 uint32_t interval,
952 uint32_t leeway,
953 uint32_t scale_factor)
954{
955 thread_t thread = current_thread();
956 wait_result_t wresult;
39236c6e
A
957 uint64_t deadline;
958 uint64_t abstime;
959 uint64_t slop;
960 uint64_t now;
961 spl_t s;
962
3e170ce0
A
963 if (__improbable(event == NO_EVENT))
964 panic("%s() called with NO_EVENT", __func__);
965
39236c6e
A
966 now = mach_absolute_time();
967 clock_interval_to_absolutetime_interval(interval, scale_factor, &abstime);
968 deadline = now + abstime;
969
970 clock_interval_to_absolutetime_interval(leeway, scale_factor, &slop);
971
3e170ce0
A
972 struct waitq *waitq;
973 waitq = global_eventq(event);
39236c6e
A
974
975 s = splsched();
3e170ce0 976 waitq_lock(waitq);
39236c6e
A
977
978 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3e170ce0 979 MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)|DBG_FUNC_NONE,
4bd07ac2 980 VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
3e170ce0
A
981
982 wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
983 interruptible,
984 urgency, deadline, slop,
985 thread);
91447636 986
3e170ce0 987 waitq_unlock(waitq);
91447636 988 splx(s);
3e170ce0 989 return wresult;
55e303ae
A
990}
991
992wait_result_t
91447636 993assert_wait_deadline(
55e303ae 994 event_t event,
91447636
A
995 wait_interrupt_t interruptible,
996 uint64_t deadline)
55e303ae
A
997{
998 thread_t thread = current_thread();
91447636 999 wait_result_t wresult;
55e303ae
A
1000 spl_t s;
1001
3e170ce0
A
1002 if (__improbable(event == NO_EVENT))
1003 panic("%s() called with NO_EVENT", __func__);
1004
1005 struct waitq *waitq;
1006 waitq = global_eventq(event);
55e303ae
A
1007
1008 s = splsched();
3e170ce0 1009 waitq_lock(waitq);
55e303ae 1010
316670eb 1011 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3e170ce0 1012 MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)|DBG_FUNC_NONE,
4bd07ac2 1013 VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
39236c6e 1014
3e170ce0
A
1015 wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
1016 interruptible,
1017 TIMEOUT_URGENCY_SYS_NORMAL, deadline,
1018 TIMEOUT_NO_LEEWAY, thread);
3e170ce0 1019 waitq_unlock(waitq);
39236c6e 1020 splx(s);
3e170ce0 1021 return wresult;
39236c6e
A
1022}
1023
1024wait_result_t
1025assert_wait_deadline_with_leeway(
1026 event_t event,
1027 wait_interrupt_t interruptible,
1028 wait_timeout_urgency_t urgency,
1029 uint64_t deadline,
1030 uint64_t leeway)
1031{
1032 thread_t thread = current_thread();
1033 wait_result_t wresult;
39236c6e
A
1034 spl_t s;
1035
3e170ce0
A
1036 if (__improbable(event == NO_EVENT))
1037 panic("%s() called with NO_EVENT", __func__);
fe8ab488 1038
3e170ce0
A
1039 struct waitq *waitq;
1040 waitq = global_eventq(event);
39236c6e
A
1041
1042 s = splsched();
3e170ce0 1043 waitq_lock(waitq);
39236c6e
A
1044
1045 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3e170ce0 1046 MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)|DBG_FUNC_NONE,
4bd07ac2 1047 VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
39236c6e 1048
3e170ce0
A
1049 wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
1050 interruptible,
1051 urgency, deadline, leeway,
1052 thread);
3e170ce0 1053 waitq_unlock(waitq);
55e303ae 1054 splx(s);
3e170ce0 1055 return wresult;
55e303ae 1056}
9bccf70c 1057
39236c6e
A
1058/*
1059 * thread_isoncpu:
1060 *
1061 * Return TRUE if a thread is running on a processor such that an AST
1062 * is needed to pull it out of userspace execution, or if executing in
1063 * the kernel, bring to a context switch boundary that would cause
1064 * thread state to be serialized in the thread PCB.
1065 *
1066 * Thread locked, returns the same way. While locked, fields
fe8ab488 1067 * like "state" cannot change. "runq" can change only from set to unset.
39236c6e
A
1068 */
1069static inline boolean_t
1070thread_isoncpu(thread_t thread)
1071{
1072 /* Not running or runnable */
1073 if (!(thread->state & TH_RUN))
1074 return (FALSE);
1075
1076 /* Waiting on a runqueue, not currently running */
fe8ab488 1077 /* TODO: This is invalid - it can get dequeued without thread lock, but not context switched. */
39236c6e
A
1078 if (thread->runq != PROCESSOR_NULL)
1079 return (FALSE);
1080
3e170ce0
A
1081 /*
1082 * Thread does not have a stack yet
1083 * It could be on the stack alloc queue or preparing to be invoked
1084 */
1085 if (!thread->kernel_stack)
1086 return (FALSE);
1087
39236c6e
A
1088 /*
1089 * Thread must be running on a processor, or
1090 * about to run, or just did run. In all these
1091 * cases, an AST to the processor is needed
1092 * to guarantee that the thread is kicked out
1093 * of userspace and the processor has
1094 * context switched (and saved register state).
1095 */
1096 return (TRUE);
1097}
1098
1c79356b 1099/*
91447636 1100 * thread_stop:
1c79356b 1101 *
91447636 1102 * Force a preemption point for a thread and wait
39236c6e
A
1103 * for it to stop running on a CPU. If a stronger
1104 * guarantee is requested, wait until no longer
1105 * runnable. Arbitrates access among
91447636 1106 * multiple stop requests. (released by unstop)
1c79356b 1107 *
91447636
A
1108 * The thread must enter a wait state and stop via a
1109 * separate means.
1c79356b 1110 *
91447636 1111 * Returns FALSE if interrupted.
1c79356b
A
1112 */
1113boolean_t
1114thread_stop(
39236c6e
A
1115 thread_t thread,
1116 boolean_t until_not_runnable)
1c79356b 1117{
91447636 1118 wait_result_t wresult;
2d21ac55 1119 spl_t s = splsched();
39236c6e 1120 boolean_t oncpu;
1c79356b 1121
1c79356b 1122 wake_lock(thread);
2d21ac55 1123 thread_lock(thread);
1c79356b
A
1124
1125 while (thread->state & TH_SUSP) {
1126 thread->wake_active = TRUE;
2d21ac55
A
1127 thread_unlock(thread);
1128
91447636 1129 wresult = assert_wait(&thread->wake_active, THREAD_ABORTSAFE);
1c79356b
A
1130 wake_unlock(thread);
1131 splx(s);
1132
91447636
A
1133 if (wresult == THREAD_WAITING)
1134 wresult = thread_block(THREAD_CONTINUE_NULL);
9bccf70c 1135
91447636 1136 if (wresult != THREAD_AWAKENED)
1c79356b
A
1137 return (FALSE);
1138
1139 s = splsched();
1140 wake_lock(thread);
2d21ac55 1141 thread_lock(thread);
1c79356b 1142 }
9bccf70c 1143
1c79356b 1144 thread->state |= TH_SUSP;
1c79356b 1145
39236c6e
A
1146 while ((oncpu = thread_isoncpu(thread)) ||
1147 (until_not_runnable && (thread->state & TH_RUN))) {
1148 processor_t processor;
1149
1150 if (oncpu) {
1151 assert(thread->state & TH_RUN);
1152 processor = thread->chosen_processor;
9bccf70c 1153 cause_ast_check(processor);
39236c6e 1154 }
9bccf70c
A
1155
1156 thread->wake_active = TRUE;
2d21ac55
A
1157 thread_unlock(thread);
1158
91447636 1159 wresult = assert_wait(&thread->wake_active, THREAD_ABORTSAFE);
9bccf70c
A
1160 wake_unlock(thread);
1161 splx(s);
1162
91447636
A
1163 if (wresult == THREAD_WAITING)
1164 wresult = thread_block(THREAD_CONTINUE_NULL);
9bccf70c 1165
91447636 1166 if (wresult != THREAD_AWAKENED) {
9bccf70c
A
1167 thread_unstop(thread);
1168 return (FALSE);
1169 }
1170
1171 s = splsched();
1172 wake_lock(thread);
1173 thread_lock(thread);
1174 }
1175
1176 thread_unlock(thread);
1c79356b
A
1177 wake_unlock(thread);
1178 splx(s);
39236c6e
A
1179
1180 /*
1181 * We return with the thread unlocked. To prevent it from
1182 * transitioning to a runnable state (or from TH_RUN to
1183 * being on the CPU), the caller must ensure the thread
1184 * is stopped via an external means (such as an AST)
1185 */
1c79356b
A
1186
1187 return (TRUE);
1188}
1189
1190/*
91447636
A
1191 * thread_unstop:
1192 *
1193 * Release a previous stop request and set
1194 * the thread running if appropriate.
1195 *
1196 * Use only after a successful stop operation.
1c79356b
A
1197 */
1198void
1199thread_unstop(
9bccf70c 1200 thread_t thread)
1c79356b 1201{
9bccf70c 1202 spl_t s = splsched();
1c79356b 1203
1c79356b
A
1204 wake_lock(thread);
1205 thread_lock(thread);
1206
3e170ce0 1207 assert((thread->state & (TH_RUN|TH_WAIT|TH_SUSP)) != TH_SUSP);
55e303ae 1208
1c79356b
A
1209 if (thread->state & TH_SUSP) {
1210 thread->state &= ~TH_SUSP;
1211
1212 if (thread->wake_active) {
1213 thread->wake_active = FALSE;
1214 thread_unlock(thread);
2d21ac55
A
1215
1216 thread_wakeup(&thread->wake_active);
1c79356b
A
1217 wake_unlock(thread);
1218 splx(s);
1c79356b
A
1219
1220 return;
1221 }
1222 }
1223
1224 thread_unlock(thread);
1225 wake_unlock(thread);
1226 splx(s);
1227}
1228
1229/*
91447636
A
1230 * thread_wait:
1231 *
1232 * Wait for a thread to stop running. (non-interruptible)
1233 *
1c79356b 1234 */
91447636 1235void
1c79356b 1236thread_wait(
316670eb
A
1237 thread_t thread,
1238 boolean_t until_not_runnable)
1c79356b 1239{
91447636 1240 wait_result_t wresult;
316670eb
A
1241 boolean_t oncpu;
1242 processor_t processor;
1243 spl_t s = splsched();
1c79356b 1244
1c79356b 1245 wake_lock(thread);
9bccf70c 1246 thread_lock(thread);
1c79356b 1247
316670eb
A
1248 /*
1249 * Wait until not running on a CPU. If stronger requirement
1250 * desired, wait until not runnable. Assumption: if thread is
1251 * on CPU, then TH_RUN is set, so we're not waiting in any case
1252 * where the original, pure "TH_RUN" check would have let us
1253 * finish.
1254 */
39236c6e 1255 while ((oncpu = thread_isoncpu(thread)) ||
316670eb 1256 (until_not_runnable && (thread->state & TH_RUN))) {
e7c99d92 1257
316670eb
A
1258 if (oncpu) {
1259 assert(thread->state & TH_RUN);
39236c6e 1260 processor = thread->chosen_processor;
9bccf70c 1261 cause_ast_check(processor);
316670eb 1262 }
1c79356b
A
1263
1264 thread->wake_active = TRUE;
2d21ac55
A
1265 thread_unlock(thread);
1266
91447636 1267 wresult = assert_wait(&thread->wake_active, THREAD_UNINT);
1c79356b
A
1268 wake_unlock(thread);
1269 splx(s);
1270
91447636
A
1271 if (wresult == THREAD_WAITING)
1272 thread_block(THREAD_CONTINUE_NULL);
1c79356b
A
1273
1274 s = splsched();
1275 wake_lock(thread);
9bccf70c 1276 thread_lock(thread);
1c79356b 1277 }
0b4e3aa0 1278
9bccf70c 1279 thread_unlock(thread);
1c79356b
A
1280 wake_unlock(thread);
1281 splx(s);
1c79356b
A
1282}
1283
1c79356b
A
1284/*
1285 * Routine: clear_wait_internal
1286 *
1287 * Clear the wait condition for the specified thread.
1288 * Start the thread executing if that is appropriate.
1289 * Arguments:
1290 * thread thread to awaken
1291 * result Wakeup result the thread should see
1292 * Conditions:
1293 * At splsched
1294 * the thread is locked.
9bccf70c
A
1295 * Returns:
1296 * KERN_SUCCESS thread was rousted out a wait
1297 * KERN_FAILURE thread was waiting but could not be rousted
1298 * KERN_NOT_WAITING thread was not waiting
1c79356b 1299 */
9bccf70c 1300__private_extern__ kern_return_t
1c79356b 1301clear_wait_internal(
9bccf70c 1302 thread_t thread,
55e303ae 1303 wait_result_t wresult)
1c79356b 1304{
39037602 1305 uint32_t i = LockTimeOutUsec;
3e170ce0 1306 struct waitq *waitq = thread->waitq;
39037602 1307
9bccf70c 1308 do {
55e303ae
A
1309 if (wresult == THREAD_INTERRUPTED && (thread->state & TH_UNINT))
1310 return (KERN_FAILURE);
9bccf70c 1311
3e170ce0 1312 if (waitq != NULL) {
39037602 1313 if (!waitq_pull_thread_locked(waitq, thread)) {
9bccf70c
A
1314 thread_unlock(thread);
1315 delay(1);
39037602
A
1316 if (i > 0 && !machine_timeout_suspended())
1317 i--;
9bccf70c 1318 thread_lock(thread);
3e170ce0
A
1319 if (waitq != thread->waitq)
1320 return KERN_NOT_WAITING;
9bccf70c
A
1321 continue;
1322 }
1c79356b 1323 }
55e303ae 1324
3e170ce0
A
1325 /* TODO: Can we instead assert TH_TERMINATE is not set? */
1326 if ((thread->state & (TH_WAIT|TH_TERMINATE)) == TH_WAIT)
1327 return (thread_go(thread, wresult));
1328 else
1329 return (KERN_NOT_WAITING);
39037602 1330 } while (i > 0);
55e303ae 1331
2d21ac55 1332 panic("clear_wait_internal: deadlock: thread=%p, wq=%p, cpu=%d\n",
3e170ce0 1333 thread, waitq, cpu_number());
55e303ae
A
1334
1335 return (KERN_FAILURE);
1c79356b
A
1336}
1337
1338
1339/*
1340 * clear_wait:
1341 *
1342 * Clear the wait condition for the specified thread. Start the thread
1343 * executing if that is appropriate.
1344 *
1345 * parameters:
1346 * thread thread to awaken
1347 * result Wakeup result the thread should see
1348 */
9bccf70c 1349kern_return_t
1c79356b 1350clear_wait(
9bccf70c
A
1351 thread_t thread,
1352 wait_result_t result)
1c79356b 1353{
9bccf70c 1354 kern_return_t ret;
1c79356b
A
1355 spl_t s;
1356
1357 s = splsched();
1358 thread_lock(thread);
9bccf70c 1359 ret = clear_wait_internal(thread, result);
1c79356b
A
1360 thread_unlock(thread);
1361 splx(s);
9bccf70c 1362 return ret;
1c79356b
A
1363}
1364
1365
1366/*
1367 * thread_wakeup_prim:
1368 *
1369 * Common routine for thread_wakeup, thread_wakeup_with_result,
1370 * and thread_wakeup_one.
1371 *
1372 */
9bccf70c 1373kern_return_t
1c79356b 1374thread_wakeup_prim(
39037602
A
1375 event_t event,
1376 boolean_t one_thread,
1377 wait_result_t result)
6d2010ae 1378{
39037602
A
1379 if (__improbable(event == NO_EVENT))
1380 panic("%s() called with NO_EVENT", __func__);
1381
1382 struct waitq *wq = global_eventq(event);
1383
1384 if (one_thread)
1385 return waitq_wakeup64_one(wq, CAST_EVENT64_T(event), result, WAITQ_ALL_PRIORITIES);
1386 else
1387 return waitq_wakeup64_all(wq, CAST_EVENT64_T(event), result, WAITQ_ALL_PRIORITIES);
6d2010ae
A
1388}
1389
39037602
A
1390/*
1391 * Wakeup a specified thread if and only if it's waiting for this event
1392 */
1393kern_return_t
1394thread_wakeup_thread(
1395 event_t event,
1396 thread_t thread)
1397{
1398 if (__improbable(event == NO_EVENT))
1399 panic("%s() called with NO_EVENT", __func__);
1400
1401 struct waitq *wq = global_eventq(event);
1402
1403 return waitq_wakeup64_thread(wq, CAST_EVENT64_T(event), thread, THREAD_AWAKENED);
1404}
6d2010ae 1405
39037602
A
1406/*
1407 * Wakeup a thread waiting on an event and promote it to a priority.
1408 *
1409 * Requires woken thread to un-promote itself when done.
1410 */
6d2010ae 1411kern_return_t
39037602
A
1412thread_wakeup_one_with_pri(
1413 event_t event,
1414 int priority)
1c79356b 1415{
3e170ce0
A
1416 if (__improbable(event == NO_EVENT))
1417 panic("%s() called with NO_EVENT", __func__);
1418
39037602 1419 struct waitq *wq = global_eventq(event);
1c79356b 1420
39037602
A
1421 return waitq_wakeup64_one(wq, CAST_EVENT64_T(event), THREAD_AWAKENED, priority);
1422}
fe8ab488 1423
39037602
A
1424/*
1425 * Wakeup a thread waiting on an event,
1426 * promote it to a priority,
1427 * and return a reference to the woken thread.
1428 *
1429 * Requires woken thread to un-promote itself when done.
1430 */
1431thread_t
1432thread_wakeup_identify(event_t event,
1433 int priority)
1434{
1435 if (__improbable(event == NO_EVENT))
1436 panic("%s() called with NO_EVENT", __func__);
1437
1438 struct waitq *wq = global_eventq(event);
1439
1440 return waitq_wakeup64_identify(wq, CAST_EVENT64_T(event), THREAD_AWAKENED, priority);
1c79356b
A
1441}
1442
1443/*
1444 * thread_bind:
1445 *
2d21ac55 1446 * Force the current thread to execute on the specified processor.
fe8ab488 1447 * Takes effect after the next thread_block().
1c79356b 1448 *
55e303ae
A
1449 * Returns the previous binding. PROCESSOR_NULL means
1450 * not bound.
1451 *
1452 * XXX - DO NOT export this to users - XXX
1c79356b 1453 */
55e303ae 1454processor_t
1c79356b 1455thread_bind(
2d21ac55 1456 processor_t processor)
1c79356b 1457{
2d21ac55 1458 thread_t self = current_thread();
55e303ae 1459 processor_t prev;
55e303ae 1460 spl_t s;
1c79356b
A
1461
1462 s = splsched();
2d21ac55 1463 thread_lock(self);
55e303ae 1464
3e170ce0 1465 prev = thread_bind_internal(self, processor);
55e303ae 1466
2d21ac55 1467 thread_unlock(self);
1c79356b 1468 splx(s);
55e303ae
A
1469
1470 return (prev);
1c79356b
A
1471}
1472
3e170ce0
A
1473/*
1474 * thread_bind_internal:
1475 *
1476 * If the specified thread is not the current thread, and it is currently
1477 * running on another CPU, a remote AST must be sent to that CPU to cause
1478 * the thread to migrate to its bound processor. Otherwise, the migration
1479 * will occur at the next quantum expiration or blocking point.
1480 *
1481 * When the thread is the current thread, and explicit thread_block() should
1482 * be used to force the current processor to context switch away and
1483 * let the thread migrate to the bound processor.
1484 *
1485 * Thread must be locked, and at splsched.
1486 */
1487
1488static processor_t
1489thread_bind_internal(
1490 thread_t thread,
1491 processor_t processor)
1492{
1493 processor_t prev;
1494
1495 /* <rdar://problem/15102234> */
1496 assert(thread->sched_pri < BASEPRI_RTQUEUES);
1497 /* A thread can't be bound if it's sitting on a (potentially incorrect) runqueue */
1498 assert(thread->runq == PROCESSOR_NULL);
1499
1500 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_THREAD_BIND), thread_tid(thread), processor ? (uintptr_t)processor->cpu_id : (uintptr_t)-1, 0, 0, 0);
1501
1502 prev = thread->bound_processor;
1503 thread->bound_processor = processor;
1504
1505 return (prev);
1506}
1507
1508/*
1509 * thread_vm_bind_group_add:
1510 *
1511 * The "VM bind group" is a special mechanism to mark a collection
1512 * of threads from the VM subsystem that, in general, should be scheduled
1513 * with only one CPU of parallelism. To accomplish this, we initially
1514 * bind all the threads to the master processor, which has the effect
1515 * that only one of the threads in the group can execute at once, including
1516 * preempting threads in the group that are a lower priority. Future
1517 * mechanisms may use more dynamic mechanisms to prevent the collection
1518 * of VM threads from using more CPU time than desired.
1519 *
1520 * The current implementation can result in priority inversions where
1521 * compute-bound priority 95 or realtime threads that happen to have
1522 * landed on the master processor prevent the VM threads from running.
1523 * When this situation is detected, we unbind the threads for one
1524 * scheduler tick to allow the scheduler to run the threads an
1525 * additional CPUs, before restoring the binding (assuming high latency
1526 * is no longer a problem).
1527 */
1528
1529/*
1530 * The current max is provisioned for:
1531 * vm_compressor_swap_trigger_thread (92)
1532 * 2 x vm_pageout_iothread_internal (92) when vm_restricted_to_single_processor==TRUE
1533 * vm_pageout_continue (92)
1534 * memorystatus_thread (95)
1535 */
1536#define MAX_VM_BIND_GROUP_COUNT (5)
1537decl_simple_lock_data(static,sched_vm_group_list_lock);
1538static thread_t sched_vm_group_thread_list[MAX_VM_BIND_GROUP_COUNT];
1539static int sched_vm_group_thread_count;
1540static boolean_t sched_vm_group_temporarily_unbound = FALSE;
1541
1542void
1543thread_vm_bind_group_add(void)
1544{
1545 thread_t self = current_thread();
1546
1547 thread_reference_internal(self);
1548 self->options |= TH_OPT_SCHED_VM_GROUP;
1549
1550 simple_lock(&sched_vm_group_list_lock);
1551 assert(sched_vm_group_thread_count < MAX_VM_BIND_GROUP_COUNT);
1552 sched_vm_group_thread_list[sched_vm_group_thread_count++] = self;
1553 simple_unlock(&sched_vm_group_list_lock);
1554
1555 thread_bind(master_processor);
1556
1557 /* Switch to bound processor if not already there */
1558 thread_block(THREAD_CONTINUE_NULL);
1559}
1560
1561static void
1562sched_vm_group_maintenance(void)
1563{
1564 uint64_t ctime = mach_absolute_time();
1565 uint64_t longtime = ctime - sched_tick_interval;
1566 int i;
1567 spl_t s;
1568 boolean_t high_latency_observed = FALSE;
1569 boolean_t runnable_and_not_on_runq_observed = FALSE;
1570 boolean_t bind_target_changed = FALSE;
1571 processor_t bind_target = PROCESSOR_NULL;
1572
1573 /* Make sure nobody attempts to add new threads while we are enumerating them */
1574 simple_lock(&sched_vm_group_list_lock);
1575
1576 s = splsched();
1577
1578 for (i=0; i < sched_vm_group_thread_count; i++) {
1579 thread_t thread = sched_vm_group_thread_list[i];
1580 assert(thread != THREAD_NULL);
1581 thread_lock(thread);
1582 if ((thread->state & (TH_RUN|TH_WAIT)) == TH_RUN) {
1583 if (thread->runq != PROCESSOR_NULL && thread->last_made_runnable_time < longtime) {
1584 high_latency_observed = TRUE;
1585 } else if (thread->runq == PROCESSOR_NULL) {
1586 /* There are some cases where a thread be transitiong that also fall into this case */
1587 runnable_and_not_on_runq_observed = TRUE;
1588 }
1589 }
1590 thread_unlock(thread);
1591
1592 if (high_latency_observed && runnable_and_not_on_runq_observed) {
1593 /* All the things we are looking for are true, stop looking */
1594 break;
1595 }
1596 }
1597
1598 splx(s);
1599
1600 if (sched_vm_group_temporarily_unbound) {
1601 /* If we turned off binding, make sure everything is OK before rebinding */
1602 if (!high_latency_observed) {
1603 /* rebind */
1604 bind_target_changed = TRUE;
1605 bind_target = master_processor;
1606 sched_vm_group_temporarily_unbound = FALSE; /* might be reset to TRUE if change cannot be completed */
1607 }
1608 } else {
1609 /*
1610 * Check if we're in a bad state, which is defined by high
1611 * latency with no core currently executing a thread. If a
1612 * single thread is making progress on a CPU, that means the
1613 * binding concept to reduce parallelism is working as
1614 * designed.
1615 */
1616 if (high_latency_observed && !runnable_and_not_on_runq_observed) {
1617 /* unbind */
1618 bind_target_changed = TRUE;
1619 bind_target = PROCESSOR_NULL;
1620 sched_vm_group_temporarily_unbound = TRUE;
1621 }
1622 }
1623
1624 if (bind_target_changed) {
1625 s = splsched();
1626 for (i=0; i < sched_vm_group_thread_count; i++) {
1627 thread_t thread = sched_vm_group_thread_list[i];
1628 boolean_t removed;
1629 assert(thread != THREAD_NULL);
1630
1631 thread_lock(thread);
1632 removed = thread_run_queue_remove(thread);
1633 if (removed || ((thread->state & (TH_RUN | TH_WAIT)) == TH_WAIT)) {
1634 thread_bind_internal(thread, bind_target);
1635 } else {
1636 /*
1637 * Thread was in the middle of being context-switched-to,
1638 * or was in the process of blocking. To avoid switching the bind
1639 * state out mid-flight, defer the change if possible.
1640 */
1641 if (bind_target == PROCESSOR_NULL) {
1642 thread_bind_internal(thread, bind_target);
1643 } else {
1644 sched_vm_group_temporarily_unbound = TRUE; /* next pass will try again */
1645 }
1646 }
1647
1648 if (removed) {
1649 thread_run_queue_reinsert(thread, SCHED_PREEMPT | SCHED_TAILQ);
1650 }
1651 thread_unlock(thread);
1652 }
1653 splx(s);
1654 }
1655
1656 simple_unlock(&sched_vm_group_list_lock);
1657}
1658
fe8ab488
A
1659/* Invoked prior to idle entry to determine if, on SMT capable processors, an SMT
1660 * rebalancing opportunity exists when a core is (instantaneously) idle, but
1661 * other SMT-capable cores may be over-committed. TODO: some possible negatives:
1662 * IPI thrash if this core does not remain idle following the load balancing ASTs
1663 * Idle "thrash", when IPI issue is followed by idle entry/core power down
1664 * followed by a wakeup shortly thereafter.
1665 */
1666
fe8ab488
A
1667#if (DEVELOPMENT || DEBUG)
1668int sched_smt_balance = 1;
1669#endif
1670
3e170ce0
A
1671#if __SMP__
1672/* Invoked with pset locked, returns with pset unlocked */
fe8ab488
A
1673static void
1674sched_SMT_balance(processor_t cprocessor, processor_set_t cpset) {
1675 processor_t ast_processor = NULL;
1676
1677#if (DEVELOPMENT || DEBUG)
1678 if (__improbable(sched_smt_balance == 0))
1679 goto smt_balance_exit;
1680#endif
1681
1682 assert(cprocessor == current_processor());
1683 if (cprocessor->is_SMT == FALSE)
1684 goto smt_balance_exit;
1685
1686 processor_t sib_processor = cprocessor->processor_secondary ? cprocessor->processor_secondary : cprocessor->processor_primary;
1687
1688 /* Determine if both this processor and its sibling are idle,
1689 * indicating an SMT rebalancing opportunity.
1690 */
1691 if (sib_processor->state != PROCESSOR_IDLE)
1692 goto smt_balance_exit;
1693
1694 processor_t sprocessor;
1695
39037602 1696 qe_foreach_element(sprocessor, &cpset->active_queue, processor_queue) {
fe8ab488
A
1697 if ((sprocessor->state == PROCESSOR_RUNNING) &&
1698 (sprocessor->processor_primary != sprocessor) &&
1699 (sprocessor->processor_primary->state == PROCESSOR_RUNNING) &&
1700 (sprocessor->current_pri < BASEPRI_RTQUEUES) &&
3e170ce0 1701 ((cpset->pending_AST_cpu_mask & (1ULL << sprocessor->cpu_id)) == 0)) {
fe8ab488
A
1702 assert(sprocessor != cprocessor);
1703 ast_processor = sprocessor;
1704 break;
1705 }
fe8ab488
A
1706 }
1707
1708smt_balance_exit:
1709 pset_unlock(cpset);
1710
1711 if (ast_processor) {
1712 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_SMT_BALANCE), ast_processor->cpu_id, ast_processor->state, ast_processor->processor_primary->state, 0, 0);
1713 cause_ast_check(ast_processor);
1714 }
1715}
3e170ce0 1716#endif /* __SMP__ */
fe8ab488 1717
1c79356b 1718/*
2d21ac55
A
1719 * thread_select:
1720 *
1721 * Select a new thread for the current processor to execute.
55e303ae
A
1722 *
1723 * May select the current thread, which must be locked.
1c79356b 1724 */
2d21ac55 1725static thread_t
1c79356b 1726thread_select(
2d21ac55 1727 thread_t thread,
fe8ab488
A
1728 processor_t processor,
1729 ast_t reason)
1c79356b 1730{
2d21ac55 1731 processor_set_t pset = processor->processor_set;
cf7d32b8 1732 thread_t new_thread = THREAD_NULL;
1c79356b 1733
6d2010ae 1734 assert(processor == current_processor());
3e170ce0 1735 assert((thread->state & (TH_RUN|TH_TERMINATE2)) == TH_RUN);
6d2010ae 1736
2d21ac55
A
1737 do {
1738 /*
1739 * Update the priority.
1740 */
6d2010ae
A
1741 if (SCHED(can_update_priority)(thread))
1742 SCHED(update_priority)(thread);
1743
2d21ac55 1744 processor->current_pri = thread->sched_pri;
6d2010ae 1745 processor->current_thmode = thread->sched_mode;
fe8ab488 1746 processor->current_sfi_class = thread->sfi_class;
1c79356b 1747
2d21ac55
A
1748 pset_lock(pset);
1749
fe8ab488 1750 assert(processor->state != PROCESSOR_OFF_LINE);
6d2010ae 1751
3e170ce0
A
1752 if (!processor->is_recommended) {
1753 /*
1754 * The performance controller has provided a hint to not dispatch more threads,
1755 * unless they are bound to us (and thus we are the only option
1756 */
1757 if (!SCHED(processor_bound_count)(processor)) {
1758 goto idle;
1759 }
1760 } else if (processor->processor_primary != processor) {
39236c6e
A
1761 /*
1762 * Should this secondary SMT processor attempt to find work? For pset runqueue systems,
1763 * we should look for work only under the same conditions that choose_processor()
1764 * would have assigned work, which is when all primary processors have been assigned work.
1765 *
1766 * An exception is that bound threads are dispatched to a processor without going through
1767 * choose_processor(), so in those cases we should continue trying to dequeue work.
1768 */
fe8ab488 1769 if (!SCHED(processor_bound_count)(processor) && !queue_empty(&pset->idle_queue) && !rt_runq.count) {
39236c6e
A
1770 goto idle;
1771 }
1772 }
1773
3e170ce0 1774 rt_lock_lock();
2d21ac55 1775
2d21ac55
A
1776 /*
1777 * Test to see if the current thread should continue
3e170ce0 1778 * to run on this processor. Must not be attempting to wait, and not
2d21ac55 1779 * bound to a different processor, nor be in the wrong
3e170ce0
A
1780 * processor set, nor be forced to context switch by TH_SUSP.
1781 *
1782 * Note that there are never any RT threads in the regular runqueue.
1783 *
1784 * This code is very insanely tricky.
2d21ac55 1785 */
3e170ce0
A
1786
1787 if (((thread->state & (TH_TERMINATE|TH_IDLE|TH_WAIT|TH_RUN|TH_SUSP)) == TH_RUN) &&
fe8ab488
A
1788 (thread->sched_pri >= BASEPRI_RTQUEUES || processor->processor_primary == processor) &&
1789 (thread->bound_processor == PROCESSOR_NULL || thread->bound_processor == processor) &&
1790 (thread->affinity_set == AFFINITY_SET_NULL || thread->affinity_set->aset_pset == pset)) {
3e170ce0
A
1791 /*
1792 * RT threads with un-expired quantum stay on processor,
1793 * unless there's a valid RT thread with an earlier deadline.
1794 */
1795 if (thread->sched_pri >= BASEPRI_RTQUEUES && processor->first_timeslice) {
6d2010ae 1796 if (rt_runq.count > 0) {
39037602 1797 thread_t next_rt = qe_queue_first(&rt_runq.queue, struct thread, runq_links);
3e170ce0
A
1798
1799 assert(next_rt->runq == THREAD_ON_RT_RUNQ);
1800
fe8ab488 1801 if (next_rt->realtime.deadline < processor->deadline &&
3e170ce0
A
1802 (next_rt->bound_processor == PROCESSOR_NULL ||
1803 next_rt->bound_processor == processor)) {
1804 /* The next RT thread is better, so pick it off the runqueue. */
1805 goto pick_new_rt_thread;
55e303ae
A
1806 }
1807 }
2d21ac55 1808
3e170ce0 1809 /* This is still the best RT thread to run. */
2d21ac55
A
1810 processor->deadline = thread->realtime.deadline;
1811
3e170ce0 1812 rt_lock_unlock();
2d21ac55
A
1813 pset_unlock(pset);
1814
1815 return (thread);
55e303ae
A
1816 }
1817
3e170ce0
A
1818 if ((rt_runq.count == 0) &&
1819 SCHED(processor_queue_has_priority)(processor, thread->sched_pri, TRUE) == FALSE) {
fe8ab488 1820 /* This thread is still the highest priority runnable (non-idle) thread */
2d21ac55 1821 processor->deadline = UINT64_MAX;
55e303ae 1822
3e170ce0 1823 rt_lock_unlock();
2d21ac55 1824 pset_unlock(pset);
55e303ae 1825
2d21ac55
A
1826 return (thread);
1827 }
1828 }
1829
3e170ce0
A
1830 /* OK, so we're not going to run the current thread. Look at the RT queue. */
1831 if (rt_runq.count > 0) {
39037602 1832 thread_t next_rt = qe_queue_first(&rt_runq.queue, struct thread, runq_links);
c910b4d9 1833
3e170ce0 1834 assert(next_rt->runq == THREAD_ON_RT_RUNQ);
fe8ab488 1835
3e170ce0
A
1836 if (__probable((next_rt->bound_processor == PROCESSOR_NULL ||
1837 (next_rt->bound_processor == processor)))) {
1838pick_new_rt_thread:
39037602 1839 new_thread = qe_dequeue_head(&rt_runq.queue, struct thread, runq_links);
6d2010ae 1840
3e170ce0 1841 new_thread->runq = PROCESSOR_NULL;
39236c6e
A
1842 SCHED_STATS_RUNQ_CHANGE(&rt_runq.runq_stats, rt_runq.count);
1843 rt_runq.count--;
6d2010ae 1844
3e170ce0 1845 processor->deadline = new_thread->realtime.deadline;
c910b4d9 1846
3e170ce0 1847 rt_lock_unlock();
39236c6e 1848 pset_unlock(pset);
c910b4d9 1849
3e170ce0 1850 return (new_thread);
39236c6e 1851 }
c910b4d9 1852 }
2d21ac55 1853
3e170ce0
A
1854 processor->deadline = UINT64_MAX;
1855 rt_lock_unlock();
6d2010ae 1856
3e170ce0
A
1857 /* No RT threads, so let's look at the regular threads. */
1858 if ((new_thread = SCHED(choose_thread)(processor, MINPRI, reason)) != THREAD_NULL) {
6d2010ae 1859 pset_unlock(pset);
6d2010ae
A
1860 return (new_thread);
1861 }
c910b4d9 1862
3e170ce0
A
1863#if __SMP__
1864 if (SCHED(steal_thread_enabled)) {
1865 /*
1866 * No runnable threads, attempt to steal
1867 * from other processors. Returns with pset lock dropped.
1868 */
2d21ac55 1869
3e170ce0
A
1870 if ((new_thread = SCHED(steal_thread)(pset)) != THREAD_NULL) {
1871 return (new_thread);
1872 }
cf7d32b8 1873
3e170ce0
A
1874 /*
1875 * If other threads have appeared, shortcut
1876 * around again.
1877 */
1878 if (!SCHED(processor_queue_empty)(processor) || rt_runq.count > 0)
1879 continue;
1880
1881 pset_lock(pset);
1882 }
1883#endif
55e303ae 1884
39236c6e 1885 idle:
1c79356b
A
1886 /*
1887 * Nothing is runnable, so set this processor idle if it
2d21ac55 1888 * was running.
1c79356b 1889 */
55e303ae 1890 if (processor->state == PROCESSOR_RUNNING) {
55e303ae 1891 processor->state = PROCESSOR_IDLE;
1c79356b 1892
fe8ab488 1893 if (processor->processor_primary == processor) {
39037602
A
1894 re_queue_head(&pset->idle_queue, &processor->processor_queue);
1895 } else {
1896 re_queue_head(&pset->idle_secondary_queue, &processor->processor_queue);
b0d623f7 1897 }
1c79356b 1898 }
1c79356b 1899
3e170ce0 1900#if __SMP__
fe8ab488
A
1901 /* Invoked with pset locked, returns with pset unlocked */
1902 sched_SMT_balance(processor, pset);
3e170ce0
A
1903#else
1904 pset_unlock(pset);
1905#endif
2d21ac55 1906
6d2010ae 1907#if CONFIG_SCHED_IDLE_IN_PLACE
2d21ac55
A
1908 /*
1909 * Choose idle thread if fast idle is not possible.
1910 */
fe8ab488
A
1911 if (processor->processor_primary != processor)
1912 return (processor->idle_thread);
1913
6d2010ae 1914 if ((thread->state & (TH_IDLE|TH_TERMINATE|TH_SUSP)) || !(thread->state & TH_WAIT) || thread->wake_active || thread->sched_pri >= BASEPRI_RTQUEUES)
2d21ac55
A
1915 return (processor->idle_thread);
1916
1917 /*
1918 * Perform idling activities directly without a
1919 * context switch. Return dispatched thread,
1920 * else check again for a runnable thread.
1921 */
1922 new_thread = thread_select_idle(thread, processor);
1923
6d2010ae
A
1924#else /* !CONFIG_SCHED_IDLE_IN_PLACE */
1925
1926 /*
1927 * Do a full context switch to idle so that the current
1928 * thread can start running on another processor without
1929 * waiting for the fast-idled processor to wake up.
1930 */
3e170ce0 1931 new_thread = processor->idle_thread;
6d2010ae
A
1932
1933#endif /* !CONFIG_SCHED_IDLE_IN_PLACE */
1934
2d21ac55
A
1935 } while (new_thread == THREAD_NULL);
1936
1937 return (new_thread);
1938}
1939
6d2010ae 1940#if CONFIG_SCHED_IDLE_IN_PLACE
2d21ac55
A
1941/*
1942 * thread_select_idle:
1943 *
1944 * Idle the processor using the current thread context.
1945 *
1946 * Called with thread locked, then dropped and relocked.
1947 */
1948static thread_t
1949thread_select_idle(
1950 thread_t thread,
1951 processor_t processor)
1952{
1953 thread_t new_thread;
39236c6e
A
1954 uint64_t arg1, arg2;
1955 int urgency;
1956
fe8ab488 1957 sched_run_decr(thread);
2d21ac55
A
1958
1959 thread->state |= TH_IDLE;
1960 processor->current_pri = IDLEPRI;
6d2010ae 1961 processor->current_thmode = TH_MODE_NONE;
fe8ab488 1962 processor->current_sfi_class = SFI_CLASS_KERNEL;
2d21ac55 1963
316670eb
A
1964 /* Reload precise timing global policy to thread-local policy */
1965 thread->precise_user_kernel_time = use_precise_user_kernel_time(thread);
1966
2d21ac55
A
1967 thread_unlock(thread);
1968
1969 /*
1970 * Switch execution timing to processor idle thread.
1971 */
1972 processor->last_dispatch = mach_absolute_time();
fe8ab488
A
1973
1974#ifdef CONFIG_MACH_APPROXIMATE_TIME
1975 commpage_update_mach_approximate_time(processor->last_dispatch);
1976#endif
1977
6d2010ae 1978 thread->last_run_time = processor->last_dispatch;
2d21ac55
A
1979 thread_timer_event(processor->last_dispatch, &processor->idle_thread->system_timer);
1980 PROCESSOR_DATA(processor, kernel_timer) = &processor->idle_thread->system_timer;
1981
1982 /*
1983 * Cancel the quantum timer while idling.
1984 */
1985 timer_call_cancel(&processor->quantum_timer);
3e170ce0 1986 processor->first_timeslice = FALSE;
2d21ac55
A
1987
1988 (*thread->sched_call)(SCHED_CALL_BLOCK, thread);
1989
3e170ce0 1990 thread_tell_urgency(THREAD_URGENCY_NONE, 0, 0, 0, NULL);
6d2010ae 1991
2d21ac55
A
1992 /*
1993 * Enable interrupts and perform idling activities. No
1994 * preemption due to TH_IDLE being set.
1995 */
1996 spllo(); new_thread = processor_idle(thread, processor);
1997
cf7d32b8
A
1998 /*
1999 * Return at splsched.
2000 */
2d21ac55
A
2001 (*thread->sched_call)(SCHED_CALL_UNBLOCK, thread);
2002
2003 thread_lock(thread);
2004
2005 /*
2006 * If awakened, switch to thread timer and start a new quantum.
2007 * Otherwise skip; we will context switch to another thread or return here.
2008 */
2009 if (!(thread->state & TH_WAIT)) {
2010 processor->last_dispatch = mach_absolute_time();
2011 thread_timer_event(processor->last_dispatch, &thread->system_timer);
2012 PROCESSOR_DATA(processor, kernel_timer) = &thread->system_timer;
2013
2014 thread_quantum_init(thread);
fe8ab488
A
2015 processor->quantum_end = processor->last_dispatch + thread->quantum_remaining;
2016 timer_call_enter1(&processor->quantum_timer, thread, processor->quantum_end, TIMER_CALL_SYS_CRITICAL | TIMER_CALL_LOCAL);
3e170ce0 2017 processor->first_timeslice = TRUE;
2d21ac55
A
2018
2019 thread->computation_epoch = processor->last_dispatch;
1c79356b
A
2020 }
2021
2d21ac55 2022 thread->state &= ~TH_IDLE;
55e303ae 2023
39236c6e
A
2024 urgency = thread_get_urgency(thread, &arg1, &arg2);
2025
3e170ce0 2026 thread_tell_urgency(urgency, arg1, arg2, 0, new_thread);
39236c6e 2027
fe8ab488 2028 sched_run_incr(thread);
39236c6e 2029
2d21ac55 2030 return (new_thread);
1c79356b 2031}
6d2010ae
A
2032#endif /* CONFIG_SCHED_IDLE_IN_PLACE */
2033
b0d623f7 2034/*
3e170ce0 2035 * thread_invoke
b0d623f7 2036 *
3e170ce0 2037 * Called at splsched with neither thread locked.
b0d623f7 2038 *
3e170ce0 2039 * Perform a context switch and start executing the new thread.
55e303ae 2040 *
3e170ce0
A
2041 * Returns FALSE when the context switch didn't happen.
2042 * The reference to the new thread is still consumed.
39236c6e
A
2043 *
2044 * "self" is what is currently running on the processor,
2045 * "thread" is the new thread to context switch to
2046 * (which may be the same thread in some cases)
2047 */
2d21ac55 2048static boolean_t
1c79356b 2049thread_invoke(
39236c6e
A
2050 thread_t self,
2051 thread_t thread,
91447636 2052 ast_t reason)
1c79356b 2053{
39236c6e 2054 if (__improbable(get_preemption_level() != 0)) {
b0d623f7
A
2055 int pl = get_preemption_level();
2056 panic("thread_invoke: preemption_level %d, possible cause: %s",
2057 pl, (pl < 0 ? "unlocking an unlocked mutex or spinlock" :
2058 "blocking while holding a spinlock, or within interrupt context"));
2059 }
0b4e3aa0 2060
3e170ce0
A
2061 thread_continue_t continuation = self->continuation;
2062 void *parameter = self->parameter;
2063 processor_t processor;
2064
2065 uint64_t ctime = mach_absolute_time();
2066
2067#ifdef CONFIG_MACH_APPROXIMATE_TIME
2068 commpage_update_mach_approximate_time(ctime);
2069#endif
2070
2071#if defined(CONFIG_SCHED_TIMESHARE_CORE)
2072 sched_timeshare_consider_maintenance(ctime);
2073#endif
2074
39037602 2075 assert_thread_magic(self);
2d21ac55 2076 assert(self == current_thread());
fe8ab488 2077 assert(self->runq == PROCESSOR_NULL);
3e170ce0 2078 assert((self->state & (TH_RUN|TH_TERMINATE2)) == TH_RUN);
91447636 2079
2d21ac55 2080 thread_lock(thread);
1c79356b 2081
39037602 2082 assert_thread_magic(thread);
3e170ce0 2083 assert((thread->state & (TH_RUN|TH_WAIT|TH_UNINT|TH_TERMINATE|TH_TERMINATE2)) == TH_RUN);
fe8ab488
A
2084 assert(thread->bound_processor == PROCESSOR_NULL || thread->bound_processor == current_processor());
2085 assert(thread->runq == PROCESSOR_NULL);
1c79356b 2086
316670eb
A
2087 /* Reload precise timing global policy to thread-local policy */
2088 thread->precise_user_kernel_time = use_precise_user_kernel_time(thread);
3e170ce0 2089
fe8ab488
A
2090 /* Update SFI class based on other factors */
2091 thread->sfi_class = sfi_thread_classify(thread);
2092
3e170ce0 2093 /* Allow realtime threads to hang onto a stack. */
6d2010ae 2094 if ((self->sched_mode == TH_MODE_REALTIME) && !self->reserved_stack)
2d21ac55 2095 self->reserved_stack = self->kernel_stack;
1c79356b 2096
91447636 2097 if (continuation != NULL) {
2d21ac55 2098 if (!thread->kernel_stack) {
9bccf70c 2099 /*
2d21ac55 2100 * If we are using a privileged stack,
9bccf70c 2101 * check to see whether we can exchange it with
2d21ac55 2102 * that of the other thread.
9bccf70c 2103 */
2d21ac55 2104 if (self->kernel_stack == self->reserved_stack && !thread->reserved_stack)
9bccf70c 2105 goto need_stack;
1c79356b 2106
91447636
A
2107 /*
2108 * Context switch by performing a stack handoff.
2109 */
2d21ac55
A
2110 continuation = thread->continuation;
2111 parameter = thread->parameter;
1c79356b 2112
9bccf70c 2113 processor = current_processor();
2d21ac55
A
2114 processor->active_thread = thread;
2115 processor->current_pri = thread->sched_pri;
6d2010ae 2116 processor->current_thmode = thread->sched_mode;
fe8ab488 2117 processor->current_sfi_class = thread->sfi_class;
2d21ac55
A
2118 if (thread->last_processor != processor && thread->last_processor != NULL) {
2119 if (thread->last_processor->processor_set != processor->processor_set)
2120 thread->ps_switch++;
2121 thread->p_switch++;
2122 }
2123 thread->last_processor = processor;
2124 thread->c_switch++;
2125 ast_context(thread);
3e170ce0 2126
2d21ac55 2127 thread_unlock(thread);
1c79356b 2128
2d21ac55 2129 self->reason = reason;
91447636 2130
39236c6e
A
2131 processor->last_dispatch = ctime;
2132 self->last_run_time = ctime;
2133 thread_timer_event(ctime, &thread->system_timer);
2d21ac55 2134 PROCESSOR_DATA(processor, kernel_timer) = &thread->system_timer;
316670eb
A
2135
2136 /*
2137 * Since non-precise user/kernel time doesn't update the state timer
2138 * during privilege transitions, synthesize an event now.
2139 */
2140 if (!thread->precise_user_kernel_time) {
2141 timer_switch(PROCESSOR_DATA(processor, current_state),
39236c6e 2142 ctime,
316670eb
A
2143 PROCESSOR_DATA(processor, current_state));
2144 }
2d21ac55 2145
316670eb
A
2146 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2147 MACHDBG_CODE(DBG_MACH_SCHED, MACH_STACK_HANDOFF)|DBG_FUNC_NONE,
2148 self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
1c79356b 2149
39236c6e 2150 if ((thread->chosen_processor != processor) && (thread->chosen_processor != PROCESSOR_NULL)) {
3e170ce0 2151 SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_MOVED)|DBG_FUNC_NONE,
6d2010ae
A
2152 (uintptr_t)thread_tid(thread), (uintptr_t)thread->chosen_processor->cpu_id, 0, 0, 0);
2153 }
2154
b0d623f7
A
2155 DTRACE_SCHED2(off__cpu, struct thread *, thread, struct proc *, thread->task->bsd_info);
2156
6d2010ae
A
2157 SCHED_STATS_CSW(processor, self->reason, self->sched_pri, thread->sched_pri);
2158
2159 TLOG(1, "thread_invoke: calling stack_handoff\n");
2160 stack_handoff(self, thread);
9bccf70c 2161
3e170ce0
A
2162 /* 'self' is now off core */
2163 assert(thread == current_thread());
2164
b0d623f7
A
2165 DTRACE_SCHED(on__cpu);
2166
39037602
A
2167#if KPERF
2168 kperf_on_cpu(thread, continuation, NULL);
2169#endif /* KPERF */
2170
2d21ac55 2171 thread_dispatch(self, thread);
1c79356b 2172
2d21ac55 2173 thread->continuation = thread->parameter = NULL;
1c79356b 2174
2d21ac55 2175 counter(c_thread_invoke_hits++);
1c79356b 2176
9bccf70c 2177 (void) spllo();
1c79356b 2178
2d21ac55
A
2179 assert(continuation);
2180 call_continuation(continuation, parameter, thread->wait_result);
9bccf70c 2181 /*NOTREACHED*/
9bccf70c 2182 }
2d21ac55 2183 else if (thread == self) {
9bccf70c 2184 /* same thread but with continuation */
2d21ac55 2185 ast_context(self);
9bccf70c 2186 counter(++c_thread_invoke_same);
3e170ce0 2187
2d21ac55 2188 thread_unlock(self);
9bccf70c 2189
39037602
A
2190#if KPERF
2191 kperf_on_cpu(thread, continuation, NULL);
2192#endif /* KPERF */
2193
316670eb
A
2194 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2195 MACHDBG_CODE(DBG_MACH_SCHED,MACH_SCHED) | DBG_FUNC_NONE,
2196 self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
6d2010ae 2197
2d21ac55
A
2198 self->continuation = self->parameter = NULL;
2199
9bccf70c 2200 (void) spllo();
55e303ae 2201
2d21ac55 2202 call_continuation(continuation, parameter, self->wait_result);
9bccf70c
A
2203 /*NOTREACHED*/
2204 }
3e170ce0 2205 } else {
9bccf70c 2206 /*
2d21ac55 2207 * Check that the other thread has a stack
9bccf70c 2208 */
2d21ac55 2209 if (!thread->kernel_stack) {
9bccf70c 2210need_stack:
2d21ac55
A
2211 if (!stack_alloc_try(thread)) {
2212 counter(c_thread_invoke_misses++);
2213 thread_unlock(thread);
2214 thread_stack_enqueue(thread);
9bccf70c
A
2215 return (FALSE);
2216 }
3e170ce0 2217 } else if (thread == self) {
2d21ac55 2218 ast_context(self);
9bccf70c 2219 counter(++c_thread_invoke_same);
2d21ac55 2220 thread_unlock(self);
6d2010ae 2221
316670eb
A
2222 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2223 MACHDBG_CODE(DBG_MACH_SCHED,MACH_SCHED) | DBG_FUNC_NONE,
2224 self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
6d2010ae 2225
9bccf70c
A
2226 return (TRUE);
2227 }
2228 }
1c79356b
A
2229
2230 /*
91447636 2231 * Context switch by full context save.
1c79356b 2232 */
9bccf70c 2233 processor = current_processor();
2d21ac55
A
2234 processor->active_thread = thread;
2235 processor->current_pri = thread->sched_pri;
6d2010ae 2236 processor->current_thmode = thread->sched_mode;
fe8ab488 2237 processor->current_sfi_class = thread->sfi_class;
2d21ac55
A
2238 if (thread->last_processor != processor && thread->last_processor != NULL) {
2239 if (thread->last_processor->processor_set != processor->processor_set)
2240 thread->ps_switch++;
2241 thread->p_switch++;
2242 }
2243 thread->last_processor = processor;
2244 thread->c_switch++;
2245 ast_context(thread);
3e170ce0 2246
2d21ac55 2247 thread_unlock(thread);
1c79356b 2248
2d21ac55 2249 counter(c_thread_invoke_csw++);
1c79356b 2250
2d21ac55 2251 self->reason = reason;
1c79356b 2252
39236c6e
A
2253 processor->last_dispatch = ctime;
2254 self->last_run_time = ctime;
2255 thread_timer_event(ctime, &thread->system_timer);
2d21ac55 2256 PROCESSOR_DATA(processor, kernel_timer) = &thread->system_timer;
91447636 2257
316670eb
A
2258 /*
2259 * Since non-precise user/kernel time doesn't update the state timer
2260 * during privilege transitions, synthesize an event now.
2261 */
2262 if (!thread->precise_user_kernel_time) {
2263 timer_switch(PROCESSOR_DATA(processor, current_state),
39236c6e 2264 ctime,
316670eb
A
2265 PROCESSOR_DATA(processor, current_state));
2266 }
3e170ce0 2267
316670eb
A
2268 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2269 MACHDBG_CODE(DBG_MACH_SCHED,MACH_SCHED) | DBG_FUNC_NONE,
2270 self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
b0d623f7 2271
6d2010ae 2272 if ((thread->chosen_processor != processor) && (thread->chosen_processor != NULL)) {
3e170ce0 2273 SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_MOVED)|DBG_FUNC_NONE,
6d2010ae
A
2274 (uintptr_t)thread_tid(thread), (uintptr_t)thread->chosen_processor->cpu_id, 0, 0, 0);
2275 }
2276
b0d623f7 2277 DTRACE_SCHED2(off__cpu, struct thread *, thread, struct proc *, thread->task->bsd_info);
1c79356b 2278
6d2010ae
A
2279 SCHED_STATS_CSW(processor, self->reason, self->sched_pri, thread->sched_pri);
2280
1c79356b 2281 /*
91447636 2282 * This is where we actually switch register context,
2d21ac55
A
2283 * and address space if required. We will next run
2284 * as a result of a subsequent context switch.
3e170ce0
A
2285 *
2286 * Once registers are switched and the processor is running "thread",
2287 * the stack variables and non-volatile registers will contain whatever
2288 * was there the last time that thread blocked. No local variables should
2289 * be used after this point, except for the special case of "thread", which
2290 * the platform layer returns as the previous thread running on the processor
2291 * via the function call ABI as a return register, and "self", which may have
2292 * been stored on the stack or a non-volatile register, but a stale idea of
2293 * what was on the CPU is newly-accurate because that thread is again
2294 * running on the CPU.
91447636 2295 */
316670eb 2296 assert(continuation == self->continuation);
2d21ac55 2297 thread = machine_switch_context(self, continuation, thread);
316670eb 2298 assert(self == current_thread());
b0d623f7
A
2299 TLOG(1,"thread_invoke: returning machine_switch_context: self %p continuation %p thread %p\n", self, continuation, thread);
2300
2301 DTRACE_SCHED(on__cpu);
1c79356b 2302
39037602
A
2303#if KPERF
2304 kperf_on_cpu(self, NULL, __builtin_frame_address(0));
2305#endif /* KPERF */
2306
1c79356b 2307 /*
2d21ac55 2308 * We have been resumed and are set to run.
1c79356b 2309 */
2d21ac55 2310 thread_dispatch(thread, self);
9bccf70c 2311
91447636 2312 if (continuation) {
2d21ac55
A
2313 self->continuation = self->parameter = NULL;
2314
9bccf70c 2315 (void) spllo();
55e303ae 2316
2d21ac55 2317 call_continuation(continuation, parameter, self->wait_result);
9bccf70c 2318 /*NOTREACHED*/
1c79356b
A
2319 }
2320
9bccf70c 2321 return (TRUE);
1c79356b
A
2322}
2323
3e170ce0
A
2324#if defined(CONFIG_SCHED_DEFERRED_AST)
2325/*
2326 * pset_cancel_deferred_dispatch:
2327 *
2328 * Cancels all ASTs that we can cancel for the given processor set
2329 * if the current processor is running the last runnable thread in the
2330 * system.
2331 *
2332 * This function assumes the current thread is runnable. This must
2333 * be called with the pset unlocked.
2334 */
2335static void
2336pset_cancel_deferred_dispatch(
2337 processor_set_t pset,
2338 processor_t processor)
2339{
2340 processor_t active_processor = NULL;
2341 uint32_t sampled_sched_run_count;
2342
2343 pset_lock(pset);
39037602 2344 sampled_sched_run_count = (volatile uint32_t) sched_run_buckets[TH_BUCKET_RUN];
3e170ce0
A
2345
2346 /*
2347 * If we have emptied the run queue, and our current thread is runnable, we
2348 * should tell any processors that are still DISPATCHING that they will
2349 * probably not have any work to do. In the event that there are no
2350 * pending signals that we can cancel, this is also uninteresting.
2351 *
2352 * In the unlikely event that another thread becomes runnable while we are
2353 * doing this (sched_run_count is atomically updated, not guarded), the
2354 * codepath making it runnable SHOULD (a dangerous word) need the pset lock
2355 * in order to dispatch it to a processor in our pset. So, the other
2356 * codepath will wait while we squash all cancelable ASTs, get the pset
2357 * lock, and then dispatch the freshly runnable thread. So this should be
2358 * correct (we won't accidentally have a runnable thread that hasn't been
2359 * dispatched to an idle processor), if not ideal (we may be restarting the
2360 * dispatch process, which could have some overhead).
2361 *
2362 */
2363 if ((sampled_sched_run_count == 1) &&
2364 (pset->pending_deferred_AST_cpu_mask)) {
2365 qe_foreach_element_safe(active_processor, &pset->active_queue, processor_queue) {
2366 /*
2367 * If a processor is DISPATCHING, it could be because of
2368 * a cancelable signal.
2369 *
2370 * IF the processor is not our
2371 * current processor (the current processor should not
2372 * be DISPATCHING, so this is a bit paranoid), AND there
2373 * is a cancelable signal pending on the processor, AND
2374 * there is no non-cancelable signal pending (as there is
2375 * no point trying to backtrack on bringing the processor
2376 * up if a signal we cannot cancel is outstanding), THEN
2377 * it should make sense to roll back the processor state
2378 * to the IDLE state.
2379 *
2380 * If the racey nature of this approach (as the signal
2381 * will be arbitrated by hardware, and can fire as we
2382 * roll back state) results in the core responding
2383 * despite being pushed back to the IDLE state, it
2384 * should be no different than if the core took some
2385 * interrupt while IDLE.
2386 */
2387 if ((active_processor->state == PROCESSOR_DISPATCHING) &&
2388 (pset->pending_deferred_AST_cpu_mask & (1ULL << active_processor->cpu_id)) &&
2389 (!(pset->pending_AST_cpu_mask & (1ULL << active_processor->cpu_id))) &&
2390 (active_processor != processor)) {
2391 /*
2392 * Squash all of the processor state back to some
2393 * reasonable facsimile of PROCESSOR_IDLE.
2394 *
2395 * TODO: What queue policy do we actually want here?
2396 * We want to promote selection of a good processor
2397 * to run on. Do we want to enqueue at the head?
2398 * The tail? At the (relative) old position in the
2399 * queue? Or something else entirely?
2400 */
39037602 2401 re_queue_head(&pset->idle_queue, &active_processor->processor_queue);
3e170ce0
A
2402
2403 assert(active_processor->next_thread == THREAD_NULL);
2404
2405 active_processor->current_pri = IDLEPRI;
2406 active_processor->current_thmode = TH_MODE_FIXED;
2407 active_processor->current_sfi_class = SFI_CLASS_KERNEL;
2408 active_processor->deadline = UINT64_MAX;
2409 active_processor->state = PROCESSOR_IDLE;
2410 pset->pending_deferred_AST_cpu_mask &= ~(1U << active_processor->cpu_id);
2411 machine_signal_idle_cancel(active_processor);
2412 }
2413
2414 }
2415 }
2416
2417 pset_unlock(pset);
2418}
2419#else
2420/* We don't support deferred ASTs; everything is candycanes and sunshine. */
2421#endif
2422
1c79356b 2423/*
2d21ac55 2424 * thread_dispatch:
1c79356b 2425 *
2d21ac55
A
2426 * Handle threads at context switch. Re-dispatch other thread
2427 * if still running, otherwise update run state and perform
2428 * special actions. Update quantum for other thread and begin
2429 * the quantum for ourselves.
91447636 2430 *
3e170ce0
A
2431 * "thread" is the old thread that we have switched away from.
2432 * "self" is the new current thread that we have context switched to
39236c6e 2433 *
91447636 2434 * Called at splsched.
1c79356b
A
2435 */
2436void
2d21ac55
A
2437thread_dispatch(
2438 thread_t thread,
2439 thread_t self)
1c79356b 2440{
2d21ac55
A
2441 processor_t processor = self->last_processor;
2442
3e170ce0
A
2443 assert(processor == current_processor());
2444 assert(self == current_thread());
2445 assert(thread != self);
2446
2d21ac55 2447 if (thread != THREAD_NULL) {
91447636 2448 /*
2d21ac55
A
2449 * If blocked at a continuation, discard
2450 * the stack.
91447636 2451 */
2d21ac55
A
2452 if (thread->continuation != NULL && thread->kernel_stack != 0)
2453 stack_free(thread);
2454
3e170ce0
A
2455 if (thread->state & TH_IDLE) {
2456 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
39037602
A
2457 MACHDBG_CODE(DBG_MACH_SCHED,MACH_DISPATCH) | DBG_FUNC_NONE,
2458 (uintptr_t)thread_tid(thread), 0, thread->state,
2459 sched_run_buckets[TH_BUCKET_RUN], 0);
3e170ce0 2460 } else {
316670eb
A
2461 int64_t consumed;
2462 int64_t remainder = 0;
2463
2464 if (processor->quantum_end > processor->last_dispatch)
2465 remainder = processor->quantum_end -
2466 processor->last_dispatch;
2467
fe8ab488 2468 consumed = thread->quantum_remaining - remainder;
316670eb 2469
39236c6e 2470 if ((thread->reason & AST_LEDGER) == 0) {
316670eb 2471 /*
39236c6e
A
2472 * Bill CPU time to both the task and
2473 * the individual thread.
316670eb
A
2474 */
2475 ledger_credit(thread->t_ledger,
2476 task_ledgers.cpu_time, consumed);
2477 ledger_credit(thread->t_threadledger,
2478 thread_ledgers.cpu_time, consumed);
fe8ab488
A
2479#ifdef CONFIG_BANK
2480 if (thread->t_bankledger) {
2481 ledger_credit(thread->t_bankledger,
2482 bank_ledgers.cpu_time,
2483 (consumed - thread->t_deduct_bank_ledger_time));
2484
2485 }
2486 thread->t_deduct_bank_ledger_time =0;
2487#endif
39236c6e 2488 }
316670eb 2489
2d21ac55
A
2490 wake_lock(thread);
2491 thread_lock(thread);
9bccf70c 2492
91447636 2493 /*
39037602
A
2494 * Apply a priority floor if the thread holds a kernel resource
2495 * Do this before checking starting_pri to avoid overpenalizing
2496 * repeated rwlock blockers.
2497 */
2498 if (__improbable(thread->rwlock_count != 0))
2499 lck_rw_set_promotion_locked(thread);
2500
2501 boolean_t keep_quantum = processor->first_timeslice;
2502
2503 /*
2504 * Treat a thread which has dropped priority since it got on core
2505 * as having expired its quantum.
91447636 2506 */
39037602
A
2507 if (processor->starting_pri > thread->sched_pri)
2508 keep_quantum = FALSE;
2509
2510 /* Compute remainder of current quantum. */
2511 if (keep_quantum &&
316670eb 2512 processor->quantum_end > processor->last_dispatch)
fe8ab488 2513 thread->quantum_remaining = (uint32_t)remainder;
2d21ac55 2514 else
fe8ab488 2515 thread->quantum_remaining = 0;
2d21ac55 2516
6d2010ae 2517 if (thread->sched_mode == TH_MODE_REALTIME) {
2d21ac55
A
2518 /*
2519 * Cancel the deadline if the thread has
2520 * consumed the entire quantum.
2521 */
fe8ab488 2522 if (thread->quantum_remaining == 0) {
2d21ac55 2523 thread->realtime.deadline = UINT64_MAX;
2d21ac55 2524 }
b7266188 2525 } else {
3e170ce0 2526#if defined(CONFIG_SCHED_TIMESHARE_CORE)
2d21ac55
A
2527 /*
2528 * For non-realtime threads treat a tiny
2529 * remaining quantum as an expired quantum
2530 * but include what's left next time.
2531 */
fe8ab488 2532 if (thread->quantum_remaining < min_std_quantum) {
2d21ac55 2533 thread->reason |= AST_QUANTUM;
fe8ab488 2534 thread->quantum_remaining += SCHED(initial_quantum_size)(thread);
2d21ac55 2535 }
3e170ce0 2536#endif /* CONFIG_SCHED_TIMESHARE_CORE */
2d21ac55
A
2537 }
2538
91447636 2539 /*
2d21ac55
A
2540 * If we are doing a direct handoff then
2541 * take the remainder of the quantum.
91447636 2542 */
2d21ac55 2543 if ((thread->reason & (AST_HANDOFF|AST_QUANTUM)) == AST_HANDOFF) {
fe8ab488 2544 self->quantum_remaining = thread->quantum_remaining;
2d21ac55 2545 thread->reason |= AST_QUANTUM;
fe8ab488
A
2546 thread->quantum_remaining = 0;
2547 } else {
2548#if defined(CONFIG_SCHED_MULTIQ)
3e170ce0
A
2549 if (SCHED(sched_groups_enabled) &&
2550 thread->sched_group == self->sched_group) {
fe8ab488 2551 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3e170ce0 2552 MACHDBG_CODE(DBG_MACH_SCHED, MACH_QUANTUM_HANDOFF),
fe8ab488
A
2553 self->reason, (uintptr_t)thread_tid(thread),
2554 self->quantum_remaining, thread->quantum_remaining, 0);
2555
2556 self->quantum_remaining = thread->quantum_remaining;
2557 thread->quantum_remaining = 0;
3e170ce0 2558 /* Don't set AST_QUANTUM here - old thread might still want to preempt someone else */
fe8ab488
A
2559 }
2560#endif /* defined(CONFIG_SCHED_MULTIQ) */
91447636 2561 }
91447636 2562
b0d623f7 2563 thread->computation_metered += (processor->last_dispatch - thread->computation_epoch);
2d21ac55
A
2564
2565 if (!(thread->state & TH_WAIT)) {
2566 /*
3e170ce0 2567 * Still runnable.
2d21ac55 2568 */
3e170ce0
A
2569 thread->last_made_runnable_time = mach_approximate_time();
2570
2571 machine_thread_going_off_core(thread, FALSE);
2572
2d21ac55
A
2573 if (thread->reason & AST_QUANTUM)
2574 thread_setrun(thread, SCHED_TAILQ);
3e170ce0 2575 else if (thread->reason & AST_PREEMPT)
2d21ac55
A
2576 thread_setrun(thread, SCHED_HEADQ);
2577 else
2578 thread_setrun(thread, SCHED_PREEMPT | SCHED_TAILQ);
2579
fe8ab488 2580 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
39037602
A
2581 MACHDBG_CODE(DBG_MACH_SCHED,MACH_DISPATCH) | DBG_FUNC_NONE,
2582 (uintptr_t)thread_tid(thread), thread->reason, thread->state,
2583 sched_run_buckets[TH_BUCKET_RUN], 0);
3e170ce0 2584
316670eb
A
2585 if (thread->wake_active) {
2586 thread->wake_active = FALSE;
2587 thread_unlock(thread);
2588
2589 thread_wakeup(&thread->wake_active);
3e170ce0 2590 } else {
316670eb 2591 thread_unlock(thread);
3e170ce0 2592 }
316670eb 2593
2d21ac55 2594 wake_unlock(thread);
3e170ce0 2595 } else {
2d21ac55
A
2596 /*
2597 * Waiting.
2598 */
b7266188 2599 boolean_t should_terminate = FALSE;
fe8ab488 2600 uint32_t new_run_count;
b7266188
A
2601
2602 /* Only the first call to thread_dispatch
2603 * after explicit termination should add
2604 * the thread to the termination queue
2605 */
2606 if ((thread->state & (TH_TERMINATE|TH_TERMINATE2)) == TH_TERMINATE) {
2607 should_terminate = TRUE;
2608 thread->state |= TH_TERMINATE2;
2609 }
2610
2d21ac55 2611 thread->state &= ~TH_RUN;
3e170ce0 2612 thread->last_made_runnable_time = ~0ULL;
39236c6e
A
2613 thread->chosen_processor = PROCESSOR_NULL;
2614
fe8ab488 2615 new_run_count = sched_run_decr(thread);
2d21ac55 2616
3e170ce0 2617#if CONFIG_SCHED_SFI
fe8ab488
A
2618 if ((thread->state & (TH_WAIT | TH_TERMINATE)) == TH_WAIT) {
2619 if (thread->reason & AST_SFI) {
2620 thread->wait_sfi_begin_time = processor->last_dispatch;
2621 }
39236c6e 2622 }
3e170ce0
A
2623#endif
2624
2625 machine_thread_going_off_core(thread, should_terminate);
fe8ab488
A
2626
2627 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
39037602
A
2628 MACHDBG_CODE(DBG_MACH_SCHED,MACH_DISPATCH) | DBG_FUNC_NONE,
2629 (uintptr_t)thread_tid(thread), thread->reason, thread->state,
2630 new_run_count, 0);
2d21ac55 2631
b7266188
A
2632 (*thread->sched_call)(SCHED_CALL_BLOCK, thread);
2633
2d21ac55
A
2634 if (thread->wake_active) {
2635 thread->wake_active = FALSE;
2636 thread_unlock(thread);
2637
2638 thread_wakeup(&thread->wake_active);
3e170ce0 2639 } else {
2d21ac55 2640 thread_unlock(thread);
3e170ce0 2641 }
91447636 2642
2d21ac55 2643 wake_unlock(thread);
91447636 2644
b7266188 2645 if (should_terminate)
2d21ac55
A
2646 thread_terminate_enqueue(thread);
2647 }
2648 }
91447636 2649 }
91447636 2650
3e170ce0
A
2651 /* Update (new) current thread and reprogram quantum timer */
2652 thread_lock(self);
2d21ac55 2653 if (!(self->state & TH_IDLE)) {
39236c6e
A
2654 uint64_t arg1, arg2;
2655 int urgency;
3e170ce0
A
2656 uint64_t latency;
2657
2658#if CONFIG_SCHED_SFI
fe8ab488
A
2659 ast_t new_ast;
2660
fe8ab488 2661 new_ast = sfi_thread_needs_ast(self, NULL);
fe8ab488
A
2662
2663 if (new_ast != AST_NONE) {
2664 ast_on(new_ast);
2665 }
3e170ce0
A
2666#endif
2667
39037602 2668 assertf(processor->last_dispatch >= self->last_made_runnable_time, "Non-monotonic time? dispatch at 0x%llx, runnable at 0x%llx", processor->last_dispatch, self->last_made_runnable_time);
3e170ce0 2669 latency = processor->last_dispatch - self->last_made_runnable_time;
6d2010ae 2670
39236c6e
A
2671 urgency = thread_get_urgency(self, &arg1, &arg2);
2672
3e170ce0
A
2673 thread_tell_urgency(urgency, arg1, arg2, latency, self);
2674
2675 machine_thread_going_on_core(self, urgency, latency);
39236c6e 2676
91447636 2677 /*
2d21ac55 2678 * Get a new quantum if none remaining.
91447636 2679 */
fe8ab488 2680 if (self->quantum_remaining == 0) {
2d21ac55 2681 thread_quantum_init(self);
6d2010ae 2682 }
91447636
A
2683
2684 /*
2d21ac55 2685 * Set up quantum timer and timeslice.
91447636 2686 */
fe8ab488
A
2687 processor->quantum_end = processor->last_dispatch + self->quantum_remaining;
2688 timer_call_enter1(&processor->quantum_timer, self, processor->quantum_end, TIMER_CALL_SYS_CRITICAL | TIMER_CALL_LOCAL);
91447636 2689
3e170ce0
A
2690 processor->first_timeslice = TRUE;
2691 } else {
2692 timer_call_cancel(&processor->quantum_timer);
2693 processor->first_timeslice = FALSE;
91447636 2694
3e170ce0
A
2695 thread_tell_urgency(THREAD_URGENCY_NONE, 0, 0, 0, self);
2696 machine_thread_going_on_core(self, THREAD_URGENCY_NONE, 0);
91447636 2697 }
6d2010ae 2698
3e170ce0
A
2699 self->computation_epoch = processor->last_dispatch;
2700 self->reason = AST_NONE;
39037602 2701 processor->starting_pri = self->sched_pri;
3e170ce0
A
2702
2703 thread_unlock(self);
2704
2705#if defined(CONFIG_SCHED_DEFERRED_AST)
2706 /*
2707 * TODO: Can we state that redispatching our old thread is also
2708 * uninteresting?
2709 */
39037602 2710 if ((((volatile uint32_t)sched_run_buckets[TH_BUCKET_RUN]) == 1) &&
3e170ce0
A
2711 !(self->state & TH_IDLE)) {
2712 pset_cancel_deferred_dispatch(processor->processor_set, processor);
91447636 2713 }
3e170ce0
A
2714#endif
2715
91447636
A
2716}
2717
2718/*
2d21ac55 2719 * thread_block_reason:
91447636 2720 *
2d21ac55
A
2721 * Forces a reschedule, blocking the caller if a wait
2722 * has been asserted.
91447636 2723 *
2d21ac55
A
2724 * If a continuation is specified, then thread_invoke will
2725 * attempt to discard the thread's kernel stack. When the
2726 * thread resumes, it will execute the continuation function
2727 * on a new kernel stack.
91447636 2728 */
2d21ac55
A
2729counter(mach_counter_t c_thread_block_calls = 0;)
2730
2731wait_result_t
2732thread_block_reason(
2733 thread_continue_t continuation,
2734 void *parameter,
2735 ast_t reason)
91447636 2736{
3e170ce0
A
2737 thread_t self = current_thread();
2738 processor_t processor;
2739 thread_t new_thread;
2740 spl_t s;
1c79356b
A
2741
2742 counter(++c_thread_block_calls);
2743
1c79356b
A
2744 s = splsched();
2745
55e303ae 2746 processor = current_processor();
1c79356b 2747
9bccf70c
A
2748 /* If we're explicitly yielding, force a subsequent quantum */
2749 if (reason & AST_YIELD)
3e170ce0 2750 processor->first_timeslice = FALSE;
0b4e3aa0 2751
9bccf70c
A
2752 /* We're handling all scheduling AST's */
2753 ast_off(AST_SCHEDULING);
1c79356b 2754
490019cf
A
2755#if PROC_REF_DEBUG
2756 if ((continuation != NULL) && (self->task != kernel_task)) {
2757 if (uthread_get_proc_refcount(self->uthread) != 0) {
2758 panic("thread_block_reason with continuation uthread %p with uu_proc_refcount != 0", self->uthread);
2759 }
2760 }
2761#endif
2762
91447636
A
2763 self->continuation = continuation;
2764 self->parameter = parameter;
2765
fe8ab488 2766 if (self->state & ~(TH_RUN | TH_IDLE)) {
316670eb
A
2767 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2768 MACHDBG_CODE(DBG_MACH_SCHED,MACH_BLOCK),
2769 reason, VM_KERNEL_UNSLIDE(continuation), 0, 0, 0);
b0d623f7
A
2770 }
2771
2d21ac55 2772 do {
91447636 2773 thread_lock(self);
fe8ab488 2774 new_thread = thread_select(self, processor, reason);
91447636 2775 thread_unlock(self);
2d21ac55 2776 } while (!thread_invoke(self, new_thread, reason));
1c79356b 2777
1c79356b
A
2778 splx(s);
2779
91447636 2780 return (self->wait_result);
1c79356b
A
2781}
2782
2783/*
2784 * thread_block:
2785 *
9bccf70c 2786 * Block the current thread if a wait has been asserted.
1c79356b 2787 */
91447636 2788wait_result_t
1c79356b 2789thread_block(
9bccf70c 2790 thread_continue_t continuation)
1c79356b 2791{
91447636
A
2792 return thread_block_reason(continuation, NULL, AST_NONE);
2793}
2794
2795wait_result_t
2796thread_block_parameter(
2797 thread_continue_t continuation,
2798 void *parameter)
2799{
2800 return thread_block_reason(continuation, parameter, AST_NONE);
1c79356b
A
2801}
2802
2803/*
2804 * thread_run:
2805 *
91447636 2806 * Switch directly from the current thread to the
55e303ae 2807 * new thread, handing off our quantum if appropriate.
9bccf70c
A
2808 *
2809 * New thread must be runnable, and not on a run queue.
1c79356b 2810 *
55e303ae 2811 * Called at splsched.
1c79356b
A
2812 */
2813int
2814thread_run(
91447636 2815 thread_t self,
9bccf70c 2816 thread_continue_t continuation,
91447636 2817 void *parameter,
9bccf70c 2818 thread_t new_thread)
1c79356b 2819{
9bccf70c
A
2820 ast_t handoff = AST_HANDOFF;
2821
91447636
A
2822 self->continuation = continuation;
2823 self->parameter = parameter;
9bccf70c 2824
91447636 2825 while (!thread_invoke(self, new_thread, handoff)) {
2d21ac55 2826 processor_t processor = current_processor();
9bccf70c 2827
91447636 2828 thread_lock(self);
fe8ab488 2829 new_thread = thread_select(self, processor, AST_NONE);
91447636 2830 thread_unlock(self);
9bccf70c
A
2831 handoff = AST_NONE;
2832 }
2833
91447636 2834 return (self->wait_result);
1c79356b
A
2835}
2836
2837/*
91447636 2838 * thread_continue:
55e303ae 2839 *
91447636
A
2840 * Called at splsched when a thread first receives
2841 * a new stack after a continuation.
1c79356b
A
2842 */
2843void
91447636 2844thread_continue(
3e170ce0 2845 thread_t thread)
1c79356b 2846{
3e170ce0
A
2847 thread_t self = current_thread();
2848 thread_continue_t continuation;
2849 void *parameter;
b0d623f7
A
2850
2851 DTRACE_SCHED(on__cpu);
2852
91447636 2853 continuation = self->continuation;
91447636 2854 parameter = self->parameter;
9bccf70c 2855
39037602
A
2856#if KPERF
2857 kperf_on_cpu(self, continuation, NULL);
2858#endif
2859
2d21ac55 2860 thread_dispatch(thread, self);
9bccf70c 2861
2d21ac55 2862 self->continuation = self->parameter = NULL;
1c79356b 2863
2d21ac55 2864 if (thread != THREAD_NULL)
91447636 2865 (void)spllo();
9bccf70c 2866
2d21ac55 2867 TLOG(1, "thread_continue: calling call_continuation \n");
91447636
A
2868 call_continuation(continuation, parameter, self->wait_result);
2869 /*NOTREACHED*/
1c79356b
A
2870}
2871
2d21ac55 2872void
6d2010ae 2873thread_quantum_init(thread_t thread)
2d21ac55 2874{
6d2010ae 2875 if (thread->sched_mode == TH_MODE_REALTIME) {
fe8ab488 2876 thread->quantum_remaining = thread->realtime.computation;
6d2010ae 2877 } else {
fe8ab488 2878 thread->quantum_remaining = SCHED(initial_quantum_size)(thread);
6d2010ae
A
2879 }
2880}
2d21ac55 2881
fe8ab488 2882uint32_t
3e170ce0 2883sched_timeshare_initial_quantum_size(thread_t thread)
6d2010ae 2884{
39037602 2885 if ((thread != THREAD_NULL) && thread->th_sched_bucket == TH_BUCKET_SHARE_BG)
316670eb 2886 return bg_quantum;
39037602
A
2887 else
2888 return std_quantum;
6d2010ae
A
2889}
2890
6d2010ae
A
2891/*
2892 * run_queue_init:
2893 *
2894 * Initialize a run queue before first use.
2895 */
2896void
2897run_queue_init(
2898 run_queue_t rq)
2899{
39037602
A
2900 rq->highq = NOPRI;
2901 for (u_int i = 0; i < BITMAP_LEN(NRQS); i++)
2d21ac55 2902 rq->bitmap[i] = 0;
2d21ac55 2903 rq->urgency = rq->count = 0;
39037602 2904 for (int i = 0; i < NRQS; i++)
2d21ac55
A
2905 queue_init(&rq->queues[i]);
2906}
1c79356b 2907
2d21ac55
A
2908/*
2909 * run_queue_dequeue:
2910 *
2911 * Perform a dequeue operation on a run queue,
2912 * and return the resulting thread.
2913 *
6d2010ae 2914 * The run queue must be locked (see thread_run_queue_remove()
2d21ac55
A
2915 * for more info), and not empty.
2916 */
6d2010ae 2917thread_t
2d21ac55 2918run_queue_dequeue(
39037602
A
2919 run_queue_t rq,
2920 integer_t options)
2d21ac55 2921{
39037602
A
2922 thread_t thread;
2923 queue_t queue = &rq->queues[rq->highq];
9bccf70c 2924
2d21ac55 2925 if (options & SCHED_HEADQ) {
39037602
A
2926 thread = qe_dequeue_head(queue, struct thread, runq_links);
2927 } else {
2928 thread = qe_dequeue_tail(queue, struct thread, runq_links);
9bccf70c 2929 }
1c79356b 2930
39037602
A
2931 assert(thread != THREAD_NULL);
2932 assert_thread_magic(thread);
2933
2d21ac55 2934 thread->runq = PROCESSOR_NULL;
6d2010ae 2935 SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
2d21ac55 2936 rq->count--;
6d2010ae 2937 if (SCHED(priority_is_urgent)(rq->highq)) {
4a3eedf9
A
2938 rq->urgency--; assert(rq->urgency >= 0);
2939 }
2d21ac55 2940 if (queue_empty(queue)) {
39037602
A
2941 bitmap_clear(rq->bitmap, rq->highq);
2942 rq->highq = bitmap_first(rq->bitmap, NRQS);
2d21ac55 2943 }
1c79356b 2944
39037602 2945 return thread;
1c79356b
A
2946}
2947
6d2010ae
A
2948/*
2949 * run_queue_enqueue:
2950 *
2951 * Perform a enqueue operation on a run queue.
2952 *
2953 * The run queue must be locked (see thread_run_queue_remove()
2954 * for more info).
2955 */
2956boolean_t
2957run_queue_enqueue(
39037602
A
2958 run_queue_t rq,
2959 thread_t thread,
2960 integer_t options)
6d2010ae 2961{
39037602
A
2962 queue_t queue = &rq->queues[thread->sched_pri];
2963 boolean_t result = FALSE;
2964
2965 assert_thread_magic(thread);
2966
6d2010ae 2967 if (queue_empty(queue)) {
39037602
A
2968 enqueue_tail(queue, &thread->runq_links);
2969
2970 rq_bitmap_set(rq->bitmap, thread->sched_pri);
6d2010ae
A
2971 if (thread->sched_pri > rq->highq) {
2972 rq->highq = thread->sched_pri;
2973 result = TRUE;
2974 }
fe8ab488 2975 } else {
6d2010ae 2976 if (options & SCHED_TAILQ)
39037602 2977 enqueue_tail(queue, &thread->runq_links);
6d2010ae 2978 else
39037602 2979 enqueue_head(queue, &thread->runq_links);
fe8ab488 2980 }
6d2010ae
A
2981 if (SCHED(priority_is_urgent)(thread->sched_pri))
2982 rq->urgency++;
2983 SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
2984 rq->count++;
39037602 2985
6d2010ae 2986 return (result);
6d2010ae
A
2987}
2988
2989/*
2990 * run_queue_remove:
2991 *
2992 * Remove a specific thread from a runqueue.
2993 *
2994 * The run queue must be locked.
2995 */
2996void
2997run_queue_remove(
39037602
A
2998 run_queue_t rq,
2999 thread_t thread)
6d2010ae 3000{
39037602
A
3001 assert(thread->runq != PROCESSOR_NULL);
3002 assert_thread_magic(thread);
6d2010ae 3003
39037602 3004 remqueue(&thread->runq_links);
6d2010ae
A
3005 SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
3006 rq->count--;
3007 if (SCHED(priority_is_urgent)(thread->sched_pri)) {
3008 rq->urgency--; assert(rq->urgency >= 0);
3009 }
39037602
A
3010
3011 if (queue_empty(&rq->queues[thread->sched_pri])) {
6d2010ae 3012 /* update run queue status */
39037602
A
3013 bitmap_clear(rq->bitmap, thread->sched_pri);
3014 rq->highq = bitmap_first(rq->bitmap, NRQS);
6d2010ae 3015 }
39037602 3016
6d2010ae
A
3017 thread->runq = PROCESSOR_NULL;
3018}
3019
3e170ce0
A
3020/* Assumes RT lock is not held, and acquires splsched/rt_lock itself */
3021void
3022rt_runq_scan(sched_update_scan_context_t scan_context)
6d2010ae 3023{
3e170ce0
A
3024 spl_t s;
3025 thread_t thread;
fe8ab488 3026
3e170ce0
A
3027 s = splsched();
3028 rt_lock_lock();
6d2010ae 3029
39037602 3030 qe_foreach_element_safe(thread, &rt_runq.queue, runq_links) {
3e170ce0
A
3031 if (thread->last_made_runnable_time < scan_context->earliest_rt_make_runnable_time) {
3032 scan_context->earliest_rt_make_runnable_time = thread->last_made_runnable_time;
3033 }
3034 }
6d2010ae 3035
3e170ce0
A
3036 rt_lock_unlock();
3037 splx(s);
6d2010ae
A
3038}
3039
3e170ce0 3040
1c79356b 3041/*
2d21ac55
A
3042 * realtime_queue_insert:
3043 *
3044 * Enqueue a thread for realtime execution.
1c79356b 3045 */
2d21ac55 3046static boolean_t
39037602 3047realtime_queue_insert(thread_t thread)
1c79356b 3048{
39037602
A
3049 queue_t queue = &rt_runq.queue;
3050 uint64_t deadline = thread->realtime.deadline;
3051 boolean_t preempt = FALSE;
1c79356b 3052
3e170ce0 3053 rt_lock_lock();
1c79356b 3054
55e303ae 3055 if (queue_empty(queue)) {
39037602 3056 enqueue_tail(queue, &thread->runq_links);
2d21ac55 3057 preempt = TRUE;
39037602
A
3058 } else {
3059 /* Insert into rt_runq in thread deadline order */
3060 queue_entry_t iter;
3061 qe_foreach(iter, queue) {
3062 thread_t iter_thread = qe_element(iter, struct thread, runq_links);
3063 assert_thread_magic(iter_thread);
3064
3065 if (deadline < iter_thread->realtime.deadline) {
3066 if (iter == queue_first(queue))
3067 preempt = TRUE;
3068 insque(&thread->runq_links, queue_prev(iter));
3069 break;
3070 } else if (iter == queue_last(queue)) {
3071 enqueue_tail(queue, &thread->runq_links);
55e303ae
A
3072 break;
3073 }
55e303ae 3074 }
55e303ae
A
3075 }
3076
3e170ce0 3077 thread->runq = THREAD_ON_RT_RUNQ;
6d2010ae
A
3078 SCHED_STATS_RUNQ_CHANGE(&rt_runq.runq_stats, rt_runq.count);
3079 rt_runq.count++;
55e303ae 3080
3e170ce0 3081 rt_lock_unlock();
55e303ae 3082
2d21ac55
A
3083 return (preempt);
3084}
55e303ae 3085
2d21ac55
A
3086/*
3087 * realtime_setrun:
3088 *
3089 * Dispatch a thread for realtime execution.
3090 *
3091 * Thread must be locked. Associated pset must
3092 * be locked, and is returned unlocked.
3093 */
3094static void
3095realtime_setrun(
3096 processor_t processor,
3097 thread_t thread)
3098{
3099 processor_set_t pset = processor->processor_set;
39236c6e 3100 ast_t preempt;
55e303ae 3101
fe8ab488
A
3102 boolean_t do_signal_idle = FALSE, do_cause_ast = FALSE;
3103
6d2010ae
A
3104 thread->chosen_processor = processor;
3105
fe8ab488
A
3106 /* <rdar://problem/15102234> */
3107 assert(thread->bound_processor == PROCESSOR_NULL);
3108
2d21ac55
A
3109 /*
3110 * Dispatch directly onto idle processor.
3111 */
6d2010ae
A
3112 if ( (thread->bound_processor == processor)
3113 && processor->state == PROCESSOR_IDLE) {
39037602 3114 re_queue_tail(&pset->active_queue, &processor->processor_queue);
55e303ae 3115
2d21ac55 3116 processor->next_thread = thread;
39236c6e
A
3117 processor->current_pri = thread->sched_pri;
3118 processor->current_thmode = thread->sched_mode;
fe8ab488 3119 processor->current_sfi_class = thread->sfi_class;
2d21ac55
A
3120 processor->deadline = thread->realtime.deadline;
3121 processor->state = PROCESSOR_DISPATCHING;
55e303ae 3122
39236c6e 3123 if (processor != current_processor()) {
3e170ce0 3124 if (!(pset->pending_AST_cpu_mask & (1ULL << processor->cpu_id))) {
39236c6e 3125 /* cleared on exit from main processor_idle() loop */
3e170ce0 3126 pset->pending_AST_cpu_mask |= (1ULL << processor->cpu_id);
fe8ab488 3127 do_signal_idle = TRUE;
39236c6e
A
3128 }
3129 }
39236c6e 3130 pset_unlock(pset);
fe8ab488
A
3131
3132 if (do_signal_idle) {
3133 machine_signal_idle(processor);
3134 }
2d21ac55
A
3135 return;
3136 }
55e303ae 3137
39236c6e
A
3138 if (processor->current_pri < BASEPRI_RTQUEUES)
3139 preempt = (AST_PREEMPT | AST_URGENT);
3140 else if (thread->realtime.deadline < processor->deadline)
3141 preempt = (AST_PREEMPT | AST_URGENT);
3142 else
3143 preempt = AST_NONE;
3144
3145 realtime_queue_insert(thread);
3146
3147 if (preempt != AST_NONE) {
3148 if (processor->state == PROCESSOR_IDLE) {
39037602
A
3149 re_queue_tail(&pset->active_queue, &processor->processor_queue);
3150
39236c6e
A
3151 processor->next_thread = THREAD_NULL;
3152 processor->current_pri = thread->sched_pri;
3153 processor->current_thmode = thread->sched_mode;
fe8ab488 3154 processor->current_sfi_class = thread->sfi_class;
39236c6e
A
3155 processor->deadline = thread->realtime.deadline;
3156 processor->state = PROCESSOR_DISPATCHING;
3157 if (processor == current_processor()) {
3158 ast_on(preempt);
3159 } else {
3e170ce0 3160 if (!(pset->pending_AST_cpu_mask & (1ULL << processor->cpu_id))) {
39236c6e 3161 /* cleared on exit from main processor_idle() loop */
3e170ce0 3162 pset->pending_AST_cpu_mask |= (1ULL << processor->cpu_id);
fe8ab488 3163 do_signal_idle = TRUE;
39236c6e 3164 }
fe8ab488 3165 }
39236c6e
A
3166 } else if (processor->state == PROCESSOR_DISPATCHING) {
3167 if ((processor->next_thread == THREAD_NULL) && ((processor->current_pri < thread->sched_pri) || (processor->deadline > thread->realtime.deadline))) {
3168 processor->current_pri = thread->sched_pri;
3169 processor->current_thmode = thread->sched_mode;
fe8ab488 3170 processor->current_sfi_class = thread->sfi_class;
39236c6e
A
3171 processor->deadline = thread->realtime.deadline;
3172 }
3173 } else {
3174 if (processor == current_processor()) {
3175 ast_on(preempt);
3176 } else {
3e170ce0 3177 if (!(pset->pending_AST_cpu_mask & (1ULL << processor->cpu_id))) {
39236c6e 3178 /* cleared after IPI causes csw_check() to be called */
3e170ce0 3179 pset->pending_AST_cpu_mask |= (1ULL << processor->cpu_id);
fe8ab488 3180 do_cause_ast = TRUE;
39236c6e
A
3181 }
3182 }
3183 }
3184 } else {
3185 /* Selected processor was too busy, just keep thread enqueued and let other processors drain it naturally. */
2d21ac55
A
3186 }
3187
3188 pset_unlock(pset);
fe8ab488
A
3189
3190 if (do_signal_idle) {
3191 machine_signal_idle(processor);
3192 } else if (do_cause_ast) {
3193 cause_ast_check(processor);
3194 }
2d21ac55
A
3195}
3196
6d2010ae 3197
fe8ab488
A
3198#if defined(CONFIG_SCHED_TIMESHARE_CORE)
3199
3200boolean_t
6d2010ae
A
3201priority_is_urgent(int priority)
3202{
39037602 3203 return bitmap_test(sched_preempt_pri, priority) ? TRUE : FALSE;
6d2010ae
A
3204}
3205
fe8ab488
A
3206#endif /* CONFIG_SCHED_TIMESHARE_CORE */
3207
55e303ae 3208/*
2d21ac55 3209 * processor_setrun:
55e303ae 3210 *
2d21ac55
A
3211 * Dispatch a thread for execution on a
3212 * processor.
55e303ae 3213 *
2d21ac55
A
3214 * Thread must be locked. Associated pset must
3215 * be locked, and is returned unlocked.
55e303ae 3216 */
2d21ac55
A
3217static void
3218processor_setrun(
3219 processor_t processor,
3220 thread_t thread,
3221 integer_t options)
55e303ae 3222{
2d21ac55
A
3223 processor_set_t pset = processor->processor_set;
3224 ast_t preempt;
39236c6e 3225 enum { eExitIdle, eInterruptRunning, eDoNothing } ipi_action = eDoNothing;
3e170ce0 3226 enum { eNoSignal, eDoSignal, eDoDeferredSignal } do_signal_idle = eNoSignal;
55e303ae 3227
3e170ce0 3228 boolean_t do_cause_ast = FALSE;
fe8ab488 3229
6d2010ae
A
3230 thread->chosen_processor = processor;
3231
55e303ae 3232 /*
2d21ac55 3233 * Dispatch directly onto idle processor.
55e303ae 3234 */
6d2010ae
A
3235 if ( (SCHED(direct_dispatch_to_idle_processors) ||
3236 thread->bound_processor == processor)
3237 && processor->state == PROCESSOR_IDLE) {
39037602
A
3238
3239 re_queue_tail(&pset->active_queue, &processor->processor_queue);
2d21ac55
A
3240
3241 processor->next_thread = thread;
39236c6e
A
3242 processor->current_pri = thread->sched_pri;
3243 processor->current_thmode = thread->sched_mode;
fe8ab488 3244 processor->current_sfi_class = thread->sfi_class;
2d21ac55
A
3245 processor->deadline = UINT64_MAX;
3246 processor->state = PROCESSOR_DISPATCHING;
2d21ac55 3247
3e170ce0 3248 if (!(pset->pending_AST_cpu_mask & (1ULL << processor->cpu_id))) {
39236c6e 3249 /* cleared on exit from main processor_idle() loop */
3e170ce0
A
3250 pset->pending_AST_cpu_mask |= (1ULL << processor->cpu_id);
3251 do_signal_idle = eDoSignal;
39236c6e
A
3252 }
3253
3254 pset_unlock(pset);
3e170ce0
A
3255
3256 if (do_signal_idle == eDoSignal) {
fe8ab488
A
3257 machine_signal_idle(processor);
3258 }
3259
2d21ac55
A
3260 return;
3261 }
55e303ae
A
3262
3263 /*
2d21ac55 3264 * Set preemption mode.
1c79356b 3265 */
3e170ce0
A
3266#if defined(CONFIG_SCHED_DEFERRED_AST)
3267 /* TODO: Do we need to care about urgency (see rdar://problem/20136239)? */
3268#endif
6d2010ae
A
3269 if (SCHED(priority_is_urgent)(thread->sched_pri) && thread->sched_pri > processor->current_pri)
3270 preempt = (AST_PREEMPT | AST_URGENT);
3271 else if(processor->active_thread && thread_eager_preemption(processor->active_thread))
55e303ae 3272 preempt = (AST_PREEMPT | AST_URGENT);
3e170ce0
A
3273 else if ((thread->sched_mode == TH_MODE_TIMESHARE) && (thread->sched_pri < thread->base_pri)) {
3274 if(SCHED(priority_is_urgent)(thread->base_pri) && thread->sched_pri > processor->current_pri) {
39236c6e
A
3275 preempt = (options & SCHED_PREEMPT)? AST_PREEMPT: AST_NONE;
3276 } else {
3277 preempt = AST_NONE;
3278 }
3279 } else
2d21ac55 3280 preempt = (options & SCHED_PREEMPT)? AST_PREEMPT: AST_NONE;
9bccf70c 3281
39236c6e 3282 SCHED(processor_enqueue)(processor, thread, options);
9bccf70c 3283
2d21ac55 3284 if (preempt != AST_NONE) {
39236c6e 3285 if (processor->state == PROCESSOR_IDLE) {
39037602
A
3286 re_queue_tail(&pset->active_queue, &processor->processor_queue);
3287
39236c6e
A
3288 processor->next_thread = THREAD_NULL;
3289 processor->current_pri = thread->sched_pri;
3290 processor->current_thmode = thread->sched_mode;
fe8ab488 3291 processor->current_sfi_class = thread->sfi_class;
39236c6e
A
3292 processor->deadline = UINT64_MAX;
3293 processor->state = PROCESSOR_DISPATCHING;
3294
3295 ipi_action = eExitIdle;
3296 } else if ( processor->state == PROCESSOR_DISPATCHING) {
3297 if ((processor->next_thread == THREAD_NULL) && (processor->current_pri < thread->sched_pri)) {
3298 processor->current_pri = thread->sched_pri;
3299 processor->current_thmode = thread->sched_mode;
fe8ab488 3300 processor->current_sfi_class = thread->sfi_class;
39236c6e
A
3301 processor->deadline = UINT64_MAX;
3302 }
3303 } else if ( (processor->state == PROCESSOR_RUNNING ||
2d21ac55 3304 processor->state == PROCESSOR_SHUTDOWN) &&
3e170ce0 3305 (thread->sched_pri >= processor->current_pri)) {
39236c6e 3306 ipi_action = eInterruptRunning;
2d21ac55 3307 }
39236c6e
A
3308 } else {
3309 /*
3310 * New thread is not important enough to preempt what is running, but
3311 * special processor states may need special handling
3312 */
3313 if (processor->state == PROCESSOR_SHUTDOWN &&
2d21ac55 3314 thread->sched_pri >= processor->current_pri ) {
39236c6e
A
3315 ipi_action = eInterruptRunning;
3316 } else if ( processor->state == PROCESSOR_IDLE &&
3317 processor != current_processor() ) {
39037602
A
3318 re_queue_tail(&pset->active_queue, &processor->processor_queue);
3319
39236c6e
A
3320 processor->next_thread = THREAD_NULL;
3321 processor->current_pri = thread->sched_pri;
3322 processor->current_thmode = thread->sched_mode;
fe8ab488 3323 processor->current_sfi_class = thread->sfi_class;
39236c6e
A
3324 processor->deadline = UINT64_MAX;
3325 processor->state = PROCESSOR_DISPATCHING;
3326
3327 ipi_action = eExitIdle;
3328 }
2d21ac55 3329 }
39236c6e
A
3330
3331 switch (ipi_action) {
3332 case eDoNothing:
3333 break;
3334 case eExitIdle:
3335 if (processor == current_processor()) {
fe8ab488 3336 if (csw_check_locked(processor, pset, AST_NONE) != AST_NONE)
39236c6e
A
3337 ast_on(preempt);
3338 } else {
3e170ce0
A
3339#if defined(CONFIG_SCHED_DEFERRED_AST)
3340 if (!(pset->pending_deferred_AST_cpu_mask & (1ULL << processor->cpu_id)) &&
3341 !(pset->pending_AST_cpu_mask & (1ULL << processor->cpu_id))) {
39236c6e 3342 /* cleared on exit from main processor_idle() loop */
3e170ce0
A
3343 pset->pending_deferred_AST_cpu_mask |= (1ULL << processor->cpu_id);
3344 do_signal_idle = eDoDeferredSignal;
3345 }
3346#else
3347 if (!(pset->pending_AST_cpu_mask & (1ULL << processor->cpu_id))) {
3348 /* cleared on exit from main processor_idle() loop */
3349 pset->pending_AST_cpu_mask |= (1ULL << processor->cpu_id);
3350 do_signal_idle = eDoSignal;
39236c6e 3351 }
3e170ce0 3352#endif
39236c6e
A
3353 }
3354 break;
3355 case eInterruptRunning:
3356 if (processor == current_processor()) {
fe8ab488 3357 if (csw_check_locked(processor, pset, AST_NONE) != AST_NONE)
39236c6e
A
3358 ast_on(preempt);
3359 } else {
3e170ce0 3360 if (!(pset->pending_AST_cpu_mask & (1ULL << processor->cpu_id))) {
39236c6e 3361 /* cleared after IPI causes csw_check() to be called */
3e170ce0 3362 pset->pending_AST_cpu_mask |= (1ULL << processor->cpu_id);
fe8ab488 3363 do_cause_ast = TRUE;
39236c6e
A
3364 }
3365 }
3366 break;
6d2010ae 3367 }
2d21ac55
A
3368
3369 pset_unlock(pset);
fe8ab488 3370
3e170ce0 3371 if (do_signal_idle == eDoSignal) {
fe8ab488 3372 machine_signal_idle(processor);
fe8ab488 3373 }
3e170ce0
A
3374#if defined(CONFIG_SCHED_DEFERRED_AST)
3375 else if (do_signal_idle == eDoDeferredSignal) {
3376 /*
3377 * TODO: The ability to cancel this signal could make
3378 * sending it outside of the pset lock an issue. Do
3379 * we need to address this? Or would the only fallout
3380 * be that the core takes a signal? As long as we do
3381 * not run the risk of having a core marked as signal
3382 * outstanding, with no real signal outstanding, the
3383 * only result should be that we fail to cancel some
3384 * signals.
3385 */
3386 machine_signal_idle_deferred(processor);
316670eb 3387 }
3e170ce0
A
3388#endif
3389 else if (do_cause_ast) {
3390 cause_ast_check(processor);
6d2010ae 3391 }
6d2010ae
A
3392}
3393
2d21ac55
A
3394/*
3395 * choose_next_pset:
3396 *
3397 * Return the next sibling pset containing
3398 * available processors.
3399 *
3400 * Returns the original pset if none other is
3401 * suitable.
3402 */
3403static processor_set_t
3404choose_next_pset(
3405 processor_set_t pset)
3406{
3407 processor_set_t nset = pset;
3408
3409 do {
3410 nset = next_pset(nset);
6d2010ae 3411 } while (nset->online_processor_count < 1 && nset != pset);
2d21ac55 3412
cf7d32b8 3413 return (nset);
2d21ac55
A
3414}
3415
3416/*
3417 * choose_processor:
3418 *
3419 * Choose a processor for the thread, beginning at
b7266188 3420 * the pset. Accepts an optional processor hint in
2d21ac55
A
3421 * the pset.
3422 *
3423 * Returns a processor, possibly from a different pset.
3424 *
3425 * The thread must be locked. The pset must be locked,
3426 * and the resulting pset is locked on return.
3427 */
6d2010ae 3428processor_t
2d21ac55
A
3429choose_processor(
3430 processor_set_t pset,
b7266188 3431 processor_t processor,
2d21ac55
A
3432 thread_t thread)
3433{
3434 processor_set_t nset, cset = pset;
39037602
A
3435
3436 assert(thread->sched_pri <= BASEPRI_RTQUEUES);
3437
cf7d32b8 3438 /*
fe8ab488 3439 * Prefer the hinted processor, when appropriate.
cf7d32b8 3440 */
b7266188 3441
fe8ab488 3442 /* Fold last processor hint from secondary processor to its primary */
0b4c1975 3443 if (processor != PROCESSOR_NULL) {
fe8ab488 3444 processor = processor->processor_primary;
0b4c1975 3445 }
b0d623f7 3446
fe8ab488
A
3447 /*
3448 * Only consult platform layer if pset is active, which
3449 * it may not be in some cases when a multi-set system
3450 * is going to sleep.
3451 */
3452 if (pset->online_processor_count) {
3453 if ((processor == PROCESSOR_NULL) || (processor->processor_set == pset && processor->state == PROCESSOR_IDLE)) {
3454 processor_t mc_processor = machine_choose_processor(pset, processor);
3455 if (mc_processor != PROCESSOR_NULL)
3456 processor = mc_processor->processor_primary;
3457 }
3458 }
b7266188 3459
fe8ab488
A
3460 /*
3461 * At this point, we may have a processor hint, and we may have
3462 * an initial starting pset. If the hint is not in the pset, or
3463 * if the hint is for a processor in an invalid state, discard
3464 * the hint.
3465 */
0b4c1975 3466 if (processor != PROCESSOR_NULL) {
fe8ab488 3467 if (processor->processor_set != pset) {
cf7d32b8 3468 processor = PROCESSOR_NULL;
3e170ce0
A
3469 } else if (!processor->is_recommended) {
3470 processor = PROCESSOR_NULL;
fe8ab488
A
3471 } else {
3472 switch (processor->state) {
3473 case PROCESSOR_START:
3474 case PROCESSOR_SHUTDOWN:
3475 case PROCESSOR_OFF_LINE:
3476 /*
3477 * Hint is for a processor that cannot support running new threads.
3478 */
3479 processor = PROCESSOR_NULL;
3480 break;
3481 case PROCESSOR_IDLE:
3482 /*
3483 * Hint is for an idle processor. Assume it is no worse than any other
3484 * idle processor. The platform layer had an opportunity to provide
3485 * the "least cost idle" processor above.
3486 */
3487 return (processor);
fe8ab488
A
3488 case PROCESSOR_RUNNING:
3489 case PROCESSOR_DISPATCHING:
3490 /*
3491 * Hint is for an active CPU. This fast-path allows
3492 * realtime threads to preempt non-realtime threads
3493 * to regain their previous executing processor.
3494 */
3495 if ((thread->sched_pri >= BASEPRI_RTQUEUES) &&
3496 (processor->current_pri < BASEPRI_RTQUEUES))
3497 return (processor);
3498
3499 /* Otherwise, use hint as part of search below */
3500 break;
3501 default:
3502 processor = PROCESSOR_NULL;
3503 break;
3504 }
3505 }
b7266188 3506 }
2d21ac55
A
3507
3508 /*
fe8ab488
A
3509 * Iterate through the processor sets to locate
3510 * an appropriate processor. Seed results with
3511 * a last-processor hint, if available, so that
3512 * a search must find something strictly better
3513 * to replace it.
3514 *
3515 * A primary/secondary pair of SMT processors are
3516 * "unpaired" if the primary is busy but its
3517 * corresponding secondary is idle (so the physical
3518 * core has full use of its resources).
2d21ac55 3519 */
fe8ab488
A
3520
3521 integer_t lowest_priority = MAXPRI + 1;
3522 integer_t lowest_unpaired_primary_priority = MAXPRI + 1;
3523 integer_t lowest_count = INT_MAX;
3524 uint64_t furthest_deadline = 1;
3525 processor_t lp_processor = PROCESSOR_NULL;
3526 processor_t lp_unpaired_primary_processor = PROCESSOR_NULL;
3527 processor_t lp_unpaired_secondary_processor = PROCESSOR_NULL;
3528 processor_t lc_processor = PROCESSOR_NULL;
3529 processor_t fd_processor = PROCESSOR_NULL;
3530
3531 if (processor != PROCESSOR_NULL) {
3532 /* All other states should be enumerated above. */
3533 assert(processor->state == PROCESSOR_RUNNING || processor->state == PROCESSOR_DISPATCHING);
3534
3535 lowest_priority = processor->current_pri;
3536 lp_processor = processor;
3537
3538 if (processor->current_pri >= BASEPRI_RTQUEUES) {
3539 furthest_deadline = processor->deadline;
3540 fd_processor = processor;
3541 }
3542
3543 lowest_count = SCHED(processor_runq_count)(processor);
3544 lc_processor = processor;
3545 }
3546
2d21ac55 3547 do {
fe8ab488 3548
9bccf70c 3549 /*
fe8ab488 3550 * Choose an idle processor, in pset traversal order
9bccf70c 3551 */
3e170ce0
A
3552 qe_foreach_element(processor, &cset->idle_queue, processor_queue) {
3553 if (processor->is_recommended)
3554 return processor;
3555 }
1c79356b 3556
fe8ab488
A
3557 /*
3558 * Otherwise, enumerate active and idle processors to find candidates
3559 * with lower priority/etc.
3560 */
0b4c1975 3561
3e170ce0
A
3562 qe_foreach_element(processor, &cset->active_queue, processor_queue) {
3563
3564 if (!processor->is_recommended) {
3565 continue;
3566 }
2d21ac55 3567
fe8ab488
A
3568 integer_t cpri = processor->current_pri;
3569 if (cpri < lowest_priority) {
3570 lowest_priority = cpri;
3571 lp_processor = processor;
3572 }
b0d623f7 3573
fe8ab488
A
3574 if ((cpri >= BASEPRI_RTQUEUES) && (processor->deadline > furthest_deadline)) {
3575 furthest_deadline = processor->deadline;
3576 fd_processor = processor;
3577 }
0b4c1975 3578
fe8ab488
A
3579 integer_t ccount = SCHED(processor_runq_count)(processor);
3580 if (ccount < lowest_count) {
3581 lowest_count = ccount;
3582 lc_processor = processor;
3583 }
fe8ab488
A
3584 }
3585
3586 /*
3587 * For SMT configs, these idle secondary processors must have active primary. Otherwise
3588 * the idle primary would have short-circuited the loop above
3589 */
3e170ce0
A
3590 qe_foreach_element(processor, &cset->idle_secondary_queue, processor_queue) {
3591
3592 if (!processor->is_recommended) {
3593 continue;
3594 }
3595
fe8ab488
A
3596 processor_t cprimary = processor->processor_primary;
3597
3598 /* If the primary processor is offline or starting up, it's not a candidate for this path */
3599 if (cprimary->state == PROCESSOR_RUNNING || cprimary->state == PROCESSOR_DISPATCHING) {
3600 integer_t primary_pri = cprimary->current_pri;
3601
3602 if (primary_pri < lowest_unpaired_primary_priority) {
3603 lowest_unpaired_primary_priority = primary_pri;
3604 lp_unpaired_primary_processor = cprimary;
3605 lp_unpaired_secondary_processor = processor;
0b4c1975 3606 }
2d21ac55 3607 }
fe8ab488
A
3608 }
3609
0b4c1975 3610
fe8ab488
A
3611 if (thread->sched_pri >= BASEPRI_RTQUEUES) {
3612
3613 /*
3614 * For realtime threads, the most important aspect is
3615 * scheduling latency, so we attempt to assign threads
3616 * to good preemption candidates (assuming an idle primary
3617 * processor was not available above).
3618 */
3619
3620 if (thread->sched_pri > lowest_unpaired_primary_priority) {
3621 /* Move to end of active queue so that the next thread doesn't also pick it */
39037602 3622 re_queue_tail(&cset->active_queue, &lp_unpaired_primary_processor->processor_queue);
fe8ab488
A
3623 return lp_unpaired_primary_processor;
3624 }
3625 if (thread->sched_pri > lowest_priority) {
3626 /* Move to end of active queue so that the next thread doesn't also pick it */
39037602 3627 re_queue_tail(&cset->active_queue, &lp_processor->processor_queue);
0b4c1975 3628 return lp_processor;
fe8ab488 3629 }
0b4c1975
A
3630 if (thread->realtime.deadline < furthest_deadline)
3631 return fd_processor;
6d2010ae 3632
2d21ac55 3633 /*
fe8ab488
A
3634 * If all primary and secondary CPUs are busy with realtime
3635 * threads with deadlines earlier than us, move on to next
3636 * pset.
2d21ac55 3637 */
fe8ab488
A
3638 }
3639 else {
3640
3641 if (thread->sched_pri > lowest_unpaired_primary_priority) {
3642 /* Move to end of active queue so that the next thread doesn't also pick it */
39037602 3643 re_queue_tail(&cset->active_queue, &lp_unpaired_primary_processor->processor_queue);
fe8ab488 3644 return lp_unpaired_primary_processor;
c910b4d9 3645 }
fe8ab488
A
3646 if (thread->sched_pri > lowest_priority) {
3647 /* Move to end of active queue so that the next thread doesn't also pick it */
39037602 3648 re_queue_tail(&cset->active_queue, &lp_processor->processor_queue);
fe8ab488 3649 return lp_processor;
cf7d32b8 3650 }
9bccf70c 3651
9bccf70c 3652 /*
fe8ab488
A
3653 * If all primary processor in this pset are running a higher
3654 * priority thread, move on to next pset. Only when we have
3655 * exhausted this search do we fall back to other heuristics.
1c79356b 3656 */
2d21ac55
A
3657 }
3658
3659 /*
fe8ab488 3660 * Move onto the next processor set.
2d21ac55
A
3661 */
3662 nset = next_pset(cset);
3663
3664 if (nset != pset) {
3665 pset_unlock(cset);
3666
3667 cset = nset;
3668 pset_lock(cset);
3669 }
3670 } while (nset != pset);
3671
3672 /*
fe8ab488
A
3673 * Make sure that we pick a running processor,
3674 * and that the correct processor set is locked.
3675 * Since we may have unlock the candidate processor's
3676 * pset, it may have changed state.
3677 *
3678 * All primary processors are running a higher priority
3679 * thread, so the only options left are enqueuing on
3680 * the secondary processor that would perturb the least priority
3681 * primary, or the least busy primary.
2d21ac55 3682 */
cf7d32b8 3683 do {
2d21ac55 3684
fe8ab488
A
3685 /* lowest_priority is evaluated in the main loops above */
3686 if (lp_unpaired_secondary_processor != PROCESSOR_NULL) {
3687 processor = lp_unpaired_secondary_processor;
3688 lp_unpaired_secondary_processor = PROCESSOR_NULL;
3689 } else if (lc_processor != PROCESSOR_NULL) {
3690 processor = lc_processor;
3691 lc_processor = PROCESSOR_NULL;
3692 } else {
cf7d32b8 3693 /*
fe8ab488
A
3694 * All processors are executing higher
3695 * priority threads, and the lowest_count
3696 * candidate was not usable
cf7d32b8 3697 */
fe8ab488 3698 processor = master_processor;
cf7d32b8
A
3699 }
3700
3701 /*
fe8ab488
A
3702 * Check that the correct processor set is
3703 * returned locked.
cf7d32b8
A
3704 */
3705 if (cset != processor->processor_set) {
3706 pset_unlock(cset);
cf7d32b8
A
3707 cset = processor->processor_set;
3708 pset_lock(cset);
3709 }
3710
3711 /*
fe8ab488
A
3712 * We must verify that the chosen processor is still available.
3713 * master_processor is an exception, since we may need to preempt
3714 * a running thread on it during processor shutdown (for sleep),
3715 * and that thread needs to be enqueued on its runqueue to run
3716 * when the processor is restarted.
cf7d32b8 3717 */
fe8ab488 3718 if (processor != master_processor && (processor->state == PROCESSOR_SHUTDOWN || processor->state == PROCESSOR_OFF_LINE))
cf7d32b8 3719 processor = PROCESSOR_NULL;
fe8ab488 3720
cf7d32b8 3721 } while (processor == PROCESSOR_NULL);
2d21ac55
A
3722
3723 return (processor);
3724}
3725
3726/*
3727 * thread_setrun:
3728 *
3729 * Dispatch thread for execution, onto an idle
3730 * processor or run queue, and signal a preemption
3731 * as appropriate.
3732 *
3733 * Thread must be locked.
3734 */
3735void
3736thread_setrun(
3737 thread_t thread,
3738 integer_t options)
3739{
3740 processor_t processor;
3741 processor_set_t pset;
3742
3e170ce0
A
3743 assert((thread->state & (TH_RUN|TH_WAIT|TH_UNINT|TH_TERMINATE|TH_TERMINATE2)) == TH_RUN);
3744 assert(thread->runq == PROCESSOR_NULL);
3745
2d21ac55
A
3746 /*
3747 * Update priority if needed.
3748 */
6d2010ae
A
3749 if (SCHED(can_update_priority)(thread))
3750 SCHED(update_priority)(thread);
2d21ac55 3751
fe8ab488
A
3752 thread->sfi_class = sfi_thread_classify(thread);
3753
2d21ac55
A
3754 assert(thread->runq == PROCESSOR_NULL);
3755
3e170ce0 3756#if __SMP__
2d21ac55
A
3757 if (thread->bound_processor == PROCESSOR_NULL) {
3758 /*
3759 * Unbound case.
3760 */
3761 if (thread->affinity_set != AFFINITY_SET_NULL) {
3762 /*
3763 * Use affinity set policy hint.
3764 */
3765 pset = thread->affinity_set->aset_pset;
3766 pset_lock(pset);
3767
6d2010ae 3768 processor = SCHED(choose_processor)(pset, PROCESSOR_NULL, thread);
39236c6e 3769
3e170ce0 3770 SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR)|DBG_FUNC_NONE,
39236c6e 3771 (uintptr_t)thread_tid(thread), (uintptr_t)-1, processor->cpu_id, processor->state, 0);
3e170ce0 3772 } else if (thread->last_processor != PROCESSOR_NULL) {
2d21ac55
A
3773 /*
3774 * Simple (last processor) affinity case.
3775 */
3776 processor = thread->last_processor;
3777 pset = processor->processor_set;
3778 pset_lock(pset);
6d2010ae
A
3779 processor = SCHED(choose_processor)(pset, processor, thread);
3780
3e170ce0 3781 SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR)|DBG_FUNC_NONE,
39236c6e 3782 (uintptr_t)thread_tid(thread), thread->last_processor->cpu_id, processor->cpu_id, processor->state, 0);
3e170ce0 3783 } else {
2d21ac55
A
3784 /*
3785 * No Affinity case:
3786 *
cf7d32b8
A
3787 * Utilitize a per task hint to spread threads
3788 * among the available processor sets.
2d21ac55 3789 */
cf7d32b8
A
3790 task_t task = thread->task;
3791
3792 pset = task->pset_hint;
3793 if (pset == PROCESSOR_SET_NULL)
3794 pset = current_processor()->processor_set;
3795
3796 pset = choose_next_pset(pset);
2d21ac55 3797 pset_lock(pset);
9bccf70c 3798
6d2010ae 3799 processor = SCHED(choose_processor)(pset, PROCESSOR_NULL, thread);
cf7d32b8 3800 task->pset_hint = processor->processor_set;
39236c6e 3801
3e170ce0 3802 SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR)|DBG_FUNC_NONE,
39236c6e 3803 (uintptr_t)thread_tid(thread), (uintptr_t)-1, processor->cpu_id, processor->state, 0);
55e303ae 3804 }
3e170ce0 3805 } else {
2d21ac55
A
3806 /*
3807 * Bound case:
3808 *
3809 * Unconditionally dispatch on the processor.
3810 */
3811 processor = thread->bound_processor;
55e303ae 3812 pset = processor->processor_set;
2d21ac55 3813 pset_lock(pset);
39236c6e 3814
3e170ce0 3815 SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR)|DBG_FUNC_NONE,
39236c6e 3816 (uintptr_t)thread_tid(thread), (uintptr_t)-2, processor->cpu_id, processor->state, 0);
2d21ac55 3817 }
3e170ce0
A
3818#else /* !__SMP__ */
3819 /* Only one processor to choose */
3820 assert(thread->bound_processor == PROCESSOR_NULL || thread->bound_processor == master_processor);
3821 processor = master_processor;
3822 pset = processor->processor_set;
3823 pset_lock(pset);
3824#endif /* !__SMP__ */
2d21ac55
A
3825
3826 /*
3e170ce0 3827 * Dispatch the thread on the chosen processor.
fe8ab488 3828 * TODO: This should be based on sched_mode, not sched_pri
2d21ac55
A
3829 */
3830 if (thread->sched_pri >= BASEPRI_RTQUEUES)
3831 realtime_setrun(processor, thread);
3832 else
3833 processor_setrun(processor, thread, options);
3834}
3835
b0d623f7
A
3836processor_set_t
3837task_choose_pset(
3838 task_t task)
3839{
3840 processor_set_t pset = task->pset_hint;
3841
3842 if (pset != PROCESSOR_SET_NULL)
3843 pset = choose_next_pset(pset);
3844
3845 return (pset);
3846}
3847
9bccf70c 3848/*
c910b4d9
A
3849 * Check for a preemption point in
3850 * the current context.
55e303ae 3851 *
fe8ab488 3852 * Called at splsched with thread locked.
9bccf70c
A
3853 */
3854ast_t
3855csw_check(
fe8ab488
A
3856 processor_t processor,
3857 ast_t check_reason)
39236c6e
A
3858{
3859 processor_set_t pset = processor->processor_set;
3860 ast_t result;
3861
3862 pset_lock(pset);
3863
3864 /* If we were sent a remote AST and interrupted a running processor, acknowledge it here with pset lock held */
3e170ce0 3865 pset->pending_AST_cpu_mask &= ~(1ULL << processor->cpu_id);
39236c6e 3866
fe8ab488 3867 result = csw_check_locked(processor, pset, check_reason);
39236c6e
A
3868
3869 pset_unlock(pset);
3870
3871 return result;
3872}
3873
3874/*
3875 * Check for preemption at splsched with
fe8ab488 3876 * pset and thread locked
39236c6e
A
3877 */
3878ast_t
3879csw_check_locked(
3880 processor_t processor,
fe8ab488
A
3881 processor_set_t pset __unused,
3882 ast_t check_reason)
9bccf70c 3883{
fe8ab488 3884 ast_t result;
316670eb 3885 thread_t thread = processor->active_thread;
9bccf70c 3886
3e170ce0 3887 if (processor->first_timeslice) {
6d2010ae 3888 if (rt_runq.count > 0)
fe8ab488 3889 return (check_reason | AST_PREEMPT | AST_URGENT);
9bccf70c
A
3890 }
3891 else {
39236c6e
A
3892 if (rt_runq.count > 0) {
3893 if (BASEPRI_RTQUEUES > processor->current_pri)
fe8ab488 3894 return (check_reason | AST_PREEMPT | AST_URGENT);
39236c6e 3895 else
fe8ab488 3896 return (check_reason | AST_PREEMPT);
39236c6e 3897 }
1c79356b 3898 }
9bccf70c 3899
316670eb 3900 result = SCHED(processor_csw_check)(processor);
9bccf70c 3901 if (result != AST_NONE)
3e170ce0
A
3902 return (check_reason | result | (thread_eager_preemption(thread) ? AST_URGENT : AST_NONE));
3903
3904#if __SMP__
9bccf70c 3905
3e170ce0
A
3906 /*
3907 * If the current thread is running on a processor that is no longer recommended, gently
3908 * (non-urgently) get to a point and then block, and which point thread_select() should
3909 * try to idle the processor and re-dispatch the thread to a recommended processor.
3910 */
3911 if (!processor->is_recommended)
fe8ab488 3912 return (check_reason | AST_PREEMPT);
3e170ce0
A
3913
3914 /*
3915 * Even though we could continue executing on this processor, a
3916 * secondary SMT core should try to shed load to another primary core.
3917 *
3918 * TODO: Should this do the same check that thread_select does? i.e.
3919 * if no bound threads target this processor, and idle primaries exist, preempt
3920 * The case of RT threads existing is already taken care of above
3921 * Consider Capri in this scenario.
3922 *
3923 * if (!SCHED(processor_bound_count)(processor) && !queue_empty(&pset->idle_queue))
3924 *
3925 * TODO: Alternatively - check if only primary is idle, or check if primary's pri is lower than mine.
3926 */
3927
3928 if (processor->current_pri < BASEPRI_RTQUEUES &&
3929 processor->processor_primary != processor)
3930 return (check_reason | AST_PREEMPT);
3931#endif
3932
316670eb 3933 if (thread->state & TH_SUSP)
fe8ab488
A
3934 return (check_reason | AST_PREEMPT);
3935
3e170ce0 3936#if CONFIG_SCHED_SFI
fe8ab488
A
3937 /*
3938 * Current thread may not need to be preempted, but maybe needs
3939 * an SFI wait?
3940 */
3941 result = sfi_thread_needs_ast(thread, NULL);
3942 if (result != AST_NONE)
3943 return (check_reason | result);
3e170ce0 3944#endif
c910b4d9
A
3945
3946 return (AST_NONE);
1c79356b
A
3947}
3948
3949/*
9bccf70c 3950 * set_sched_pri:
1c79356b 3951 *
55e303ae
A
3952 * Set the scheduled priority of the specified thread.
3953 *
9bccf70c 3954 * This may cause the thread to change queues.
1c79356b 3955 *
55e303ae 3956 * Thread must be locked.
1c79356b
A
3957 */
3958void
9bccf70c 3959set_sched_pri(
3e170ce0
A
3960 thread_t thread,
3961 int priority)
1c79356b 3962{
3e170ce0
A
3963 thread_t cthread = current_thread();
3964 boolean_t is_current_thread = (thread == cthread) ? TRUE : FALSE;
fe8ab488
A
3965 int curgency, nurgency;
3966 uint64_t urgency_param1, urgency_param2;
3e170ce0 3967 boolean_t removed_from_runq = FALSE;
9bccf70c 3968
3e170ce0
A
3969 /* If we're already at this priority, no need to mess with the runqueue */
3970 if (priority == thread->sched_pri)
3971 return;
3972
3973 if (is_current_thread) {
3974 assert(thread->runq == PROCESSOR_NULL);
fe8ab488 3975 curgency = thread_get_urgency(thread, &urgency_param1, &urgency_param2);
3e170ce0
A
3976 } else {
3977 removed_from_runq = thread_run_queue_remove(thread);
fe8ab488 3978 }
3e170ce0 3979
490019cf
A
3980 thread->sched_pri = priority;
3981
3e170ce0
A
3982 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHANGE_PRIORITY),
3983 (uintptr_t)thread_tid(thread),
3984 thread->base_pri,
3985 thread->sched_pri,
3986 0, /* eventually, 'reason' */
3987 0);
3988
3e170ce0 3989 if (is_current_thread) {
fe8ab488 3990 nurgency = thread_get_urgency(thread, &urgency_param1, &urgency_param2);
3e170ce0
A
3991 /*
3992 * set_sched_pri doesn't alter RT params. We expect direct base priority/QoS
3993 * class alterations from user space to occur relatively infrequently, hence
3994 * those are lazily handled. QoS classes have distinct priority bands, and QoS
3995 * inheritance is expected to involve priority changes.
3996 */
fe8ab488 3997 if (nurgency != curgency) {
3e170ce0
A
3998 thread_tell_urgency(nurgency, urgency_param1, urgency_param2, 0, thread);
3999 machine_thread_going_on_core(thread, nurgency, 0);
fe8ab488
A
4000 }
4001 }
4002
3e170ce0
A
4003 /* TODO: Should this be TAILQ if it went down, HEADQ if it went up? */
4004 if (removed_from_runq)
4005 thread_run_queue_reinsert(thread, SCHED_PREEMPT | SCHED_TAILQ);
4006 else if (thread->state & TH_RUN) {
4007 processor_t processor = thread->last_processor;
9bccf70c 4008
3e170ce0
A
4009 if (is_current_thread) {
4010 ast_t preempt;
9bccf70c 4011
9bccf70c 4012 processor->current_pri = priority;
6d2010ae 4013 processor->current_thmode = thread->sched_mode;
fe8ab488
A
4014 processor->current_sfi_class = thread->sfi_class = sfi_thread_classify(thread);
4015 if ((preempt = csw_check(processor, AST_NONE)) != AST_NONE)
c910b4d9 4016 ast_on(preempt);
3e170ce0 4017 } else if (processor != PROCESSOR_NULL && processor->active_thread == thread)
9bccf70c 4018 cause_ast_check(processor);
1c79356b
A
4019 }
4020}
4021
3e170ce0
A
4022/*
4023 * thread_run_queue_remove_for_handoff
4024 *
4025 * Pull a thread or its (recursive) push target out of the runqueue
4026 * so that it is ready for thread_run()
4027 *
4028 * Called at splsched
4029 *
4030 * Returns the thread that was pulled or THREAD_NULL if no thread could be pulled.
4031 * This may be different than the thread that was passed in.
4032 */
4033thread_t
4034thread_run_queue_remove_for_handoff(thread_t thread) {
91447636 4035
3e170ce0 4036 thread_t pulled_thread = THREAD_NULL;
91447636 4037
3e170ce0 4038 thread_lock(thread);
91447636 4039
3e170ce0
A
4040 /*
4041 * Check that the thread is not bound
4042 * to a different processor, and that realtime
4043 * is not involved.
4044 *
4045 * Next, pull it off its run queue. If it
4046 * doesn't come, it's not eligible.
4047 */
91447636 4048
3e170ce0
A
4049 processor_t processor = current_processor();
4050 if (processor->current_pri < BASEPRI_RTQUEUES && thread->sched_pri < BASEPRI_RTQUEUES &&
4051 (thread->bound_processor == PROCESSOR_NULL || thread->bound_processor == processor)) {
91447636 4052
3e170ce0
A
4053 if (thread_run_queue_remove(thread))
4054 pulled_thread = thread;
91447636
A
4055 }
4056
3e170ce0 4057 thread_unlock(thread);
6d2010ae 4058
3e170ce0 4059 return pulled_thread;
6d2010ae
A
4060}
4061
1c79356b 4062/*
6d2010ae 4063 * thread_run_queue_remove:
1c79356b 4064 *
fe8ab488 4065 * Remove a thread from its current run queue and
2d21ac55 4066 * return TRUE if successful.
55e303ae
A
4067 *
4068 * Thread must be locked.
fe8ab488
A
4069 *
4070 * If thread->runq is PROCESSOR_NULL, the thread will not re-enter the
4071 * run queues because the caller locked the thread. Otherwise
4072 * the thread is on a run queue, but could be chosen for dispatch
4073 * and removed by another processor under a different lock, which
4074 * will set thread->runq to PROCESSOR_NULL.
4075 *
4076 * Hence the thread select path must not rely on anything that could
4077 * be changed under the thread lock after calling this function,
4078 * most importantly thread->sched_pri.
1c79356b 4079 */
2d21ac55 4080boolean_t
6d2010ae 4081thread_run_queue_remove(
fe8ab488 4082 thread_t thread)
1c79356b 4083{
fe8ab488
A
4084 boolean_t removed = FALSE;
4085 processor_t processor = thread->runq;
1c79356b 4086
fe8ab488
A
4087 if ((thread->state & (TH_RUN|TH_WAIT)) == TH_WAIT) {
4088 /* Thread isn't runnable */
4089 assert(thread->runq == PROCESSOR_NULL);
4090 return FALSE;
4091 }
55e303ae 4092
fe8ab488 4093 if (processor == PROCESSOR_NULL) {
55e303ae 4094 /*
fe8ab488
A
4095 * The thread is either not on the runq,
4096 * or is in the midst of being removed from the runq.
4097 *
4098 * runq is set to NULL under the pset lock, not the thread
4099 * lock, so the thread may still be in the process of being dequeued
4100 * from the runq. It will wait in invoke for the thread lock to be
4101 * dropped.
55e303ae 4102 */
55e303ae 4103
fe8ab488
A
4104 return FALSE;
4105 }
55e303ae 4106
fe8ab488
A
4107 if (thread->sched_pri < BASEPRI_RTQUEUES) {
4108 return SCHED(processor_queue_remove)(processor, thread);
4109 }
55e303ae 4110
3e170ce0 4111 rt_lock_lock();
55e303ae 4112
fe8ab488
A
4113 if (thread->runq != PROCESSOR_NULL) {
4114 /*
3e170ce0 4115 * Thread is on the RT run queue and we have a lock on
fe8ab488
A
4116 * that run queue.
4117 */
4118
3e170ce0 4119 assert(thread->runq == THREAD_ON_RT_RUNQ);
fe8ab488 4120
39037602 4121 remqueue(&thread->runq_links);
fe8ab488
A
4122 SCHED_STATS_RUNQ_CHANGE(&rt_runq.runq_stats, rt_runq.count);
4123 rt_runq.count--;
4124
4125 thread->runq = PROCESSOR_NULL;
4126
4127 removed = TRUE;
1c79356b
A
4128 }
4129
3e170ce0 4130 rt_lock_unlock();
fe8ab488
A
4131
4132 return (removed);
1c79356b
A
4133}
4134
cf7d32b8 4135/*
3e170ce0 4136 * Put the thread back where it goes after a thread_run_queue_remove
cf7d32b8 4137 *
3e170ce0 4138 * Thread must have been removed under the same thread lock hold
cf7d32b8 4139 *
3e170ce0 4140 * thread locked, at splsched
cf7d32b8 4141 */
3e170ce0
A
4142void
4143thread_run_queue_reinsert(thread_t thread, integer_t options)
cf7d32b8 4144{
3e170ce0 4145 assert(thread->runq == PROCESSOR_NULL);
cf7d32b8 4146
3e170ce0
A
4147 assert(thread->state & (TH_RUN));
4148 thread_setrun(thread, options);
6d2010ae 4149
6d2010ae
A
4150}
4151
39236c6e
A
4152void
4153sys_override_cpu_throttle(int flag)
6d2010ae 4154{
39236c6e
A
4155 if (flag == CPU_THROTTLE_ENABLE)
4156 cpu_throttle_enabled = 1;
4157 if (flag == CPU_THROTTLE_DISABLE)
4158 cpu_throttle_enabled = 0;
4159}
6d2010ae 4160
39236c6e
A
4161int
4162thread_get_urgency(thread_t thread, uint64_t *arg1, uint64_t *arg2)
4163{
4164 if (thread == NULL || (thread->state & TH_IDLE)) {
4165 *arg1 = 0;
4166 *arg2 = 0;
4167
4168 return (THREAD_URGENCY_NONE);
4169 } else if (thread->sched_mode == TH_MODE_REALTIME) {
4170 *arg1 = thread->realtime.period;
4171 *arg2 = thread->realtime.deadline;
4172
4173 return (THREAD_URGENCY_REAL_TIME);
4174 } else if (cpu_throttle_enabled &&
3e170ce0 4175 ((thread->sched_pri <= MAXPRI_THROTTLE) && (thread->base_pri <= MAXPRI_THROTTLE))) {
39236c6e
A
4176 /*
4177 * Background urgency applied when thread priority is MAXPRI_THROTTLE or lower and thread is not promoted
4178 */
4179 *arg1 = thread->sched_pri;
3e170ce0 4180 *arg2 = thread->base_pri;
6d2010ae 4181
39236c6e
A
4182 return (THREAD_URGENCY_BACKGROUND);
4183 } else {
fe8ab488
A
4184 /* For otherwise unclassified threads, report throughput QoS
4185 * parameters
4186 */
39037602
A
4187 *arg1 = proc_get_effective_thread_policy(thread, TASK_POLICY_THROUGH_QOS);
4188 *arg2 = proc_get_effective_task_policy(thread->task, TASK_POLICY_THROUGH_QOS);
4189
6d2010ae
A
4190 return (THREAD_URGENCY_NORMAL);
4191 }
6d2010ae
A
4192}
4193
4194
1c79356b 4195/*
2d21ac55
A
4196 * This is the processor idle loop, which just looks for other threads
4197 * to execute. Processor idle threads invoke this without supplying a
4198 * current thread to idle without an asserted wait state.
4199 *
4200 * Returns a the next thread to execute if dispatched directly.
1c79356b 4201 */
6d2010ae
A
4202
4203#if 0
4204#define IDLE_KERNEL_DEBUG_CONSTANT(...) KERNEL_DEBUG_CONSTANT(__VA_ARGS__)
4205#else
4206#define IDLE_KERNEL_DEBUG_CONSTANT(...) do { } while(0)
4207#endif
4208
4209thread_t
2d21ac55
A
4210processor_idle(
4211 thread_t thread,
4212 processor_t processor)
1c79356b 4213{
2d21ac55
A
4214 processor_set_t pset = processor->processor_set;
4215 thread_t new_thread;
4216 int state;
2d21ac55 4217 (void)splsched();
1c79356b 4218
316670eb
A
4219 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
4220 MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_START,
4221 (uintptr_t)thread_tid(thread), 0, 0, 0, 0);
3a60a9f5 4222
6d2010ae
A
4223 SCHED_STATS_CPU_IDLE_START(processor);
4224
2d21ac55
A
4225 timer_switch(&PROCESSOR_DATA(processor, system_state),
4226 mach_absolute_time(), &PROCESSOR_DATA(processor, idle_state));
4227 PROCESSOR_DATA(processor, current_state) = &PROCESSOR_DATA(processor, idle_state);
3a60a9f5 4228
39236c6e 4229 while (1) {
39236c6e
A
4230 if (processor->state != PROCESSOR_IDLE) /* unsafe, but worst case we loop around once */
4231 break;
3e170ce0 4232 if (pset->pending_AST_cpu_mask & (1ULL << processor->cpu_id))
39236c6e 4233 break;
3e170ce0
A
4234 if (processor->is_recommended) {
4235 if (rt_runq.count)
4236 break;
4237 } else {
4238 if (SCHED(processor_bound_count)(processor))
4239 break;
4240 }
4241
39236c6e
A
4242#if CONFIG_SCHED_IDLE_IN_PLACE
4243 if (thread != THREAD_NULL) {
4244 /* Did idle-in-place thread wake up */
4245 if ((thread->state & (TH_WAIT|TH_SUSP)) != TH_WAIT || thread->wake_active)
4246 break;
4247 }
4248#endif
4249
6d2010ae
A
4250 IDLE_KERNEL_DEBUG_CONSTANT(
4251 MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), rt_runq.count, SCHED(processor_runq_count)(processor), -1, 0);
4252
4b17d6b6
A
4253 machine_track_platform_idle(TRUE);
4254
2d21ac55 4255 machine_idle();
55e303ae 4256
4b17d6b6
A
4257 machine_track_platform_idle(FALSE);
4258
55e303ae 4259 (void)splsched();
c910b4d9 4260
6d2010ae
A
4261 IDLE_KERNEL_DEBUG_CONSTANT(
4262 MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), rt_runq.count, SCHED(processor_runq_count)(processor), -2, 0);
4263
fe8ab488
A
4264 if (!SCHED(processor_queue_empty)(processor)) {
4265 /* Secondary SMT processors respond to directed wakeups
4266 * exclusively. Some platforms induce 'spurious' SMT wakeups.
4267 */
4268 if (processor->processor_primary == processor)
4269 break;
4270 }
55e303ae
A
4271 }
4272
2d21ac55
A
4273 timer_switch(&PROCESSOR_DATA(processor, idle_state),
4274 mach_absolute_time(), &PROCESSOR_DATA(processor, system_state));
4275 PROCESSOR_DATA(processor, current_state) = &PROCESSOR_DATA(processor, system_state);
1c79356b 4276
2d21ac55
A
4277 pset_lock(pset);
4278
39236c6e 4279 /* If we were sent a remote AST and came out of idle, acknowledge it here with pset lock held */
3e170ce0
A
4280 pset->pending_AST_cpu_mask &= ~(1ULL << processor->cpu_id);
4281#if defined(CONFIG_SCHED_DEFERRED_AST)
4282 pset->pending_deferred_AST_cpu_mask &= ~(1ULL << processor->cpu_id);
4283#endif
39236c6e 4284
55e303ae
A
4285 state = processor->state;
4286 if (state == PROCESSOR_DISPATCHING) {
1c79356b 4287 /*
55e303ae 4288 * Commmon case -- cpu dispatched.
1c79356b 4289 */
2d21ac55
A
4290 new_thread = processor->next_thread;
4291 processor->next_thread = THREAD_NULL;
55e303ae 4292 processor->state = PROCESSOR_RUNNING;
1c79356b 4293
39236c6e 4294 if ((new_thread != THREAD_NULL) && (SCHED(processor_queue_has_priority)(processor, new_thread->sched_pri, FALSE) ||
3e170ce0 4295 (rt_runq.count > 0)) ) {
fe8ab488 4296 /* Something higher priority has popped up on the runqueue - redispatch this thread elsewhere */
39236c6e
A
4297 processor->current_pri = IDLEPRI;
4298 processor->current_thmode = TH_MODE_FIXED;
fe8ab488 4299 processor->current_sfi_class = SFI_CLASS_KERNEL;
2d21ac55 4300 processor->deadline = UINT64_MAX;
55e303ae 4301
2d21ac55 4302 pset_unlock(pset);
1c79356b 4303
2d21ac55 4304 thread_lock(new_thread);
6d2010ae 4305 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_REDISPATCH), (uintptr_t)thread_tid(new_thread), new_thread->sched_pri, rt_runq.count, 0, 0);
2d21ac55
A
4306 thread_setrun(new_thread, SCHED_HEADQ);
4307 thread_unlock(new_thread);
55e303ae 4308
316670eb
A
4309 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
4310 MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_END,
4311 (uintptr_t)thread_tid(thread), state, 0, 0, 0);
6d2010ae 4312
2d21ac55 4313 return (THREAD_NULL);
1c79356b 4314 }
1c79356b 4315
2d21ac55
A
4316 pset_unlock(pset);
4317
316670eb
A
4318 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
4319 MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_END,
4320 (uintptr_t)thread_tid(thread), state, (uintptr_t)thread_tid(new_thread), 0, 0);
39037602 4321
2d21ac55 4322 return (new_thread);
39037602
A
4323
4324 } else if (state == PROCESSOR_IDLE) {
4325 re_queue_tail(&pset->active_queue, &processor->processor_queue);
1c79356b 4326
2d21ac55 4327 processor->state = PROCESSOR_RUNNING;
39236c6e
A
4328 processor->current_pri = IDLEPRI;
4329 processor->current_thmode = TH_MODE_FIXED;
fe8ab488 4330 processor->current_sfi_class = SFI_CLASS_KERNEL;
39236c6e 4331 processor->deadline = UINT64_MAX;
39037602
A
4332
4333 } else if (state == PROCESSOR_SHUTDOWN) {
55e303ae
A
4334 /*
4335 * Going off-line. Force a
4336 * reschedule.
4337 */
2d21ac55
A
4338 if ((new_thread = processor->next_thread) != THREAD_NULL) {
4339 processor->next_thread = THREAD_NULL;
39236c6e
A
4340 processor->current_pri = IDLEPRI;
4341 processor->current_thmode = TH_MODE_FIXED;
fe8ab488 4342 processor->current_sfi_class = SFI_CLASS_KERNEL;
55e303ae 4343 processor->deadline = UINT64_MAX;
2d21ac55
A
4344
4345 pset_unlock(pset);
55e303ae
A
4346
4347 thread_lock(new_thread);
4348 thread_setrun(new_thread, SCHED_HEADQ);
4349 thread_unlock(new_thread);
55e303ae 4350
316670eb
A
4351 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
4352 MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_END,
4353 (uintptr_t)thread_tid(thread), state, 0, 0, 0);
6d2010ae 4354
2d21ac55
A
4355 return (THREAD_NULL);
4356 }
55e303ae
A
4357 }
4358
2d21ac55
A
4359 pset_unlock(pset);
4360
316670eb
A
4361 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
4362 MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_END,
4363 (uintptr_t)thread_tid(thread), state, 0, 0, 0);
6d2010ae 4364
2d21ac55
A
4365 return (THREAD_NULL);
4366}
4367
cf7d32b8
A
4368/*
4369 * Each processor has a dedicated thread which
4370 * executes the idle loop when there is no suitable
4371 * previous context.
4372 */
2d21ac55
A
4373void
4374idle_thread(void)
4375{
4376 processor_t processor = current_processor();
4377 thread_t new_thread;
4378
4379 new_thread = processor_idle(THREAD_NULL, processor);
4380 if (new_thread != THREAD_NULL) {
4381 thread_run(processor->idle_thread, (thread_continue_t)idle_thread, NULL, new_thread);
4382 /*NOTREACHED*/
4383 }
55e303ae 4384
2d21ac55 4385 thread_block((thread_continue_t)idle_thread);
55e303ae 4386 /*NOTREACHED*/
1c79356b
A
4387}
4388
91447636
A
4389kern_return_t
4390idle_thread_create(
4391 processor_t processor)
1c79356b 4392{
91447636
A
4393 kern_return_t result;
4394 thread_t thread;
4395 spl_t s;
4396
4397 result = kernel_thread_create((thread_continue_t)idle_thread, NULL, MAXPRI_KERNEL, &thread);
4398 if (result != KERN_SUCCESS)
4399 return (result);
4400
4401 s = splsched();
4402 thread_lock(thread);
4403 thread->bound_processor = processor;
4404 processor->idle_thread = thread;
3e170ce0 4405 thread->sched_pri = thread->base_pri = IDLEPRI;
91447636 4406 thread->state = (TH_RUN | TH_IDLE);
39236c6e 4407 thread->options |= TH_OPT_IDLE_THREAD;
91447636
A
4408 thread_unlock(thread);
4409 splx(s);
4410
4411 thread_deallocate(thread);
4412
4413 return (KERN_SUCCESS);
1c79356b
A
4414}
4415
91447636
A
4416/*
4417 * sched_startup:
4418 *
4419 * Kicks off scheduler services.
4420 *
4421 * Called at splsched.
4422 */
0b4e3aa0 4423void
91447636 4424sched_startup(void)
0b4e3aa0 4425{
91447636
A
4426 kern_return_t result;
4427 thread_t thread;
4428
3e170ce0
A
4429 simple_lock_init(&sched_vm_group_list_lock, 0);
4430
490019cf 4431
6d2010ae 4432 result = kernel_thread_start_priority((thread_continue_t)sched_init_thread,
39236c6e 4433 (void *)SCHED(maintenance_continuation), MAXPRI_KERNEL, &thread);
91447636
A
4434 if (result != KERN_SUCCESS)
4435 panic("sched_startup");
4436
4437 thread_deallocate(thread);
4438
39037602
A
4439 assert_thread_magic(thread);
4440
91447636 4441 /*
316670eb
A
4442 * Yield to the sched_init_thread once, to
4443 * initialize our own thread after being switched
4444 * back to.
91447636
A
4445 *
4446 * The current thread is the only other thread
4447 * active at this point.
4448 */
316670eb 4449 thread_block(THREAD_CONTINUE_NULL);
6d2010ae 4450}
91447636 4451
fe8ab488 4452#if defined(CONFIG_SCHED_TIMESHARE_CORE)
91447636 4453
39236c6e
A
4454static volatile uint64_t sched_maintenance_deadline;
4455static uint64_t sched_tick_last_abstime;
4456static uint64_t sched_tick_delta;
4457uint64_t sched_tick_max_delta;
1c79356b 4458/*
6d2010ae 4459 * sched_init_thread:
1c79356b 4460 *
55e303ae
A
4461 * Perform periodic bookkeeping functions about ten
4462 * times per second.
1c79356b 4463 */
fe8ab488 4464void
3e170ce0 4465sched_timeshare_maintenance_continue(void)
1c79356b 4466{
fe8ab488
A
4467 uint64_t sched_tick_ctime, late_time;
4468
3e170ce0
A
4469 struct sched_update_scan_context scan_context = {
4470 .earliest_bg_make_runnable_time = UINT64_MAX,
4471 .earliest_normal_make_runnable_time = UINT64_MAX,
4472 .earliest_rt_make_runnable_time = UINT64_MAX
4473 };
4474
fe8ab488 4475 sched_tick_ctime = mach_absolute_time();
1c79356b 4476
39236c6e
A
4477 if (__improbable(sched_tick_last_abstime == 0)) {
4478 sched_tick_last_abstime = sched_tick_ctime;
fe8ab488 4479 late_time = 0;
39236c6e
A
4480 sched_tick_delta = 1;
4481 } else {
fe8ab488
A
4482 late_time = sched_tick_ctime - sched_tick_last_abstime;
4483 sched_tick_delta = late_time / sched_tick_interval;
39236c6e
A
4484 /* Ensure a delta of 1, since the interval could be slightly
4485 * smaller than the sched_tick_interval due to dispatch
4486 * latencies.
4487 */
4488 sched_tick_delta = MAX(sched_tick_delta, 1);
4489
4490 /* In the event interrupt latencies or platform
4491 * idle events that advanced the timebase resulted
4492 * in periods where no threads were dispatched,
4493 * cap the maximum "tick delta" at SCHED_TICK_MAX_DELTA
4494 * iterations.
4495 */
4496 sched_tick_delta = MIN(sched_tick_delta, SCHED_TICK_MAX_DELTA);
4497
4498 sched_tick_last_abstime = sched_tick_ctime;
4499 sched_tick_max_delta = MAX(sched_tick_delta, sched_tick_max_delta);
4500 }
4501
fe8ab488 4502 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_MAINTENANCE)|DBG_FUNC_START,
39037602 4503 sched_tick_delta, late_time, 0, 0, 0);
fe8ab488 4504
39236c6e
A
4505 /* Add a number of pseudo-ticks corresponding to the elapsed interval
4506 * This could be greater than 1 if substantial intervals where
4507 * all processors are idle occur, which rarely occurs in practice.
4508 */
39037602 4509
39236c6e 4510 sched_tick += sched_tick_delta;
1c79356b
A
4511
4512 /*
91447636 4513 * Compute various averages.
1c79356b 4514 */
39236c6e 4515 compute_averages(sched_tick_delta);
1c79356b
A
4516
4517 /*
91447636 4518 * Scan the run queues for threads which
39037602
A
4519 * may need to be updated, and find the earliest runnable thread on the runqueue
4520 * to report its latency.
1c79356b 4521 */
3e170ce0
A
4522 SCHED(thread_update_scan)(&scan_context);
4523
4524 rt_runq_scan(&scan_context);
4525
4526 uint64_t ctime = mach_absolute_time();
4527
39037602
A
4528 uint64_t bg_max_latency = (ctime > scan_context.earliest_bg_make_runnable_time) ?
4529 ctime - scan_context.earliest_bg_make_runnable_time : 0;
4530
4531 uint64_t default_max_latency = (ctime > scan_context.earliest_normal_make_runnable_time) ?
4532 ctime - scan_context.earliest_normal_make_runnable_time : 0;
4533
4534 uint64_t realtime_max_latency = (ctime > scan_context.earliest_rt_make_runnable_time) ?
4535 ctime - scan_context.earliest_rt_make_runnable_time : 0;
4536
4537 machine_max_runnable_latency(bg_max_latency, default_max_latency, realtime_max_latency);
3e170ce0
A
4538
4539 /*
4540 * Check to see if the special sched VM group needs attention.
4541 */
4542 sched_vm_group_maintenance();
fe8ab488 4543
490019cf 4544
39037602
A
4545 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_MAINTENANCE) | DBG_FUNC_END,
4546 sched_pri_shifts[TH_BUCKET_SHARE_FG], sched_pri_shifts[TH_BUCKET_SHARE_BG],
4547 sched_pri_shifts[TH_BUCKET_SHARE_UT], 0, 0);
1c79356b 4548
3e170ce0
A
4549 assert_wait((event_t)sched_timeshare_maintenance_continue, THREAD_UNINT);
4550 thread_block((thread_continue_t)sched_timeshare_maintenance_continue);
1c79356b
A
4551 /*NOTREACHED*/
4552}
4553
39236c6e
A
4554static uint64_t sched_maintenance_wakeups;
4555
4556/*
4557 * Determine if the set of routines formerly driven by a maintenance timer
4558 * must be invoked, based on a deadline comparison. Signals the scheduler
4559 * maintenance thread on deadline expiration. Must be invoked at an interval
4560 * lower than the "sched_tick_interval", currently accomplished by
4561 * invocation via the quantum expiration timer and at context switch time.
4562 * Performance matters: this routine reuses a timestamp approximating the
4563 * current absolute time received from the caller, and should perform
4564 * no more than a comparison against the deadline in the common case.
4565 */
4566void
3e170ce0 4567sched_timeshare_consider_maintenance(uint64_t ctime) {
39236c6e
A
4568 uint64_t ndeadline, deadline = sched_maintenance_deadline;
4569
4570 if (__improbable(ctime >= deadline)) {
4571 if (__improbable(current_thread() == sched_maintenance_thread))
4572 return;
4573 OSMemoryBarrier();
4574
4575 ndeadline = ctime + sched_tick_interval;
4576
4577 if (__probable(__sync_bool_compare_and_swap(&sched_maintenance_deadline, deadline, ndeadline))) {
3e170ce0 4578 thread_wakeup((event_t)sched_timeshare_maintenance_continue);
39236c6e
A
4579 sched_maintenance_wakeups++;
4580 }
4581 }
4582}
4583
fe8ab488 4584#endif /* CONFIG_SCHED_TIMESHARE_CORE */
6d2010ae 4585
1c79356b 4586void
6d2010ae 4587sched_init_thread(void (*continuation)(void))
1c79356b 4588{
316670eb 4589 thread_block(THREAD_CONTINUE_NULL);
91447636 4590
490019cf
A
4591 thread_t thread = current_thread();
4592
39037602
A
4593 thread_set_thread_name(thread, "sched_maintenance_thread");
4594
490019cf
A
4595 sched_maintenance_thread = thread;
4596
6d2010ae 4597 continuation();
1c79356b 4598
1c79356b
A
4599 /*NOTREACHED*/
4600}
4601
fe8ab488 4602#if defined(CONFIG_SCHED_TIMESHARE_CORE)
6d2010ae 4603
1c79356b 4604/*
91447636 4605 * thread_update_scan / runq_scan:
55e303ae 4606 *
91447636
A
4607 * Scan the run queues to account for timesharing threads
4608 * which need to be updated.
1c79356b
A
4609 *
4610 * Scanner runs in two passes. Pass one squirrels likely
91447636 4611 * threads away in an array, pass two does the update.
1c79356b 4612 *
91447636
A
4613 * This is necessary because the run queue is locked for
4614 * the candidate scan, but the thread is locked for the update.
1c79356b 4615 *
91447636
A
4616 * Array should be sized to make forward progress, without
4617 * disabling preemption for long periods.
1c79356b 4618 */
55e303ae 4619
91447636 4620#define THREAD_UPDATE_SIZE 128
55e303ae 4621
39037602
A
4622static thread_t thread_update_array[THREAD_UPDATE_SIZE];
4623static uint32_t thread_update_count = 0;
1c79356b 4624
fe8ab488
A
4625/* Returns TRUE if thread was added, FALSE if thread_update_array is full */
4626boolean_t
4627thread_update_add_thread(thread_t thread)
4628{
4629 if (thread_update_count == THREAD_UPDATE_SIZE)
4630 return (FALSE);
4631
4632 thread_update_array[thread_update_count++] = thread;
4633 thread_reference_internal(thread);
4634 return (TRUE);
4635}
4636
4637void
4638thread_update_process_threads(void)
4639{
39037602 4640 assert(thread_update_count <= THREAD_UPDATE_SIZE);
fe8ab488 4641
39037602
A
4642 for (uint32_t i = 0 ; i < thread_update_count ; i++) {
4643 thread_t thread = thread_update_array[i];
4644 assert_thread_magic(thread);
4645 thread_update_array[i] = THREAD_NULL;
4646
4647 spl_t s = splsched();
fe8ab488 4648 thread_lock(thread);
39037602 4649 if (!(thread->state & (TH_WAIT)) && thread->sched_stamp != sched_tick) {
fe8ab488
A
4650 SCHED(update_priority)(thread);
4651 }
4652 thread_unlock(thread);
4653 splx(s);
4654
4655 thread_deallocate(thread);
4656 }
39037602
A
4657
4658 thread_update_count = 0;
fe8ab488
A
4659}
4660
1c79356b 4661/*
91447636
A
4662 * Scan a runq for candidate threads.
4663 *
4664 * Returns TRUE if retry is needed.
1c79356b 4665 */
fe8ab488 4666boolean_t
91447636 4667runq_scan(
39037602
A
4668 run_queue_t runq,
4669 sched_update_scan_context_t scan_context)
1c79356b 4670{
39037602
A
4671 int count = runq->count;
4672 int queue_index;
1c79356b 4673
39037602
A
4674 assert(count >= 0);
4675
4676 if (count == 0)
4677 return FALSE;
4678
4679 for (queue_index = bitmap_first(runq->bitmap, NRQS);
4680 queue_index >= 0;
4681 queue_index = bitmap_next(runq->bitmap, queue_index)) {
4682
4683 thread_t thread;
4684 queue_t queue = &runq->queues[queue_index];
3e170ce0 4685
39037602
A
4686 qe_foreach_element(thread, queue, runq_links) {
4687 assert(count > 0);
4688 assert_thread_magic(thread);
4689
4690 if (thread->sched_stamp != sched_tick &&
4691 thread->sched_mode == TH_MODE_TIMESHARE) {
4692 if (thread_update_add_thread(thread) == FALSE)
4693 return TRUE;
1c79356b
A
4694 }
4695
39037602
A
4696 if (cpu_throttle_enabled && ((thread->sched_pri <= MAXPRI_THROTTLE) && (thread->base_pri <= MAXPRI_THROTTLE))) {
4697 if (thread->last_made_runnable_time < scan_context->earliest_bg_make_runnable_time) {
4698 scan_context->earliest_bg_make_runnable_time = thread->last_made_runnable_time;
4699 }
4700 } else {
4701 if (thread->last_made_runnable_time < scan_context->earliest_normal_make_runnable_time) {
4702 scan_context->earliest_normal_make_runnable_time = thread->last_made_runnable_time;
4703 }
4704 }
4705 count--;
1c79356b
A
4706 }
4707 }
1c79356b 4708
39037602 4709 return FALSE;
1c79356b
A
4710}
4711
fe8ab488
A
4712#endif /* CONFIG_SCHED_TIMESHARE_CORE */
4713
6d2010ae
A
4714boolean_t
4715thread_eager_preemption(thread_t thread)
4716{
4717 return ((thread->sched_flags & TH_SFLAG_EAGERPREEMPT) != 0);
4718}
4719
4720void
4721thread_set_eager_preempt(thread_t thread)
4722{
4723 spl_t x;
4724 processor_t p;
4725 ast_t ast = AST_NONE;
4726
4727 x = splsched();
4728 p = current_processor();
4729
4730 thread_lock(thread);
4731 thread->sched_flags |= TH_SFLAG_EAGERPREEMPT;
4732
4733 if (thread == current_thread()) {
6d2010ae 4734
fe8ab488
A
4735 ast = csw_check(p, AST_NONE);
4736 thread_unlock(thread);
6d2010ae
A
4737 if (ast != AST_NONE) {
4738 (void) thread_block_reason(THREAD_CONTINUE_NULL, NULL, ast);
4739 }
4740 } else {
4741 p = thread->last_processor;
4742
4743 if (p != PROCESSOR_NULL && p->state == PROCESSOR_RUNNING &&
4744 p->active_thread == thread) {
4745 cause_ast_check(p);
4746 }
1c79356b 4747
6d2010ae
A
4748 thread_unlock(thread);
4749 }
4750
4751 splx(x);
4752}
4753
4754void
4755thread_clear_eager_preempt(thread_t thread)
4756{
4757 spl_t x;
4758
4759 x = splsched();
4760 thread_lock(thread);
4761
4762 thread->sched_flags &= ~TH_SFLAG_EAGERPREEMPT;
4763
4764 thread_unlock(thread);
4765 splx(x);
4766}
3e170ce0 4767
6d2010ae
A
4768/*
4769 * Scheduling statistics
4770 */
4771void
4772sched_stats_handle_csw(processor_t processor, int reasons, int selfpri, int otherpri)
4773{
4774 struct processor_sched_statistics *stats;
4775 boolean_t to_realtime = FALSE;
4776
4777 stats = &processor->processor_data.sched_stats;
4778 stats->csw_count++;
4779
4780 if (otherpri >= BASEPRI_REALTIME) {
4781 stats->rt_sched_count++;
4782 to_realtime = TRUE;
4783 }
4784
4785 if ((reasons & AST_PREEMPT) != 0) {
4786 stats->preempt_count++;
4787
4788 if (selfpri >= BASEPRI_REALTIME) {
4789 stats->preempted_rt_count++;
4790 }
4791
4792 if (to_realtime) {
4793 stats->preempted_by_rt_count++;
4794 }
4795
4796 }
4797}
4798
4799void
4800sched_stats_handle_runq_change(struct runq_stats *stats, int old_count)
4801{
4802 uint64_t timestamp = mach_absolute_time();
4803
4804 stats->count_sum += (timestamp - stats->last_change_timestamp) * old_count;
4805 stats->last_change_timestamp = timestamp;
4806}
4807
1c79356b 4808/*
6d2010ae 4809 * For calls from assembly code
1c79356b 4810 */
6d2010ae 4811#undef thread_wakeup
1c79356b
A
4812void
4813thread_wakeup(
6d2010ae 4814 event_t x);
1c79356b
A
4815
4816void
4817thread_wakeup(
6d2010ae 4818 event_t x)
1c79356b 4819{
6d2010ae 4820 thread_wakeup_with_result(x, THREAD_AWAKENED);
1c79356b
A
4821}
4822
91447636
A
4823boolean_t
4824preemption_enabled(void)
4825{
4826 return (get_preemption_level() == 0 && ml_get_interrupts_enabled());
4827}
9bccf70c 4828
4b17d6b6
A
4829static void
4830sched_timer_deadline_tracking_init(void) {
4831 nanoseconds_to_absolutetime(TIMER_DEADLINE_TRACKING_BIN_1_DEFAULT, &timer_deadline_tracking_bin_1);
4832 nanoseconds_to_absolutetime(TIMER_DEADLINE_TRACKING_BIN_2_DEFAULT, &timer_deadline_tracking_bin_2);
4833}
3e170ce0
A
4834
4835
4836kern_return_t
4837sched_work_interval_notify(thread_t thread, uint64_t work_interval_id, uint64_t start, uint64_t finish, uint64_t deadline, uint64_t next_start, uint32_t flags)
4838{
4839 int urgency;
4840 uint64_t urgency_param1, urgency_param2;
4841 spl_t s;
4842
4843 if (work_interval_id == 0) {
4844 return (KERN_INVALID_ARGUMENT);
4845 }
4846
4847 assert(thread == current_thread());
4848
4849 thread_mtx_lock(thread);
4850 if (thread->work_interval_id != work_interval_id) {
4851 thread_mtx_unlock(thread);
4852 return (KERN_INVALID_ARGUMENT);
4853 }
4854 thread_mtx_unlock(thread);
4855
4856 s = splsched();
4857 thread_lock(thread);
4858 urgency = thread_get_urgency(thread, &urgency_param1, &urgency_param2);
4859 thread_unlock(thread);
4860 splx(s);
4861
4862 machine_work_interval_notify(thread, work_interval_id, start, finish, deadline, next_start, urgency, flags);
4863 return (KERN_SUCCESS);
4864}
4865
4866void thread_set_options(uint32_t thopt) {
4867 spl_t x;
4868 thread_t t = current_thread();
4869
4870 x = splsched();
4871 thread_lock(t);
4872
4873 t->options |= thopt;
4874
4875 thread_unlock(t);
4876 splx(x);
4877}