]> git.saurik.com Git - apple/xnu.git/blame - osfmk/kern/sched_prim.c
xnu-3248.40.184.tar.gz
[apple/xnu.git] / osfmk / kern / sched_prim.c
CommitLineData
1c79356b 1/*
39236c6e 2 * Copyright (c) 2000-2012 Apple Inc. All rights reserved.
1c79356b 3 *
2d21ac55 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
1c79356b 5 *
2d21ac55
A
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
8f6c56a5 14 *
2d21ac55
A
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
8f6c56a5
A
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
2d21ac55
A
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
8f6c56a5 25 *
2d21ac55 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
1c79356b
A
27 */
28/*
29 * @OSF_FREE_COPYRIGHT@
30 */
31/*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56/*
57 */
58/*
59 * File: sched_prim.c
60 * Author: Avadis Tevanian, Jr.
61 * Date: 1986
62 *
63 * Scheduling primitives
64 *
65 */
66
67#include <debug.h>
91447636
A
68
69#include <mach/mach_types.h>
1c79356b 70#include <mach/machine.h>
91447636
A
71#include <mach/policy.h>
72#include <mach/sync_policy.h>
6d2010ae 73#include <mach/thread_act.h>
91447636 74
1c79356b
A
75#include <machine/machine_routines.h>
76#include <machine/sched_param.h>
0c530ab8 77#include <machine/machine_cpu.h>
6d2010ae 78#include <machine/machlimits.h>
91447636 79
fe8ab488
A
80#ifdef CONFIG_MACH_APPROXIMATE_TIME
81#include <machine/commpage.h>
82#endif
83
91447636 84#include <kern/kern_types.h>
1c79356b
A
85#include <kern/clock.h>
86#include <kern/counters.h>
87#include <kern/cpu_number.h>
88#include <kern/cpu_data.h>
3e170ce0 89#include <kern/smp.h>
91447636 90#include <kern/debug.h>
1c79356b
A
91#include <kern/macro_help.h>
92#include <kern/machine.h>
93#include <kern/misc_protos.h>
94#include <kern/processor.h>
95#include <kern/queue.h>
96#include <kern/sched.h>
97#include <kern/sched_prim.h>
fe8ab488 98#include <kern/sfi.h>
1c79356b
A
99#include <kern/syscall_subr.h>
100#include <kern/task.h>
101#include <kern/thread.h>
316670eb 102#include <kern/ledger.h>
39236c6e 103#include <kern/timer_queue.h>
3e170ce0 104#include <kern/waitq.h>
91447636 105
1c79356b
A
106#include <vm/pmap.h>
107#include <vm/vm_kern.h>
108#include <vm/vm_map.h>
91447636 109
b0d623f7
A
110#include <mach/sdt.h>
111
1c79356b
A
112#include <sys/kdebug.h>
113
0c530ab8 114#include <kern/pms.h>
3a60a9f5 115
fe8ab488
A
116#if defined(CONFIG_TELEMETRY) && defined(CONFIG_SCHED_TIMESHARE_CORE)
117#include <kern/telemetry.h>
118#endif
119
6d2010ae 120struct rt_queue rt_runq;
2d21ac55 121
3e170ce0
A
122uintptr_t sched_thread_on_rt_queue = (uintptr_t)0xDEAFBEE0;
123
124/* Lock RT runq, must be done with interrupts disabled (under splsched()) */
125#if __SMP__
126decl_simple_lock_data(static,rt_lock);
127#define rt_lock_init() simple_lock_init(&rt_lock, 0)
128#define rt_lock_lock() simple_lock(&rt_lock)
129#define rt_lock_unlock() simple_unlock(&rt_lock)
130#else
131#define rt_lock_init() do { } while(0)
132#define rt_lock_lock() do { } while(0)
133#define rt_lock_unlock() do { } while(0)
134#endif
6d2010ae 135
0b4e3aa0 136#define DEFAULT_PREEMPTION_RATE 100 /* (1/s) */
1c79356b
A
137int default_preemption_rate = DEFAULT_PREEMPTION_RATE;
138
316670eb
A
139#define DEFAULT_BG_PREEMPTION_RATE 400 /* (1/s) */
140int default_bg_preemption_rate = DEFAULT_BG_PREEMPTION_RATE;
141
0b4e3aa0
A
142#define MAX_UNSAFE_QUANTA 800
143int max_unsafe_quanta = MAX_UNSAFE_QUANTA;
144
145#define MAX_POLL_QUANTA 2
146int max_poll_quanta = MAX_POLL_QUANTA;
147
148#define SCHED_POLL_YIELD_SHIFT 4 /* 1/16 */
149int sched_poll_yield_shift = SCHED_POLL_YIELD_SHIFT;
150
55e303ae
A
151uint64_t max_poll_computation;
152
6d2010ae
A
153uint64_t max_unsafe_computation;
154uint64_t sched_safe_duration;
155
fe8ab488 156#if defined(CONFIG_SCHED_TIMESHARE_CORE)
6d2010ae 157
55e303ae
A
158uint32_t std_quantum;
159uint32_t min_std_quantum;
316670eb 160uint32_t bg_quantum;
55e303ae 161
91447636 162uint32_t std_quantum_us;
316670eb 163uint32_t bg_quantum_us;
91447636 164
fe8ab488 165#endif /* CONFIG_SCHED_TIMESHARE_CORE */
6d2010ae
A
166
167uint32_t thread_depress_time;
168uint32_t default_timeshare_computation;
169uint32_t default_timeshare_constraint;
170
55e303ae
A
171uint32_t max_rt_quantum;
172uint32_t min_rt_quantum;
173
fe8ab488 174#if defined(CONFIG_SCHED_TIMESHARE_CORE)
6d2010ae 175
1c79356b 176unsigned sched_tick;
91447636 177uint32_t sched_tick_interval;
fe8ab488
A
178#if defined(CONFIG_TELEMETRY)
179uint32_t sched_telemetry_interval;
180#endif /* CONFIG_TELEMETRY */
1c79356b 181
2d21ac55 182uint32_t sched_pri_shift = INT8_MAX;
39236c6e
A
183uint32_t sched_background_pri_shift = INT8_MAX;
184uint32_t sched_combined_fgbg_pri_shift = INT8_MAX;
2d21ac55 185uint32_t sched_fixed_shift;
39236c6e
A
186uint32_t sched_use_combined_fgbg_decay = 0;
187
188uint32_t sched_decay_usage_age_factor = 1; /* accelerate 5/8^n usage aging */
2d21ac55 189
fe8ab488
A
190/* Allow foreground to decay past default to resolve inversions */
191#define DEFAULT_DECAY_BAND_LIMIT ((BASEPRI_FOREGROUND - BASEPRI_DEFAULT) + 2)
192int sched_pri_decay_band_limit = DEFAULT_DECAY_BAND_LIMIT;
6d2010ae 193
4b17d6b6
A
194/* Defaults for timer deadline profiling */
195#define TIMER_DEADLINE_TRACKING_BIN_1_DEFAULT 2000000 /* Timers with deadlines <=
39236c6e 196 * 2ms */
4b17d6b6 197#define TIMER_DEADLINE_TRACKING_BIN_2_DEFAULT 5000000 /* Timers with deadlines
39236c6e
A
198 <= 5ms */
199
4b17d6b6
A
200uint64_t timer_deadline_tracking_bin_1;
201uint64_t timer_deadline_tracking_bin_2;
202
490019cf
A
203#endif /* CONFIG_SCHED_TIMESHARE_CORE */
204
39236c6e
A
205thread_t sched_maintenance_thread;
206
fe8ab488 207
6d2010ae
A
208uint64_t sched_one_second_interval;
209
39236c6e 210uint32_t sched_run_count, sched_share_count, sched_background_count;
2d21ac55
A
211uint32_t sched_load_average, sched_mach_factor;
212
1c79356b 213/* Forwards */
6d2010ae 214
fe8ab488 215#if defined(CONFIG_SCHED_TIMESHARE_CORE)
6d2010ae 216
39236c6e
A
217static void load_shift_init(void);
218static void preempt_pri_init(void);
2d21ac55 219
fe8ab488 220#endif /* CONFIG_SCHED_TIMESHARE_CORE */
c910b4d9 221
6d2010ae
A
222static thread_t thread_select(
223 thread_t thread,
fe8ab488
A
224 processor_t processor,
225 ast_t reason);
b0d623f7 226
6d2010ae 227#if CONFIG_SCHED_IDLE_IN_PLACE
2d21ac55
A
228static thread_t thread_select_idle(
229 thread_t thread,
230 processor_t processor);
6d2010ae 231#endif
1c79356b 232
6d2010ae 233thread_t processor_idle(
2d21ac55
A
234 thread_t thread,
235 processor_t processor);
91447636 236
39236c6e
A
237ast_t
238csw_check_locked( processor_t processor,
fe8ab488
A
239 processor_set_t pset,
240 ast_t check_reason);
39236c6e 241
6d2010ae
A
242static void processor_setrun(
243 processor_t processor,
244 thread_t thread,
245 integer_t options);
246
6d2010ae 247static void
39236c6e 248sched_realtime_init(void);
6d2010ae
A
249
250static void
251sched_realtime_timebase_init(void);
252
4b17d6b6
A
253static void
254sched_timer_deadline_tracking_init(void);
255
2d21ac55
A
256#if DEBUG
257extern int debug_task;
258#define TLOG(a, fmt, args...) if(debug_task & a) kprintf(fmt, ## args)
259#else
260#define TLOG(a, fmt, args...) do {} while (0)
261#endif
262
3e170ce0
A
263static processor_t
264thread_bind_internal(
265 thread_t thread,
266 processor_t processor);
1c79356b 267
3e170ce0
A
268static void
269sched_vm_group_maintenance(void);
1c79356b 270
fe8ab488 271#if defined(CONFIG_SCHED_TIMESHARE_CORE)
91447636 272int8_t sched_load_shifts[NRQS];
b0d623f7 273int sched_preempt_pri[NRQBM];
fe8ab488 274#endif /* CONFIG_SCHED_TIMESHARE_CORE */
6d2010ae 275
6d2010ae
A
276const struct sched_dispatch_table *sched_current_dispatch = NULL;
277
278/*
279 * Statically allocate a buffer to hold the longest possible
280 * scheduler description string, as currently implemented.
281 * bsd/kern/kern_sysctl.c has a corresponding definition in bsd/
282 * to export to userspace via sysctl(3). If either version
283 * changes, update the other.
284 *
285 * Note that in addition to being an upper bound on the strings
286 * in the kernel, it's also an exact parameter to PE_get_default(),
287 * which interrogates the device tree on some platforms. That
288 * API requires the caller know the exact size of the device tree
289 * property, so we need both a legacy size (32) and the current size
290 * (48) to deal with old and new device trees. The device tree property
291 * is similarly padded to a fixed size so that the same kernel image
292 * can run on multiple devices with different schedulers configured
293 * in the device tree.
294 */
6d2010ae 295char sched_string[SCHED_STRING_MAX_LENGTH];
3e170ce0
A
296
297uint32_t sched_debug_flags;
39236c6e
A
298
299/* Global flag which indicates whether Background Stepper Context is enabled */
300static int cpu_throttle_enabled = 1;
91447636 301
1c79356b
A
302void
303sched_init(void)
6d2010ae
A
304{
305 char sched_arg[SCHED_STRING_MAX_LENGTH] = { '\0' };
306
307 /* Check for runtime selection of the scheduler algorithm */
308 if (!PE_parse_boot_argn("sched", sched_arg, sizeof (sched_arg))) {
309 /* If no boot-args override, look in device tree */
310 if (!PE_get_default("kern.sched", sched_arg,
311 SCHED_STRING_MAX_LENGTH)) {
312 sched_arg[0] = '\0';
313 }
314 }
315
fe8ab488
A
316
317 if (!PE_parse_boot_argn("sched_pri_decay_limit", &sched_pri_decay_band_limit, sizeof(sched_pri_decay_band_limit))) {
318 /* No boot-args, check in device tree */
319 if (!PE_get_default("kern.sched_pri_decay_limit",
320 &sched_pri_decay_band_limit,
321 sizeof(sched_pri_decay_band_limit))) {
322 /* Allow decay all the way to normal limits */
323 sched_pri_decay_band_limit = DEFAULT_DECAY_BAND_LIMIT;
324 }
325 }
326
327 kprintf("Setting scheduler priority decay band limit %d\n", sched_pri_decay_band_limit);
328
6d2010ae
A
329 if (strlen(sched_arg) > 0) {
330 if (0) {
331 /* Allow pattern below */
332#if defined(CONFIG_SCHED_TRADITIONAL)
3e170ce0 333 } else if (0 == strcmp(sched_arg, sched_traditional_dispatch.sched_name)) {
6d2010ae 334 sched_current_dispatch = &sched_traditional_dispatch;
3e170ce0 335 } else if (0 == strcmp(sched_arg, sched_traditional_with_pset_runqueue_dispatch.sched_name)) {
6d2010ae 336 sched_current_dispatch = &sched_traditional_with_pset_runqueue_dispatch;
6d2010ae
A
337#endif
338#if defined(CONFIG_SCHED_PROTO)
3e170ce0 339 } else if (0 == strcmp(sched_arg, sched_proto_dispatch.sched_name)) {
6d2010ae 340 sched_current_dispatch = &sched_proto_dispatch;
6d2010ae
A
341#endif
342#if defined(CONFIG_SCHED_GRRR)
3e170ce0 343 } else if (0 == strcmp(sched_arg, sched_grrr_dispatch.sched_name)) {
6d2010ae 344 sched_current_dispatch = &sched_grrr_dispatch;
6d2010ae 345#endif
fe8ab488 346#if defined(CONFIG_SCHED_MULTIQ)
3e170ce0 347 } else if (0 == strcmp(sched_arg, sched_multiq_dispatch.sched_name)) {
fe8ab488 348 sched_current_dispatch = &sched_multiq_dispatch;
3e170ce0 349 } else if (0 == strcmp(sched_arg, sched_dualq_dispatch.sched_name)) {
fe8ab488 350 sched_current_dispatch = &sched_dualq_dispatch;
6d2010ae
A
351#endif
352 } else {
fe8ab488
A
353#if defined(CONFIG_SCHED_TRADITIONAL)
354 printf("Unrecognized scheduler algorithm: %s\n", sched_arg);
3e170ce0 355 printf("Scheduler: Using instead: %s\n", sched_traditional_with_pset_runqueue_dispatch.sched_name);
fe8ab488 356 sched_current_dispatch = &sched_traditional_with_pset_runqueue_dispatch;
fe8ab488 357#else
6d2010ae 358 panic("Unrecognized scheduler algorithm: %s", sched_arg);
fe8ab488 359#endif
6d2010ae 360 }
3e170ce0 361 kprintf("Scheduler: Runtime selection of %s\n", SCHED(sched_name));
6d2010ae 362 } else {
fe8ab488
A
363#if defined(CONFIG_SCHED_MULTIQ)
364 sched_current_dispatch = &sched_multiq_dispatch;
fe8ab488 365#elif defined(CONFIG_SCHED_TRADITIONAL)
39236c6e 366 sched_current_dispatch = &sched_traditional_with_pset_runqueue_dispatch;
6d2010ae
A
367#elif defined(CONFIG_SCHED_PROTO)
368 sched_current_dispatch = &sched_proto_dispatch;
6d2010ae
A
369#elif defined(CONFIG_SCHED_GRRR)
370 sched_current_dispatch = &sched_grrr_dispatch;
6d2010ae
A
371#else
372#error No default scheduler implementation
373#endif
3e170ce0
A
374 kprintf("Scheduler: Default of %s\n", SCHED(sched_name));
375 }
376
377 strlcpy(sched_string, SCHED(sched_name), sizeof(sched_string));
378
379 if (PE_parse_boot_argn("sched_debug", &sched_debug_flags, sizeof(sched_debug_flags))) {
380 kprintf("Scheduler: Debug flags 0x%08x\n", sched_debug_flags);
6d2010ae
A
381 }
382
383 SCHED(init)();
6d2010ae
A
384 sched_realtime_init();
385 ast_init();
4b17d6b6 386 sched_timer_deadline_tracking_init();
39236c6e 387
6d2010ae
A
388 SCHED(pset_init)(&pset0);
389 SCHED(processor_init)(master_processor);
390}
391
392void
393sched_timebase_init(void)
394{
395 uint64_t abstime;
396
397 clock_interval_to_absolutetime_interval(1, NSEC_PER_SEC, &abstime);
398 sched_one_second_interval = abstime;
399
400 SCHED(timebase_init)();
401 sched_realtime_timebase_init();
402}
403
fe8ab488 404#if defined(CONFIG_SCHED_TIMESHARE_CORE)
6d2010ae 405
fe8ab488 406void
3e170ce0 407sched_timeshare_init(void)
1c79356b
A
408{
409 /*
0b4e3aa0
A
410 * Calculate the timeslicing quantum
411 * in us.
1c79356b
A
412 */
413 if (default_preemption_rate < 1)
414 default_preemption_rate = DEFAULT_PREEMPTION_RATE;
0b4e3aa0 415 std_quantum_us = (1000 * 1000) / default_preemption_rate;
1c79356b 416
0b4e3aa0 417 printf("standard timeslicing quantum is %d us\n", std_quantum_us);
1c79356b 418
316670eb
A
419 if (default_bg_preemption_rate < 1)
420 default_bg_preemption_rate = DEFAULT_BG_PREEMPTION_RATE;
421 bg_quantum_us = (1000 * 1000) / default_bg_preemption_rate;
422
423 printf("standard background quantum is %d us\n", bg_quantum_us);
424
91447636 425 load_shift_init();
4a3eedf9 426 preempt_pri_init();
1c79356b 427 sched_tick = 0;
1c79356b
A
428}
429
fe8ab488 430void
3e170ce0 431sched_timeshare_timebase_init(void)
55e303ae 432{
91447636
A
433 uint64_t abstime;
434 uint32_t shift;
55e303ae 435
91447636 436 /* standard timeslicing quantum */
55e303ae
A
437 clock_interval_to_absolutetime_interval(
438 std_quantum_us, NSEC_PER_USEC, &abstime);
439 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
b0d623f7 440 std_quantum = (uint32_t)abstime;
55e303ae 441
91447636 442 /* smallest remaining quantum (250 us) */
55e303ae
A
443 clock_interval_to_absolutetime_interval(250, NSEC_PER_USEC, &abstime);
444 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
b0d623f7 445 min_std_quantum = (uint32_t)abstime;
55e303ae 446
316670eb
A
447 /* quantum for background tasks */
448 clock_interval_to_absolutetime_interval(
449 bg_quantum_us, NSEC_PER_USEC, &abstime);
450 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
451 bg_quantum = (uint32_t)abstime;
452
91447636
A
453 /* scheduler tick interval */
454 clock_interval_to_absolutetime_interval(USEC_PER_SEC >> SCHED_TICK_SHIFT,
455 NSEC_PER_USEC, &abstime);
cf7d32b8 456 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
b0d623f7 457 sched_tick_interval = (uint32_t)abstime;
55e303ae 458
91447636
A
459 /*
460 * Compute conversion factor from usage to
461 * timesharing priorities with 5/8 ** n aging.
462 */
463 abstime = (abstime * 5) / 3;
464 for (shift = 0; abstime > BASEPRI_DEFAULT; ++shift)
465 abstime >>= 1;
2d21ac55 466 sched_fixed_shift = shift;
91447636 467
fe8ab488
A
468 max_unsafe_computation = ((uint64_t)max_unsafe_quanta) * std_quantum;
469 sched_safe_duration = 2 * ((uint64_t)max_unsafe_quanta) * std_quantum;
6d2010ae 470
fe8ab488 471 max_poll_computation = ((uint64_t)max_poll_quanta) * std_quantum;
6d2010ae
A
472 thread_depress_time = 1 * std_quantum;
473 default_timeshare_computation = std_quantum / 2;
474 default_timeshare_constraint = std_quantum;
475
fe8ab488
A
476#if defined(CONFIG_TELEMETRY)
477 /* interval for high frequency telemetry */
478 clock_interval_to_absolutetime_interval(10, NSEC_PER_MSEC, &abstime);
479 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
480 sched_telemetry_interval = (uint32_t)abstime;
481#endif
490019cf 482
6d2010ae
A
483}
484
fe8ab488
A
485#endif /* CONFIG_SCHED_TIMESHARE_CORE */
486
6d2010ae
A
487static void
488sched_realtime_init(void)
489{
3e170ce0 490 rt_lock_init();
6d2010ae
A
491
492 rt_runq.count = 0;
493 queue_init(&rt_runq.queue);
55e303ae
A
494}
495
6d2010ae
A
496static void
497sched_realtime_timebase_init(void)
498{
499 uint64_t abstime;
500
501 /* smallest rt computaton (50 us) */
502 clock_interval_to_absolutetime_interval(50, NSEC_PER_USEC, &abstime);
503 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
504 min_rt_quantum = (uint32_t)abstime;
505
506 /* maximum rt computation (50 ms) */
507 clock_interval_to_absolutetime_interval(
508 50, 1000*NSEC_PER_USEC, &abstime);
509 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
510 max_rt_quantum = (uint32_t)abstime;
511
512}
513
fe8ab488 514#if defined(CONFIG_SCHED_TIMESHARE_CORE)
6d2010ae 515
91447636
A
516/*
517 * Set up values for timeshare
518 * loading factors.
519 */
520static void
521load_shift_init(void)
522{
523 int8_t k, *p = sched_load_shifts;
524 uint32_t i, j;
525
39236c6e
A
526 uint32_t sched_decay_penalty = 1;
527
528 if (PE_parse_boot_argn("sched_decay_penalty", &sched_decay_penalty, sizeof (sched_decay_penalty))) {
529 kprintf("Overriding scheduler decay penalty %u\n", sched_decay_penalty);
530 }
531
532 if (PE_parse_boot_argn("sched_decay_usage_age_factor", &sched_decay_usage_age_factor, sizeof (sched_decay_usage_age_factor))) {
533 kprintf("Overriding scheduler decay usage age factor %u\n", sched_decay_usage_age_factor);
534 }
535
536 if (PE_parse_boot_argn("sched_use_combined_fgbg_decay", &sched_use_combined_fgbg_decay, sizeof (sched_use_combined_fgbg_decay))) {
537 kprintf("Overriding schedule fg/bg decay calculation: %u\n", sched_use_combined_fgbg_decay);
538 }
539
540 if (sched_decay_penalty == 0) {
541 /*
542 * There is no penalty for timeshare threads for using too much
543 * CPU, so set all load shifts to INT8_MIN. Even under high load,
544 * sched_pri_shift will be >INT8_MAX, and there will be no
545 * penalty applied to threads (nor will sched_usage be updated per
546 * thread).
547 */
548 for (i = 0; i < NRQS; i++) {
549 sched_load_shifts[i] = INT8_MIN;
550 }
551
552 return;
553 }
554
91447636
A
555 *p++ = INT8_MIN; *p++ = 0;
556
39236c6e
A
557 /*
558 * For a given system load "i", the per-thread priority
559 * penalty per quantum of CPU usage is ~2^k priority
560 * levels. "sched_decay_penalty" can cause more
561 * array entries to be filled with smaller "k" values
562 */
563 for (i = 2, j = 1 << sched_decay_penalty, k = 1; i < NRQS; ++k) {
564 for (j <<= 1; (i < j) && (i < NRQS); ++i)
91447636
A
565 *p++ = k;
566 }
567}
568
4a3eedf9
A
569static void
570preempt_pri_init(void)
571{
572 int i, *p = sched_preempt_pri;
573
39236c6e 574 for (i = BASEPRI_FOREGROUND; i < MINPRI_KERNEL; ++i)
4a3eedf9
A
575 setbit(i, p);
576
577 for (i = BASEPRI_PREEMPT; i <= MAXPRI; ++i)
578 setbit(i, p);
579}
580
fe8ab488 581#endif /* CONFIG_SCHED_TIMESHARE_CORE */
6d2010ae 582
1c79356b 583/*
0b4e3aa0 584 * Thread wait timer expiration.
1c79356b
A
585 */
586void
587thread_timer_expire(
91447636
A
588 void *p0,
589 __unused void *p1)
1c79356b
A
590{
591 thread_t thread = p0;
592 spl_t s;
593
594 s = splsched();
55e303ae 595 thread_lock(thread);
91447636 596 if (--thread->wait_timer_active == 0) {
0b4e3aa0
A
597 if (thread->wait_timer_is_set) {
598 thread->wait_timer_is_set = FALSE;
55e303ae 599 clear_wait_internal(thread, THREAD_TIMED_OUT);
0b4e3aa0 600 }
1c79356b 601 }
55e303ae 602 thread_unlock(thread);
1c79356b
A
603 splx(s);
604}
605
1c79356b 606/*
91447636
A
607 * thread_unblock:
608 *
609 * Unblock thread on wake up.
610 *
3e170ce0 611 * Returns TRUE if the thread should now be placed on the runqueue.
91447636
A
612 *
613 * Thread must be locked.
3e170ce0
A
614 *
615 * Called at splsched().
1c79356b 616 */
91447636
A
617boolean_t
618thread_unblock(
619 thread_t thread,
620 wait_result_t wresult)
1c79356b 621{
3e170ce0 622 boolean_t ready_for_runq = FALSE;
4b17d6b6 623 thread_t cthread = current_thread();
fe8ab488 624 uint32_t new_run_count;
0b4e3aa0 625
91447636 626 /*
2d21ac55 627 * Set wait_result.
91447636
A
628 */
629 thread->wait_result = wresult;
1c79356b 630
91447636 631 /*
2d21ac55 632 * Cancel pending wait timer.
91447636 633 */
1c79356b
A
634 if (thread->wait_timer_is_set) {
635 if (timer_call_cancel(&thread->wait_timer))
636 thread->wait_timer_active--;
637 thread->wait_timer_is_set = FALSE;
638 }
639
91447636 640 /*
2d21ac55
A
641 * Update scheduling state: not waiting,
642 * set running.
91447636
A
643 */
644 thread->state &= ~(TH_WAIT|TH_UNINT);
1c79356b 645
91447636
A
646 if (!(thread->state & TH_RUN)) {
647 thread->state |= TH_RUN;
3e170ce0
A
648 thread->last_made_runnable_time = mach_approximate_time();
649
650 ready_for_runq = TRUE;
1c79356b 651
2d21ac55 652 (*thread->sched_call)(SCHED_CALL_UNBLOCK, thread);
1c79356b 653
91447636 654 /*
2d21ac55 655 * Update run counts.
91447636 656 */
fe8ab488 657 new_run_count = sched_run_incr(thread);
39236c6e 658 if (thread->sched_mode == TH_MODE_TIMESHARE) {
fe8ab488 659 sched_share_incr(thread);
39236c6e 660
fe8ab488
A
661 if (thread->sched_flags & TH_SFLAG_THROTTLED)
662 sched_background_incr(thread);
39236c6e 663 }
3e170ce0 664 } else {
2d21ac55
A
665 /*
666 * Signal if idling on another processor.
667 */
6d2010ae 668#if CONFIG_SCHED_IDLE_IN_PLACE
2d21ac55
A
669 if (thread->state & TH_IDLE) {
670 processor_t processor = thread->last_processor;
671
672 if (processor != current_processor())
673 machine_signal_idle(processor);
674 }
6d2010ae
A
675#else
676 assert((thread->state & TH_IDLE) == 0);
677#endif
678
fe8ab488 679 new_run_count = sched_run_count; /* updated in thread_select_idle() */
2d21ac55 680 }
1c79356b 681
3e170ce0 682
91447636
A
683 /*
684 * Calculate deadline for real-time threads.
685 */
6d2010ae 686 if (thread->sched_mode == TH_MODE_REALTIME) {
3e170ce0 687 uint64_t ctime;
fe8ab488
A
688
689 ctime = mach_absolute_time();
690 thread->realtime.deadline = thread->realtime.constraint + ctime;
0b4e3aa0
A
691 }
692
91447636
A
693 /*
694 * Clear old quantum, fail-safe computation, etc.
695 */
fe8ab488 696 thread->quantum_remaining = 0;
91447636
A
697 thread->computation_metered = 0;
698 thread->reason = AST_NONE;
1c79356b 699
4b17d6b6
A
700 /* Obtain power-relevant interrupt and "platform-idle exit" statistics.
701 * We also account for "double hop" thread signaling via
702 * the thread callout infrastructure.
703 * DRK: consider removing the callout wakeup counters in the future
704 * they're present for verification at the moment.
705 */
706 boolean_t aticontext, pidle;
707 ml_get_power_state(&aticontext, &pidle);
39236c6e
A
708
709 if (__improbable(aticontext && !(thread_get_tag_internal(thread) & THREAD_TAG_CALLOUT))) {
4b17d6b6 710 ledger_credit(thread->t_ledger, task_ledgers.interrupt_wakeups, 1);
39236c6e
A
711 DTRACE_SCHED2(iwakeup, struct thread *, thread, struct proc *, thread->task->bsd_info);
712
4b17d6b6 713 uint64_t ttd = PROCESSOR_DATA(current_processor(), timer_call_ttd);
39236c6e 714
4b17d6b6
A
715 if (ttd) {
716 if (ttd <= timer_deadline_tracking_bin_1)
717 thread->thread_timer_wakeups_bin_1++;
718 else
719 if (ttd <= timer_deadline_tracking_bin_2)
720 thread->thread_timer_wakeups_bin_2++;
721 }
39236c6e 722
4b17d6b6
A
723 if (pidle) {
724 ledger_credit(thread->t_ledger, task_ledgers.platform_idle_wakeups, 1);
725 }
39236c6e 726
4b17d6b6
A
727 } else if (thread_get_tag_internal(cthread) & THREAD_TAG_CALLOUT) {
728 if (cthread->callout_woken_from_icontext) {
729 ledger_credit(thread->t_ledger, task_ledgers.interrupt_wakeups, 1);
730 thread->thread_callout_interrupt_wakeups++;
731 if (cthread->callout_woken_from_platform_idle) {
732 ledger_credit(thread->t_ledger, task_ledgers.platform_idle_wakeups, 1);
733 thread->thread_callout_platform_idle_wakeups++;
734 }
39236c6e
A
735
736 cthread->callout_woke_thread = TRUE;
4b17d6b6
A
737 }
738 }
739
740 if (thread_get_tag_internal(thread) & THREAD_TAG_CALLOUT) {
39236c6e
A
741 thread->callout_woken_from_icontext = aticontext;
742 thread->callout_woken_from_platform_idle = pidle;
743 thread->callout_woke_thread = FALSE;
4b17d6b6
A
744 }
745
fe8ab488
A
746 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
747 MACHDBG_CODE(DBG_MACH_SCHED,MACH_MAKE_RUNNABLE) | DBG_FUNC_NONE,
748 (uintptr_t)thread_tid(thread), thread->sched_pri, thread->wait_result, new_run_count, 0);
b0d623f7
A
749
750 DTRACE_SCHED2(wakeup, struct thread *, thread, struct proc *, thread->task->bsd_info);
91447636 751
3e170ce0 752 return (ready_for_runq);
1c79356b
A
753}
754
755/*
91447636 756 * Routine: thread_go
1c79356b 757 * Purpose:
91447636 758 * Unblock and dispatch thread.
1c79356b
A
759 * Conditions:
760 * thread lock held, IPC locks may be held.
761 * thread must have been pulled from wait queue under same lock hold.
3e170ce0
A
762 * thread must have been waiting
763 * Returns:
9bccf70c 764 * KERN_SUCCESS - Thread was set running
3e170ce0
A
765 *
766 * TODO: This should return void
1c79356b 767 */
9bccf70c 768kern_return_t
91447636 769thread_go(
3e170ce0
A
770 thread_t thread,
771 wait_result_t wresult)
1c79356b 772{
1c79356b 773 assert(thread->at_safe_point == FALSE);
9bccf70c 774 assert(thread->wait_event == NO_EVENT64);
3e170ce0 775 assert(thread->waitq == NULL);
1c79356b 776
3e170ce0
A
777 assert(!(thread->state & (TH_TERMINATE|TH_TERMINATE2)));
778 assert(thread->state & TH_WAIT);
55e303ae 779
55e303ae 780
3e170ce0
A
781 if (thread_unblock(thread, wresult))
782 thread_setrun(thread, SCHED_PREEMPT | SCHED_TAILQ);
783
784 return (KERN_SUCCESS);
1c79356b
A
785}
786
9bccf70c
A
787/*
788 * Routine: thread_mark_wait_locked
789 * Purpose:
790 * Mark a thread as waiting. If, given the circumstances,
791 * it doesn't want to wait (i.e. already aborted), then
792 * indicate that in the return value.
793 * Conditions:
794 * at splsched() and thread is locked.
795 */
796__private_extern__
797wait_result_t
1c79356b 798thread_mark_wait_locked(
9bccf70c
A
799 thread_t thread,
800 wait_interrupt_t interruptible)
1c79356b 801{
55e303ae 802 boolean_t at_safe_point;
1c79356b 803
b0d623f7 804 assert(thread == current_thread());
3e170ce0 805 assert(!(thread->state & (TH_WAIT|TH_IDLE|TH_UNINT|TH_TERMINATE2)));
b0d623f7 806
9bccf70c
A
807 /*
808 * The thread may have certain types of interrupts/aborts masked
809 * off. Even if the wait location says these types of interrupts
810 * are OK, we have to honor mask settings (outer-scoped code may
811 * not be able to handle aborts at the moment).
812 */
91447636
A
813 if (interruptible > (thread->options & TH_OPT_INTMASK))
814 interruptible = thread->options & TH_OPT_INTMASK;
9bccf70c
A
815
816 at_safe_point = (interruptible == THREAD_ABORTSAFE);
817
55e303ae 818 if ( interruptible == THREAD_UNINT ||
6d2010ae 819 !(thread->sched_flags & TH_SFLAG_ABORT) ||
55e303ae 820 (!at_safe_point &&
6d2010ae 821 (thread->sched_flags & TH_SFLAG_ABORTSAFELY))) {
b0d623f7 822
316670eb
A
823 if ( !(thread->state & TH_TERMINATE))
824 DTRACE_SCHED(sleep);
b0d623f7 825
9bccf70c
A
826 thread->state |= (interruptible) ? TH_WAIT : (TH_WAIT | TH_UNINT);
827 thread->at_safe_point = at_safe_point;
9bccf70c 828 return (thread->wait_result = THREAD_WAITING);
9bccf70c 829 }
55e303ae 830 else
6d2010ae
A
831 if (thread->sched_flags & TH_SFLAG_ABORTSAFELY)
832 thread->sched_flags &= ~TH_SFLAG_ABORTED_MASK;
55e303ae 833
9bccf70c 834 return (thread->wait_result = THREAD_INTERRUPTED);
1c79356b
A
835}
836
9bccf70c
A
837/*
838 * Routine: thread_interrupt_level
839 * Purpose:
840 * Set the maximum interruptible state for the
841 * current thread. The effective value of any
842 * interruptible flag passed into assert_wait
843 * will never exceed this.
844 *
845 * Useful for code that must not be interrupted,
846 * but which calls code that doesn't know that.
847 * Returns:
848 * The old interrupt level for the thread.
849 */
850__private_extern__
851wait_interrupt_t
852thread_interrupt_level(
853 wait_interrupt_t new_level)
854{
855 thread_t thread = current_thread();
91447636 856 wait_interrupt_t result = thread->options & TH_OPT_INTMASK;
1c79356b 857
91447636 858 thread->options = (thread->options & ~TH_OPT_INTMASK) | (new_level & TH_OPT_INTMASK);
1c79356b 859
91447636 860 return result;
1c79356b
A
861}
862
863/*
864 * Check to see if an assert wait is possible, without actually doing one.
865 * This is used by debug code in locks and elsewhere to verify that it is
866 * always OK to block when trying to take a blocking lock (since waiting
867 * for the actual assert_wait to catch the case may make it hard to detect
868 * this case.
869 */
870boolean_t
871assert_wait_possible(void)
872{
873
874 thread_t thread;
1c79356b
A
875
876#if DEBUG
877 if(debug_mode) return TRUE; /* Always succeed in debug mode */
878#endif
879
880 thread = current_thread();
881
3e170ce0 882 return (thread == NULL || waitq_wait_possible(thread));
1c79356b
A
883}
884
885/*
886 * assert_wait:
887 *
888 * Assert that the current thread is about to go to
889 * sleep until the specified event occurs.
890 */
9bccf70c 891wait_result_t
1c79356b
A
892assert_wait(
893 event_t event,
9bccf70c 894 wait_interrupt_t interruptible)
1c79356b 895{
3e170ce0
A
896 if (__improbable(event == NO_EVENT))
897 panic("%s() called with NO_EVENT", __func__);
1c79356b 898
316670eb
A
899 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
900 MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)|DBG_FUNC_NONE,
4bd07ac2 901 VM_KERNEL_UNSLIDE_OR_PERM(event), 0, 0, 0, 0);
316670eb 902
3e170ce0
A
903 struct waitq *waitq;
904 waitq = global_eventq(event);
905 return waitq_assert_wait64(waitq, CAST_EVENT64_T(event), interruptible, TIMEOUT_WAIT_FOREVER);
9bccf70c
A
906}
907
91447636
A
908wait_result_t
909assert_wait_timeout(
910 event_t event,
911 wait_interrupt_t interruptible,
912 uint32_t interval,
913 uint32_t scale_factor)
55e303ae 914{
91447636
A
915 thread_t thread = current_thread();
916 wait_result_t wresult;
91447636
A
917 uint64_t deadline;
918 spl_t s;
919
3e170ce0
A
920 if (__improbable(event == NO_EVENT))
921 panic("%s() called with NO_EVENT", __func__);
fe8ab488 922
3e170ce0
A
923 struct waitq *waitq;
924 waitq = global_eventq(event);
91447636
A
925
926 s = splsched();
3e170ce0 927 waitq_lock(waitq);
91447636
A
928 thread_lock(thread);
929
930 clock_interval_to_deadline(interval, scale_factor, &deadline);
3e170ce0 931
316670eb 932 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3e170ce0 933 MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)|DBG_FUNC_NONE,
4bd07ac2 934 VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
3e170ce0
A
935
936 wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
937 interruptible,
938 TIMEOUT_URGENCY_SYS_NORMAL,
939 deadline, TIMEOUT_NO_LEEWAY,
940 thread);
39236c6e
A
941
942 thread_unlock(thread);
3e170ce0 943 waitq_unlock(waitq);
39236c6e 944 splx(s);
3e170ce0 945 return wresult;
39236c6e
A
946}
947
948wait_result_t
949assert_wait_timeout_with_leeway(
950 event_t event,
951 wait_interrupt_t interruptible,
952 wait_timeout_urgency_t urgency,
953 uint32_t interval,
954 uint32_t leeway,
955 uint32_t scale_factor)
956{
957 thread_t thread = current_thread();
958 wait_result_t wresult;
39236c6e
A
959 uint64_t deadline;
960 uint64_t abstime;
961 uint64_t slop;
962 uint64_t now;
963 spl_t s;
964
3e170ce0
A
965 if (__improbable(event == NO_EVENT))
966 panic("%s() called with NO_EVENT", __func__);
967
39236c6e
A
968 now = mach_absolute_time();
969 clock_interval_to_absolutetime_interval(interval, scale_factor, &abstime);
970 deadline = now + abstime;
971
972 clock_interval_to_absolutetime_interval(leeway, scale_factor, &slop);
973
3e170ce0
A
974 struct waitq *waitq;
975 waitq = global_eventq(event);
39236c6e
A
976
977 s = splsched();
3e170ce0 978 waitq_lock(waitq);
39236c6e
A
979 thread_lock(thread);
980
981 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3e170ce0 982 MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)|DBG_FUNC_NONE,
4bd07ac2 983 VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
3e170ce0
A
984
985 wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
986 interruptible,
987 urgency, deadline, slop,
988 thread);
91447636
A
989
990 thread_unlock(thread);
3e170ce0 991 waitq_unlock(waitq);
91447636 992 splx(s);
3e170ce0 993 return wresult;
55e303ae
A
994}
995
996wait_result_t
91447636 997assert_wait_deadline(
55e303ae 998 event_t event,
91447636
A
999 wait_interrupt_t interruptible,
1000 uint64_t deadline)
55e303ae
A
1001{
1002 thread_t thread = current_thread();
91447636 1003 wait_result_t wresult;
55e303ae
A
1004 spl_t s;
1005
3e170ce0
A
1006 if (__improbable(event == NO_EVENT))
1007 panic("%s() called with NO_EVENT", __func__);
1008
1009 struct waitq *waitq;
1010 waitq = global_eventq(event);
55e303ae
A
1011
1012 s = splsched();
3e170ce0 1013 waitq_lock(waitq);
55e303ae
A
1014 thread_lock(thread);
1015
316670eb 1016 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3e170ce0 1017 MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)|DBG_FUNC_NONE,
4bd07ac2 1018 VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
39236c6e 1019
3e170ce0
A
1020 wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
1021 interruptible,
1022 TIMEOUT_URGENCY_SYS_NORMAL, deadline,
1023 TIMEOUT_NO_LEEWAY, thread);
39236c6e 1024 thread_unlock(thread);
3e170ce0 1025 waitq_unlock(waitq);
39236c6e 1026 splx(s);
3e170ce0 1027 return wresult;
39236c6e
A
1028}
1029
1030wait_result_t
1031assert_wait_deadline_with_leeway(
1032 event_t event,
1033 wait_interrupt_t interruptible,
1034 wait_timeout_urgency_t urgency,
1035 uint64_t deadline,
1036 uint64_t leeway)
1037{
1038 thread_t thread = current_thread();
1039 wait_result_t wresult;
39236c6e
A
1040 spl_t s;
1041
3e170ce0
A
1042 if (__improbable(event == NO_EVENT))
1043 panic("%s() called with NO_EVENT", __func__);
fe8ab488 1044
3e170ce0
A
1045 struct waitq *waitq;
1046 waitq = global_eventq(event);
39236c6e
A
1047
1048 s = splsched();
3e170ce0 1049 waitq_lock(waitq);
39236c6e
A
1050 thread_lock(thread);
1051
1052 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3e170ce0 1053 MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)|DBG_FUNC_NONE,
4bd07ac2 1054 VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
39236c6e 1055
3e170ce0
A
1056 wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
1057 interruptible,
1058 urgency, deadline, leeway,
1059 thread);
55e303ae
A
1060
1061 thread_unlock(thread);
3e170ce0 1062 waitq_unlock(waitq);
55e303ae 1063 splx(s);
3e170ce0 1064 return wresult;
55e303ae 1065}
9bccf70c 1066
39236c6e
A
1067/*
1068 * thread_isoncpu:
1069 *
1070 * Return TRUE if a thread is running on a processor such that an AST
1071 * is needed to pull it out of userspace execution, or if executing in
1072 * the kernel, bring to a context switch boundary that would cause
1073 * thread state to be serialized in the thread PCB.
1074 *
1075 * Thread locked, returns the same way. While locked, fields
fe8ab488 1076 * like "state" cannot change. "runq" can change only from set to unset.
39236c6e
A
1077 */
1078static inline boolean_t
1079thread_isoncpu(thread_t thread)
1080{
1081 /* Not running or runnable */
1082 if (!(thread->state & TH_RUN))
1083 return (FALSE);
1084
1085 /* Waiting on a runqueue, not currently running */
fe8ab488 1086 /* TODO: This is invalid - it can get dequeued without thread lock, but not context switched. */
39236c6e
A
1087 if (thread->runq != PROCESSOR_NULL)
1088 return (FALSE);
1089
3e170ce0
A
1090 /*
1091 * Thread does not have a stack yet
1092 * It could be on the stack alloc queue or preparing to be invoked
1093 */
1094 if (!thread->kernel_stack)
1095 return (FALSE);
1096
39236c6e
A
1097 /*
1098 * Thread must be running on a processor, or
1099 * about to run, or just did run. In all these
1100 * cases, an AST to the processor is needed
1101 * to guarantee that the thread is kicked out
1102 * of userspace and the processor has
1103 * context switched (and saved register state).
1104 */
1105 return (TRUE);
1106}
1107
1c79356b 1108/*
91447636 1109 * thread_stop:
1c79356b 1110 *
91447636 1111 * Force a preemption point for a thread and wait
39236c6e
A
1112 * for it to stop running on a CPU. If a stronger
1113 * guarantee is requested, wait until no longer
1114 * runnable. Arbitrates access among
91447636 1115 * multiple stop requests. (released by unstop)
1c79356b 1116 *
91447636
A
1117 * The thread must enter a wait state and stop via a
1118 * separate means.
1c79356b 1119 *
91447636 1120 * Returns FALSE if interrupted.
1c79356b
A
1121 */
1122boolean_t
1123thread_stop(
39236c6e
A
1124 thread_t thread,
1125 boolean_t until_not_runnable)
1c79356b 1126{
91447636 1127 wait_result_t wresult;
2d21ac55 1128 spl_t s = splsched();
39236c6e 1129 boolean_t oncpu;
1c79356b 1130
1c79356b 1131 wake_lock(thread);
2d21ac55 1132 thread_lock(thread);
1c79356b
A
1133
1134 while (thread->state & TH_SUSP) {
1135 thread->wake_active = TRUE;
2d21ac55
A
1136 thread_unlock(thread);
1137
91447636 1138 wresult = assert_wait(&thread->wake_active, THREAD_ABORTSAFE);
1c79356b
A
1139 wake_unlock(thread);
1140 splx(s);
1141
91447636
A
1142 if (wresult == THREAD_WAITING)
1143 wresult = thread_block(THREAD_CONTINUE_NULL);
9bccf70c 1144
91447636 1145 if (wresult != THREAD_AWAKENED)
1c79356b
A
1146 return (FALSE);
1147
1148 s = splsched();
1149 wake_lock(thread);
2d21ac55 1150 thread_lock(thread);
1c79356b 1151 }
9bccf70c 1152
1c79356b 1153 thread->state |= TH_SUSP;
1c79356b 1154
39236c6e
A
1155 while ((oncpu = thread_isoncpu(thread)) ||
1156 (until_not_runnable && (thread->state & TH_RUN))) {
1157 processor_t processor;
1158
1159 if (oncpu) {
1160 assert(thread->state & TH_RUN);
1161 processor = thread->chosen_processor;
9bccf70c 1162 cause_ast_check(processor);
39236c6e 1163 }
9bccf70c
A
1164
1165 thread->wake_active = TRUE;
2d21ac55
A
1166 thread_unlock(thread);
1167
91447636 1168 wresult = assert_wait(&thread->wake_active, THREAD_ABORTSAFE);
9bccf70c
A
1169 wake_unlock(thread);
1170 splx(s);
1171
91447636
A
1172 if (wresult == THREAD_WAITING)
1173 wresult = thread_block(THREAD_CONTINUE_NULL);
9bccf70c 1174
91447636 1175 if (wresult != THREAD_AWAKENED) {
9bccf70c
A
1176 thread_unstop(thread);
1177 return (FALSE);
1178 }
1179
1180 s = splsched();
1181 wake_lock(thread);
1182 thread_lock(thread);
1183 }
1184
1185 thread_unlock(thread);
1c79356b
A
1186 wake_unlock(thread);
1187 splx(s);
39236c6e
A
1188
1189 /*
1190 * We return with the thread unlocked. To prevent it from
1191 * transitioning to a runnable state (or from TH_RUN to
1192 * being on the CPU), the caller must ensure the thread
1193 * is stopped via an external means (such as an AST)
1194 */
1c79356b
A
1195
1196 return (TRUE);
1197}
1198
1199/*
91447636
A
1200 * thread_unstop:
1201 *
1202 * Release a previous stop request and set
1203 * the thread running if appropriate.
1204 *
1205 * Use only after a successful stop operation.
1c79356b
A
1206 */
1207void
1208thread_unstop(
9bccf70c 1209 thread_t thread)
1c79356b 1210{
9bccf70c 1211 spl_t s = splsched();
1c79356b 1212
1c79356b
A
1213 wake_lock(thread);
1214 thread_lock(thread);
1215
3e170ce0 1216 assert((thread->state & (TH_RUN|TH_WAIT|TH_SUSP)) != TH_SUSP);
55e303ae 1217
1c79356b
A
1218 if (thread->state & TH_SUSP) {
1219 thread->state &= ~TH_SUSP;
1220
1221 if (thread->wake_active) {
1222 thread->wake_active = FALSE;
1223 thread_unlock(thread);
2d21ac55
A
1224
1225 thread_wakeup(&thread->wake_active);
1c79356b
A
1226 wake_unlock(thread);
1227 splx(s);
1c79356b
A
1228
1229 return;
1230 }
1231 }
1232
1233 thread_unlock(thread);
1234 wake_unlock(thread);
1235 splx(s);
1236}
1237
1238/*
91447636
A
1239 * thread_wait:
1240 *
1241 * Wait for a thread to stop running. (non-interruptible)
1242 *
1c79356b 1243 */
91447636 1244void
1c79356b 1245thread_wait(
316670eb
A
1246 thread_t thread,
1247 boolean_t until_not_runnable)
1c79356b 1248{
91447636 1249 wait_result_t wresult;
316670eb
A
1250 boolean_t oncpu;
1251 processor_t processor;
1252 spl_t s = splsched();
1c79356b 1253
1c79356b 1254 wake_lock(thread);
9bccf70c 1255 thread_lock(thread);
1c79356b 1256
316670eb
A
1257 /*
1258 * Wait until not running on a CPU. If stronger requirement
1259 * desired, wait until not runnable. Assumption: if thread is
1260 * on CPU, then TH_RUN is set, so we're not waiting in any case
1261 * where the original, pure "TH_RUN" check would have let us
1262 * finish.
1263 */
39236c6e 1264 while ((oncpu = thread_isoncpu(thread)) ||
316670eb 1265 (until_not_runnable && (thread->state & TH_RUN))) {
e7c99d92 1266
316670eb
A
1267 if (oncpu) {
1268 assert(thread->state & TH_RUN);
39236c6e 1269 processor = thread->chosen_processor;
9bccf70c 1270 cause_ast_check(processor);
316670eb 1271 }
1c79356b
A
1272
1273 thread->wake_active = TRUE;
2d21ac55
A
1274 thread_unlock(thread);
1275
91447636 1276 wresult = assert_wait(&thread->wake_active, THREAD_UNINT);
1c79356b
A
1277 wake_unlock(thread);
1278 splx(s);
1279
91447636
A
1280 if (wresult == THREAD_WAITING)
1281 thread_block(THREAD_CONTINUE_NULL);
1c79356b
A
1282
1283 s = splsched();
1284 wake_lock(thread);
9bccf70c 1285 thread_lock(thread);
1c79356b 1286 }
0b4e3aa0 1287
9bccf70c 1288 thread_unlock(thread);
1c79356b
A
1289 wake_unlock(thread);
1290 splx(s);
1c79356b
A
1291}
1292
1c79356b
A
1293/*
1294 * Routine: clear_wait_internal
1295 *
1296 * Clear the wait condition for the specified thread.
1297 * Start the thread executing if that is appropriate.
1298 * Arguments:
1299 * thread thread to awaken
1300 * result Wakeup result the thread should see
1301 * Conditions:
1302 * At splsched
1303 * the thread is locked.
9bccf70c
A
1304 * Returns:
1305 * KERN_SUCCESS thread was rousted out a wait
1306 * KERN_FAILURE thread was waiting but could not be rousted
1307 * KERN_NOT_WAITING thread was not waiting
1c79356b 1308 */
9bccf70c 1309__private_extern__ kern_return_t
1c79356b 1310clear_wait_internal(
9bccf70c 1311 thread_t thread,
55e303ae 1312 wait_result_t wresult)
1c79356b 1313{
060df5ea 1314 uint32_t i = LockTimeOut;
3e170ce0 1315 struct waitq *waitq = thread->waitq;
9bccf70c 1316
9bccf70c 1317 do {
55e303ae
A
1318 if (wresult == THREAD_INTERRUPTED && (thread->state & TH_UNINT))
1319 return (KERN_FAILURE);
9bccf70c 1320
3e170ce0
A
1321 if (waitq != NULL) {
1322 assert(waitq_irq_safe(waitq)); //irqs are already disabled!
1323 if (waitq_lock_try(waitq)) {
1324 waitq_pull_thread_locked(waitq, thread);
1325 waitq_unlock(waitq);
1326 } else {
9bccf70c
A
1327 thread_unlock(thread);
1328 delay(1);
1329 thread_lock(thread);
3e170ce0
A
1330 if (waitq != thread->waitq)
1331 return KERN_NOT_WAITING;
9bccf70c
A
1332 continue;
1333 }
1c79356b 1334 }
55e303ae 1335
3e170ce0
A
1336 /* TODO: Can we instead assert TH_TERMINATE is not set? */
1337 if ((thread->state & (TH_WAIT|TH_TERMINATE)) == TH_WAIT)
1338 return (thread_go(thread, wresult));
1339 else
1340 return (KERN_NOT_WAITING);
060df5ea 1341 } while ((--i > 0) || machine_timeout_suspended());
55e303ae 1342
2d21ac55 1343 panic("clear_wait_internal: deadlock: thread=%p, wq=%p, cpu=%d\n",
3e170ce0 1344 thread, waitq, cpu_number());
55e303ae
A
1345
1346 return (KERN_FAILURE);
1c79356b
A
1347}
1348
1349
1350/*
1351 * clear_wait:
1352 *
1353 * Clear the wait condition for the specified thread. Start the thread
1354 * executing if that is appropriate.
1355 *
1356 * parameters:
1357 * thread thread to awaken
1358 * result Wakeup result the thread should see
1359 */
9bccf70c 1360kern_return_t
1c79356b 1361clear_wait(
9bccf70c
A
1362 thread_t thread,
1363 wait_result_t result)
1c79356b 1364{
9bccf70c 1365 kern_return_t ret;
1c79356b
A
1366 spl_t s;
1367
1368 s = splsched();
1369 thread_lock(thread);
9bccf70c 1370 ret = clear_wait_internal(thread, result);
1c79356b
A
1371 thread_unlock(thread);
1372 splx(s);
9bccf70c 1373 return ret;
1c79356b
A
1374}
1375
1376
1377/*
1378 * thread_wakeup_prim:
1379 *
1380 * Common routine for thread_wakeup, thread_wakeup_with_result,
1381 * and thread_wakeup_one.
1382 *
1383 */
9bccf70c 1384kern_return_t
1c79356b
A
1385thread_wakeup_prim(
1386 event_t event,
1387 boolean_t one_thread,
6d2010ae
A
1388 wait_result_t result)
1389{
1390 return (thread_wakeup_prim_internal(event, one_thread, result, -1));
1391}
1392
1393
1394kern_return_t
1395thread_wakeup_prim_internal(
1396 event_t event,
1397 boolean_t one_thread,
1398 wait_result_t result,
1399 int priority)
1c79356b 1400{
3e170ce0
A
1401 if (__improbable(event == NO_EVENT))
1402 panic("%s() called with NO_EVENT", __func__);
1403
1404 struct waitq *wq;
1c79356b 1405
3e170ce0
A
1406 wq = global_eventq(event);
1407 priority = (priority == -1 ? WAITQ_ALL_PRIORITIES : priority);
fe8ab488 1408
1c79356b 1409 if (one_thread)
3e170ce0 1410 return waitq_wakeup64_one(wq, CAST_EVENT64_T(event), result, priority);
1c79356b 1411 else
3e170ce0 1412 return waitq_wakeup64_all(wq, CAST_EVENT64_T(event), result, priority);
1c79356b
A
1413}
1414
1415/*
1416 * thread_bind:
1417 *
2d21ac55 1418 * Force the current thread to execute on the specified processor.
fe8ab488 1419 * Takes effect after the next thread_block().
1c79356b 1420 *
55e303ae
A
1421 * Returns the previous binding. PROCESSOR_NULL means
1422 * not bound.
1423 *
1424 * XXX - DO NOT export this to users - XXX
1c79356b 1425 */
55e303ae 1426processor_t
1c79356b 1427thread_bind(
2d21ac55 1428 processor_t processor)
1c79356b 1429{
2d21ac55 1430 thread_t self = current_thread();
55e303ae 1431 processor_t prev;
55e303ae 1432 spl_t s;
1c79356b
A
1433
1434 s = splsched();
2d21ac55 1435 thread_lock(self);
55e303ae 1436
3e170ce0 1437 prev = thread_bind_internal(self, processor);
55e303ae 1438
2d21ac55 1439 thread_unlock(self);
1c79356b 1440 splx(s);
55e303ae
A
1441
1442 return (prev);
1c79356b
A
1443}
1444
3e170ce0
A
1445/*
1446 * thread_bind_internal:
1447 *
1448 * If the specified thread is not the current thread, and it is currently
1449 * running on another CPU, a remote AST must be sent to that CPU to cause
1450 * the thread to migrate to its bound processor. Otherwise, the migration
1451 * will occur at the next quantum expiration or blocking point.
1452 *
1453 * When the thread is the current thread, and explicit thread_block() should
1454 * be used to force the current processor to context switch away and
1455 * let the thread migrate to the bound processor.
1456 *
1457 * Thread must be locked, and at splsched.
1458 */
1459
1460static processor_t
1461thread_bind_internal(
1462 thread_t thread,
1463 processor_t processor)
1464{
1465 processor_t prev;
1466
1467 /* <rdar://problem/15102234> */
1468 assert(thread->sched_pri < BASEPRI_RTQUEUES);
1469 /* A thread can't be bound if it's sitting on a (potentially incorrect) runqueue */
1470 assert(thread->runq == PROCESSOR_NULL);
1471
1472 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_THREAD_BIND), thread_tid(thread), processor ? (uintptr_t)processor->cpu_id : (uintptr_t)-1, 0, 0, 0);
1473
1474 prev = thread->bound_processor;
1475 thread->bound_processor = processor;
1476
1477 return (prev);
1478}
1479
1480/*
1481 * thread_vm_bind_group_add:
1482 *
1483 * The "VM bind group" is a special mechanism to mark a collection
1484 * of threads from the VM subsystem that, in general, should be scheduled
1485 * with only one CPU of parallelism. To accomplish this, we initially
1486 * bind all the threads to the master processor, which has the effect
1487 * that only one of the threads in the group can execute at once, including
1488 * preempting threads in the group that are a lower priority. Future
1489 * mechanisms may use more dynamic mechanisms to prevent the collection
1490 * of VM threads from using more CPU time than desired.
1491 *
1492 * The current implementation can result in priority inversions where
1493 * compute-bound priority 95 or realtime threads that happen to have
1494 * landed on the master processor prevent the VM threads from running.
1495 * When this situation is detected, we unbind the threads for one
1496 * scheduler tick to allow the scheduler to run the threads an
1497 * additional CPUs, before restoring the binding (assuming high latency
1498 * is no longer a problem).
1499 */
1500
1501/*
1502 * The current max is provisioned for:
1503 * vm_compressor_swap_trigger_thread (92)
1504 * 2 x vm_pageout_iothread_internal (92) when vm_restricted_to_single_processor==TRUE
1505 * vm_pageout_continue (92)
1506 * memorystatus_thread (95)
1507 */
1508#define MAX_VM_BIND_GROUP_COUNT (5)
1509decl_simple_lock_data(static,sched_vm_group_list_lock);
1510static thread_t sched_vm_group_thread_list[MAX_VM_BIND_GROUP_COUNT];
1511static int sched_vm_group_thread_count;
1512static boolean_t sched_vm_group_temporarily_unbound = FALSE;
1513
1514void
1515thread_vm_bind_group_add(void)
1516{
1517 thread_t self = current_thread();
1518
1519 thread_reference_internal(self);
1520 self->options |= TH_OPT_SCHED_VM_GROUP;
1521
1522 simple_lock(&sched_vm_group_list_lock);
1523 assert(sched_vm_group_thread_count < MAX_VM_BIND_GROUP_COUNT);
1524 sched_vm_group_thread_list[sched_vm_group_thread_count++] = self;
1525 simple_unlock(&sched_vm_group_list_lock);
1526
1527 thread_bind(master_processor);
1528
1529 /* Switch to bound processor if not already there */
1530 thread_block(THREAD_CONTINUE_NULL);
1531}
1532
1533static void
1534sched_vm_group_maintenance(void)
1535{
1536 uint64_t ctime = mach_absolute_time();
1537 uint64_t longtime = ctime - sched_tick_interval;
1538 int i;
1539 spl_t s;
1540 boolean_t high_latency_observed = FALSE;
1541 boolean_t runnable_and_not_on_runq_observed = FALSE;
1542 boolean_t bind_target_changed = FALSE;
1543 processor_t bind_target = PROCESSOR_NULL;
1544
1545 /* Make sure nobody attempts to add new threads while we are enumerating them */
1546 simple_lock(&sched_vm_group_list_lock);
1547
1548 s = splsched();
1549
1550 for (i=0; i < sched_vm_group_thread_count; i++) {
1551 thread_t thread = sched_vm_group_thread_list[i];
1552 assert(thread != THREAD_NULL);
1553 thread_lock(thread);
1554 if ((thread->state & (TH_RUN|TH_WAIT)) == TH_RUN) {
1555 if (thread->runq != PROCESSOR_NULL && thread->last_made_runnable_time < longtime) {
1556 high_latency_observed = TRUE;
1557 } else if (thread->runq == PROCESSOR_NULL) {
1558 /* There are some cases where a thread be transitiong that also fall into this case */
1559 runnable_and_not_on_runq_observed = TRUE;
1560 }
1561 }
1562 thread_unlock(thread);
1563
1564 if (high_latency_observed && runnable_and_not_on_runq_observed) {
1565 /* All the things we are looking for are true, stop looking */
1566 break;
1567 }
1568 }
1569
1570 splx(s);
1571
1572 if (sched_vm_group_temporarily_unbound) {
1573 /* If we turned off binding, make sure everything is OK before rebinding */
1574 if (!high_latency_observed) {
1575 /* rebind */
1576 bind_target_changed = TRUE;
1577 bind_target = master_processor;
1578 sched_vm_group_temporarily_unbound = FALSE; /* might be reset to TRUE if change cannot be completed */
1579 }
1580 } else {
1581 /*
1582 * Check if we're in a bad state, which is defined by high
1583 * latency with no core currently executing a thread. If a
1584 * single thread is making progress on a CPU, that means the
1585 * binding concept to reduce parallelism is working as
1586 * designed.
1587 */
1588 if (high_latency_observed && !runnable_and_not_on_runq_observed) {
1589 /* unbind */
1590 bind_target_changed = TRUE;
1591 bind_target = PROCESSOR_NULL;
1592 sched_vm_group_temporarily_unbound = TRUE;
1593 }
1594 }
1595
1596 if (bind_target_changed) {
1597 s = splsched();
1598 for (i=0; i < sched_vm_group_thread_count; i++) {
1599 thread_t thread = sched_vm_group_thread_list[i];
1600 boolean_t removed;
1601 assert(thread != THREAD_NULL);
1602
1603 thread_lock(thread);
1604 removed = thread_run_queue_remove(thread);
1605 if (removed || ((thread->state & (TH_RUN | TH_WAIT)) == TH_WAIT)) {
1606 thread_bind_internal(thread, bind_target);
1607 } else {
1608 /*
1609 * Thread was in the middle of being context-switched-to,
1610 * or was in the process of blocking. To avoid switching the bind
1611 * state out mid-flight, defer the change if possible.
1612 */
1613 if (bind_target == PROCESSOR_NULL) {
1614 thread_bind_internal(thread, bind_target);
1615 } else {
1616 sched_vm_group_temporarily_unbound = TRUE; /* next pass will try again */
1617 }
1618 }
1619
1620 if (removed) {
1621 thread_run_queue_reinsert(thread, SCHED_PREEMPT | SCHED_TAILQ);
1622 }
1623 thread_unlock(thread);
1624 }
1625 splx(s);
1626 }
1627
1628 simple_unlock(&sched_vm_group_list_lock);
1629}
1630
fe8ab488
A
1631/* Invoked prior to idle entry to determine if, on SMT capable processors, an SMT
1632 * rebalancing opportunity exists when a core is (instantaneously) idle, but
1633 * other SMT-capable cores may be over-committed. TODO: some possible negatives:
1634 * IPI thrash if this core does not remain idle following the load balancing ASTs
1635 * Idle "thrash", when IPI issue is followed by idle entry/core power down
1636 * followed by a wakeup shortly thereafter.
1637 */
1638
fe8ab488
A
1639#if (DEVELOPMENT || DEBUG)
1640int sched_smt_balance = 1;
1641#endif
1642
3e170ce0
A
1643#if __SMP__
1644/* Invoked with pset locked, returns with pset unlocked */
fe8ab488
A
1645static void
1646sched_SMT_balance(processor_t cprocessor, processor_set_t cpset) {
1647 processor_t ast_processor = NULL;
1648
1649#if (DEVELOPMENT || DEBUG)
1650 if (__improbable(sched_smt_balance == 0))
1651 goto smt_balance_exit;
1652#endif
1653
1654 assert(cprocessor == current_processor());
1655 if (cprocessor->is_SMT == FALSE)
1656 goto smt_balance_exit;
1657
1658 processor_t sib_processor = cprocessor->processor_secondary ? cprocessor->processor_secondary : cprocessor->processor_primary;
1659
1660 /* Determine if both this processor and its sibling are idle,
1661 * indicating an SMT rebalancing opportunity.
1662 */
1663 if (sib_processor->state != PROCESSOR_IDLE)
1664 goto smt_balance_exit;
1665
1666 processor_t sprocessor;
1667
1668 sprocessor = (processor_t)queue_first(&cpset->active_queue);
1669
1670 while (!queue_end(&cpset->active_queue, (queue_entry_t)sprocessor)) {
1671 if ((sprocessor->state == PROCESSOR_RUNNING) &&
1672 (sprocessor->processor_primary != sprocessor) &&
1673 (sprocessor->processor_primary->state == PROCESSOR_RUNNING) &&
1674 (sprocessor->current_pri < BASEPRI_RTQUEUES) &&
3e170ce0 1675 ((cpset->pending_AST_cpu_mask & (1ULL << sprocessor->cpu_id)) == 0)) {
fe8ab488
A
1676 assert(sprocessor != cprocessor);
1677 ast_processor = sprocessor;
1678 break;
1679 }
1680 sprocessor = (processor_t)queue_next((queue_entry_t)sprocessor);
1681 }
1682
1683smt_balance_exit:
1684 pset_unlock(cpset);
1685
1686 if (ast_processor) {
1687 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_SMT_BALANCE), ast_processor->cpu_id, ast_processor->state, ast_processor->processor_primary->state, 0, 0);
1688 cause_ast_check(ast_processor);
1689 }
1690}
3e170ce0 1691#endif /* __SMP__ */
fe8ab488 1692
1c79356b 1693/*
2d21ac55
A
1694 * thread_select:
1695 *
1696 * Select a new thread for the current processor to execute.
55e303ae
A
1697 *
1698 * May select the current thread, which must be locked.
1c79356b 1699 */
2d21ac55 1700static thread_t
1c79356b 1701thread_select(
2d21ac55 1702 thread_t thread,
fe8ab488
A
1703 processor_t processor,
1704 ast_t reason)
1c79356b 1705{
2d21ac55 1706 processor_set_t pset = processor->processor_set;
cf7d32b8 1707 thread_t new_thread = THREAD_NULL;
1c79356b 1708
6d2010ae 1709 assert(processor == current_processor());
3e170ce0 1710 assert((thread->state & (TH_RUN|TH_TERMINATE2)) == TH_RUN);
6d2010ae 1711
2d21ac55
A
1712 do {
1713 /*
1714 * Update the priority.
1715 */
6d2010ae
A
1716 if (SCHED(can_update_priority)(thread))
1717 SCHED(update_priority)(thread);
1718
2d21ac55 1719 processor->current_pri = thread->sched_pri;
6d2010ae 1720 processor->current_thmode = thread->sched_mode;
fe8ab488 1721 processor->current_sfi_class = thread->sfi_class;
1c79356b 1722
2d21ac55
A
1723 pset_lock(pset);
1724
fe8ab488 1725 assert(processor->state != PROCESSOR_OFF_LINE);
6d2010ae 1726
3e170ce0
A
1727 if (!processor->is_recommended) {
1728 /*
1729 * The performance controller has provided a hint to not dispatch more threads,
1730 * unless they are bound to us (and thus we are the only option
1731 */
1732 if (!SCHED(processor_bound_count)(processor)) {
1733 goto idle;
1734 }
1735 } else if (processor->processor_primary != processor) {
39236c6e
A
1736 /*
1737 * Should this secondary SMT processor attempt to find work? For pset runqueue systems,
1738 * we should look for work only under the same conditions that choose_processor()
1739 * would have assigned work, which is when all primary processors have been assigned work.
1740 *
1741 * An exception is that bound threads are dispatched to a processor without going through
1742 * choose_processor(), so in those cases we should continue trying to dequeue work.
1743 */
fe8ab488 1744 if (!SCHED(processor_bound_count)(processor) && !queue_empty(&pset->idle_queue) && !rt_runq.count) {
39236c6e
A
1745 goto idle;
1746 }
1747 }
1748
3e170ce0 1749 rt_lock_lock();
2d21ac55 1750
2d21ac55
A
1751 /*
1752 * Test to see if the current thread should continue
3e170ce0 1753 * to run on this processor. Must not be attempting to wait, and not
2d21ac55 1754 * bound to a different processor, nor be in the wrong
3e170ce0
A
1755 * processor set, nor be forced to context switch by TH_SUSP.
1756 *
1757 * Note that there are never any RT threads in the regular runqueue.
1758 *
1759 * This code is very insanely tricky.
2d21ac55 1760 */
3e170ce0
A
1761
1762 if (((thread->state & (TH_TERMINATE|TH_IDLE|TH_WAIT|TH_RUN|TH_SUSP)) == TH_RUN) &&
fe8ab488
A
1763 (thread->sched_pri >= BASEPRI_RTQUEUES || processor->processor_primary == processor) &&
1764 (thread->bound_processor == PROCESSOR_NULL || thread->bound_processor == processor) &&
1765 (thread->affinity_set == AFFINITY_SET_NULL || thread->affinity_set->aset_pset == pset)) {
3e170ce0
A
1766 /*
1767 * RT threads with un-expired quantum stay on processor,
1768 * unless there's a valid RT thread with an earlier deadline.
1769 */
1770 if (thread->sched_pri >= BASEPRI_RTQUEUES && processor->first_timeslice) {
6d2010ae 1771 if (rt_runq.count > 0) {
fe8ab488 1772 thread_t next_rt;
2d21ac55 1773
fe8ab488 1774 next_rt = (thread_t)queue_first(&rt_runq.queue);
3e170ce0
A
1775
1776 assert(next_rt->runq == THREAD_ON_RT_RUNQ);
1777
fe8ab488 1778 if (next_rt->realtime.deadline < processor->deadline &&
3e170ce0
A
1779 (next_rt->bound_processor == PROCESSOR_NULL ||
1780 next_rt->bound_processor == processor)) {
1781 /* The next RT thread is better, so pick it off the runqueue. */
1782 goto pick_new_rt_thread;
55e303ae
A
1783 }
1784 }
2d21ac55 1785
3e170ce0 1786 /* This is still the best RT thread to run. */
2d21ac55
A
1787 processor->deadline = thread->realtime.deadline;
1788
3e170ce0 1789 rt_lock_unlock();
2d21ac55
A
1790 pset_unlock(pset);
1791
1792 return (thread);
55e303ae
A
1793 }
1794
3e170ce0
A
1795 if ((rt_runq.count == 0) &&
1796 SCHED(processor_queue_has_priority)(processor, thread->sched_pri, TRUE) == FALSE) {
fe8ab488 1797 /* This thread is still the highest priority runnable (non-idle) thread */
2d21ac55 1798 processor->deadline = UINT64_MAX;
55e303ae 1799
3e170ce0 1800 rt_lock_unlock();
2d21ac55 1801 pset_unlock(pset);
55e303ae 1802
2d21ac55
A
1803 return (thread);
1804 }
1805 }
1806
3e170ce0
A
1807 /* OK, so we're not going to run the current thread. Look at the RT queue. */
1808 if (rt_runq.count > 0) {
1809 thread_t next_rt = (thread_t)queue_first(&rt_runq.queue);
c910b4d9 1810
3e170ce0 1811 assert(next_rt->runq == THREAD_ON_RT_RUNQ);
fe8ab488 1812
3e170ce0
A
1813 if (__probable((next_rt->bound_processor == PROCESSOR_NULL ||
1814 (next_rt->bound_processor == processor)))) {
1815pick_new_rt_thread:
1816 new_thread = (thread_t)dequeue_head(&rt_runq.queue);
6d2010ae 1817
3e170ce0 1818 new_thread->runq = PROCESSOR_NULL;
39236c6e
A
1819 SCHED_STATS_RUNQ_CHANGE(&rt_runq.runq_stats, rt_runq.count);
1820 rt_runq.count--;
6d2010ae 1821
3e170ce0 1822 processor->deadline = new_thread->realtime.deadline;
c910b4d9 1823
3e170ce0 1824 rt_lock_unlock();
39236c6e 1825 pset_unlock(pset);
c910b4d9 1826
3e170ce0 1827 return (new_thread);
39236c6e 1828 }
c910b4d9 1829 }
2d21ac55 1830
3e170ce0
A
1831 processor->deadline = UINT64_MAX;
1832 rt_lock_unlock();
6d2010ae 1833
3e170ce0
A
1834 /* No RT threads, so let's look at the regular threads. */
1835 if ((new_thread = SCHED(choose_thread)(processor, MINPRI, reason)) != THREAD_NULL) {
6d2010ae 1836 pset_unlock(pset);
6d2010ae
A
1837 return (new_thread);
1838 }
c910b4d9 1839
3e170ce0
A
1840#if __SMP__
1841 if (SCHED(steal_thread_enabled)) {
1842 /*
1843 * No runnable threads, attempt to steal
1844 * from other processors. Returns with pset lock dropped.
1845 */
2d21ac55 1846
3e170ce0
A
1847 if ((new_thread = SCHED(steal_thread)(pset)) != THREAD_NULL) {
1848 return (new_thread);
1849 }
cf7d32b8 1850
3e170ce0
A
1851 /*
1852 * If other threads have appeared, shortcut
1853 * around again.
1854 */
1855 if (!SCHED(processor_queue_empty)(processor) || rt_runq.count > 0)
1856 continue;
1857
1858 pset_lock(pset);
1859 }
1860#endif
55e303ae 1861
39236c6e 1862 idle:
1c79356b
A
1863 /*
1864 * Nothing is runnable, so set this processor idle if it
2d21ac55 1865 * was running.
1c79356b 1866 */
55e303ae 1867 if (processor->state == PROCESSOR_RUNNING) {
6d2010ae 1868 remqueue((queue_entry_t)processor);
55e303ae 1869 processor->state = PROCESSOR_IDLE;
1c79356b 1870
fe8ab488 1871 if (processor->processor_primary == processor) {
b0d623f7 1872 enqueue_head(&pset->idle_queue, (queue_entry_t)processor);
b0d623f7
A
1873 }
1874 else {
fe8ab488 1875 enqueue_head(&pset->idle_secondary_queue, (queue_entry_t)processor);
b0d623f7 1876 }
1c79356b 1877 }
1c79356b 1878
3e170ce0 1879#if __SMP__
fe8ab488
A
1880 /* Invoked with pset locked, returns with pset unlocked */
1881 sched_SMT_balance(processor, pset);
3e170ce0
A
1882#else
1883 pset_unlock(pset);
1884#endif
2d21ac55 1885
6d2010ae 1886#if CONFIG_SCHED_IDLE_IN_PLACE
2d21ac55
A
1887 /*
1888 * Choose idle thread if fast idle is not possible.
1889 */
fe8ab488
A
1890 if (processor->processor_primary != processor)
1891 return (processor->idle_thread);
1892
6d2010ae 1893 if ((thread->state & (TH_IDLE|TH_TERMINATE|TH_SUSP)) || !(thread->state & TH_WAIT) || thread->wake_active || thread->sched_pri >= BASEPRI_RTQUEUES)
2d21ac55
A
1894 return (processor->idle_thread);
1895
1896 /*
1897 * Perform idling activities directly without a
1898 * context switch. Return dispatched thread,
1899 * else check again for a runnable thread.
1900 */
1901 new_thread = thread_select_idle(thread, processor);
1902
6d2010ae
A
1903#else /* !CONFIG_SCHED_IDLE_IN_PLACE */
1904
1905 /*
1906 * Do a full context switch to idle so that the current
1907 * thread can start running on another processor without
1908 * waiting for the fast-idled processor to wake up.
1909 */
3e170ce0 1910 new_thread = processor->idle_thread;
6d2010ae
A
1911
1912#endif /* !CONFIG_SCHED_IDLE_IN_PLACE */
1913
2d21ac55
A
1914 } while (new_thread == THREAD_NULL);
1915
1916 return (new_thread);
1917}
1918
6d2010ae 1919#if CONFIG_SCHED_IDLE_IN_PLACE
2d21ac55
A
1920/*
1921 * thread_select_idle:
1922 *
1923 * Idle the processor using the current thread context.
1924 *
1925 * Called with thread locked, then dropped and relocked.
1926 */
1927static thread_t
1928thread_select_idle(
1929 thread_t thread,
1930 processor_t processor)
1931{
1932 thread_t new_thread;
39236c6e
A
1933 uint64_t arg1, arg2;
1934 int urgency;
1935
1936 if (thread->sched_mode == TH_MODE_TIMESHARE) {
fe8ab488
A
1937 if (thread->sched_flags & TH_SFLAG_THROTTLED)
1938 sched_background_decr(thread);
2d21ac55 1939
fe8ab488 1940 sched_share_decr(thread);
39236c6e 1941 }
fe8ab488 1942 sched_run_decr(thread);
2d21ac55
A
1943
1944 thread->state |= TH_IDLE;
1945 processor->current_pri = IDLEPRI;
6d2010ae 1946 processor->current_thmode = TH_MODE_NONE;
fe8ab488 1947 processor->current_sfi_class = SFI_CLASS_KERNEL;
2d21ac55 1948
316670eb
A
1949 /* Reload precise timing global policy to thread-local policy */
1950 thread->precise_user_kernel_time = use_precise_user_kernel_time(thread);
1951
2d21ac55
A
1952 thread_unlock(thread);
1953
1954 /*
1955 * Switch execution timing to processor idle thread.
1956 */
1957 processor->last_dispatch = mach_absolute_time();
fe8ab488
A
1958
1959#ifdef CONFIG_MACH_APPROXIMATE_TIME
1960 commpage_update_mach_approximate_time(processor->last_dispatch);
1961#endif
1962
6d2010ae 1963 thread->last_run_time = processor->last_dispatch;
2d21ac55
A
1964 thread_timer_event(processor->last_dispatch, &processor->idle_thread->system_timer);
1965 PROCESSOR_DATA(processor, kernel_timer) = &processor->idle_thread->system_timer;
1966
1967 /*
1968 * Cancel the quantum timer while idling.
1969 */
1970 timer_call_cancel(&processor->quantum_timer);
3e170ce0 1971 processor->first_timeslice = FALSE;
2d21ac55
A
1972
1973 (*thread->sched_call)(SCHED_CALL_BLOCK, thread);
1974
3e170ce0 1975 thread_tell_urgency(THREAD_URGENCY_NONE, 0, 0, 0, NULL);
6d2010ae 1976
2d21ac55
A
1977 /*
1978 * Enable interrupts and perform idling activities. No
1979 * preemption due to TH_IDLE being set.
1980 */
1981 spllo(); new_thread = processor_idle(thread, processor);
1982
cf7d32b8
A
1983 /*
1984 * Return at splsched.
1985 */
2d21ac55
A
1986 (*thread->sched_call)(SCHED_CALL_UNBLOCK, thread);
1987
1988 thread_lock(thread);
1989
1990 /*
1991 * If awakened, switch to thread timer and start a new quantum.
1992 * Otherwise skip; we will context switch to another thread or return here.
1993 */
1994 if (!(thread->state & TH_WAIT)) {
1995 processor->last_dispatch = mach_absolute_time();
1996 thread_timer_event(processor->last_dispatch, &thread->system_timer);
1997 PROCESSOR_DATA(processor, kernel_timer) = &thread->system_timer;
1998
1999 thread_quantum_init(thread);
fe8ab488
A
2000 processor->quantum_end = processor->last_dispatch + thread->quantum_remaining;
2001 timer_call_enter1(&processor->quantum_timer, thread, processor->quantum_end, TIMER_CALL_SYS_CRITICAL | TIMER_CALL_LOCAL);
3e170ce0 2002 processor->first_timeslice = TRUE;
2d21ac55
A
2003
2004 thread->computation_epoch = processor->last_dispatch;
1c79356b
A
2005 }
2006
2d21ac55 2007 thread->state &= ~TH_IDLE;
55e303ae 2008
39236c6e
A
2009 urgency = thread_get_urgency(thread, &arg1, &arg2);
2010
3e170ce0 2011 thread_tell_urgency(urgency, arg1, arg2, 0, new_thread);
39236c6e 2012
fe8ab488 2013 sched_run_incr(thread);
39236c6e 2014 if (thread->sched_mode == TH_MODE_TIMESHARE) {
fe8ab488 2015 sched_share_incr(thread);
2d21ac55 2016
fe8ab488
A
2017 if (thread->sched_flags & TH_SFLAG_THROTTLED)
2018 sched_background_incr(thread);
39236c6e
A
2019 }
2020
2d21ac55 2021 return (new_thread);
1c79356b 2022}
6d2010ae
A
2023#endif /* CONFIG_SCHED_IDLE_IN_PLACE */
2024
b0d623f7 2025/*
3e170ce0 2026 * thread_invoke
b0d623f7 2027 *
3e170ce0 2028 * Called at splsched with neither thread locked.
b0d623f7 2029 *
3e170ce0 2030 * Perform a context switch and start executing the new thread.
55e303ae 2031 *
3e170ce0
A
2032 * Returns FALSE when the context switch didn't happen.
2033 * The reference to the new thread is still consumed.
39236c6e
A
2034 *
2035 * "self" is what is currently running on the processor,
2036 * "thread" is the new thread to context switch to
2037 * (which may be the same thread in some cases)
2038 */
2d21ac55 2039static boolean_t
1c79356b 2040thread_invoke(
39236c6e
A
2041 thread_t self,
2042 thread_t thread,
91447636 2043 ast_t reason)
1c79356b 2044{
39236c6e 2045 if (__improbable(get_preemption_level() != 0)) {
b0d623f7
A
2046 int pl = get_preemption_level();
2047 panic("thread_invoke: preemption_level %d, possible cause: %s",
2048 pl, (pl < 0 ? "unlocking an unlocked mutex or spinlock" :
2049 "blocking while holding a spinlock, or within interrupt context"));
2050 }
0b4e3aa0 2051
3e170ce0
A
2052 thread_continue_t continuation = self->continuation;
2053 void *parameter = self->parameter;
2054 processor_t processor;
2055
2056 uint64_t ctime = mach_absolute_time();
2057
2058#ifdef CONFIG_MACH_APPROXIMATE_TIME
2059 commpage_update_mach_approximate_time(ctime);
2060#endif
2061
2062#if defined(CONFIG_SCHED_TIMESHARE_CORE)
2063 sched_timeshare_consider_maintenance(ctime);
2064#endif
2065
2d21ac55 2066 assert(self == current_thread());
fe8ab488 2067 assert(self->runq == PROCESSOR_NULL);
3e170ce0 2068 assert((self->state & (TH_RUN|TH_TERMINATE2)) == TH_RUN);
91447636 2069
2d21ac55 2070 thread_lock(thread);
1c79356b 2071
3e170ce0 2072 assert((thread->state & (TH_RUN|TH_WAIT|TH_UNINT|TH_TERMINATE|TH_TERMINATE2)) == TH_RUN);
fe8ab488
A
2073 assert(thread->bound_processor == PROCESSOR_NULL || thread->bound_processor == current_processor());
2074 assert(thread->runq == PROCESSOR_NULL);
1c79356b 2075
316670eb
A
2076 /* Reload precise timing global policy to thread-local policy */
2077 thread->precise_user_kernel_time = use_precise_user_kernel_time(thread);
3e170ce0 2078
fe8ab488
A
2079 /* Update SFI class based on other factors */
2080 thread->sfi_class = sfi_thread_classify(thread);
2081
3e170ce0 2082 /* Allow realtime threads to hang onto a stack. */
6d2010ae 2083 if ((self->sched_mode == TH_MODE_REALTIME) && !self->reserved_stack)
2d21ac55 2084 self->reserved_stack = self->kernel_stack;
1c79356b 2085
91447636 2086 if (continuation != NULL) {
2d21ac55 2087 if (!thread->kernel_stack) {
9bccf70c 2088 /*
2d21ac55 2089 * If we are using a privileged stack,
9bccf70c 2090 * check to see whether we can exchange it with
2d21ac55 2091 * that of the other thread.
9bccf70c 2092 */
2d21ac55 2093 if (self->kernel_stack == self->reserved_stack && !thread->reserved_stack)
9bccf70c 2094 goto need_stack;
1c79356b 2095
91447636
A
2096 /*
2097 * Context switch by performing a stack handoff.
2098 */
2d21ac55
A
2099 continuation = thread->continuation;
2100 parameter = thread->parameter;
1c79356b 2101
9bccf70c 2102 processor = current_processor();
2d21ac55
A
2103 processor->active_thread = thread;
2104 processor->current_pri = thread->sched_pri;
6d2010ae 2105 processor->current_thmode = thread->sched_mode;
fe8ab488 2106 processor->current_sfi_class = thread->sfi_class;
2d21ac55
A
2107 if (thread->last_processor != processor && thread->last_processor != NULL) {
2108 if (thread->last_processor->processor_set != processor->processor_set)
2109 thread->ps_switch++;
2110 thread->p_switch++;
2111 }
2112 thread->last_processor = processor;
2113 thread->c_switch++;
2114 ast_context(thread);
3e170ce0 2115
2d21ac55 2116 thread_unlock(thread);
1c79356b 2117
2d21ac55 2118 self->reason = reason;
91447636 2119
39236c6e
A
2120 processor->last_dispatch = ctime;
2121 self->last_run_time = ctime;
2122 thread_timer_event(ctime, &thread->system_timer);
2d21ac55 2123 PROCESSOR_DATA(processor, kernel_timer) = &thread->system_timer;
316670eb
A
2124
2125 /*
2126 * Since non-precise user/kernel time doesn't update the state timer
2127 * during privilege transitions, synthesize an event now.
2128 */
2129 if (!thread->precise_user_kernel_time) {
2130 timer_switch(PROCESSOR_DATA(processor, current_state),
39236c6e 2131 ctime,
316670eb
A
2132 PROCESSOR_DATA(processor, current_state));
2133 }
2d21ac55 2134
316670eb
A
2135 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2136 MACHDBG_CODE(DBG_MACH_SCHED, MACH_STACK_HANDOFF)|DBG_FUNC_NONE,
2137 self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
1c79356b 2138
39236c6e 2139 if ((thread->chosen_processor != processor) && (thread->chosen_processor != PROCESSOR_NULL)) {
3e170ce0 2140 SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_MOVED)|DBG_FUNC_NONE,
6d2010ae
A
2141 (uintptr_t)thread_tid(thread), (uintptr_t)thread->chosen_processor->cpu_id, 0, 0, 0);
2142 }
2143
b0d623f7
A
2144 DTRACE_SCHED2(off__cpu, struct thread *, thread, struct proc *, thread->task->bsd_info);
2145
6d2010ae
A
2146 SCHED_STATS_CSW(processor, self->reason, self->sched_pri, thread->sched_pri);
2147
2148 TLOG(1, "thread_invoke: calling stack_handoff\n");
2149 stack_handoff(self, thread);
9bccf70c 2150
3e170ce0
A
2151 /* 'self' is now off core */
2152 assert(thread == current_thread());
2153
b0d623f7
A
2154 DTRACE_SCHED(on__cpu);
2155
2d21ac55 2156 thread_dispatch(self, thread);
1c79356b 2157
2d21ac55 2158 thread->continuation = thread->parameter = NULL;
1c79356b 2159
2d21ac55 2160 counter(c_thread_invoke_hits++);
1c79356b 2161
9bccf70c 2162 (void) spllo();
1c79356b 2163
2d21ac55
A
2164 assert(continuation);
2165 call_continuation(continuation, parameter, thread->wait_result);
9bccf70c 2166 /*NOTREACHED*/
9bccf70c 2167 }
2d21ac55 2168 else if (thread == self) {
9bccf70c 2169 /* same thread but with continuation */
2d21ac55 2170 ast_context(self);
9bccf70c 2171 counter(++c_thread_invoke_same);
3e170ce0 2172
2d21ac55 2173 thread_unlock(self);
9bccf70c 2174
316670eb
A
2175 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2176 MACHDBG_CODE(DBG_MACH_SCHED,MACH_SCHED) | DBG_FUNC_NONE,
2177 self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
6d2010ae 2178
2d21ac55
A
2179 self->continuation = self->parameter = NULL;
2180
9bccf70c 2181 (void) spllo();
55e303ae 2182
2d21ac55 2183 call_continuation(continuation, parameter, self->wait_result);
9bccf70c
A
2184 /*NOTREACHED*/
2185 }
3e170ce0 2186 } else {
9bccf70c 2187 /*
2d21ac55 2188 * Check that the other thread has a stack
9bccf70c 2189 */
2d21ac55 2190 if (!thread->kernel_stack) {
9bccf70c 2191need_stack:
2d21ac55
A
2192 if (!stack_alloc_try(thread)) {
2193 counter(c_thread_invoke_misses++);
2194 thread_unlock(thread);
2195 thread_stack_enqueue(thread);
9bccf70c
A
2196 return (FALSE);
2197 }
3e170ce0 2198 } else if (thread == self) {
2d21ac55 2199 ast_context(self);
9bccf70c 2200 counter(++c_thread_invoke_same);
2d21ac55 2201 thread_unlock(self);
6d2010ae 2202
316670eb
A
2203 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2204 MACHDBG_CODE(DBG_MACH_SCHED,MACH_SCHED) | DBG_FUNC_NONE,
2205 self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
6d2010ae 2206
9bccf70c
A
2207 return (TRUE);
2208 }
2209 }
1c79356b
A
2210
2211 /*
91447636 2212 * Context switch by full context save.
1c79356b 2213 */
9bccf70c 2214 processor = current_processor();
2d21ac55
A
2215 processor->active_thread = thread;
2216 processor->current_pri = thread->sched_pri;
6d2010ae 2217 processor->current_thmode = thread->sched_mode;
fe8ab488 2218 processor->current_sfi_class = thread->sfi_class;
2d21ac55
A
2219 if (thread->last_processor != processor && thread->last_processor != NULL) {
2220 if (thread->last_processor->processor_set != processor->processor_set)
2221 thread->ps_switch++;
2222 thread->p_switch++;
2223 }
2224 thread->last_processor = processor;
2225 thread->c_switch++;
2226 ast_context(thread);
3e170ce0 2227
2d21ac55 2228 thread_unlock(thread);
1c79356b 2229
2d21ac55 2230 counter(c_thread_invoke_csw++);
1c79356b 2231
2d21ac55 2232 self->reason = reason;
1c79356b 2233
39236c6e
A
2234 processor->last_dispatch = ctime;
2235 self->last_run_time = ctime;
2236 thread_timer_event(ctime, &thread->system_timer);
2d21ac55 2237 PROCESSOR_DATA(processor, kernel_timer) = &thread->system_timer;
91447636 2238
316670eb
A
2239 /*
2240 * Since non-precise user/kernel time doesn't update the state timer
2241 * during privilege transitions, synthesize an event now.
2242 */
2243 if (!thread->precise_user_kernel_time) {
2244 timer_switch(PROCESSOR_DATA(processor, current_state),
39236c6e 2245 ctime,
316670eb
A
2246 PROCESSOR_DATA(processor, current_state));
2247 }
3e170ce0 2248
316670eb
A
2249 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2250 MACHDBG_CODE(DBG_MACH_SCHED,MACH_SCHED) | DBG_FUNC_NONE,
2251 self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
b0d623f7 2252
6d2010ae 2253 if ((thread->chosen_processor != processor) && (thread->chosen_processor != NULL)) {
3e170ce0 2254 SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_MOVED)|DBG_FUNC_NONE,
6d2010ae
A
2255 (uintptr_t)thread_tid(thread), (uintptr_t)thread->chosen_processor->cpu_id, 0, 0, 0);
2256 }
2257
b0d623f7 2258 DTRACE_SCHED2(off__cpu, struct thread *, thread, struct proc *, thread->task->bsd_info);
1c79356b 2259
6d2010ae
A
2260 SCHED_STATS_CSW(processor, self->reason, self->sched_pri, thread->sched_pri);
2261
1c79356b 2262 /*
91447636 2263 * This is where we actually switch register context,
2d21ac55
A
2264 * and address space if required. We will next run
2265 * as a result of a subsequent context switch.
3e170ce0
A
2266 *
2267 * Once registers are switched and the processor is running "thread",
2268 * the stack variables and non-volatile registers will contain whatever
2269 * was there the last time that thread blocked. No local variables should
2270 * be used after this point, except for the special case of "thread", which
2271 * the platform layer returns as the previous thread running on the processor
2272 * via the function call ABI as a return register, and "self", which may have
2273 * been stored on the stack or a non-volatile register, but a stale idea of
2274 * what was on the CPU is newly-accurate because that thread is again
2275 * running on the CPU.
91447636 2276 */
316670eb 2277 assert(continuation == self->continuation);
2d21ac55 2278 thread = machine_switch_context(self, continuation, thread);
316670eb 2279 assert(self == current_thread());
b0d623f7
A
2280 TLOG(1,"thread_invoke: returning machine_switch_context: self %p continuation %p thread %p\n", self, continuation, thread);
2281
2282 DTRACE_SCHED(on__cpu);
1c79356b
A
2283
2284 /*
2d21ac55 2285 * We have been resumed and are set to run.
1c79356b 2286 */
2d21ac55 2287 thread_dispatch(thread, self);
9bccf70c 2288
91447636 2289 if (continuation) {
2d21ac55
A
2290 self->continuation = self->parameter = NULL;
2291
9bccf70c 2292 (void) spllo();
55e303ae 2293
2d21ac55 2294 call_continuation(continuation, parameter, self->wait_result);
9bccf70c 2295 /*NOTREACHED*/
1c79356b
A
2296 }
2297
9bccf70c 2298 return (TRUE);
1c79356b
A
2299}
2300
3e170ce0
A
2301#if defined(CONFIG_SCHED_DEFERRED_AST)
2302/*
2303 * pset_cancel_deferred_dispatch:
2304 *
2305 * Cancels all ASTs that we can cancel for the given processor set
2306 * if the current processor is running the last runnable thread in the
2307 * system.
2308 *
2309 * This function assumes the current thread is runnable. This must
2310 * be called with the pset unlocked.
2311 */
2312static void
2313pset_cancel_deferred_dispatch(
2314 processor_set_t pset,
2315 processor_t processor)
2316{
2317 processor_t active_processor = NULL;
2318 uint32_t sampled_sched_run_count;
2319
2320 pset_lock(pset);
2321 sampled_sched_run_count = (volatile uint32_t) sched_run_count;
2322
2323 /*
2324 * If we have emptied the run queue, and our current thread is runnable, we
2325 * should tell any processors that are still DISPATCHING that they will
2326 * probably not have any work to do. In the event that there are no
2327 * pending signals that we can cancel, this is also uninteresting.
2328 *
2329 * In the unlikely event that another thread becomes runnable while we are
2330 * doing this (sched_run_count is atomically updated, not guarded), the
2331 * codepath making it runnable SHOULD (a dangerous word) need the pset lock
2332 * in order to dispatch it to a processor in our pset. So, the other
2333 * codepath will wait while we squash all cancelable ASTs, get the pset
2334 * lock, and then dispatch the freshly runnable thread. So this should be
2335 * correct (we won't accidentally have a runnable thread that hasn't been
2336 * dispatched to an idle processor), if not ideal (we may be restarting the
2337 * dispatch process, which could have some overhead).
2338 *
2339 */
2340 if ((sampled_sched_run_count == 1) &&
2341 (pset->pending_deferred_AST_cpu_mask)) {
2342 qe_foreach_element_safe(active_processor, &pset->active_queue, processor_queue) {
2343 /*
2344 * If a processor is DISPATCHING, it could be because of
2345 * a cancelable signal.
2346 *
2347 * IF the processor is not our
2348 * current processor (the current processor should not
2349 * be DISPATCHING, so this is a bit paranoid), AND there
2350 * is a cancelable signal pending on the processor, AND
2351 * there is no non-cancelable signal pending (as there is
2352 * no point trying to backtrack on bringing the processor
2353 * up if a signal we cannot cancel is outstanding), THEN
2354 * it should make sense to roll back the processor state
2355 * to the IDLE state.
2356 *
2357 * If the racey nature of this approach (as the signal
2358 * will be arbitrated by hardware, and can fire as we
2359 * roll back state) results in the core responding
2360 * despite being pushed back to the IDLE state, it
2361 * should be no different than if the core took some
2362 * interrupt while IDLE.
2363 */
2364 if ((active_processor->state == PROCESSOR_DISPATCHING) &&
2365 (pset->pending_deferred_AST_cpu_mask & (1ULL << active_processor->cpu_id)) &&
2366 (!(pset->pending_AST_cpu_mask & (1ULL << active_processor->cpu_id))) &&
2367 (active_processor != processor)) {
2368 /*
2369 * Squash all of the processor state back to some
2370 * reasonable facsimile of PROCESSOR_IDLE.
2371 *
2372 * TODO: What queue policy do we actually want here?
2373 * We want to promote selection of a good processor
2374 * to run on. Do we want to enqueue at the head?
2375 * The tail? At the (relative) old position in the
2376 * queue? Or something else entirely?
2377 */
2378 re_queue_head(&pset->idle_queue, (queue_entry_t)active_processor);
2379
2380 assert(active_processor->next_thread == THREAD_NULL);
2381
2382 active_processor->current_pri = IDLEPRI;
2383 active_processor->current_thmode = TH_MODE_FIXED;
2384 active_processor->current_sfi_class = SFI_CLASS_KERNEL;
2385 active_processor->deadline = UINT64_MAX;
2386 active_processor->state = PROCESSOR_IDLE;
2387 pset->pending_deferred_AST_cpu_mask &= ~(1U << active_processor->cpu_id);
2388 machine_signal_idle_cancel(active_processor);
2389 }
2390
2391 }
2392 }
2393
2394 pset_unlock(pset);
2395}
2396#else
2397/* We don't support deferred ASTs; everything is candycanes and sunshine. */
2398#endif
2399
1c79356b 2400/*
2d21ac55 2401 * thread_dispatch:
1c79356b 2402 *
2d21ac55
A
2403 * Handle threads at context switch. Re-dispatch other thread
2404 * if still running, otherwise update run state and perform
2405 * special actions. Update quantum for other thread and begin
2406 * the quantum for ourselves.
91447636 2407 *
3e170ce0
A
2408 * "thread" is the old thread that we have switched away from.
2409 * "self" is the new current thread that we have context switched to
39236c6e 2410 *
91447636 2411 * Called at splsched.
1c79356b
A
2412 */
2413void
2d21ac55
A
2414thread_dispatch(
2415 thread_t thread,
2416 thread_t self)
1c79356b 2417{
2d21ac55
A
2418 processor_t processor = self->last_processor;
2419
3e170ce0
A
2420 assert(processor == current_processor());
2421 assert(self == current_thread());
2422 assert(thread != self);
2423
2d21ac55 2424 if (thread != THREAD_NULL) {
91447636 2425 /*
2d21ac55
A
2426 * If blocked at a continuation, discard
2427 * the stack.
91447636 2428 */
2d21ac55
A
2429 if (thread->continuation != NULL && thread->kernel_stack != 0)
2430 stack_free(thread);
2431
3e170ce0
A
2432 if (thread->state & TH_IDLE) {
2433 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2434 MACHDBG_CODE(DBG_MACH_SCHED,MACH_DISPATCH) | DBG_FUNC_NONE,
2435 (uintptr_t)thread_tid(thread), 0, thread->state, sched_run_count, 0);
2436 } else {
316670eb
A
2437 int64_t consumed;
2438 int64_t remainder = 0;
2439
2440 if (processor->quantum_end > processor->last_dispatch)
2441 remainder = processor->quantum_end -
2442 processor->last_dispatch;
2443
fe8ab488 2444 consumed = thread->quantum_remaining - remainder;
316670eb 2445
39236c6e 2446 if ((thread->reason & AST_LEDGER) == 0) {
316670eb 2447 /*
39236c6e
A
2448 * Bill CPU time to both the task and
2449 * the individual thread.
316670eb
A
2450 */
2451 ledger_credit(thread->t_ledger,
2452 task_ledgers.cpu_time, consumed);
2453 ledger_credit(thread->t_threadledger,
2454 thread_ledgers.cpu_time, consumed);
fe8ab488
A
2455#ifdef CONFIG_BANK
2456 if (thread->t_bankledger) {
2457 ledger_credit(thread->t_bankledger,
2458 bank_ledgers.cpu_time,
2459 (consumed - thread->t_deduct_bank_ledger_time));
2460
2461 }
2462 thread->t_deduct_bank_ledger_time =0;
2463#endif
39236c6e 2464 }
316670eb 2465
2d21ac55
A
2466 wake_lock(thread);
2467 thread_lock(thread);
9bccf70c 2468
91447636 2469 /*
2d21ac55 2470 * Compute remainder of current quantum.
91447636 2471 */
3e170ce0 2472 if (processor->first_timeslice &&
316670eb 2473 processor->quantum_end > processor->last_dispatch)
fe8ab488 2474 thread->quantum_remaining = (uint32_t)remainder;
2d21ac55 2475 else
fe8ab488 2476 thread->quantum_remaining = 0;
2d21ac55 2477
6d2010ae 2478 if (thread->sched_mode == TH_MODE_REALTIME) {
2d21ac55
A
2479 /*
2480 * Cancel the deadline if the thread has
2481 * consumed the entire quantum.
2482 */
fe8ab488 2483 if (thread->quantum_remaining == 0) {
2d21ac55 2484 thread->realtime.deadline = UINT64_MAX;
2d21ac55 2485 }
b7266188 2486 } else {
3e170ce0 2487#if defined(CONFIG_SCHED_TIMESHARE_CORE)
2d21ac55
A
2488 /*
2489 * For non-realtime threads treat a tiny
2490 * remaining quantum as an expired quantum
2491 * but include what's left next time.
2492 */
fe8ab488 2493 if (thread->quantum_remaining < min_std_quantum) {
2d21ac55 2494 thread->reason |= AST_QUANTUM;
fe8ab488 2495 thread->quantum_remaining += SCHED(initial_quantum_size)(thread);
2d21ac55 2496 }
3e170ce0 2497#endif /* CONFIG_SCHED_TIMESHARE_CORE */
2d21ac55
A
2498 }
2499
91447636 2500 /*
2d21ac55
A
2501 * If we are doing a direct handoff then
2502 * take the remainder of the quantum.
91447636 2503 */
2d21ac55 2504 if ((thread->reason & (AST_HANDOFF|AST_QUANTUM)) == AST_HANDOFF) {
fe8ab488 2505 self->quantum_remaining = thread->quantum_remaining;
2d21ac55 2506 thread->reason |= AST_QUANTUM;
fe8ab488
A
2507 thread->quantum_remaining = 0;
2508 } else {
2509#if defined(CONFIG_SCHED_MULTIQ)
3e170ce0
A
2510 if (SCHED(sched_groups_enabled) &&
2511 thread->sched_group == self->sched_group) {
fe8ab488 2512 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3e170ce0 2513 MACHDBG_CODE(DBG_MACH_SCHED, MACH_QUANTUM_HANDOFF),
fe8ab488
A
2514 self->reason, (uintptr_t)thread_tid(thread),
2515 self->quantum_remaining, thread->quantum_remaining, 0);
2516
2517 self->quantum_remaining = thread->quantum_remaining;
2518 thread->quantum_remaining = 0;
3e170ce0 2519 /* Don't set AST_QUANTUM here - old thread might still want to preempt someone else */
fe8ab488
A
2520 }
2521#endif /* defined(CONFIG_SCHED_MULTIQ) */
91447636 2522 }
91447636 2523
b0d623f7 2524 thread->computation_metered += (processor->last_dispatch - thread->computation_epoch);
2d21ac55 2525
39236c6e
A
2526 if ((thread->rwlock_count != 0) && !(LcksOpts & disLkRWPrio)) {
2527 integer_t priority;
2528
2529 priority = thread->sched_pri;
2530
3e170ce0
A
2531 if (priority < thread->base_pri)
2532 priority = thread->base_pri;
39236c6e
A
2533 if (priority < BASEPRI_BACKGROUND)
2534 priority = BASEPRI_BACKGROUND;
2535
2536 if ((thread->sched_pri < priority) || !(thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
2537 KERNEL_DEBUG_CONSTANT(
2538 MACHDBG_CODE(DBG_MACH_SCHED, MACH_RW_PROMOTE) | DBG_FUNC_NONE,
3e170ce0 2539 (uintptr_t)thread_tid(thread), thread->sched_pri, thread->base_pri, priority, 0);
39236c6e
A
2540
2541 thread->sched_flags |= TH_SFLAG_RW_PROMOTED;
2542
2543 if (thread->sched_pri < priority)
2544 set_sched_pri(thread, priority);
2545 }
2546 }
2547
2d21ac55
A
2548 if (!(thread->state & TH_WAIT)) {
2549 /*
3e170ce0 2550 * Still runnable.
2d21ac55 2551 */
3e170ce0
A
2552 thread->last_made_runnable_time = mach_approximate_time();
2553
2554 machine_thread_going_off_core(thread, FALSE);
2555
2d21ac55
A
2556 if (thread->reason & AST_QUANTUM)
2557 thread_setrun(thread, SCHED_TAILQ);
3e170ce0 2558 else if (thread->reason & AST_PREEMPT)
2d21ac55
A
2559 thread_setrun(thread, SCHED_HEADQ);
2560 else
2561 thread_setrun(thread, SCHED_PREEMPT | SCHED_TAILQ);
2562
fe8ab488
A
2563 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2564 MACHDBG_CODE(DBG_MACH_SCHED,MACH_DISPATCH) | DBG_FUNC_NONE,
2565 (uintptr_t)thread_tid(thread), thread->reason, thread->state, sched_run_count, 0);
3e170ce0 2566
316670eb
A
2567 if (thread->wake_active) {
2568 thread->wake_active = FALSE;
2569 thread_unlock(thread);
2570
2571 thread_wakeup(&thread->wake_active);
3e170ce0 2572 } else {
316670eb 2573 thread_unlock(thread);
3e170ce0 2574 }
316670eb 2575
2d21ac55 2576 wake_unlock(thread);
3e170ce0 2577 } else {
2d21ac55
A
2578 /*
2579 * Waiting.
2580 */
b7266188 2581 boolean_t should_terminate = FALSE;
fe8ab488 2582 uint32_t new_run_count;
b7266188
A
2583
2584 /* Only the first call to thread_dispatch
2585 * after explicit termination should add
2586 * the thread to the termination queue
2587 */
2588 if ((thread->state & (TH_TERMINATE|TH_TERMINATE2)) == TH_TERMINATE) {
2589 should_terminate = TRUE;
2590 thread->state |= TH_TERMINATE2;
2591 }
2592
2d21ac55 2593 thread->state &= ~TH_RUN;
3e170ce0 2594 thread->last_made_runnable_time = ~0ULL;
39236c6e
A
2595 thread->chosen_processor = PROCESSOR_NULL;
2596
2597 if (thread->sched_mode == TH_MODE_TIMESHARE) {
fe8ab488
A
2598 if (thread->sched_flags & TH_SFLAG_THROTTLED)
2599 sched_background_decr(thread);
2600
2601 sched_share_decr(thread);
2602 }
2603 new_run_count = sched_run_decr(thread);
2d21ac55 2604
3e170ce0 2605#if CONFIG_SCHED_SFI
fe8ab488
A
2606 if ((thread->state & (TH_WAIT | TH_TERMINATE)) == TH_WAIT) {
2607 if (thread->reason & AST_SFI) {
2608 thread->wait_sfi_begin_time = processor->last_dispatch;
2609 }
39236c6e 2610 }
3e170ce0
A
2611#endif
2612
2613 machine_thread_going_off_core(thread, should_terminate);
fe8ab488
A
2614
2615 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2616 MACHDBG_CODE(DBG_MACH_SCHED,MACH_DISPATCH) | DBG_FUNC_NONE,
2617 (uintptr_t)thread_tid(thread), thread->reason, thread->state, new_run_count, 0);
2d21ac55 2618
b7266188
A
2619 (*thread->sched_call)(SCHED_CALL_BLOCK, thread);
2620
2d21ac55
A
2621 if (thread->wake_active) {
2622 thread->wake_active = FALSE;
2623 thread_unlock(thread);
2624
2625 thread_wakeup(&thread->wake_active);
3e170ce0 2626 } else {
2d21ac55 2627 thread_unlock(thread);
3e170ce0 2628 }
91447636 2629
2d21ac55 2630 wake_unlock(thread);
91447636 2631
b7266188 2632 if (should_terminate)
2d21ac55
A
2633 thread_terminate_enqueue(thread);
2634 }
2635 }
91447636 2636 }
91447636 2637
3e170ce0
A
2638 /* Update (new) current thread and reprogram quantum timer */
2639 thread_lock(self);
2d21ac55 2640 if (!(self->state & TH_IDLE)) {
39236c6e
A
2641 uint64_t arg1, arg2;
2642 int urgency;
3e170ce0
A
2643 uint64_t latency;
2644
2645#if CONFIG_SCHED_SFI
fe8ab488
A
2646 ast_t new_ast;
2647
fe8ab488 2648 new_ast = sfi_thread_needs_ast(self, NULL);
fe8ab488
A
2649
2650 if (new_ast != AST_NONE) {
2651 ast_on(new_ast);
2652 }
3e170ce0
A
2653#endif
2654
2655 assert(processor->last_dispatch >= self->last_made_runnable_time);
2656 latency = processor->last_dispatch - self->last_made_runnable_time;
6d2010ae 2657
39236c6e
A
2658 urgency = thread_get_urgency(self, &arg1, &arg2);
2659
3e170ce0
A
2660 thread_tell_urgency(urgency, arg1, arg2, latency, self);
2661
2662 machine_thread_going_on_core(self, urgency, latency);
39236c6e 2663
91447636 2664 /*
2d21ac55 2665 * Get a new quantum if none remaining.
91447636 2666 */
fe8ab488 2667 if (self->quantum_remaining == 0) {
2d21ac55 2668 thread_quantum_init(self);
6d2010ae 2669 }
91447636
A
2670
2671 /*
2d21ac55 2672 * Set up quantum timer and timeslice.
91447636 2673 */
fe8ab488
A
2674 processor->quantum_end = processor->last_dispatch + self->quantum_remaining;
2675 timer_call_enter1(&processor->quantum_timer, self, processor->quantum_end, TIMER_CALL_SYS_CRITICAL | TIMER_CALL_LOCAL);
91447636 2676
3e170ce0
A
2677 processor->first_timeslice = TRUE;
2678 } else {
2679 timer_call_cancel(&processor->quantum_timer);
2680 processor->first_timeslice = FALSE;
91447636 2681
3e170ce0
A
2682 thread_tell_urgency(THREAD_URGENCY_NONE, 0, 0, 0, self);
2683 machine_thread_going_on_core(self, THREAD_URGENCY_NONE, 0);
91447636 2684 }
6d2010ae 2685
3e170ce0
A
2686 self->computation_epoch = processor->last_dispatch;
2687 self->reason = AST_NONE;
2688
2689 thread_unlock(self);
2690
2691#if defined(CONFIG_SCHED_DEFERRED_AST)
2692 /*
2693 * TODO: Can we state that redispatching our old thread is also
2694 * uninteresting?
2695 */
2696 if ((((volatile uint32_t)sched_run_count) == 1) &&
2697 !(self->state & TH_IDLE)) {
2698 pset_cancel_deferred_dispatch(processor->processor_set, processor);
91447636 2699 }
3e170ce0
A
2700#endif
2701
91447636
A
2702}
2703
2704/*
2d21ac55 2705 * thread_block_reason:
91447636 2706 *
2d21ac55
A
2707 * Forces a reschedule, blocking the caller if a wait
2708 * has been asserted.
91447636 2709 *
2d21ac55
A
2710 * If a continuation is specified, then thread_invoke will
2711 * attempt to discard the thread's kernel stack. When the
2712 * thread resumes, it will execute the continuation function
2713 * on a new kernel stack.
91447636 2714 */
2d21ac55
A
2715counter(mach_counter_t c_thread_block_calls = 0;)
2716
2717wait_result_t
2718thread_block_reason(
2719 thread_continue_t continuation,
2720 void *parameter,
2721 ast_t reason)
91447636 2722{
3e170ce0
A
2723 thread_t self = current_thread();
2724 processor_t processor;
2725 thread_t new_thread;
2726 spl_t s;
1c79356b
A
2727
2728 counter(++c_thread_block_calls);
2729
1c79356b
A
2730 s = splsched();
2731
55e303ae 2732 processor = current_processor();
1c79356b 2733
9bccf70c
A
2734 /* If we're explicitly yielding, force a subsequent quantum */
2735 if (reason & AST_YIELD)
3e170ce0 2736 processor->first_timeslice = FALSE;
0b4e3aa0 2737
9bccf70c
A
2738 /* We're handling all scheduling AST's */
2739 ast_off(AST_SCHEDULING);
1c79356b 2740
490019cf
A
2741#if PROC_REF_DEBUG
2742 if ((continuation != NULL) && (self->task != kernel_task)) {
2743 if (uthread_get_proc_refcount(self->uthread) != 0) {
2744 panic("thread_block_reason with continuation uthread %p with uu_proc_refcount != 0", self->uthread);
2745 }
2746 }
2747#endif
2748
91447636
A
2749 self->continuation = continuation;
2750 self->parameter = parameter;
2751
fe8ab488 2752 if (self->state & ~(TH_RUN | TH_IDLE)) {
316670eb
A
2753 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2754 MACHDBG_CODE(DBG_MACH_SCHED,MACH_BLOCK),
2755 reason, VM_KERNEL_UNSLIDE(continuation), 0, 0, 0);
b0d623f7
A
2756 }
2757
2d21ac55 2758 do {
91447636 2759 thread_lock(self);
fe8ab488 2760 new_thread = thread_select(self, processor, reason);
91447636 2761 thread_unlock(self);
2d21ac55 2762 } while (!thread_invoke(self, new_thread, reason));
1c79356b 2763
1c79356b
A
2764 splx(s);
2765
91447636 2766 return (self->wait_result);
1c79356b
A
2767}
2768
2769/*
2770 * thread_block:
2771 *
9bccf70c 2772 * Block the current thread if a wait has been asserted.
1c79356b 2773 */
91447636 2774wait_result_t
1c79356b 2775thread_block(
9bccf70c 2776 thread_continue_t continuation)
1c79356b 2777{
91447636
A
2778 return thread_block_reason(continuation, NULL, AST_NONE);
2779}
2780
2781wait_result_t
2782thread_block_parameter(
2783 thread_continue_t continuation,
2784 void *parameter)
2785{
2786 return thread_block_reason(continuation, parameter, AST_NONE);
1c79356b
A
2787}
2788
2789/*
2790 * thread_run:
2791 *
91447636 2792 * Switch directly from the current thread to the
55e303ae 2793 * new thread, handing off our quantum if appropriate.
9bccf70c
A
2794 *
2795 * New thread must be runnable, and not on a run queue.
1c79356b 2796 *
55e303ae 2797 * Called at splsched.
1c79356b
A
2798 */
2799int
2800thread_run(
91447636 2801 thread_t self,
9bccf70c 2802 thread_continue_t continuation,
91447636 2803 void *parameter,
9bccf70c 2804 thread_t new_thread)
1c79356b 2805{
9bccf70c
A
2806 ast_t handoff = AST_HANDOFF;
2807
91447636
A
2808 self->continuation = continuation;
2809 self->parameter = parameter;
9bccf70c 2810
91447636 2811 while (!thread_invoke(self, new_thread, handoff)) {
2d21ac55 2812 processor_t processor = current_processor();
9bccf70c 2813
91447636 2814 thread_lock(self);
fe8ab488 2815 new_thread = thread_select(self, processor, AST_NONE);
91447636 2816 thread_unlock(self);
9bccf70c
A
2817 handoff = AST_NONE;
2818 }
2819
91447636 2820 return (self->wait_result);
1c79356b
A
2821}
2822
2823/*
91447636 2824 * thread_continue:
55e303ae 2825 *
91447636
A
2826 * Called at splsched when a thread first receives
2827 * a new stack after a continuation.
1c79356b
A
2828 */
2829void
91447636 2830thread_continue(
3e170ce0 2831 thread_t thread)
1c79356b 2832{
3e170ce0
A
2833 thread_t self = current_thread();
2834 thread_continue_t continuation;
2835 void *parameter;
b0d623f7
A
2836
2837 DTRACE_SCHED(on__cpu);
2838
91447636 2839 continuation = self->continuation;
91447636 2840 parameter = self->parameter;
9bccf70c 2841
2d21ac55 2842 thread_dispatch(thread, self);
9bccf70c 2843
2d21ac55 2844 self->continuation = self->parameter = NULL;
1c79356b 2845
2d21ac55 2846 if (thread != THREAD_NULL)
91447636 2847 (void)spllo();
9bccf70c 2848
2d21ac55 2849 TLOG(1, "thread_continue: calling call_continuation \n");
91447636
A
2850 call_continuation(continuation, parameter, self->wait_result);
2851 /*NOTREACHED*/
1c79356b
A
2852}
2853
2d21ac55 2854void
6d2010ae 2855thread_quantum_init(thread_t thread)
2d21ac55 2856{
6d2010ae 2857 if (thread->sched_mode == TH_MODE_REALTIME) {
fe8ab488 2858 thread->quantum_remaining = thread->realtime.computation;
6d2010ae 2859 } else {
fe8ab488 2860 thread->quantum_remaining = SCHED(initial_quantum_size)(thread);
6d2010ae
A
2861 }
2862}
2d21ac55 2863
fe8ab488 2864uint32_t
3e170ce0 2865sched_timeshare_initial_quantum_size(thread_t thread)
6d2010ae 2866{
fe8ab488 2867 if ((thread == THREAD_NULL) || !(thread->sched_flags & TH_SFLAG_THROTTLED))
316670eb
A
2868 return std_quantum;
2869 else
2870 return bg_quantum;
6d2010ae
A
2871}
2872
6d2010ae
A
2873/*
2874 * run_queue_init:
2875 *
2876 * Initialize a run queue before first use.
2877 */
2878void
2879run_queue_init(
2880 run_queue_t rq)
2881{
2882 int i;
2883
2884 rq->highq = IDLEPRI;
2d21ac55
A
2885 for (i = 0; i < NRQBM; i++)
2886 rq->bitmap[i] = 0;
2887 setbit(MAXPRI - IDLEPRI, rq->bitmap);
2888 rq->urgency = rq->count = 0;
2889 for (i = 0; i < NRQS; i++)
2890 queue_init(&rq->queues[i]);
2891}
1c79356b 2892
2d21ac55
A
2893/*
2894 * run_queue_dequeue:
2895 *
2896 * Perform a dequeue operation on a run queue,
2897 * and return the resulting thread.
2898 *
6d2010ae 2899 * The run queue must be locked (see thread_run_queue_remove()
2d21ac55
A
2900 * for more info), and not empty.
2901 */
6d2010ae 2902thread_t
2d21ac55
A
2903run_queue_dequeue(
2904 run_queue_t rq,
2905 integer_t options)
2906{
2907 thread_t thread;
2908 queue_t queue = rq->queues + rq->highq;
9bccf70c 2909
2d21ac55 2910 if (options & SCHED_HEADQ) {
6d2010ae 2911 thread = (thread_t)dequeue_head(queue);
2d21ac55
A
2912 }
2913 else {
6d2010ae 2914 thread = (thread_t)dequeue_tail(queue);
9bccf70c 2915 }
1c79356b 2916
2d21ac55 2917 thread->runq = PROCESSOR_NULL;
6d2010ae 2918 SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
2d21ac55 2919 rq->count--;
6d2010ae 2920 if (SCHED(priority_is_urgent)(rq->highq)) {
4a3eedf9
A
2921 rq->urgency--; assert(rq->urgency >= 0);
2922 }
2d21ac55
A
2923 if (queue_empty(queue)) {
2924 if (rq->highq != IDLEPRI)
2925 clrbit(MAXPRI - rq->highq, rq->bitmap);
2926 rq->highq = MAXPRI - ffsbit(rq->bitmap);
2927 }
1c79356b 2928
2d21ac55 2929 return (thread);
1c79356b
A
2930}
2931
6d2010ae
A
2932/*
2933 * run_queue_enqueue:
2934 *
2935 * Perform a enqueue operation on a run queue.
2936 *
2937 * The run queue must be locked (see thread_run_queue_remove()
2938 * for more info).
2939 */
2940boolean_t
2941run_queue_enqueue(
2942 run_queue_t rq,
2943 thread_t thread,
2944 integer_t options)
2945{
2946 queue_t queue = rq->queues + thread->sched_pri;
2947 boolean_t result = FALSE;
2948
2949 if (queue_empty(queue)) {
2950 enqueue_tail(queue, (queue_entry_t)thread);
2951
2952 setbit(MAXPRI - thread->sched_pri, rq->bitmap);
2953 if (thread->sched_pri > rq->highq) {
2954 rq->highq = thread->sched_pri;
2955 result = TRUE;
2956 }
fe8ab488 2957 } else {
6d2010ae
A
2958 if (options & SCHED_TAILQ)
2959 enqueue_tail(queue, (queue_entry_t)thread);
2960 else
2961 enqueue_head(queue, (queue_entry_t)thread);
fe8ab488 2962 }
6d2010ae
A
2963 if (SCHED(priority_is_urgent)(thread->sched_pri))
2964 rq->urgency++;
2965 SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
2966 rq->count++;
2967
2968 return (result);
2969
2970}
2971
2972/*
2973 * run_queue_remove:
2974 *
2975 * Remove a specific thread from a runqueue.
2976 *
2977 * The run queue must be locked.
2978 */
2979void
2980run_queue_remove(
2981 run_queue_t rq,
2982 thread_t thread)
2983{
2984
2985 remqueue((queue_entry_t)thread);
2986 SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
2987 rq->count--;
2988 if (SCHED(priority_is_urgent)(thread->sched_pri)) {
2989 rq->urgency--; assert(rq->urgency >= 0);
2990 }
2991
2992 if (queue_empty(rq->queues + thread->sched_pri)) {
2993 /* update run queue status */
2994 if (thread->sched_pri != IDLEPRI)
2995 clrbit(MAXPRI - thread->sched_pri, rq->bitmap);
2996 rq->highq = MAXPRI - ffsbit(rq->bitmap);
2997 }
2998
2999 thread->runq = PROCESSOR_NULL;
3000}
3001
3e170ce0
A
3002/* Assumes RT lock is not held, and acquires splsched/rt_lock itself */
3003void
3004rt_runq_scan(sched_update_scan_context_t scan_context)
6d2010ae 3005{
3e170ce0
A
3006 spl_t s;
3007 thread_t thread;
fe8ab488 3008
3e170ce0
A
3009 s = splsched();
3010 rt_lock_lock();
6d2010ae 3011
3e170ce0
A
3012 qe_foreach_element_safe(thread, &rt_runq.queue, links) {
3013 if (thread->last_made_runnable_time < scan_context->earliest_rt_make_runnable_time) {
3014 scan_context->earliest_rt_make_runnable_time = thread->last_made_runnable_time;
3015 }
3016 }
6d2010ae 3017
3e170ce0
A
3018 rt_lock_unlock();
3019 splx(s);
6d2010ae
A
3020}
3021
3e170ce0 3022
1c79356b 3023/*
2d21ac55
A
3024 * realtime_queue_insert:
3025 *
3026 * Enqueue a thread for realtime execution.
1c79356b 3027 */
2d21ac55
A
3028static boolean_t
3029realtime_queue_insert(
3030 thread_t thread)
1c79356b 3031{
6d2010ae 3032 queue_t queue = &rt_runq.queue;
2d21ac55
A
3033 uint64_t deadline = thread->realtime.deadline;
3034 boolean_t preempt = FALSE;
1c79356b 3035
3e170ce0 3036 rt_lock_lock();
1c79356b 3037
55e303ae
A
3038 if (queue_empty(queue)) {
3039 enqueue_tail(queue, (queue_entry_t)thread);
2d21ac55 3040 preempt = TRUE;
55e303ae
A
3041 }
3042 else {
3043 register thread_t entry = (thread_t)queue_first(queue);
3044
3045 while (TRUE) {
3046 if ( queue_end(queue, (queue_entry_t)entry) ||
3047 deadline < entry->realtime.deadline ) {
3048 entry = (thread_t)queue_prev((queue_entry_t)entry);
3049 break;
3050 }
3051
3052 entry = (thread_t)queue_next((queue_entry_t)entry);
3053 }
3054
3055 if ((queue_entry_t)entry == queue)
2d21ac55 3056 preempt = TRUE;
55e303ae
A
3057
3058 insque((queue_entry_t)thread, (queue_entry_t)entry);
3059 }
3060
3e170ce0 3061 thread->runq = THREAD_ON_RT_RUNQ;
6d2010ae
A
3062 SCHED_STATS_RUNQ_CHANGE(&rt_runq.runq_stats, rt_runq.count);
3063 rt_runq.count++;
55e303ae 3064
3e170ce0 3065 rt_lock_unlock();
55e303ae 3066
2d21ac55
A
3067 return (preempt);
3068}
55e303ae 3069
2d21ac55
A
3070/*
3071 * realtime_setrun:
3072 *
3073 * Dispatch a thread for realtime execution.
3074 *
3075 * Thread must be locked. Associated pset must
3076 * be locked, and is returned unlocked.
3077 */
3078static void
3079realtime_setrun(
3080 processor_t processor,
3081 thread_t thread)
3082{
3083 processor_set_t pset = processor->processor_set;
39236c6e 3084 ast_t preempt;
55e303ae 3085
fe8ab488
A
3086 boolean_t do_signal_idle = FALSE, do_cause_ast = FALSE;
3087
6d2010ae
A
3088 thread->chosen_processor = processor;
3089
fe8ab488
A
3090 /* <rdar://problem/15102234> */
3091 assert(thread->bound_processor == PROCESSOR_NULL);
3092
2d21ac55
A
3093 /*
3094 * Dispatch directly onto idle processor.
3095 */
6d2010ae
A
3096 if ( (thread->bound_processor == processor)
3097 && processor->state == PROCESSOR_IDLE) {
3098 remqueue((queue_entry_t)processor);
cf7d32b8 3099 enqueue_tail(&pset->active_queue, (queue_entry_t)processor);
55e303ae 3100
2d21ac55 3101 processor->next_thread = thread;
39236c6e
A
3102 processor->current_pri = thread->sched_pri;
3103 processor->current_thmode = thread->sched_mode;
fe8ab488 3104 processor->current_sfi_class = thread->sfi_class;
2d21ac55
A
3105 processor->deadline = thread->realtime.deadline;
3106 processor->state = PROCESSOR_DISPATCHING;
55e303ae 3107
39236c6e 3108 if (processor != current_processor()) {
3e170ce0 3109 if (!(pset->pending_AST_cpu_mask & (1ULL << processor->cpu_id))) {
39236c6e 3110 /* cleared on exit from main processor_idle() loop */
3e170ce0 3111 pset->pending_AST_cpu_mask |= (1ULL << processor->cpu_id);
fe8ab488 3112 do_signal_idle = TRUE;
39236c6e
A
3113 }
3114 }
39236c6e 3115 pset_unlock(pset);
fe8ab488
A
3116
3117 if (do_signal_idle) {
3118 machine_signal_idle(processor);
3119 }
2d21ac55
A
3120 return;
3121 }
55e303ae 3122
39236c6e
A
3123 if (processor->current_pri < BASEPRI_RTQUEUES)
3124 preempt = (AST_PREEMPT | AST_URGENT);
3125 else if (thread->realtime.deadline < processor->deadline)
3126 preempt = (AST_PREEMPT | AST_URGENT);
3127 else
3128 preempt = AST_NONE;
3129
3130 realtime_queue_insert(thread);
3131
3132 if (preempt != AST_NONE) {
3133 if (processor->state == PROCESSOR_IDLE) {
3134 remqueue((queue_entry_t)processor);
3135 enqueue_tail(&pset->active_queue, (queue_entry_t)processor);
3136 processor->next_thread = THREAD_NULL;
3137 processor->current_pri = thread->sched_pri;
3138 processor->current_thmode = thread->sched_mode;
fe8ab488 3139 processor->current_sfi_class = thread->sfi_class;
39236c6e
A
3140 processor->deadline = thread->realtime.deadline;
3141 processor->state = PROCESSOR_DISPATCHING;
3142 if (processor == current_processor()) {
3143 ast_on(preempt);
3144 } else {
3e170ce0 3145 if (!(pset->pending_AST_cpu_mask & (1ULL << processor->cpu_id))) {
39236c6e 3146 /* cleared on exit from main processor_idle() loop */
3e170ce0 3147 pset->pending_AST_cpu_mask |= (1ULL << processor->cpu_id);
fe8ab488 3148 do_signal_idle = TRUE;
39236c6e 3149 }
fe8ab488 3150 }
39236c6e
A
3151 } else if (processor->state == PROCESSOR_DISPATCHING) {
3152 if ((processor->next_thread == THREAD_NULL) && ((processor->current_pri < thread->sched_pri) || (processor->deadline > thread->realtime.deadline))) {
3153 processor->current_pri = thread->sched_pri;
3154 processor->current_thmode = thread->sched_mode;
fe8ab488 3155 processor->current_sfi_class = thread->sfi_class;
39236c6e
A
3156 processor->deadline = thread->realtime.deadline;
3157 }
3158 } else {
3159 if (processor == current_processor()) {
3160 ast_on(preempt);
3161 } else {
3e170ce0 3162 if (!(pset->pending_AST_cpu_mask & (1ULL << processor->cpu_id))) {
39236c6e 3163 /* cleared after IPI causes csw_check() to be called */
3e170ce0 3164 pset->pending_AST_cpu_mask |= (1ULL << processor->cpu_id);
fe8ab488 3165 do_cause_ast = TRUE;
39236c6e
A
3166 }
3167 }
3168 }
3169 } else {
3170 /* Selected processor was too busy, just keep thread enqueued and let other processors drain it naturally. */
2d21ac55
A
3171 }
3172
3173 pset_unlock(pset);
fe8ab488
A
3174
3175 if (do_signal_idle) {
3176 machine_signal_idle(processor);
3177 } else if (do_cause_ast) {
3178 cause_ast_check(processor);
3179 }
2d21ac55
A
3180}
3181
6d2010ae 3182
fe8ab488
A
3183#if defined(CONFIG_SCHED_TIMESHARE_CORE)
3184
3185boolean_t
6d2010ae
A
3186priority_is_urgent(int priority)
3187{
3188 return testbit(priority, sched_preempt_pri) ? TRUE : FALSE;
3189}
3190
fe8ab488
A
3191#endif /* CONFIG_SCHED_TIMESHARE_CORE */
3192
55e303ae 3193/*
2d21ac55 3194 * processor_setrun:
55e303ae 3195 *
2d21ac55
A
3196 * Dispatch a thread for execution on a
3197 * processor.
55e303ae 3198 *
2d21ac55
A
3199 * Thread must be locked. Associated pset must
3200 * be locked, and is returned unlocked.
55e303ae 3201 */
2d21ac55
A
3202static void
3203processor_setrun(
3204 processor_t processor,
3205 thread_t thread,
3206 integer_t options)
55e303ae 3207{
2d21ac55
A
3208 processor_set_t pset = processor->processor_set;
3209 ast_t preempt;
39236c6e 3210 enum { eExitIdle, eInterruptRunning, eDoNothing } ipi_action = eDoNothing;
3e170ce0 3211 enum { eNoSignal, eDoSignal, eDoDeferredSignal } do_signal_idle = eNoSignal;
55e303ae 3212
3e170ce0 3213 boolean_t do_cause_ast = FALSE;
fe8ab488 3214
6d2010ae
A
3215 thread->chosen_processor = processor;
3216
55e303ae 3217 /*
2d21ac55 3218 * Dispatch directly onto idle processor.
55e303ae 3219 */
6d2010ae
A
3220 if ( (SCHED(direct_dispatch_to_idle_processors) ||
3221 thread->bound_processor == processor)
3222 && processor->state == PROCESSOR_IDLE) {
3223 remqueue((queue_entry_t)processor);
cf7d32b8 3224 enqueue_tail(&pset->active_queue, (queue_entry_t)processor);
2d21ac55
A
3225
3226 processor->next_thread = thread;
39236c6e
A
3227 processor->current_pri = thread->sched_pri;
3228 processor->current_thmode = thread->sched_mode;
fe8ab488 3229 processor->current_sfi_class = thread->sfi_class;
2d21ac55
A
3230 processor->deadline = UINT64_MAX;
3231 processor->state = PROCESSOR_DISPATCHING;
2d21ac55 3232
3e170ce0 3233 if (!(pset->pending_AST_cpu_mask & (1ULL << processor->cpu_id))) {
39236c6e 3234 /* cleared on exit from main processor_idle() loop */
3e170ce0
A
3235 pset->pending_AST_cpu_mask |= (1ULL << processor->cpu_id);
3236 do_signal_idle = eDoSignal;
39236c6e
A
3237 }
3238
3239 pset_unlock(pset);
3e170ce0
A
3240
3241 if (do_signal_idle == eDoSignal) {
fe8ab488
A
3242 machine_signal_idle(processor);
3243 }
3244
2d21ac55
A
3245 return;
3246 }
55e303ae
A
3247
3248 /*
2d21ac55 3249 * Set preemption mode.
1c79356b 3250 */
3e170ce0
A
3251#if defined(CONFIG_SCHED_DEFERRED_AST)
3252 /* TODO: Do we need to care about urgency (see rdar://problem/20136239)? */
3253#endif
6d2010ae
A
3254 if (SCHED(priority_is_urgent)(thread->sched_pri) && thread->sched_pri > processor->current_pri)
3255 preempt = (AST_PREEMPT | AST_URGENT);
3256 else if(processor->active_thread && thread_eager_preemption(processor->active_thread))
55e303ae 3257 preempt = (AST_PREEMPT | AST_URGENT);
3e170ce0
A
3258 else if ((thread->sched_mode == TH_MODE_TIMESHARE) && (thread->sched_pri < thread->base_pri)) {
3259 if(SCHED(priority_is_urgent)(thread->base_pri) && thread->sched_pri > processor->current_pri) {
39236c6e
A
3260 preempt = (options & SCHED_PREEMPT)? AST_PREEMPT: AST_NONE;
3261 } else {
3262 preempt = AST_NONE;
3263 }
3264 } else
2d21ac55 3265 preempt = (options & SCHED_PREEMPT)? AST_PREEMPT: AST_NONE;
9bccf70c 3266
39236c6e 3267 SCHED(processor_enqueue)(processor, thread, options);
9bccf70c 3268
2d21ac55 3269 if (preempt != AST_NONE) {
39236c6e
A
3270 if (processor->state == PROCESSOR_IDLE) {
3271 remqueue((queue_entry_t)processor);
3272 enqueue_tail(&pset->active_queue, (queue_entry_t)processor);
3273 processor->next_thread = THREAD_NULL;
3274 processor->current_pri = thread->sched_pri;
3275 processor->current_thmode = thread->sched_mode;
fe8ab488 3276 processor->current_sfi_class = thread->sfi_class;
39236c6e
A
3277 processor->deadline = UINT64_MAX;
3278 processor->state = PROCESSOR_DISPATCHING;
3279
3280 ipi_action = eExitIdle;
3281 } else if ( processor->state == PROCESSOR_DISPATCHING) {
3282 if ((processor->next_thread == THREAD_NULL) && (processor->current_pri < thread->sched_pri)) {
3283 processor->current_pri = thread->sched_pri;
3284 processor->current_thmode = thread->sched_mode;
fe8ab488 3285 processor->current_sfi_class = thread->sfi_class;
39236c6e
A
3286 processor->deadline = UINT64_MAX;
3287 }
3288 } else if ( (processor->state == PROCESSOR_RUNNING ||
2d21ac55 3289 processor->state == PROCESSOR_SHUTDOWN) &&
3e170ce0 3290 (thread->sched_pri >= processor->current_pri)) {
39236c6e 3291 ipi_action = eInterruptRunning;
2d21ac55 3292 }
39236c6e
A
3293 } else {
3294 /*
3295 * New thread is not important enough to preempt what is running, but
3296 * special processor states may need special handling
3297 */
3298 if (processor->state == PROCESSOR_SHUTDOWN &&
2d21ac55 3299 thread->sched_pri >= processor->current_pri ) {
39236c6e
A
3300 ipi_action = eInterruptRunning;
3301 } else if ( processor->state == PROCESSOR_IDLE &&
3302 processor != current_processor() ) {
3303 remqueue((queue_entry_t)processor);
3304 enqueue_tail(&pset->active_queue, (queue_entry_t)processor);
3305 processor->next_thread = THREAD_NULL;
3306 processor->current_pri = thread->sched_pri;
3307 processor->current_thmode = thread->sched_mode;
fe8ab488 3308 processor->current_sfi_class = thread->sfi_class;
39236c6e
A
3309 processor->deadline = UINT64_MAX;
3310 processor->state = PROCESSOR_DISPATCHING;
3311
3312 ipi_action = eExitIdle;
3313 }
2d21ac55 3314 }
39236c6e
A
3315
3316 switch (ipi_action) {
3317 case eDoNothing:
3318 break;
3319 case eExitIdle:
3320 if (processor == current_processor()) {
fe8ab488 3321 if (csw_check_locked(processor, pset, AST_NONE) != AST_NONE)
39236c6e
A
3322 ast_on(preempt);
3323 } else {
3e170ce0
A
3324#if defined(CONFIG_SCHED_DEFERRED_AST)
3325 if (!(pset->pending_deferred_AST_cpu_mask & (1ULL << processor->cpu_id)) &&
3326 !(pset->pending_AST_cpu_mask & (1ULL << processor->cpu_id))) {
39236c6e 3327 /* cleared on exit from main processor_idle() loop */
3e170ce0
A
3328 pset->pending_deferred_AST_cpu_mask |= (1ULL << processor->cpu_id);
3329 do_signal_idle = eDoDeferredSignal;
3330 }
3331#else
3332 if (!(pset->pending_AST_cpu_mask & (1ULL << processor->cpu_id))) {
3333 /* cleared on exit from main processor_idle() loop */
3334 pset->pending_AST_cpu_mask |= (1ULL << processor->cpu_id);
3335 do_signal_idle = eDoSignal;
39236c6e 3336 }
3e170ce0 3337#endif
39236c6e
A
3338 }
3339 break;
3340 case eInterruptRunning:
3341 if (processor == current_processor()) {
fe8ab488 3342 if (csw_check_locked(processor, pset, AST_NONE) != AST_NONE)
39236c6e
A
3343 ast_on(preempt);
3344 } else {
3e170ce0 3345 if (!(pset->pending_AST_cpu_mask & (1ULL << processor->cpu_id))) {
39236c6e 3346 /* cleared after IPI causes csw_check() to be called */
3e170ce0 3347 pset->pending_AST_cpu_mask |= (1ULL << processor->cpu_id);
fe8ab488 3348 do_cause_ast = TRUE;
39236c6e
A
3349 }
3350 }
3351 break;
6d2010ae 3352 }
2d21ac55
A
3353
3354 pset_unlock(pset);
fe8ab488 3355
3e170ce0 3356 if (do_signal_idle == eDoSignal) {
fe8ab488 3357 machine_signal_idle(processor);
fe8ab488 3358 }
3e170ce0
A
3359#if defined(CONFIG_SCHED_DEFERRED_AST)
3360 else if (do_signal_idle == eDoDeferredSignal) {
3361 /*
3362 * TODO: The ability to cancel this signal could make
3363 * sending it outside of the pset lock an issue. Do
3364 * we need to address this? Or would the only fallout
3365 * be that the core takes a signal? As long as we do
3366 * not run the risk of having a core marked as signal
3367 * outstanding, with no real signal outstanding, the
3368 * only result should be that we fail to cancel some
3369 * signals.
3370 */
3371 machine_signal_idle_deferred(processor);
316670eb 3372 }
3e170ce0
A
3373#endif
3374 else if (do_cause_ast) {
3375 cause_ast_check(processor);
6d2010ae 3376 }
6d2010ae
A
3377}
3378
2d21ac55
A
3379/*
3380 * choose_next_pset:
3381 *
3382 * Return the next sibling pset containing
3383 * available processors.
3384 *
3385 * Returns the original pset if none other is
3386 * suitable.
3387 */
3388static processor_set_t
3389choose_next_pset(
3390 processor_set_t pset)
3391{
3392 processor_set_t nset = pset;
3393
3394 do {
3395 nset = next_pset(nset);
6d2010ae 3396 } while (nset->online_processor_count < 1 && nset != pset);
2d21ac55 3397
cf7d32b8 3398 return (nset);
2d21ac55
A
3399}
3400
3401/*
3402 * choose_processor:
3403 *
3404 * Choose a processor for the thread, beginning at
b7266188 3405 * the pset. Accepts an optional processor hint in
2d21ac55
A
3406 * the pset.
3407 *
3408 * Returns a processor, possibly from a different pset.
3409 *
3410 * The thread must be locked. The pset must be locked,
3411 * and the resulting pset is locked on return.
3412 */
6d2010ae 3413processor_t
2d21ac55
A
3414choose_processor(
3415 processor_set_t pset,
b7266188 3416 processor_t processor,
2d21ac55
A
3417 thread_t thread)
3418{
3419 processor_set_t nset, cset = pset;
0b4c1975 3420
cf7d32b8 3421 /*
fe8ab488 3422 * Prefer the hinted processor, when appropriate.
cf7d32b8 3423 */
b7266188 3424
fe8ab488 3425 /* Fold last processor hint from secondary processor to its primary */
0b4c1975 3426 if (processor != PROCESSOR_NULL) {
fe8ab488 3427 processor = processor->processor_primary;
0b4c1975 3428 }
b0d623f7 3429
fe8ab488
A
3430 /*
3431 * Only consult platform layer if pset is active, which
3432 * it may not be in some cases when a multi-set system
3433 * is going to sleep.
3434 */
3435 if (pset->online_processor_count) {
3436 if ((processor == PROCESSOR_NULL) || (processor->processor_set == pset && processor->state == PROCESSOR_IDLE)) {
3437 processor_t mc_processor = machine_choose_processor(pset, processor);
3438 if (mc_processor != PROCESSOR_NULL)
3439 processor = mc_processor->processor_primary;
3440 }
3441 }
b7266188 3442
fe8ab488
A
3443 /*
3444 * At this point, we may have a processor hint, and we may have
3445 * an initial starting pset. If the hint is not in the pset, or
3446 * if the hint is for a processor in an invalid state, discard
3447 * the hint.
3448 */
0b4c1975 3449 if (processor != PROCESSOR_NULL) {
fe8ab488 3450 if (processor->processor_set != pset) {
cf7d32b8 3451 processor = PROCESSOR_NULL;
3e170ce0
A
3452 } else if (!processor->is_recommended) {
3453 processor = PROCESSOR_NULL;
fe8ab488
A
3454 } else {
3455 switch (processor->state) {
3456 case PROCESSOR_START:
3457 case PROCESSOR_SHUTDOWN:
3458 case PROCESSOR_OFF_LINE:
3459 /*
3460 * Hint is for a processor that cannot support running new threads.
3461 */
3462 processor = PROCESSOR_NULL;
3463 break;
3464 case PROCESSOR_IDLE:
3465 /*
3466 * Hint is for an idle processor. Assume it is no worse than any other
3467 * idle processor. The platform layer had an opportunity to provide
3468 * the "least cost idle" processor above.
3469 */
3470 return (processor);
3471 break;
3472 case PROCESSOR_RUNNING:
3473 case PROCESSOR_DISPATCHING:
3474 /*
3475 * Hint is for an active CPU. This fast-path allows
3476 * realtime threads to preempt non-realtime threads
3477 * to regain their previous executing processor.
3478 */
3479 if ((thread->sched_pri >= BASEPRI_RTQUEUES) &&
3480 (processor->current_pri < BASEPRI_RTQUEUES))
3481 return (processor);
3482
3483 /* Otherwise, use hint as part of search below */
3484 break;
3485 default:
3486 processor = PROCESSOR_NULL;
3487 break;
3488 }
3489 }
b7266188 3490 }
2d21ac55
A
3491
3492 /*
fe8ab488
A
3493 * Iterate through the processor sets to locate
3494 * an appropriate processor. Seed results with
3495 * a last-processor hint, if available, so that
3496 * a search must find something strictly better
3497 * to replace it.
3498 *
3499 * A primary/secondary pair of SMT processors are
3500 * "unpaired" if the primary is busy but its
3501 * corresponding secondary is idle (so the physical
3502 * core has full use of its resources).
2d21ac55 3503 */
fe8ab488
A
3504
3505 integer_t lowest_priority = MAXPRI + 1;
3506 integer_t lowest_unpaired_primary_priority = MAXPRI + 1;
3507 integer_t lowest_count = INT_MAX;
3508 uint64_t furthest_deadline = 1;
3509 processor_t lp_processor = PROCESSOR_NULL;
3510 processor_t lp_unpaired_primary_processor = PROCESSOR_NULL;
3511 processor_t lp_unpaired_secondary_processor = PROCESSOR_NULL;
3512 processor_t lc_processor = PROCESSOR_NULL;
3513 processor_t fd_processor = PROCESSOR_NULL;
3514
3515 if (processor != PROCESSOR_NULL) {
3516 /* All other states should be enumerated above. */
3517 assert(processor->state == PROCESSOR_RUNNING || processor->state == PROCESSOR_DISPATCHING);
3518
3519 lowest_priority = processor->current_pri;
3520 lp_processor = processor;
3521
3522 if (processor->current_pri >= BASEPRI_RTQUEUES) {
3523 furthest_deadline = processor->deadline;
3524 fd_processor = processor;
3525 }
3526
3527 lowest_count = SCHED(processor_runq_count)(processor);
3528 lc_processor = processor;
3529 }
3530
2d21ac55 3531 do {
fe8ab488 3532
9bccf70c 3533 /*
fe8ab488 3534 * Choose an idle processor, in pset traversal order
9bccf70c 3535 */
3e170ce0
A
3536 qe_foreach_element(processor, &cset->idle_queue, processor_queue) {
3537 if (processor->is_recommended)
3538 return processor;
3539 }
1c79356b 3540
fe8ab488
A
3541 /*
3542 * Otherwise, enumerate active and idle processors to find candidates
3543 * with lower priority/etc.
3544 */
0b4c1975 3545
3e170ce0
A
3546 qe_foreach_element(processor, &cset->active_queue, processor_queue) {
3547
3548 if (!processor->is_recommended) {
3549 continue;
3550 }
2d21ac55 3551
fe8ab488
A
3552 integer_t cpri = processor->current_pri;
3553 if (cpri < lowest_priority) {
3554 lowest_priority = cpri;
3555 lp_processor = processor;
3556 }
b0d623f7 3557
fe8ab488
A
3558 if ((cpri >= BASEPRI_RTQUEUES) && (processor->deadline > furthest_deadline)) {
3559 furthest_deadline = processor->deadline;
3560 fd_processor = processor;
3561 }
0b4c1975 3562
fe8ab488
A
3563 integer_t ccount = SCHED(processor_runq_count)(processor);
3564 if (ccount < lowest_count) {
3565 lowest_count = ccount;
3566 lc_processor = processor;
3567 }
fe8ab488
A
3568 }
3569
3570 /*
3571 * For SMT configs, these idle secondary processors must have active primary. Otherwise
3572 * the idle primary would have short-circuited the loop above
3573 */
3e170ce0
A
3574 qe_foreach_element(processor, &cset->idle_secondary_queue, processor_queue) {
3575
3576 if (!processor->is_recommended) {
3577 continue;
3578 }
3579
fe8ab488
A
3580 processor_t cprimary = processor->processor_primary;
3581
3582 /* If the primary processor is offline or starting up, it's not a candidate for this path */
3583 if (cprimary->state == PROCESSOR_RUNNING || cprimary->state == PROCESSOR_DISPATCHING) {
3584 integer_t primary_pri = cprimary->current_pri;
3585
3586 if (primary_pri < lowest_unpaired_primary_priority) {
3587 lowest_unpaired_primary_priority = primary_pri;
3588 lp_unpaired_primary_processor = cprimary;
3589 lp_unpaired_secondary_processor = processor;
0b4c1975 3590 }
2d21ac55 3591 }
fe8ab488
A
3592 }
3593
0b4c1975 3594
fe8ab488
A
3595 if (thread->sched_pri >= BASEPRI_RTQUEUES) {
3596
3597 /*
3598 * For realtime threads, the most important aspect is
3599 * scheduling latency, so we attempt to assign threads
3600 * to good preemption candidates (assuming an idle primary
3601 * processor was not available above).
3602 */
3603
3604 if (thread->sched_pri > lowest_unpaired_primary_priority) {
3605 /* Move to end of active queue so that the next thread doesn't also pick it */
3e170ce0 3606 re_queue_tail(&cset->active_queue, (queue_entry_t)lp_unpaired_primary_processor);
fe8ab488
A
3607 return lp_unpaired_primary_processor;
3608 }
3609 if (thread->sched_pri > lowest_priority) {
3610 /* Move to end of active queue so that the next thread doesn't also pick it */
3e170ce0 3611 re_queue_tail(&cset->active_queue, (queue_entry_t)lp_processor);
0b4c1975 3612 return lp_processor;
fe8ab488 3613 }
0b4c1975
A
3614 if (thread->realtime.deadline < furthest_deadline)
3615 return fd_processor;
6d2010ae 3616
2d21ac55 3617 /*
fe8ab488
A
3618 * If all primary and secondary CPUs are busy with realtime
3619 * threads with deadlines earlier than us, move on to next
3620 * pset.
2d21ac55 3621 */
fe8ab488
A
3622 }
3623 else {
3624
3625 if (thread->sched_pri > lowest_unpaired_primary_priority) {
3626 /* Move to end of active queue so that the next thread doesn't also pick it */
3e170ce0 3627 re_queue_tail(&cset->active_queue, (queue_entry_t)lp_unpaired_primary_processor);
fe8ab488 3628 return lp_unpaired_primary_processor;
c910b4d9 3629 }
fe8ab488
A
3630 if (thread->sched_pri > lowest_priority) {
3631 /* Move to end of active queue so that the next thread doesn't also pick it */
3e170ce0 3632 re_queue_tail(&cset->active_queue, (queue_entry_t)lp_processor);
fe8ab488 3633 return lp_processor;
cf7d32b8 3634 }
9bccf70c 3635
9bccf70c 3636 /*
fe8ab488
A
3637 * If all primary processor in this pset are running a higher
3638 * priority thread, move on to next pset. Only when we have
3639 * exhausted this search do we fall back to other heuristics.
1c79356b 3640 */
2d21ac55
A
3641 }
3642
3643 /*
fe8ab488 3644 * Move onto the next processor set.
2d21ac55
A
3645 */
3646 nset = next_pset(cset);
3647
3648 if (nset != pset) {
3649 pset_unlock(cset);
3650
3651 cset = nset;
3652 pset_lock(cset);
3653 }
3654 } while (nset != pset);
3655
3656 /*
fe8ab488
A
3657 * Make sure that we pick a running processor,
3658 * and that the correct processor set is locked.
3659 * Since we may have unlock the candidate processor's
3660 * pset, it may have changed state.
3661 *
3662 * All primary processors are running a higher priority
3663 * thread, so the only options left are enqueuing on
3664 * the secondary processor that would perturb the least priority
3665 * primary, or the least busy primary.
2d21ac55 3666 */
cf7d32b8 3667 do {
2d21ac55 3668
fe8ab488
A
3669 /* lowest_priority is evaluated in the main loops above */
3670 if (lp_unpaired_secondary_processor != PROCESSOR_NULL) {
3671 processor = lp_unpaired_secondary_processor;
3672 lp_unpaired_secondary_processor = PROCESSOR_NULL;
3673 } else if (lc_processor != PROCESSOR_NULL) {
3674 processor = lc_processor;
3675 lc_processor = PROCESSOR_NULL;
3676 } else {
cf7d32b8 3677 /*
fe8ab488
A
3678 * All processors are executing higher
3679 * priority threads, and the lowest_count
3680 * candidate was not usable
cf7d32b8 3681 */
fe8ab488 3682 processor = master_processor;
cf7d32b8
A
3683 }
3684
3685 /*
fe8ab488
A
3686 * Check that the correct processor set is
3687 * returned locked.
cf7d32b8
A
3688 */
3689 if (cset != processor->processor_set) {
3690 pset_unlock(cset);
cf7d32b8
A
3691 cset = processor->processor_set;
3692 pset_lock(cset);
3693 }
3694
3695 /*
fe8ab488
A
3696 * We must verify that the chosen processor is still available.
3697 * master_processor is an exception, since we may need to preempt
3698 * a running thread on it during processor shutdown (for sleep),
3699 * and that thread needs to be enqueued on its runqueue to run
3700 * when the processor is restarted.
cf7d32b8 3701 */
fe8ab488 3702 if (processor != master_processor && (processor->state == PROCESSOR_SHUTDOWN || processor->state == PROCESSOR_OFF_LINE))
cf7d32b8 3703 processor = PROCESSOR_NULL;
fe8ab488 3704
cf7d32b8 3705 } while (processor == PROCESSOR_NULL);
2d21ac55
A
3706
3707 return (processor);
3708}
3709
3710/*
3711 * thread_setrun:
3712 *
3713 * Dispatch thread for execution, onto an idle
3714 * processor or run queue, and signal a preemption
3715 * as appropriate.
3716 *
3717 * Thread must be locked.
3718 */
3719void
3720thread_setrun(
3721 thread_t thread,
3722 integer_t options)
3723{
3724 processor_t processor;
3725 processor_set_t pset;
3726
3e170ce0
A
3727 assert((thread->state & (TH_RUN|TH_WAIT|TH_UNINT|TH_TERMINATE|TH_TERMINATE2)) == TH_RUN);
3728 assert(thread->runq == PROCESSOR_NULL);
3729
2d21ac55
A
3730 /*
3731 * Update priority if needed.
3732 */
6d2010ae
A
3733 if (SCHED(can_update_priority)(thread))
3734 SCHED(update_priority)(thread);
2d21ac55 3735
fe8ab488
A
3736 thread->sfi_class = sfi_thread_classify(thread);
3737
2d21ac55
A
3738 assert(thread->runq == PROCESSOR_NULL);
3739
3e170ce0 3740#if __SMP__
2d21ac55
A
3741 if (thread->bound_processor == PROCESSOR_NULL) {
3742 /*
3743 * Unbound case.
3744 */
3745 if (thread->affinity_set != AFFINITY_SET_NULL) {
3746 /*
3747 * Use affinity set policy hint.
3748 */
3749 pset = thread->affinity_set->aset_pset;
3750 pset_lock(pset);
3751
6d2010ae 3752 processor = SCHED(choose_processor)(pset, PROCESSOR_NULL, thread);
39236c6e 3753
3e170ce0 3754 SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR)|DBG_FUNC_NONE,
39236c6e 3755 (uintptr_t)thread_tid(thread), (uintptr_t)-1, processor->cpu_id, processor->state, 0);
3e170ce0 3756 } else if (thread->last_processor != PROCESSOR_NULL) {
2d21ac55
A
3757 /*
3758 * Simple (last processor) affinity case.
3759 */
3760 processor = thread->last_processor;
3761 pset = processor->processor_set;
3762 pset_lock(pset);
6d2010ae
A
3763 processor = SCHED(choose_processor)(pset, processor, thread);
3764
3e170ce0 3765 SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR)|DBG_FUNC_NONE,
39236c6e 3766 (uintptr_t)thread_tid(thread), thread->last_processor->cpu_id, processor->cpu_id, processor->state, 0);
3e170ce0 3767 } else {
2d21ac55
A
3768 /*
3769 * No Affinity case:
3770 *
cf7d32b8
A
3771 * Utilitize a per task hint to spread threads
3772 * among the available processor sets.
2d21ac55 3773 */
cf7d32b8
A
3774 task_t task = thread->task;
3775
3776 pset = task->pset_hint;
3777 if (pset == PROCESSOR_SET_NULL)
3778 pset = current_processor()->processor_set;
3779
3780 pset = choose_next_pset(pset);
2d21ac55 3781 pset_lock(pset);
9bccf70c 3782
6d2010ae 3783 processor = SCHED(choose_processor)(pset, PROCESSOR_NULL, thread);
cf7d32b8 3784 task->pset_hint = processor->processor_set;
39236c6e 3785
3e170ce0 3786 SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR)|DBG_FUNC_NONE,
39236c6e 3787 (uintptr_t)thread_tid(thread), (uintptr_t)-1, processor->cpu_id, processor->state, 0);
55e303ae 3788 }
3e170ce0 3789 } else {
2d21ac55
A
3790 /*
3791 * Bound case:
3792 *
3793 * Unconditionally dispatch on the processor.
3794 */
3795 processor = thread->bound_processor;
55e303ae 3796 pset = processor->processor_set;
2d21ac55 3797 pset_lock(pset);
39236c6e 3798
3e170ce0 3799 SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR)|DBG_FUNC_NONE,
39236c6e 3800 (uintptr_t)thread_tid(thread), (uintptr_t)-2, processor->cpu_id, processor->state, 0);
2d21ac55 3801 }
3e170ce0
A
3802#else /* !__SMP__ */
3803 /* Only one processor to choose */
3804 assert(thread->bound_processor == PROCESSOR_NULL || thread->bound_processor == master_processor);
3805 processor = master_processor;
3806 pset = processor->processor_set;
3807 pset_lock(pset);
3808#endif /* !__SMP__ */
2d21ac55
A
3809
3810 /*
3e170ce0 3811 * Dispatch the thread on the chosen processor.
fe8ab488 3812 * TODO: This should be based on sched_mode, not sched_pri
2d21ac55
A
3813 */
3814 if (thread->sched_pri >= BASEPRI_RTQUEUES)
3815 realtime_setrun(processor, thread);
3816 else
3817 processor_setrun(processor, thread, options);
3818}
3819
b0d623f7
A
3820processor_set_t
3821task_choose_pset(
3822 task_t task)
3823{
3824 processor_set_t pset = task->pset_hint;
3825
3826 if (pset != PROCESSOR_SET_NULL)
3827 pset = choose_next_pset(pset);
3828
3829 return (pset);
3830}
3831
9bccf70c 3832/*
c910b4d9
A
3833 * Check for a preemption point in
3834 * the current context.
55e303ae 3835 *
fe8ab488 3836 * Called at splsched with thread locked.
9bccf70c
A
3837 */
3838ast_t
3839csw_check(
fe8ab488
A
3840 processor_t processor,
3841 ast_t check_reason)
39236c6e
A
3842{
3843 processor_set_t pset = processor->processor_set;
3844 ast_t result;
3845
3846 pset_lock(pset);
3847
3848 /* If we were sent a remote AST and interrupted a running processor, acknowledge it here with pset lock held */
3e170ce0 3849 pset->pending_AST_cpu_mask &= ~(1ULL << processor->cpu_id);
39236c6e 3850
fe8ab488 3851 result = csw_check_locked(processor, pset, check_reason);
39236c6e
A
3852
3853 pset_unlock(pset);
3854
3855 return result;
3856}
3857
3858/*
3859 * Check for preemption at splsched with
fe8ab488 3860 * pset and thread locked
39236c6e
A
3861 */
3862ast_t
3863csw_check_locked(
3864 processor_t processor,
fe8ab488
A
3865 processor_set_t pset __unused,
3866 ast_t check_reason)
9bccf70c 3867{
fe8ab488 3868 ast_t result;
316670eb 3869 thread_t thread = processor->active_thread;
9bccf70c 3870
3e170ce0 3871 if (processor->first_timeslice) {
6d2010ae 3872 if (rt_runq.count > 0)
fe8ab488 3873 return (check_reason | AST_PREEMPT | AST_URGENT);
9bccf70c
A
3874 }
3875 else {
39236c6e
A
3876 if (rt_runq.count > 0) {
3877 if (BASEPRI_RTQUEUES > processor->current_pri)
fe8ab488 3878 return (check_reason | AST_PREEMPT | AST_URGENT);
39236c6e 3879 else
fe8ab488 3880 return (check_reason | AST_PREEMPT);
39236c6e 3881 }
1c79356b 3882 }
9bccf70c 3883
316670eb 3884 result = SCHED(processor_csw_check)(processor);
9bccf70c 3885 if (result != AST_NONE)
3e170ce0
A
3886 return (check_reason | result | (thread_eager_preemption(thread) ? AST_URGENT : AST_NONE));
3887
3888#if __SMP__
9bccf70c 3889
3e170ce0
A
3890 /*
3891 * If the current thread is running on a processor that is no longer recommended, gently
3892 * (non-urgently) get to a point and then block, and which point thread_select() should
3893 * try to idle the processor and re-dispatch the thread to a recommended processor.
3894 */
3895 if (!processor->is_recommended)
fe8ab488 3896 return (check_reason | AST_PREEMPT);
3e170ce0
A
3897
3898 /*
3899 * Even though we could continue executing on this processor, a
3900 * secondary SMT core should try to shed load to another primary core.
3901 *
3902 * TODO: Should this do the same check that thread_select does? i.e.
3903 * if no bound threads target this processor, and idle primaries exist, preempt
3904 * The case of RT threads existing is already taken care of above
3905 * Consider Capri in this scenario.
3906 *
3907 * if (!SCHED(processor_bound_count)(processor) && !queue_empty(&pset->idle_queue))
3908 *
3909 * TODO: Alternatively - check if only primary is idle, or check if primary's pri is lower than mine.
3910 */
3911
3912 if (processor->current_pri < BASEPRI_RTQUEUES &&
3913 processor->processor_primary != processor)
3914 return (check_reason | AST_PREEMPT);
3915#endif
3916
316670eb 3917 if (thread->state & TH_SUSP)
fe8ab488
A
3918 return (check_reason | AST_PREEMPT);
3919
3e170ce0 3920#if CONFIG_SCHED_SFI
fe8ab488
A
3921 /*
3922 * Current thread may not need to be preempted, but maybe needs
3923 * an SFI wait?
3924 */
3925 result = sfi_thread_needs_ast(thread, NULL);
3926 if (result != AST_NONE)
3927 return (check_reason | result);
3e170ce0 3928#endif
c910b4d9
A
3929
3930 return (AST_NONE);
1c79356b
A
3931}
3932
3933/*
9bccf70c 3934 * set_sched_pri:
1c79356b 3935 *
55e303ae
A
3936 * Set the scheduled priority of the specified thread.
3937 *
9bccf70c 3938 * This may cause the thread to change queues.
1c79356b 3939 *
55e303ae 3940 * Thread must be locked.
1c79356b
A
3941 */
3942void
9bccf70c 3943set_sched_pri(
3e170ce0
A
3944 thread_t thread,
3945 int priority)
1c79356b 3946{
3e170ce0
A
3947 thread_t cthread = current_thread();
3948 boolean_t is_current_thread = (thread == cthread) ? TRUE : FALSE;
fe8ab488
A
3949 int curgency, nurgency;
3950 uint64_t urgency_param1, urgency_param2;
3e170ce0 3951 boolean_t removed_from_runq = FALSE;
9bccf70c 3952
3e170ce0
A
3953 /* If we're already at this priority, no need to mess with the runqueue */
3954 if (priority == thread->sched_pri)
3955 return;
3956
3957 if (is_current_thread) {
3958 assert(thread->runq == PROCESSOR_NULL);
fe8ab488 3959 curgency = thread_get_urgency(thread, &urgency_param1, &urgency_param2);
3e170ce0
A
3960 } else {
3961 removed_from_runq = thread_run_queue_remove(thread);
fe8ab488 3962 }
3e170ce0 3963
490019cf
A
3964 thread->sched_pri = priority;
3965
3e170ce0
A
3966 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHANGE_PRIORITY),
3967 (uintptr_t)thread_tid(thread),
3968 thread->base_pri,
3969 thread->sched_pri,
3970 0, /* eventually, 'reason' */
3971 0);
3972
3e170ce0 3973 if (is_current_thread) {
fe8ab488 3974 nurgency = thread_get_urgency(thread, &urgency_param1, &urgency_param2);
3e170ce0
A
3975 /*
3976 * set_sched_pri doesn't alter RT params. We expect direct base priority/QoS
3977 * class alterations from user space to occur relatively infrequently, hence
3978 * those are lazily handled. QoS classes have distinct priority bands, and QoS
3979 * inheritance is expected to involve priority changes.
3980 */
fe8ab488 3981 if (nurgency != curgency) {
3e170ce0
A
3982 thread_tell_urgency(nurgency, urgency_param1, urgency_param2, 0, thread);
3983 machine_thread_going_on_core(thread, nurgency, 0);
fe8ab488
A
3984 }
3985 }
3986
3e170ce0
A
3987 /* TODO: Should this be TAILQ if it went down, HEADQ if it went up? */
3988 if (removed_from_runq)
3989 thread_run_queue_reinsert(thread, SCHED_PREEMPT | SCHED_TAILQ);
3990 else if (thread->state & TH_RUN) {
3991 processor_t processor = thread->last_processor;
9bccf70c 3992
3e170ce0
A
3993 if (is_current_thread) {
3994 ast_t preempt;
9bccf70c 3995
9bccf70c 3996 processor->current_pri = priority;
6d2010ae 3997 processor->current_thmode = thread->sched_mode;
fe8ab488
A
3998 processor->current_sfi_class = thread->sfi_class = sfi_thread_classify(thread);
3999 if ((preempt = csw_check(processor, AST_NONE)) != AST_NONE)
c910b4d9 4000 ast_on(preempt);
3e170ce0 4001 } else if (processor != PROCESSOR_NULL && processor->active_thread == thread)
9bccf70c 4002 cause_ast_check(processor);
1c79356b
A
4003 }
4004}
4005
3e170ce0
A
4006/*
4007 * thread_run_queue_remove_for_handoff
4008 *
4009 * Pull a thread or its (recursive) push target out of the runqueue
4010 * so that it is ready for thread_run()
4011 *
4012 * Called at splsched
4013 *
4014 * Returns the thread that was pulled or THREAD_NULL if no thread could be pulled.
4015 * This may be different than the thread that was passed in.
4016 */
4017thread_t
4018thread_run_queue_remove_for_handoff(thread_t thread) {
91447636 4019
3e170ce0 4020 thread_t pulled_thread = THREAD_NULL;
91447636 4021
3e170ce0 4022 thread_lock(thread);
91447636 4023
3e170ce0
A
4024 /*
4025 * Check that the thread is not bound
4026 * to a different processor, and that realtime
4027 * is not involved.
4028 *
4029 * Next, pull it off its run queue. If it
4030 * doesn't come, it's not eligible.
4031 */
91447636 4032
3e170ce0
A
4033 processor_t processor = current_processor();
4034 if (processor->current_pri < BASEPRI_RTQUEUES && thread->sched_pri < BASEPRI_RTQUEUES &&
4035 (thread->bound_processor == PROCESSOR_NULL || thread->bound_processor == processor)) {
91447636 4036
3e170ce0
A
4037 if (thread_run_queue_remove(thread))
4038 pulled_thread = thread;
91447636
A
4039 }
4040
3e170ce0 4041 thread_unlock(thread);
6d2010ae 4042
3e170ce0 4043 return pulled_thread;
6d2010ae
A
4044}
4045
1c79356b 4046/*
6d2010ae 4047 * thread_run_queue_remove:
1c79356b 4048 *
fe8ab488 4049 * Remove a thread from its current run queue and
2d21ac55 4050 * return TRUE if successful.
55e303ae
A
4051 *
4052 * Thread must be locked.
fe8ab488
A
4053 *
4054 * If thread->runq is PROCESSOR_NULL, the thread will not re-enter the
4055 * run queues because the caller locked the thread. Otherwise
4056 * the thread is on a run queue, but could be chosen for dispatch
4057 * and removed by another processor under a different lock, which
4058 * will set thread->runq to PROCESSOR_NULL.
4059 *
4060 * Hence the thread select path must not rely on anything that could
4061 * be changed under the thread lock after calling this function,
4062 * most importantly thread->sched_pri.
1c79356b 4063 */
2d21ac55 4064boolean_t
6d2010ae 4065thread_run_queue_remove(
fe8ab488 4066 thread_t thread)
1c79356b 4067{
fe8ab488
A
4068 boolean_t removed = FALSE;
4069 processor_t processor = thread->runq;
1c79356b 4070
fe8ab488
A
4071 if ((thread->state & (TH_RUN|TH_WAIT)) == TH_WAIT) {
4072 /* Thread isn't runnable */
4073 assert(thread->runq == PROCESSOR_NULL);
4074 return FALSE;
4075 }
55e303ae 4076
fe8ab488 4077 if (processor == PROCESSOR_NULL) {
55e303ae 4078 /*
fe8ab488
A
4079 * The thread is either not on the runq,
4080 * or is in the midst of being removed from the runq.
4081 *
4082 * runq is set to NULL under the pset lock, not the thread
4083 * lock, so the thread may still be in the process of being dequeued
4084 * from the runq. It will wait in invoke for the thread lock to be
4085 * dropped.
55e303ae 4086 */
55e303ae 4087
fe8ab488
A
4088 return FALSE;
4089 }
55e303ae 4090
fe8ab488
A
4091 if (thread->sched_pri < BASEPRI_RTQUEUES) {
4092 return SCHED(processor_queue_remove)(processor, thread);
4093 }
55e303ae 4094
3e170ce0 4095 rt_lock_lock();
55e303ae 4096
fe8ab488
A
4097 if (thread->runq != PROCESSOR_NULL) {
4098 /*
3e170ce0 4099 * Thread is on the RT run queue and we have a lock on
fe8ab488
A
4100 * that run queue.
4101 */
4102
3e170ce0 4103 assert(thread->runq == THREAD_ON_RT_RUNQ);
fe8ab488
A
4104
4105 remqueue((queue_entry_t)thread);
4106 SCHED_STATS_RUNQ_CHANGE(&rt_runq.runq_stats, rt_runq.count);
4107 rt_runq.count--;
4108
4109 thread->runq = PROCESSOR_NULL;
4110
4111 removed = TRUE;
1c79356b
A
4112 }
4113
3e170ce0 4114 rt_lock_unlock();
fe8ab488
A
4115
4116 return (removed);
1c79356b
A
4117}
4118
cf7d32b8 4119/*
3e170ce0 4120 * Put the thread back where it goes after a thread_run_queue_remove
cf7d32b8 4121 *
3e170ce0 4122 * Thread must have been removed under the same thread lock hold
cf7d32b8 4123 *
3e170ce0 4124 * thread locked, at splsched
cf7d32b8 4125 */
3e170ce0
A
4126void
4127thread_run_queue_reinsert(thread_t thread, integer_t options)
cf7d32b8 4128{
3e170ce0 4129 assert(thread->runq == PROCESSOR_NULL);
cf7d32b8 4130
3e170ce0
A
4131 assert(thread->state & (TH_RUN));
4132 thread_setrun(thread, options);
6d2010ae 4133
6d2010ae
A
4134}
4135
39236c6e
A
4136void
4137sys_override_cpu_throttle(int flag)
6d2010ae 4138{
39236c6e
A
4139 if (flag == CPU_THROTTLE_ENABLE)
4140 cpu_throttle_enabled = 1;
4141 if (flag == CPU_THROTTLE_DISABLE)
4142 cpu_throttle_enabled = 0;
4143}
6d2010ae 4144
39236c6e
A
4145int
4146thread_get_urgency(thread_t thread, uint64_t *arg1, uint64_t *arg2)
4147{
4148 if (thread == NULL || (thread->state & TH_IDLE)) {
4149 *arg1 = 0;
4150 *arg2 = 0;
4151
4152 return (THREAD_URGENCY_NONE);
4153 } else if (thread->sched_mode == TH_MODE_REALTIME) {
4154 *arg1 = thread->realtime.period;
4155 *arg2 = thread->realtime.deadline;
4156
4157 return (THREAD_URGENCY_REAL_TIME);
4158 } else if (cpu_throttle_enabled &&
3e170ce0 4159 ((thread->sched_pri <= MAXPRI_THROTTLE) && (thread->base_pri <= MAXPRI_THROTTLE))) {
39236c6e
A
4160 /*
4161 * Background urgency applied when thread priority is MAXPRI_THROTTLE or lower and thread is not promoted
fe8ab488 4162 * TODO: Use TH_SFLAG_THROTTLED instead?
39236c6e
A
4163 */
4164 *arg1 = thread->sched_pri;
3e170ce0 4165 *arg2 = thread->base_pri;
6d2010ae 4166
39236c6e
A
4167 return (THREAD_URGENCY_BACKGROUND);
4168 } else {
fe8ab488
A
4169 /* For otherwise unclassified threads, report throughput QoS
4170 * parameters
4171 */
4172 *arg1 = thread->effective_policy.t_through_qos;
4173 *arg2 = thread->task->effective_policy.t_through_qos;
39236c6e 4174
6d2010ae
A
4175 return (THREAD_URGENCY_NORMAL);
4176 }
6d2010ae
A
4177}
4178
4179
1c79356b 4180/*
2d21ac55
A
4181 * This is the processor idle loop, which just looks for other threads
4182 * to execute. Processor idle threads invoke this without supplying a
4183 * current thread to idle without an asserted wait state.
4184 *
4185 * Returns a the next thread to execute if dispatched directly.
1c79356b 4186 */
6d2010ae
A
4187
4188#if 0
4189#define IDLE_KERNEL_DEBUG_CONSTANT(...) KERNEL_DEBUG_CONSTANT(__VA_ARGS__)
4190#else
4191#define IDLE_KERNEL_DEBUG_CONSTANT(...) do { } while(0)
4192#endif
4193
4194thread_t
2d21ac55
A
4195processor_idle(
4196 thread_t thread,
4197 processor_t processor)
1c79356b 4198{
2d21ac55
A
4199 processor_set_t pset = processor->processor_set;
4200 thread_t new_thread;
4201 int state;
2d21ac55 4202 (void)splsched();
1c79356b 4203
316670eb
A
4204 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
4205 MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_START,
4206 (uintptr_t)thread_tid(thread), 0, 0, 0, 0);
3a60a9f5 4207
6d2010ae
A
4208 SCHED_STATS_CPU_IDLE_START(processor);
4209
2d21ac55
A
4210 timer_switch(&PROCESSOR_DATA(processor, system_state),
4211 mach_absolute_time(), &PROCESSOR_DATA(processor, idle_state));
4212 PROCESSOR_DATA(processor, current_state) = &PROCESSOR_DATA(processor, idle_state);
3a60a9f5 4213
39236c6e 4214 while (1) {
39236c6e
A
4215 if (processor->state != PROCESSOR_IDLE) /* unsafe, but worst case we loop around once */
4216 break;
3e170ce0 4217 if (pset->pending_AST_cpu_mask & (1ULL << processor->cpu_id))
39236c6e 4218 break;
3e170ce0
A
4219 if (processor->is_recommended) {
4220 if (rt_runq.count)
4221 break;
4222 } else {
4223 if (SCHED(processor_bound_count)(processor))
4224 break;
4225 }
4226
39236c6e
A
4227#if CONFIG_SCHED_IDLE_IN_PLACE
4228 if (thread != THREAD_NULL) {
4229 /* Did idle-in-place thread wake up */
4230 if ((thread->state & (TH_WAIT|TH_SUSP)) != TH_WAIT || thread->wake_active)
4231 break;
4232 }
4233#endif
4234
6d2010ae
A
4235 IDLE_KERNEL_DEBUG_CONSTANT(
4236 MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), rt_runq.count, SCHED(processor_runq_count)(processor), -1, 0);
4237
4b17d6b6
A
4238 machine_track_platform_idle(TRUE);
4239
2d21ac55 4240 machine_idle();
55e303ae 4241
4b17d6b6
A
4242 machine_track_platform_idle(FALSE);
4243
55e303ae 4244 (void)splsched();
c910b4d9 4245
6d2010ae
A
4246 IDLE_KERNEL_DEBUG_CONSTANT(
4247 MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), rt_runq.count, SCHED(processor_runq_count)(processor), -2, 0);
4248
fe8ab488
A
4249 if (!SCHED(processor_queue_empty)(processor)) {
4250 /* Secondary SMT processors respond to directed wakeups
4251 * exclusively. Some platforms induce 'spurious' SMT wakeups.
4252 */
4253 if (processor->processor_primary == processor)
4254 break;
4255 }
55e303ae
A
4256 }
4257
2d21ac55
A
4258 timer_switch(&PROCESSOR_DATA(processor, idle_state),
4259 mach_absolute_time(), &PROCESSOR_DATA(processor, system_state));
4260 PROCESSOR_DATA(processor, current_state) = &PROCESSOR_DATA(processor, system_state);
1c79356b 4261
2d21ac55
A
4262 pset_lock(pset);
4263
39236c6e 4264 /* If we were sent a remote AST and came out of idle, acknowledge it here with pset lock held */
3e170ce0
A
4265 pset->pending_AST_cpu_mask &= ~(1ULL << processor->cpu_id);
4266#if defined(CONFIG_SCHED_DEFERRED_AST)
4267 pset->pending_deferred_AST_cpu_mask &= ~(1ULL << processor->cpu_id);
4268#endif
39236c6e 4269
55e303ae
A
4270 state = processor->state;
4271 if (state == PROCESSOR_DISPATCHING) {
1c79356b 4272 /*
55e303ae 4273 * Commmon case -- cpu dispatched.
1c79356b 4274 */
2d21ac55
A
4275 new_thread = processor->next_thread;
4276 processor->next_thread = THREAD_NULL;
55e303ae 4277 processor->state = PROCESSOR_RUNNING;
1c79356b 4278
39236c6e 4279 if ((new_thread != THREAD_NULL) && (SCHED(processor_queue_has_priority)(processor, new_thread->sched_pri, FALSE) ||
3e170ce0 4280 (rt_runq.count > 0)) ) {
fe8ab488 4281 /* Something higher priority has popped up on the runqueue - redispatch this thread elsewhere */
39236c6e
A
4282 processor->current_pri = IDLEPRI;
4283 processor->current_thmode = TH_MODE_FIXED;
fe8ab488 4284 processor->current_sfi_class = SFI_CLASS_KERNEL;
2d21ac55 4285 processor->deadline = UINT64_MAX;
55e303ae 4286
2d21ac55 4287 pset_unlock(pset);
1c79356b 4288
2d21ac55 4289 thread_lock(new_thread);
6d2010ae 4290 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_REDISPATCH), (uintptr_t)thread_tid(new_thread), new_thread->sched_pri, rt_runq.count, 0, 0);
2d21ac55
A
4291 thread_setrun(new_thread, SCHED_HEADQ);
4292 thread_unlock(new_thread);
55e303ae 4293
316670eb
A
4294 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
4295 MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_END,
4296 (uintptr_t)thread_tid(thread), state, 0, 0, 0);
6d2010ae 4297
2d21ac55 4298 return (THREAD_NULL);
1c79356b 4299 }
1c79356b 4300
2d21ac55
A
4301 pset_unlock(pset);
4302
316670eb
A
4303 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
4304 MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_END,
4305 (uintptr_t)thread_tid(thread), state, (uintptr_t)thread_tid(new_thread), 0, 0);
6d2010ae 4306
2d21ac55 4307 return (new_thread);
55e303ae
A
4308 }
4309 else
4310 if (state == PROCESSOR_IDLE) {
6d2010ae 4311 remqueue((queue_entry_t)processor);
1c79356b 4312
2d21ac55 4313 processor->state = PROCESSOR_RUNNING;
39236c6e
A
4314 processor->current_pri = IDLEPRI;
4315 processor->current_thmode = TH_MODE_FIXED;
fe8ab488 4316 processor->current_sfi_class = SFI_CLASS_KERNEL;
39236c6e 4317 processor->deadline = UINT64_MAX;
cf7d32b8 4318 enqueue_tail(&pset->active_queue, (queue_entry_t)processor);
1c79356b 4319 }
55e303ae
A
4320 else
4321 if (state == PROCESSOR_SHUTDOWN) {
4322 /*
4323 * Going off-line. Force a
4324 * reschedule.
4325 */
2d21ac55
A
4326 if ((new_thread = processor->next_thread) != THREAD_NULL) {
4327 processor->next_thread = THREAD_NULL;
39236c6e
A
4328 processor->current_pri = IDLEPRI;
4329 processor->current_thmode = TH_MODE_FIXED;
fe8ab488 4330 processor->current_sfi_class = SFI_CLASS_KERNEL;
55e303ae 4331 processor->deadline = UINT64_MAX;
2d21ac55
A
4332
4333 pset_unlock(pset);
55e303ae
A
4334
4335 thread_lock(new_thread);
4336 thread_setrun(new_thread, SCHED_HEADQ);
4337 thread_unlock(new_thread);
55e303ae 4338
316670eb
A
4339 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
4340 MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_END,
4341 (uintptr_t)thread_tid(thread), state, 0, 0, 0);
6d2010ae 4342
2d21ac55
A
4343 return (THREAD_NULL);
4344 }
55e303ae
A
4345 }
4346
2d21ac55
A
4347 pset_unlock(pset);
4348
316670eb
A
4349 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
4350 MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_END,
4351 (uintptr_t)thread_tid(thread), state, 0, 0, 0);
6d2010ae 4352
2d21ac55
A
4353 return (THREAD_NULL);
4354}
4355
cf7d32b8
A
4356/*
4357 * Each processor has a dedicated thread which
4358 * executes the idle loop when there is no suitable
4359 * previous context.
4360 */
2d21ac55
A
4361void
4362idle_thread(void)
4363{
4364 processor_t processor = current_processor();
4365 thread_t new_thread;
4366
4367 new_thread = processor_idle(THREAD_NULL, processor);
4368 if (new_thread != THREAD_NULL) {
4369 thread_run(processor->idle_thread, (thread_continue_t)idle_thread, NULL, new_thread);
4370 /*NOTREACHED*/
4371 }
55e303ae 4372
2d21ac55 4373 thread_block((thread_continue_t)idle_thread);
55e303ae 4374 /*NOTREACHED*/
1c79356b
A
4375}
4376
91447636
A
4377kern_return_t
4378idle_thread_create(
4379 processor_t processor)
1c79356b 4380{
91447636
A
4381 kern_return_t result;
4382 thread_t thread;
4383 spl_t s;
4384
4385 result = kernel_thread_create((thread_continue_t)idle_thread, NULL, MAXPRI_KERNEL, &thread);
4386 if (result != KERN_SUCCESS)
4387 return (result);
4388
4389 s = splsched();
4390 thread_lock(thread);
4391 thread->bound_processor = processor;
4392 processor->idle_thread = thread;
3e170ce0 4393 thread->sched_pri = thread->base_pri = IDLEPRI;
91447636 4394 thread->state = (TH_RUN | TH_IDLE);
39236c6e 4395 thread->options |= TH_OPT_IDLE_THREAD;
91447636
A
4396 thread_unlock(thread);
4397 splx(s);
4398
4399 thread_deallocate(thread);
4400
4401 return (KERN_SUCCESS);
1c79356b
A
4402}
4403
91447636
A
4404/*
4405 * sched_startup:
4406 *
4407 * Kicks off scheduler services.
4408 *
4409 * Called at splsched.
4410 */
0b4e3aa0 4411void
91447636 4412sched_startup(void)
0b4e3aa0 4413{
91447636
A
4414 kern_return_t result;
4415 thread_t thread;
4416
3e170ce0
A
4417 simple_lock_init(&sched_vm_group_list_lock, 0);
4418
490019cf 4419
6d2010ae 4420 result = kernel_thread_start_priority((thread_continue_t)sched_init_thread,
39236c6e 4421 (void *)SCHED(maintenance_continuation), MAXPRI_KERNEL, &thread);
91447636
A
4422 if (result != KERN_SUCCESS)
4423 panic("sched_startup");
4424
4425 thread_deallocate(thread);
4426
4427 /*
316670eb
A
4428 * Yield to the sched_init_thread once, to
4429 * initialize our own thread after being switched
4430 * back to.
91447636
A
4431 *
4432 * The current thread is the only other thread
4433 * active at this point.
4434 */
316670eb 4435 thread_block(THREAD_CONTINUE_NULL);
6d2010ae 4436}
91447636 4437
fe8ab488 4438#if defined(CONFIG_SCHED_TIMESHARE_CORE)
91447636 4439
39236c6e 4440static volatile uint64_t sched_maintenance_deadline;
fe8ab488
A
4441#if defined(CONFIG_TELEMETRY)
4442static volatile uint64_t sched_telemetry_deadline = 0;
4443#endif
39236c6e
A
4444static uint64_t sched_tick_last_abstime;
4445static uint64_t sched_tick_delta;
4446uint64_t sched_tick_max_delta;
1c79356b 4447/*
6d2010ae 4448 * sched_init_thread:
1c79356b 4449 *
55e303ae
A
4450 * Perform periodic bookkeeping functions about ten
4451 * times per second.
1c79356b 4452 */
fe8ab488 4453void
3e170ce0 4454sched_timeshare_maintenance_continue(void)
1c79356b 4455{
fe8ab488
A
4456 uint64_t sched_tick_ctime, late_time;
4457
3e170ce0
A
4458 struct sched_update_scan_context scan_context = {
4459 .earliest_bg_make_runnable_time = UINT64_MAX,
4460 .earliest_normal_make_runnable_time = UINT64_MAX,
4461 .earliest_rt_make_runnable_time = UINT64_MAX
4462 };
4463
fe8ab488 4464 sched_tick_ctime = mach_absolute_time();
1c79356b 4465
39236c6e
A
4466 if (__improbable(sched_tick_last_abstime == 0)) {
4467 sched_tick_last_abstime = sched_tick_ctime;
fe8ab488 4468 late_time = 0;
39236c6e
A
4469 sched_tick_delta = 1;
4470 } else {
fe8ab488
A
4471 late_time = sched_tick_ctime - sched_tick_last_abstime;
4472 sched_tick_delta = late_time / sched_tick_interval;
39236c6e
A
4473 /* Ensure a delta of 1, since the interval could be slightly
4474 * smaller than the sched_tick_interval due to dispatch
4475 * latencies.
4476 */
4477 sched_tick_delta = MAX(sched_tick_delta, 1);
4478
4479 /* In the event interrupt latencies or platform
4480 * idle events that advanced the timebase resulted
4481 * in periods where no threads were dispatched,
4482 * cap the maximum "tick delta" at SCHED_TICK_MAX_DELTA
4483 * iterations.
4484 */
4485 sched_tick_delta = MIN(sched_tick_delta, SCHED_TICK_MAX_DELTA);
4486
4487 sched_tick_last_abstime = sched_tick_ctime;
4488 sched_tick_max_delta = MAX(sched_tick_delta, sched_tick_max_delta);
4489 }
4490
fe8ab488
A
4491 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_MAINTENANCE)|DBG_FUNC_START,
4492 sched_tick_delta,
4493 late_time,
4494 0,
4495 0,
4496 0);
4497
39236c6e
A
4498 /* Add a number of pseudo-ticks corresponding to the elapsed interval
4499 * This could be greater than 1 if substantial intervals where
4500 * all processors are idle occur, which rarely occurs in practice.
4501 */
4502
4503 sched_tick += sched_tick_delta;
1c79356b
A
4504
4505 /*
91447636 4506 * Compute various averages.
1c79356b 4507 */
39236c6e 4508 compute_averages(sched_tick_delta);
1c79356b
A
4509
4510 /*
91447636
A
4511 * Scan the run queues for threads which
4512 * may need to be updated.
1c79356b 4513 */
3e170ce0
A
4514 SCHED(thread_update_scan)(&scan_context);
4515
4516 rt_runq_scan(&scan_context);
4517
4518 uint64_t ctime = mach_absolute_time();
4519
4520 machine_max_runnable_latency(ctime > scan_context.earliest_bg_make_runnable_time ? ctime - scan_context.earliest_bg_make_runnable_time : 0,
4521 ctime > scan_context.earliest_normal_make_runnable_time ? ctime - scan_context.earliest_normal_make_runnable_time : 0,
4522 ctime > scan_context.earliest_rt_make_runnable_time ? ctime - scan_context.earliest_rt_make_runnable_time : 0);
4523
4524 /*
4525 * Check to see if the special sched VM group needs attention.
4526 */
4527 sched_vm_group_maintenance();
fe8ab488 4528
490019cf 4529
fe8ab488
A
4530 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_MAINTENANCE)|DBG_FUNC_END,
4531 sched_pri_shift,
4532 sched_background_pri_shift,
4533 0,
4534 0,
4535 0);
1c79356b 4536
3e170ce0
A
4537 assert_wait((event_t)sched_timeshare_maintenance_continue, THREAD_UNINT);
4538 thread_block((thread_continue_t)sched_timeshare_maintenance_continue);
1c79356b
A
4539 /*NOTREACHED*/
4540}
4541
39236c6e
A
4542static uint64_t sched_maintenance_wakeups;
4543
4544/*
4545 * Determine if the set of routines formerly driven by a maintenance timer
4546 * must be invoked, based on a deadline comparison. Signals the scheduler
4547 * maintenance thread on deadline expiration. Must be invoked at an interval
4548 * lower than the "sched_tick_interval", currently accomplished by
4549 * invocation via the quantum expiration timer and at context switch time.
4550 * Performance matters: this routine reuses a timestamp approximating the
4551 * current absolute time received from the caller, and should perform
4552 * no more than a comparison against the deadline in the common case.
4553 */
4554void
3e170ce0 4555sched_timeshare_consider_maintenance(uint64_t ctime) {
39236c6e
A
4556 uint64_t ndeadline, deadline = sched_maintenance_deadline;
4557
4558 if (__improbable(ctime >= deadline)) {
4559 if (__improbable(current_thread() == sched_maintenance_thread))
4560 return;
4561 OSMemoryBarrier();
4562
4563 ndeadline = ctime + sched_tick_interval;
4564
4565 if (__probable(__sync_bool_compare_and_swap(&sched_maintenance_deadline, deadline, ndeadline))) {
3e170ce0 4566 thread_wakeup((event_t)sched_timeshare_maintenance_continue);
39236c6e
A
4567 sched_maintenance_wakeups++;
4568 }
4569 }
fe8ab488
A
4570
4571#if defined(CONFIG_TELEMETRY)
4572 /*
4573 * Windowed telemetry is driven by the scheduler. It should be safe
4574 * to call compute_telemetry_windowed() even when windowed telemetry
4575 * is disabled, but we should try to avoid doing extra work for no
4576 * reason.
4577 */
4578 if (telemetry_window_enabled) {
4579 deadline = sched_telemetry_deadline;
4580
4581 if (__improbable(ctime >= deadline)) {
4582 ndeadline = ctime + sched_telemetry_interval;
4583
4584 if (__probable(__sync_bool_compare_and_swap(&sched_telemetry_deadline, deadline, ndeadline))) {
4585 compute_telemetry_windowed();
4586 }
4587 }
4588 }
4589#endif /* CONFIG_TELEMETRY */
39236c6e
A
4590}
4591
fe8ab488 4592#endif /* CONFIG_SCHED_TIMESHARE_CORE */
6d2010ae 4593
1c79356b 4594void
6d2010ae 4595sched_init_thread(void (*continuation)(void))
1c79356b 4596{
316670eb 4597 thread_block(THREAD_CONTINUE_NULL);
91447636 4598
490019cf
A
4599 thread_t thread = current_thread();
4600
4601 sched_maintenance_thread = thread;
4602
6d2010ae 4603 continuation();
1c79356b 4604
1c79356b
A
4605 /*NOTREACHED*/
4606}
4607
fe8ab488 4608#if defined(CONFIG_SCHED_TIMESHARE_CORE)
6d2010ae 4609
1c79356b 4610/*
91447636 4611 * thread_update_scan / runq_scan:
55e303ae 4612 *
91447636
A
4613 * Scan the run queues to account for timesharing threads
4614 * which need to be updated.
1c79356b
A
4615 *
4616 * Scanner runs in two passes. Pass one squirrels likely
91447636 4617 * threads away in an array, pass two does the update.
1c79356b 4618 *
91447636
A
4619 * This is necessary because the run queue is locked for
4620 * the candidate scan, but the thread is locked for the update.
1c79356b 4621 *
91447636
A
4622 * Array should be sized to make forward progress, without
4623 * disabling preemption for long periods.
1c79356b 4624 */
55e303ae 4625
91447636 4626#define THREAD_UPDATE_SIZE 128
55e303ae 4627
91447636
A
4628static thread_t thread_update_array[THREAD_UPDATE_SIZE];
4629static int thread_update_count = 0;
1c79356b 4630
fe8ab488
A
4631/* Returns TRUE if thread was added, FALSE if thread_update_array is full */
4632boolean_t
4633thread_update_add_thread(thread_t thread)
4634{
4635 if (thread_update_count == THREAD_UPDATE_SIZE)
4636 return (FALSE);
4637
4638 thread_update_array[thread_update_count++] = thread;
4639 thread_reference_internal(thread);
4640 return (TRUE);
4641}
4642
4643void
4644thread_update_process_threads(void)
4645{
4646 while (thread_update_count > 0) {
4647 spl_t s;
4648 thread_t thread = thread_update_array[--thread_update_count];
4649 thread_update_array[thread_update_count] = THREAD_NULL;
4650
4651 s = splsched();
4652 thread_lock(thread);
4653 if (!(thread->state & (TH_WAIT)) && (SCHED(can_update_priority)(thread))) {
4654 SCHED(update_priority)(thread);
4655 }
4656 thread_unlock(thread);
4657 splx(s);
4658
4659 thread_deallocate(thread);
4660 }
4661}
4662
1c79356b 4663/*
91447636
A
4664 * Scan a runq for candidate threads.
4665 *
4666 * Returns TRUE if retry is needed.
1c79356b 4667 */
fe8ab488 4668boolean_t
91447636 4669runq_scan(
3e170ce0
A
4670 run_queue_t runq,
4671 sched_update_scan_context_t scan_context)
1c79356b 4672{
91447636 4673 register int count;
1c79356b
A
4674 register queue_t q;
4675 register thread_t thread;
1c79356b 4676
1c79356b
A
4677 if ((count = runq->count) > 0) {
4678 q = runq->queues + runq->highq;
4679 while (count > 0) {
4680 queue_iterate(q, thread, thread_t, links) {
55e303ae 4681 if ( thread->sched_stamp != sched_tick &&
6d2010ae 4682 (thread->sched_mode == TH_MODE_TIMESHARE) ) {
fe8ab488 4683 if (thread_update_add_thread(thread) == FALSE)
55e303ae 4684 return (TRUE);
1c79356b
A
4685 }
4686
3e170ce0
A
4687 if (cpu_throttle_enabled && ((thread->sched_pri <= MAXPRI_THROTTLE) && (thread->base_pri <= MAXPRI_THROTTLE))) {
4688 if (thread->last_made_runnable_time < scan_context->earliest_bg_make_runnable_time) {
4689 scan_context->earliest_bg_make_runnable_time = thread->last_made_runnable_time;
4690 }
4691 } else {
4692 if (thread->last_made_runnable_time < scan_context->earliest_normal_make_runnable_time) {
4693 scan_context->earliest_normal_make_runnable_time = thread->last_made_runnable_time;
4694 }
4695 }
4696
1c79356b
A
4697 count--;
4698 }
4699
4700 q--;
4701 }
4702 }
1c79356b 4703
91447636 4704 return (FALSE);
1c79356b
A
4705}
4706
fe8ab488
A
4707#endif /* CONFIG_SCHED_TIMESHARE_CORE */
4708
6d2010ae
A
4709boolean_t
4710thread_eager_preemption(thread_t thread)
4711{
4712 return ((thread->sched_flags & TH_SFLAG_EAGERPREEMPT) != 0);
4713}
4714
4715void
4716thread_set_eager_preempt(thread_t thread)
4717{
4718 spl_t x;
4719 processor_t p;
4720 ast_t ast = AST_NONE;
4721
4722 x = splsched();
4723 p = current_processor();
4724
4725 thread_lock(thread);
4726 thread->sched_flags |= TH_SFLAG_EAGERPREEMPT;
4727
4728 if (thread == current_thread()) {
6d2010ae 4729
fe8ab488
A
4730 ast = csw_check(p, AST_NONE);
4731 thread_unlock(thread);
6d2010ae
A
4732 if (ast != AST_NONE) {
4733 (void) thread_block_reason(THREAD_CONTINUE_NULL, NULL, ast);
4734 }
4735 } else {
4736 p = thread->last_processor;
4737
4738 if (p != PROCESSOR_NULL && p->state == PROCESSOR_RUNNING &&
4739 p->active_thread == thread) {
4740 cause_ast_check(p);
4741 }
1c79356b 4742
6d2010ae
A
4743 thread_unlock(thread);
4744 }
4745
4746 splx(x);
4747}
4748
4749void
4750thread_clear_eager_preempt(thread_t thread)
4751{
4752 spl_t x;
4753
4754 x = splsched();
4755 thread_lock(thread);
4756
4757 thread->sched_flags &= ~TH_SFLAG_EAGERPREEMPT;
4758
4759 thread_unlock(thread);
4760 splx(x);
4761}
3e170ce0 4762
6d2010ae
A
4763/*
4764 * Scheduling statistics
4765 */
4766void
4767sched_stats_handle_csw(processor_t processor, int reasons, int selfpri, int otherpri)
4768{
4769 struct processor_sched_statistics *stats;
4770 boolean_t to_realtime = FALSE;
4771
4772 stats = &processor->processor_data.sched_stats;
4773 stats->csw_count++;
4774
4775 if (otherpri >= BASEPRI_REALTIME) {
4776 stats->rt_sched_count++;
4777 to_realtime = TRUE;
4778 }
4779
4780 if ((reasons & AST_PREEMPT) != 0) {
4781 stats->preempt_count++;
4782
4783 if (selfpri >= BASEPRI_REALTIME) {
4784 stats->preempted_rt_count++;
4785 }
4786
4787 if (to_realtime) {
4788 stats->preempted_by_rt_count++;
4789 }
4790
4791 }
4792}
4793
4794void
4795sched_stats_handle_runq_change(struct runq_stats *stats, int old_count)
4796{
4797 uint64_t timestamp = mach_absolute_time();
4798
4799 stats->count_sum += (timestamp - stats->last_change_timestamp) * old_count;
4800 stats->last_change_timestamp = timestamp;
4801}
4802
1c79356b 4803/*
6d2010ae 4804 * For calls from assembly code
1c79356b 4805 */
6d2010ae 4806#undef thread_wakeup
1c79356b
A
4807void
4808thread_wakeup(
6d2010ae 4809 event_t x);
1c79356b
A
4810
4811void
4812thread_wakeup(
6d2010ae 4813 event_t x)
1c79356b 4814{
6d2010ae 4815 thread_wakeup_with_result(x, THREAD_AWAKENED);
1c79356b
A
4816}
4817
91447636
A
4818boolean_t
4819preemption_enabled(void)
4820{
4821 return (get_preemption_level() == 0 && ml_get_interrupts_enabled());
4822}
9bccf70c 4823
4b17d6b6
A
4824static void
4825sched_timer_deadline_tracking_init(void) {
4826 nanoseconds_to_absolutetime(TIMER_DEADLINE_TRACKING_BIN_1_DEFAULT, &timer_deadline_tracking_bin_1);
4827 nanoseconds_to_absolutetime(TIMER_DEADLINE_TRACKING_BIN_2_DEFAULT, &timer_deadline_tracking_bin_2);
4828}
3e170ce0
A
4829
4830
4831kern_return_t
4832sched_work_interval_notify(thread_t thread, uint64_t work_interval_id, uint64_t start, uint64_t finish, uint64_t deadline, uint64_t next_start, uint32_t flags)
4833{
4834 int urgency;
4835 uint64_t urgency_param1, urgency_param2;
4836 spl_t s;
4837
4838 if (work_interval_id == 0) {
4839 return (KERN_INVALID_ARGUMENT);
4840 }
4841
4842 assert(thread == current_thread());
4843
4844 thread_mtx_lock(thread);
4845 if (thread->work_interval_id != work_interval_id) {
4846 thread_mtx_unlock(thread);
4847 return (KERN_INVALID_ARGUMENT);
4848 }
4849 thread_mtx_unlock(thread);
4850
4851 s = splsched();
4852 thread_lock(thread);
4853 urgency = thread_get_urgency(thread, &urgency_param1, &urgency_param2);
4854 thread_unlock(thread);
4855 splx(s);
4856
4857 machine_work_interval_notify(thread, work_interval_id, start, finish, deadline, next_start, urgency, flags);
4858 return (KERN_SUCCESS);
4859}
4860
4861void thread_set_options(uint32_t thopt) {
4862 spl_t x;
4863 thread_t t = current_thread();
4864
4865 x = splsched();
4866 thread_lock(t);
4867
4868 t->options |= thopt;
4869
4870 thread_unlock(t);
4871 splx(x);
4872}