]> git.saurik.com Git - apple/xnu.git/blame_incremental - osfmk/kern/sched_prim.c
xnu-3248.40.184.tar.gz
[apple/xnu.git] / osfmk / kern / sched_prim.c
... / ...
CommitLineData
1/*
2 * Copyright (c) 2000-2012 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/*
29 * @OSF_FREE_COPYRIGHT@
30 */
31/*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56/*
57 */
58/*
59 * File: sched_prim.c
60 * Author: Avadis Tevanian, Jr.
61 * Date: 1986
62 *
63 * Scheduling primitives
64 *
65 */
66
67#include <debug.h>
68
69#include <mach/mach_types.h>
70#include <mach/machine.h>
71#include <mach/policy.h>
72#include <mach/sync_policy.h>
73#include <mach/thread_act.h>
74
75#include <machine/machine_routines.h>
76#include <machine/sched_param.h>
77#include <machine/machine_cpu.h>
78#include <machine/machlimits.h>
79
80#ifdef CONFIG_MACH_APPROXIMATE_TIME
81#include <machine/commpage.h>
82#endif
83
84#include <kern/kern_types.h>
85#include <kern/clock.h>
86#include <kern/counters.h>
87#include <kern/cpu_number.h>
88#include <kern/cpu_data.h>
89#include <kern/smp.h>
90#include <kern/debug.h>
91#include <kern/macro_help.h>
92#include <kern/machine.h>
93#include <kern/misc_protos.h>
94#include <kern/processor.h>
95#include <kern/queue.h>
96#include <kern/sched.h>
97#include <kern/sched_prim.h>
98#include <kern/sfi.h>
99#include <kern/syscall_subr.h>
100#include <kern/task.h>
101#include <kern/thread.h>
102#include <kern/ledger.h>
103#include <kern/timer_queue.h>
104#include <kern/waitq.h>
105
106#include <vm/pmap.h>
107#include <vm/vm_kern.h>
108#include <vm/vm_map.h>
109
110#include <mach/sdt.h>
111
112#include <sys/kdebug.h>
113
114#include <kern/pms.h>
115
116#if defined(CONFIG_TELEMETRY) && defined(CONFIG_SCHED_TIMESHARE_CORE)
117#include <kern/telemetry.h>
118#endif
119
120struct rt_queue rt_runq;
121
122uintptr_t sched_thread_on_rt_queue = (uintptr_t)0xDEAFBEE0;
123
124/* Lock RT runq, must be done with interrupts disabled (under splsched()) */
125#if __SMP__
126decl_simple_lock_data(static,rt_lock);
127#define rt_lock_init() simple_lock_init(&rt_lock, 0)
128#define rt_lock_lock() simple_lock(&rt_lock)
129#define rt_lock_unlock() simple_unlock(&rt_lock)
130#else
131#define rt_lock_init() do { } while(0)
132#define rt_lock_lock() do { } while(0)
133#define rt_lock_unlock() do { } while(0)
134#endif
135
136#define DEFAULT_PREEMPTION_RATE 100 /* (1/s) */
137int default_preemption_rate = DEFAULT_PREEMPTION_RATE;
138
139#define DEFAULT_BG_PREEMPTION_RATE 400 /* (1/s) */
140int default_bg_preemption_rate = DEFAULT_BG_PREEMPTION_RATE;
141
142#define MAX_UNSAFE_QUANTA 800
143int max_unsafe_quanta = MAX_UNSAFE_QUANTA;
144
145#define MAX_POLL_QUANTA 2
146int max_poll_quanta = MAX_POLL_QUANTA;
147
148#define SCHED_POLL_YIELD_SHIFT 4 /* 1/16 */
149int sched_poll_yield_shift = SCHED_POLL_YIELD_SHIFT;
150
151uint64_t max_poll_computation;
152
153uint64_t max_unsafe_computation;
154uint64_t sched_safe_duration;
155
156#if defined(CONFIG_SCHED_TIMESHARE_CORE)
157
158uint32_t std_quantum;
159uint32_t min_std_quantum;
160uint32_t bg_quantum;
161
162uint32_t std_quantum_us;
163uint32_t bg_quantum_us;
164
165#endif /* CONFIG_SCHED_TIMESHARE_CORE */
166
167uint32_t thread_depress_time;
168uint32_t default_timeshare_computation;
169uint32_t default_timeshare_constraint;
170
171uint32_t max_rt_quantum;
172uint32_t min_rt_quantum;
173
174#if defined(CONFIG_SCHED_TIMESHARE_CORE)
175
176unsigned sched_tick;
177uint32_t sched_tick_interval;
178#if defined(CONFIG_TELEMETRY)
179uint32_t sched_telemetry_interval;
180#endif /* CONFIG_TELEMETRY */
181
182uint32_t sched_pri_shift = INT8_MAX;
183uint32_t sched_background_pri_shift = INT8_MAX;
184uint32_t sched_combined_fgbg_pri_shift = INT8_MAX;
185uint32_t sched_fixed_shift;
186uint32_t sched_use_combined_fgbg_decay = 0;
187
188uint32_t sched_decay_usage_age_factor = 1; /* accelerate 5/8^n usage aging */
189
190/* Allow foreground to decay past default to resolve inversions */
191#define DEFAULT_DECAY_BAND_LIMIT ((BASEPRI_FOREGROUND - BASEPRI_DEFAULT) + 2)
192int sched_pri_decay_band_limit = DEFAULT_DECAY_BAND_LIMIT;
193
194/* Defaults for timer deadline profiling */
195#define TIMER_DEADLINE_TRACKING_BIN_1_DEFAULT 2000000 /* Timers with deadlines <=
196 * 2ms */
197#define TIMER_DEADLINE_TRACKING_BIN_2_DEFAULT 5000000 /* Timers with deadlines
198 <= 5ms */
199
200uint64_t timer_deadline_tracking_bin_1;
201uint64_t timer_deadline_tracking_bin_2;
202
203#endif /* CONFIG_SCHED_TIMESHARE_CORE */
204
205thread_t sched_maintenance_thread;
206
207
208uint64_t sched_one_second_interval;
209
210uint32_t sched_run_count, sched_share_count, sched_background_count;
211uint32_t sched_load_average, sched_mach_factor;
212
213/* Forwards */
214
215#if defined(CONFIG_SCHED_TIMESHARE_CORE)
216
217static void load_shift_init(void);
218static void preempt_pri_init(void);
219
220#endif /* CONFIG_SCHED_TIMESHARE_CORE */
221
222static thread_t thread_select(
223 thread_t thread,
224 processor_t processor,
225 ast_t reason);
226
227#if CONFIG_SCHED_IDLE_IN_PLACE
228static thread_t thread_select_idle(
229 thread_t thread,
230 processor_t processor);
231#endif
232
233thread_t processor_idle(
234 thread_t thread,
235 processor_t processor);
236
237ast_t
238csw_check_locked( processor_t processor,
239 processor_set_t pset,
240 ast_t check_reason);
241
242static void processor_setrun(
243 processor_t processor,
244 thread_t thread,
245 integer_t options);
246
247static void
248sched_realtime_init(void);
249
250static void
251sched_realtime_timebase_init(void);
252
253static void
254sched_timer_deadline_tracking_init(void);
255
256#if DEBUG
257extern int debug_task;
258#define TLOG(a, fmt, args...) if(debug_task & a) kprintf(fmt, ## args)
259#else
260#define TLOG(a, fmt, args...) do {} while (0)
261#endif
262
263static processor_t
264thread_bind_internal(
265 thread_t thread,
266 processor_t processor);
267
268static void
269sched_vm_group_maintenance(void);
270
271#if defined(CONFIG_SCHED_TIMESHARE_CORE)
272int8_t sched_load_shifts[NRQS];
273int sched_preempt_pri[NRQBM];
274#endif /* CONFIG_SCHED_TIMESHARE_CORE */
275
276const struct sched_dispatch_table *sched_current_dispatch = NULL;
277
278/*
279 * Statically allocate a buffer to hold the longest possible
280 * scheduler description string, as currently implemented.
281 * bsd/kern/kern_sysctl.c has a corresponding definition in bsd/
282 * to export to userspace via sysctl(3). If either version
283 * changes, update the other.
284 *
285 * Note that in addition to being an upper bound on the strings
286 * in the kernel, it's also an exact parameter to PE_get_default(),
287 * which interrogates the device tree on some platforms. That
288 * API requires the caller know the exact size of the device tree
289 * property, so we need both a legacy size (32) and the current size
290 * (48) to deal with old and new device trees. The device tree property
291 * is similarly padded to a fixed size so that the same kernel image
292 * can run on multiple devices with different schedulers configured
293 * in the device tree.
294 */
295char sched_string[SCHED_STRING_MAX_LENGTH];
296
297uint32_t sched_debug_flags;
298
299/* Global flag which indicates whether Background Stepper Context is enabled */
300static int cpu_throttle_enabled = 1;
301
302void
303sched_init(void)
304{
305 char sched_arg[SCHED_STRING_MAX_LENGTH] = { '\0' };
306
307 /* Check for runtime selection of the scheduler algorithm */
308 if (!PE_parse_boot_argn("sched", sched_arg, sizeof (sched_arg))) {
309 /* If no boot-args override, look in device tree */
310 if (!PE_get_default("kern.sched", sched_arg,
311 SCHED_STRING_MAX_LENGTH)) {
312 sched_arg[0] = '\0';
313 }
314 }
315
316
317 if (!PE_parse_boot_argn("sched_pri_decay_limit", &sched_pri_decay_band_limit, sizeof(sched_pri_decay_band_limit))) {
318 /* No boot-args, check in device tree */
319 if (!PE_get_default("kern.sched_pri_decay_limit",
320 &sched_pri_decay_band_limit,
321 sizeof(sched_pri_decay_band_limit))) {
322 /* Allow decay all the way to normal limits */
323 sched_pri_decay_band_limit = DEFAULT_DECAY_BAND_LIMIT;
324 }
325 }
326
327 kprintf("Setting scheduler priority decay band limit %d\n", sched_pri_decay_band_limit);
328
329 if (strlen(sched_arg) > 0) {
330 if (0) {
331 /* Allow pattern below */
332#if defined(CONFIG_SCHED_TRADITIONAL)
333 } else if (0 == strcmp(sched_arg, sched_traditional_dispatch.sched_name)) {
334 sched_current_dispatch = &sched_traditional_dispatch;
335 } else if (0 == strcmp(sched_arg, sched_traditional_with_pset_runqueue_dispatch.sched_name)) {
336 sched_current_dispatch = &sched_traditional_with_pset_runqueue_dispatch;
337#endif
338#if defined(CONFIG_SCHED_PROTO)
339 } else if (0 == strcmp(sched_arg, sched_proto_dispatch.sched_name)) {
340 sched_current_dispatch = &sched_proto_dispatch;
341#endif
342#if defined(CONFIG_SCHED_GRRR)
343 } else if (0 == strcmp(sched_arg, sched_grrr_dispatch.sched_name)) {
344 sched_current_dispatch = &sched_grrr_dispatch;
345#endif
346#if defined(CONFIG_SCHED_MULTIQ)
347 } else if (0 == strcmp(sched_arg, sched_multiq_dispatch.sched_name)) {
348 sched_current_dispatch = &sched_multiq_dispatch;
349 } else if (0 == strcmp(sched_arg, sched_dualq_dispatch.sched_name)) {
350 sched_current_dispatch = &sched_dualq_dispatch;
351#endif
352 } else {
353#if defined(CONFIG_SCHED_TRADITIONAL)
354 printf("Unrecognized scheduler algorithm: %s\n", sched_arg);
355 printf("Scheduler: Using instead: %s\n", sched_traditional_with_pset_runqueue_dispatch.sched_name);
356 sched_current_dispatch = &sched_traditional_with_pset_runqueue_dispatch;
357#else
358 panic("Unrecognized scheduler algorithm: %s", sched_arg);
359#endif
360 }
361 kprintf("Scheduler: Runtime selection of %s\n", SCHED(sched_name));
362 } else {
363#if defined(CONFIG_SCHED_MULTIQ)
364 sched_current_dispatch = &sched_multiq_dispatch;
365#elif defined(CONFIG_SCHED_TRADITIONAL)
366 sched_current_dispatch = &sched_traditional_with_pset_runqueue_dispatch;
367#elif defined(CONFIG_SCHED_PROTO)
368 sched_current_dispatch = &sched_proto_dispatch;
369#elif defined(CONFIG_SCHED_GRRR)
370 sched_current_dispatch = &sched_grrr_dispatch;
371#else
372#error No default scheduler implementation
373#endif
374 kprintf("Scheduler: Default of %s\n", SCHED(sched_name));
375 }
376
377 strlcpy(sched_string, SCHED(sched_name), sizeof(sched_string));
378
379 if (PE_parse_boot_argn("sched_debug", &sched_debug_flags, sizeof(sched_debug_flags))) {
380 kprintf("Scheduler: Debug flags 0x%08x\n", sched_debug_flags);
381 }
382
383 SCHED(init)();
384 sched_realtime_init();
385 ast_init();
386 sched_timer_deadline_tracking_init();
387
388 SCHED(pset_init)(&pset0);
389 SCHED(processor_init)(master_processor);
390}
391
392void
393sched_timebase_init(void)
394{
395 uint64_t abstime;
396
397 clock_interval_to_absolutetime_interval(1, NSEC_PER_SEC, &abstime);
398 sched_one_second_interval = abstime;
399
400 SCHED(timebase_init)();
401 sched_realtime_timebase_init();
402}
403
404#if defined(CONFIG_SCHED_TIMESHARE_CORE)
405
406void
407sched_timeshare_init(void)
408{
409 /*
410 * Calculate the timeslicing quantum
411 * in us.
412 */
413 if (default_preemption_rate < 1)
414 default_preemption_rate = DEFAULT_PREEMPTION_RATE;
415 std_quantum_us = (1000 * 1000) / default_preemption_rate;
416
417 printf("standard timeslicing quantum is %d us\n", std_quantum_us);
418
419 if (default_bg_preemption_rate < 1)
420 default_bg_preemption_rate = DEFAULT_BG_PREEMPTION_RATE;
421 bg_quantum_us = (1000 * 1000) / default_bg_preemption_rate;
422
423 printf("standard background quantum is %d us\n", bg_quantum_us);
424
425 load_shift_init();
426 preempt_pri_init();
427 sched_tick = 0;
428}
429
430void
431sched_timeshare_timebase_init(void)
432{
433 uint64_t abstime;
434 uint32_t shift;
435
436 /* standard timeslicing quantum */
437 clock_interval_to_absolutetime_interval(
438 std_quantum_us, NSEC_PER_USEC, &abstime);
439 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
440 std_quantum = (uint32_t)abstime;
441
442 /* smallest remaining quantum (250 us) */
443 clock_interval_to_absolutetime_interval(250, NSEC_PER_USEC, &abstime);
444 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
445 min_std_quantum = (uint32_t)abstime;
446
447 /* quantum for background tasks */
448 clock_interval_to_absolutetime_interval(
449 bg_quantum_us, NSEC_PER_USEC, &abstime);
450 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
451 bg_quantum = (uint32_t)abstime;
452
453 /* scheduler tick interval */
454 clock_interval_to_absolutetime_interval(USEC_PER_SEC >> SCHED_TICK_SHIFT,
455 NSEC_PER_USEC, &abstime);
456 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
457 sched_tick_interval = (uint32_t)abstime;
458
459 /*
460 * Compute conversion factor from usage to
461 * timesharing priorities with 5/8 ** n aging.
462 */
463 abstime = (abstime * 5) / 3;
464 for (shift = 0; abstime > BASEPRI_DEFAULT; ++shift)
465 abstime >>= 1;
466 sched_fixed_shift = shift;
467
468 max_unsafe_computation = ((uint64_t)max_unsafe_quanta) * std_quantum;
469 sched_safe_duration = 2 * ((uint64_t)max_unsafe_quanta) * std_quantum;
470
471 max_poll_computation = ((uint64_t)max_poll_quanta) * std_quantum;
472 thread_depress_time = 1 * std_quantum;
473 default_timeshare_computation = std_quantum / 2;
474 default_timeshare_constraint = std_quantum;
475
476#if defined(CONFIG_TELEMETRY)
477 /* interval for high frequency telemetry */
478 clock_interval_to_absolutetime_interval(10, NSEC_PER_MSEC, &abstime);
479 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
480 sched_telemetry_interval = (uint32_t)abstime;
481#endif
482
483}
484
485#endif /* CONFIG_SCHED_TIMESHARE_CORE */
486
487static void
488sched_realtime_init(void)
489{
490 rt_lock_init();
491
492 rt_runq.count = 0;
493 queue_init(&rt_runq.queue);
494}
495
496static void
497sched_realtime_timebase_init(void)
498{
499 uint64_t abstime;
500
501 /* smallest rt computaton (50 us) */
502 clock_interval_to_absolutetime_interval(50, NSEC_PER_USEC, &abstime);
503 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
504 min_rt_quantum = (uint32_t)abstime;
505
506 /* maximum rt computation (50 ms) */
507 clock_interval_to_absolutetime_interval(
508 50, 1000*NSEC_PER_USEC, &abstime);
509 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
510 max_rt_quantum = (uint32_t)abstime;
511
512}
513
514#if defined(CONFIG_SCHED_TIMESHARE_CORE)
515
516/*
517 * Set up values for timeshare
518 * loading factors.
519 */
520static void
521load_shift_init(void)
522{
523 int8_t k, *p = sched_load_shifts;
524 uint32_t i, j;
525
526 uint32_t sched_decay_penalty = 1;
527
528 if (PE_parse_boot_argn("sched_decay_penalty", &sched_decay_penalty, sizeof (sched_decay_penalty))) {
529 kprintf("Overriding scheduler decay penalty %u\n", sched_decay_penalty);
530 }
531
532 if (PE_parse_boot_argn("sched_decay_usage_age_factor", &sched_decay_usage_age_factor, sizeof (sched_decay_usage_age_factor))) {
533 kprintf("Overriding scheduler decay usage age factor %u\n", sched_decay_usage_age_factor);
534 }
535
536 if (PE_parse_boot_argn("sched_use_combined_fgbg_decay", &sched_use_combined_fgbg_decay, sizeof (sched_use_combined_fgbg_decay))) {
537 kprintf("Overriding schedule fg/bg decay calculation: %u\n", sched_use_combined_fgbg_decay);
538 }
539
540 if (sched_decay_penalty == 0) {
541 /*
542 * There is no penalty for timeshare threads for using too much
543 * CPU, so set all load shifts to INT8_MIN. Even under high load,
544 * sched_pri_shift will be >INT8_MAX, and there will be no
545 * penalty applied to threads (nor will sched_usage be updated per
546 * thread).
547 */
548 for (i = 0; i < NRQS; i++) {
549 sched_load_shifts[i] = INT8_MIN;
550 }
551
552 return;
553 }
554
555 *p++ = INT8_MIN; *p++ = 0;
556
557 /*
558 * For a given system load "i", the per-thread priority
559 * penalty per quantum of CPU usage is ~2^k priority
560 * levels. "sched_decay_penalty" can cause more
561 * array entries to be filled with smaller "k" values
562 */
563 for (i = 2, j = 1 << sched_decay_penalty, k = 1; i < NRQS; ++k) {
564 for (j <<= 1; (i < j) && (i < NRQS); ++i)
565 *p++ = k;
566 }
567}
568
569static void
570preempt_pri_init(void)
571{
572 int i, *p = sched_preempt_pri;
573
574 for (i = BASEPRI_FOREGROUND; i < MINPRI_KERNEL; ++i)
575 setbit(i, p);
576
577 for (i = BASEPRI_PREEMPT; i <= MAXPRI; ++i)
578 setbit(i, p);
579}
580
581#endif /* CONFIG_SCHED_TIMESHARE_CORE */
582
583/*
584 * Thread wait timer expiration.
585 */
586void
587thread_timer_expire(
588 void *p0,
589 __unused void *p1)
590{
591 thread_t thread = p0;
592 spl_t s;
593
594 s = splsched();
595 thread_lock(thread);
596 if (--thread->wait_timer_active == 0) {
597 if (thread->wait_timer_is_set) {
598 thread->wait_timer_is_set = FALSE;
599 clear_wait_internal(thread, THREAD_TIMED_OUT);
600 }
601 }
602 thread_unlock(thread);
603 splx(s);
604}
605
606/*
607 * thread_unblock:
608 *
609 * Unblock thread on wake up.
610 *
611 * Returns TRUE if the thread should now be placed on the runqueue.
612 *
613 * Thread must be locked.
614 *
615 * Called at splsched().
616 */
617boolean_t
618thread_unblock(
619 thread_t thread,
620 wait_result_t wresult)
621{
622 boolean_t ready_for_runq = FALSE;
623 thread_t cthread = current_thread();
624 uint32_t new_run_count;
625
626 /*
627 * Set wait_result.
628 */
629 thread->wait_result = wresult;
630
631 /*
632 * Cancel pending wait timer.
633 */
634 if (thread->wait_timer_is_set) {
635 if (timer_call_cancel(&thread->wait_timer))
636 thread->wait_timer_active--;
637 thread->wait_timer_is_set = FALSE;
638 }
639
640 /*
641 * Update scheduling state: not waiting,
642 * set running.
643 */
644 thread->state &= ~(TH_WAIT|TH_UNINT);
645
646 if (!(thread->state & TH_RUN)) {
647 thread->state |= TH_RUN;
648 thread->last_made_runnable_time = mach_approximate_time();
649
650 ready_for_runq = TRUE;
651
652 (*thread->sched_call)(SCHED_CALL_UNBLOCK, thread);
653
654 /*
655 * Update run counts.
656 */
657 new_run_count = sched_run_incr(thread);
658 if (thread->sched_mode == TH_MODE_TIMESHARE) {
659 sched_share_incr(thread);
660
661 if (thread->sched_flags & TH_SFLAG_THROTTLED)
662 sched_background_incr(thread);
663 }
664 } else {
665 /*
666 * Signal if idling on another processor.
667 */
668#if CONFIG_SCHED_IDLE_IN_PLACE
669 if (thread->state & TH_IDLE) {
670 processor_t processor = thread->last_processor;
671
672 if (processor != current_processor())
673 machine_signal_idle(processor);
674 }
675#else
676 assert((thread->state & TH_IDLE) == 0);
677#endif
678
679 new_run_count = sched_run_count; /* updated in thread_select_idle() */
680 }
681
682
683 /*
684 * Calculate deadline for real-time threads.
685 */
686 if (thread->sched_mode == TH_MODE_REALTIME) {
687 uint64_t ctime;
688
689 ctime = mach_absolute_time();
690 thread->realtime.deadline = thread->realtime.constraint + ctime;
691 }
692
693 /*
694 * Clear old quantum, fail-safe computation, etc.
695 */
696 thread->quantum_remaining = 0;
697 thread->computation_metered = 0;
698 thread->reason = AST_NONE;
699
700 /* Obtain power-relevant interrupt and "platform-idle exit" statistics.
701 * We also account for "double hop" thread signaling via
702 * the thread callout infrastructure.
703 * DRK: consider removing the callout wakeup counters in the future
704 * they're present for verification at the moment.
705 */
706 boolean_t aticontext, pidle;
707 ml_get_power_state(&aticontext, &pidle);
708
709 if (__improbable(aticontext && !(thread_get_tag_internal(thread) & THREAD_TAG_CALLOUT))) {
710 ledger_credit(thread->t_ledger, task_ledgers.interrupt_wakeups, 1);
711 DTRACE_SCHED2(iwakeup, struct thread *, thread, struct proc *, thread->task->bsd_info);
712
713 uint64_t ttd = PROCESSOR_DATA(current_processor(), timer_call_ttd);
714
715 if (ttd) {
716 if (ttd <= timer_deadline_tracking_bin_1)
717 thread->thread_timer_wakeups_bin_1++;
718 else
719 if (ttd <= timer_deadline_tracking_bin_2)
720 thread->thread_timer_wakeups_bin_2++;
721 }
722
723 if (pidle) {
724 ledger_credit(thread->t_ledger, task_ledgers.platform_idle_wakeups, 1);
725 }
726
727 } else if (thread_get_tag_internal(cthread) & THREAD_TAG_CALLOUT) {
728 if (cthread->callout_woken_from_icontext) {
729 ledger_credit(thread->t_ledger, task_ledgers.interrupt_wakeups, 1);
730 thread->thread_callout_interrupt_wakeups++;
731 if (cthread->callout_woken_from_platform_idle) {
732 ledger_credit(thread->t_ledger, task_ledgers.platform_idle_wakeups, 1);
733 thread->thread_callout_platform_idle_wakeups++;
734 }
735
736 cthread->callout_woke_thread = TRUE;
737 }
738 }
739
740 if (thread_get_tag_internal(thread) & THREAD_TAG_CALLOUT) {
741 thread->callout_woken_from_icontext = aticontext;
742 thread->callout_woken_from_platform_idle = pidle;
743 thread->callout_woke_thread = FALSE;
744 }
745
746 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
747 MACHDBG_CODE(DBG_MACH_SCHED,MACH_MAKE_RUNNABLE) | DBG_FUNC_NONE,
748 (uintptr_t)thread_tid(thread), thread->sched_pri, thread->wait_result, new_run_count, 0);
749
750 DTRACE_SCHED2(wakeup, struct thread *, thread, struct proc *, thread->task->bsd_info);
751
752 return (ready_for_runq);
753}
754
755/*
756 * Routine: thread_go
757 * Purpose:
758 * Unblock and dispatch thread.
759 * Conditions:
760 * thread lock held, IPC locks may be held.
761 * thread must have been pulled from wait queue under same lock hold.
762 * thread must have been waiting
763 * Returns:
764 * KERN_SUCCESS - Thread was set running
765 *
766 * TODO: This should return void
767 */
768kern_return_t
769thread_go(
770 thread_t thread,
771 wait_result_t wresult)
772{
773 assert(thread->at_safe_point == FALSE);
774 assert(thread->wait_event == NO_EVENT64);
775 assert(thread->waitq == NULL);
776
777 assert(!(thread->state & (TH_TERMINATE|TH_TERMINATE2)));
778 assert(thread->state & TH_WAIT);
779
780
781 if (thread_unblock(thread, wresult))
782 thread_setrun(thread, SCHED_PREEMPT | SCHED_TAILQ);
783
784 return (KERN_SUCCESS);
785}
786
787/*
788 * Routine: thread_mark_wait_locked
789 * Purpose:
790 * Mark a thread as waiting. If, given the circumstances,
791 * it doesn't want to wait (i.e. already aborted), then
792 * indicate that in the return value.
793 * Conditions:
794 * at splsched() and thread is locked.
795 */
796__private_extern__
797wait_result_t
798thread_mark_wait_locked(
799 thread_t thread,
800 wait_interrupt_t interruptible)
801{
802 boolean_t at_safe_point;
803
804 assert(thread == current_thread());
805 assert(!(thread->state & (TH_WAIT|TH_IDLE|TH_UNINT|TH_TERMINATE2)));
806
807 /*
808 * The thread may have certain types of interrupts/aborts masked
809 * off. Even if the wait location says these types of interrupts
810 * are OK, we have to honor mask settings (outer-scoped code may
811 * not be able to handle aborts at the moment).
812 */
813 if (interruptible > (thread->options & TH_OPT_INTMASK))
814 interruptible = thread->options & TH_OPT_INTMASK;
815
816 at_safe_point = (interruptible == THREAD_ABORTSAFE);
817
818 if ( interruptible == THREAD_UNINT ||
819 !(thread->sched_flags & TH_SFLAG_ABORT) ||
820 (!at_safe_point &&
821 (thread->sched_flags & TH_SFLAG_ABORTSAFELY))) {
822
823 if ( !(thread->state & TH_TERMINATE))
824 DTRACE_SCHED(sleep);
825
826 thread->state |= (interruptible) ? TH_WAIT : (TH_WAIT | TH_UNINT);
827 thread->at_safe_point = at_safe_point;
828 return (thread->wait_result = THREAD_WAITING);
829 }
830 else
831 if (thread->sched_flags & TH_SFLAG_ABORTSAFELY)
832 thread->sched_flags &= ~TH_SFLAG_ABORTED_MASK;
833
834 return (thread->wait_result = THREAD_INTERRUPTED);
835}
836
837/*
838 * Routine: thread_interrupt_level
839 * Purpose:
840 * Set the maximum interruptible state for the
841 * current thread. The effective value of any
842 * interruptible flag passed into assert_wait
843 * will never exceed this.
844 *
845 * Useful for code that must not be interrupted,
846 * but which calls code that doesn't know that.
847 * Returns:
848 * The old interrupt level for the thread.
849 */
850__private_extern__
851wait_interrupt_t
852thread_interrupt_level(
853 wait_interrupt_t new_level)
854{
855 thread_t thread = current_thread();
856 wait_interrupt_t result = thread->options & TH_OPT_INTMASK;
857
858 thread->options = (thread->options & ~TH_OPT_INTMASK) | (new_level & TH_OPT_INTMASK);
859
860 return result;
861}
862
863/*
864 * Check to see if an assert wait is possible, without actually doing one.
865 * This is used by debug code in locks and elsewhere to verify that it is
866 * always OK to block when trying to take a blocking lock (since waiting
867 * for the actual assert_wait to catch the case may make it hard to detect
868 * this case.
869 */
870boolean_t
871assert_wait_possible(void)
872{
873
874 thread_t thread;
875
876#if DEBUG
877 if(debug_mode) return TRUE; /* Always succeed in debug mode */
878#endif
879
880 thread = current_thread();
881
882 return (thread == NULL || waitq_wait_possible(thread));
883}
884
885/*
886 * assert_wait:
887 *
888 * Assert that the current thread is about to go to
889 * sleep until the specified event occurs.
890 */
891wait_result_t
892assert_wait(
893 event_t event,
894 wait_interrupt_t interruptible)
895{
896 if (__improbable(event == NO_EVENT))
897 panic("%s() called with NO_EVENT", __func__);
898
899 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
900 MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)|DBG_FUNC_NONE,
901 VM_KERNEL_UNSLIDE_OR_PERM(event), 0, 0, 0, 0);
902
903 struct waitq *waitq;
904 waitq = global_eventq(event);
905 return waitq_assert_wait64(waitq, CAST_EVENT64_T(event), interruptible, TIMEOUT_WAIT_FOREVER);
906}
907
908wait_result_t
909assert_wait_timeout(
910 event_t event,
911 wait_interrupt_t interruptible,
912 uint32_t interval,
913 uint32_t scale_factor)
914{
915 thread_t thread = current_thread();
916 wait_result_t wresult;
917 uint64_t deadline;
918 spl_t s;
919
920 if (__improbable(event == NO_EVENT))
921 panic("%s() called with NO_EVENT", __func__);
922
923 struct waitq *waitq;
924 waitq = global_eventq(event);
925
926 s = splsched();
927 waitq_lock(waitq);
928 thread_lock(thread);
929
930 clock_interval_to_deadline(interval, scale_factor, &deadline);
931
932 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
933 MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)|DBG_FUNC_NONE,
934 VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
935
936 wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
937 interruptible,
938 TIMEOUT_URGENCY_SYS_NORMAL,
939 deadline, TIMEOUT_NO_LEEWAY,
940 thread);
941
942 thread_unlock(thread);
943 waitq_unlock(waitq);
944 splx(s);
945 return wresult;
946}
947
948wait_result_t
949assert_wait_timeout_with_leeway(
950 event_t event,
951 wait_interrupt_t interruptible,
952 wait_timeout_urgency_t urgency,
953 uint32_t interval,
954 uint32_t leeway,
955 uint32_t scale_factor)
956{
957 thread_t thread = current_thread();
958 wait_result_t wresult;
959 uint64_t deadline;
960 uint64_t abstime;
961 uint64_t slop;
962 uint64_t now;
963 spl_t s;
964
965 if (__improbable(event == NO_EVENT))
966 panic("%s() called with NO_EVENT", __func__);
967
968 now = mach_absolute_time();
969 clock_interval_to_absolutetime_interval(interval, scale_factor, &abstime);
970 deadline = now + abstime;
971
972 clock_interval_to_absolutetime_interval(leeway, scale_factor, &slop);
973
974 struct waitq *waitq;
975 waitq = global_eventq(event);
976
977 s = splsched();
978 waitq_lock(waitq);
979 thread_lock(thread);
980
981 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
982 MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)|DBG_FUNC_NONE,
983 VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
984
985 wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
986 interruptible,
987 urgency, deadline, slop,
988 thread);
989
990 thread_unlock(thread);
991 waitq_unlock(waitq);
992 splx(s);
993 return wresult;
994}
995
996wait_result_t
997assert_wait_deadline(
998 event_t event,
999 wait_interrupt_t interruptible,
1000 uint64_t deadline)
1001{
1002 thread_t thread = current_thread();
1003 wait_result_t wresult;
1004 spl_t s;
1005
1006 if (__improbable(event == NO_EVENT))
1007 panic("%s() called with NO_EVENT", __func__);
1008
1009 struct waitq *waitq;
1010 waitq = global_eventq(event);
1011
1012 s = splsched();
1013 waitq_lock(waitq);
1014 thread_lock(thread);
1015
1016 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1017 MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)|DBG_FUNC_NONE,
1018 VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
1019
1020 wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
1021 interruptible,
1022 TIMEOUT_URGENCY_SYS_NORMAL, deadline,
1023 TIMEOUT_NO_LEEWAY, thread);
1024 thread_unlock(thread);
1025 waitq_unlock(waitq);
1026 splx(s);
1027 return wresult;
1028}
1029
1030wait_result_t
1031assert_wait_deadline_with_leeway(
1032 event_t event,
1033 wait_interrupt_t interruptible,
1034 wait_timeout_urgency_t urgency,
1035 uint64_t deadline,
1036 uint64_t leeway)
1037{
1038 thread_t thread = current_thread();
1039 wait_result_t wresult;
1040 spl_t s;
1041
1042 if (__improbable(event == NO_EVENT))
1043 panic("%s() called with NO_EVENT", __func__);
1044
1045 struct waitq *waitq;
1046 waitq = global_eventq(event);
1047
1048 s = splsched();
1049 waitq_lock(waitq);
1050 thread_lock(thread);
1051
1052 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1053 MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)|DBG_FUNC_NONE,
1054 VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
1055
1056 wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
1057 interruptible,
1058 urgency, deadline, leeway,
1059 thread);
1060
1061 thread_unlock(thread);
1062 waitq_unlock(waitq);
1063 splx(s);
1064 return wresult;
1065}
1066
1067/*
1068 * thread_isoncpu:
1069 *
1070 * Return TRUE if a thread is running on a processor such that an AST
1071 * is needed to pull it out of userspace execution, or if executing in
1072 * the kernel, bring to a context switch boundary that would cause
1073 * thread state to be serialized in the thread PCB.
1074 *
1075 * Thread locked, returns the same way. While locked, fields
1076 * like "state" cannot change. "runq" can change only from set to unset.
1077 */
1078static inline boolean_t
1079thread_isoncpu(thread_t thread)
1080{
1081 /* Not running or runnable */
1082 if (!(thread->state & TH_RUN))
1083 return (FALSE);
1084
1085 /* Waiting on a runqueue, not currently running */
1086 /* TODO: This is invalid - it can get dequeued without thread lock, but not context switched. */
1087 if (thread->runq != PROCESSOR_NULL)
1088 return (FALSE);
1089
1090 /*
1091 * Thread does not have a stack yet
1092 * It could be on the stack alloc queue or preparing to be invoked
1093 */
1094 if (!thread->kernel_stack)
1095 return (FALSE);
1096
1097 /*
1098 * Thread must be running on a processor, or
1099 * about to run, or just did run. In all these
1100 * cases, an AST to the processor is needed
1101 * to guarantee that the thread is kicked out
1102 * of userspace and the processor has
1103 * context switched (and saved register state).
1104 */
1105 return (TRUE);
1106}
1107
1108/*
1109 * thread_stop:
1110 *
1111 * Force a preemption point for a thread and wait
1112 * for it to stop running on a CPU. If a stronger
1113 * guarantee is requested, wait until no longer
1114 * runnable. Arbitrates access among
1115 * multiple stop requests. (released by unstop)
1116 *
1117 * The thread must enter a wait state and stop via a
1118 * separate means.
1119 *
1120 * Returns FALSE if interrupted.
1121 */
1122boolean_t
1123thread_stop(
1124 thread_t thread,
1125 boolean_t until_not_runnable)
1126{
1127 wait_result_t wresult;
1128 spl_t s = splsched();
1129 boolean_t oncpu;
1130
1131 wake_lock(thread);
1132 thread_lock(thread);
1133
1134 while (thread->state & TH_SUSP) {
1135 thread->wake_active = TRUE;
1136 thread_unlock(thread);
1137
1138 wresult = assert_wait(&thread->wake_active, THREAD_ABORTSAFE);
1139 wake_unlock(thread);
1140 splx(s);
1141
1142 if (wresult == THREAD_WAITING)
1143 wresult = thread_block(THREAD_CONTINUE_NULL);
1144
1145 if (wresult != THREAD_AWAKENED)
1146 return (FALSE);
1147
1148 s = splsched();
1149 wake_lock(thread);
1150 thread_lock(thread);
1151 }
1152
1153 thread->state |= TH_SUSP;
1154
1155 while ((oncpu = thread_isoncpu(thread)) ||
1156 (until_not_runnable && (thread->state & TH_RUN))) {
1157 processor_t processor;
1158
1159 if (oncpu) {
1160 assert(thread->state & TH_RUN);
1161 processor = thread->chosen_processor;
1162 cause_ast_check(processor);
1163 }
1164
1165 thread->wake_active = TRUE;
1166 thread_unlock(thread);
1167
1168 wresult = assert_wait(&thread->wake_active, THREAD_ABORTSAFE);
1169 wake_unlock(thread);
1170 splx(s);
1171
1172 if (wresult == THREAD_WAITING)
1173 wresult = thread_block(THREAD_CONTINUE_NULL);
1174
1175 if (wresult != THREAD_AWAKENED) {
1176 thread_unstop(thread);
1177 return (FALSE);
1178 }
1179
1180 s = splsched();
1181 wake_lock(thread);
1182 thread_lock(thread);
1183 }
1184
1185 thread_unlock(thread);
1186 wake_unlock(thread);
1187 splx(s);
1188
1189 /*
1190 * We return with the thread unlocked. To prevent it from
1191 * transitioning to a runnable state (or from TH_RUN to
1192 * being on the CPU), the caller must ensure the thread
1193 * is stopped via an external means (such as an AST)
1194 */
1195
1196 return (TRUE);
1197}
1198
1199/*
1200 * thread_unstop:
1201 *
1202 * Release a previous stop request and set
1203 * the thread running if appropriate.
1204 *
1205 * Use only after a successful stop operation.
1206 */
1207void
1208thread_unstop(
1209 thread_t thread)
1210{
1211 spl_t s = splsched();
1212
1213 wake_lock(thread);
1214 thread_lock(thread);
1215
1216 assert((thread->state & (TH_RUN|TH_WAIT|TH_SUSP)) != TH_SUSP);
1217
1218 if (thread->state & TH_SUSP) {
1219 thread->state &= ~TH_SUSP;
1220
1221 if (thread->wake_active) {
1222 thread->wake_active = FALSE;
1223 thread_unlock(thread);
1224
1225 thread_wakeup(&thread->wake_active);
1226 wake_unlock(thread);
1227 splx(s);
1228
1229 return;
1230 }
1231 }
1232
1233 thread_unlock(thread);
1234 wake_unlock(thread);
1235 splx(s);
1236}
1237
1238/*
1239 * thread_wait:
1240 *
1241 * Wait for a thread to stop running. (non-interruptible)
1242 *
1243 */
1244void
1245thread_wait(
1246 thread_t thread,
1247 boolean_t until_not_runnable)
1248{
1249 wait_result_t wresult;
1250 boolean_t oncpu;
1251 processor_t processor;
1252 spl_t s = splsched();
1253
1254 wake_lock(thread);
1255 thread_lock(thread);
1256
1257 /*
1258 * Wait until not running on a CPU. If stronger requirement
1259 * desired, wait until not runnable. Assumption: if thread is
1260 * on CPU, then TH_RUN is set, so we're not waiting in any case
1261 * where the original, pure "TH_RUN" check would have let us
1262 * finish.
1263 */
1264 while ((oncpu = thread_isoncpu(thread)) ||
1265 (until_not_runnable && (thread->state & TH_RUN))) {
1266
1267 if (oncpu) {
1268 assert(thread->state & TH_RUN);
1269 processor = thread->chosen_processor;
1270 cause_ast_check(processor);
1271 }
1272
1273 thread->wake_active = TRUE;
1274 thread_unlock(thread);
1275
1276 wresult = assert_wait(&thread->wake_active, THREAD_UNINT);
1277 wake_unlock(thread);
1278 splx(s);
1279
1280 if (wresult == THREAD_WAITING)
1281 thread_block(THREAD_CONTINUE_NULL);
1282
1283 s = splsched();
1284 wake_lock(thread);
1285 thread_lock(thread);
1286 }
1287
1288 thread_unlock(thread);
1289 wake_unlock(thread);
1290 splx(s);
1291}
1292
1293/*
1294 * Routine: clear_wait_internal
1295 *
1296 * Clear the wait condition for the specified thread.
1297 * Start the thread executing if that is appropriate.
1298 * Arguments:
1299 * thread thread to awaken
1300 * result Wakeup result the thread should see
1301 * Conditions:
1302 * At splsched
1303 * the thread is locked.
1304 * Returns:
1305 * KERN_SUCCESS thread was rousted out a wait
1306 * KERN_FAILURE thread was waiting but could not be rousted
1307 * KERN_NOT_WAITING thread was not waiting
1308 */
1309__private_extern__ kern_return_t
1310clear_wait_internal(
1311 thread_t thread,
1312 wait_result_t wresult)
1313{
1314 uint32_t i = LockTimeOut;
1315 struct waitq *waitq = thread->waitq;
1316
1317 do {
1318 if (wresult == THREAD_INTERRUPTED && (thread->state & TH_UNINT))
1319 return (KERN_FAILURE);
1320
1321 if (waitq != NULL) {
1322 assert(waitq_irq_safe(waitq)); //irqs are already disabled!
1323 if (waitq_lock_try(waitq)) {
1324 waitq_pull_thread_locked(waitq, thread);
1325 waitq_unlock(waitq);
1326 } else {
1327 thread_unlock(thread);
1328 delay(1);
1329 thread_lock(thread);
1330 if (waitq != thread->waitq)
1331 return KERN_NOT_WAITING;
1332 continue;
1333 }
1334 }
1335
1336 /* TODO: Can we instead assert TH_TERMINATE is not set? */
1337 if ((thread->state & (TH_WAIT|TH_TERMINATE)) == TH_WAIT)
1338 return (thread_go(thread, wresult));
1339 else
1340 return (KERN_NOT_WAITING);
1341 } while ((--i > 0) || machine_timeout_suspended());
1342
1343 panic("clear_wait_internal: deadlock: thread=%p, wq=%p, cpu=%d\n",
1344 thread, waitq, cpu_number());
1345
1346 return (KERN_FAILURE);
1347}
1348
1349
1350/*
1351 * clear_wait:
1352 *
1353 * Clear the wait condition for the specified thread. Start the thread
1354 * executing if that is appropriate.
1355 *
1356 * parameters:
1357 * thread thread to awaken
1358 * result Wakeup result the thread should see
1359 */
1360kern_return_t
1361clear_wait(
1362 thread_t thread,
1363 wait_result_t result)
1364{
1365 kern_return_t ret;
1366 spl_t s;
1367
1368 s = splsched();
1369 thread_lock(thread);
1370 ret = clear_wait_internal(thread, result);
1371 thread_unlock(thread);
1372 splx(s);
1373 return ret;
1374}
1375
1376
1377/*
1378 * thread_wakeup_prim:
1379 *
1380 * Common routine for thread_wakeup, thread_wakeup_with_result,
1381 * and thread_wakeup_one.
1382 *
1383 */
1384kern_return_t
1385thread_wakeup_prim(
1386 event_t event,
1387 boolean_t one_thread,
1388 wait_result_t result)
1389{
1390 return (thread_wakeup_prim_internal(event, one_thread, result, -1));
1391}
1392
1393
1394kern_return_t
1395thread_wakeup_prim_internal(
1396 event_t event,
1397 boolean_t one_thread,
1398 wait_result_t result,
1399 int priority)
1400{
1401 if (__improbable(event == NO_EVENT))
1402 panic("%s() called with NO_EVENT", __func__);
1403
1404 struct waitq *wq;
1405
1406 wq = global_eventq(event);
1407 priority = (priority == -1 ? WAITQ_ALL_PRIORITIES : priority);
1408
1409 if (one_thread)
1410 return waitq_wakeup64_one(wq, CAST_EVENT64_T(event), result, priority);
1411 else
1412 return waitq_wakeup64_all(wq, CAST_EVENT64_T(event), result, priority);
1413}
1414
1415/*
1416 * thread_bind:
1417 *
1418 * Force the current thread to execute on the specified processor.
1419 * Takes effect after the next thread_block().
1420 *
1421 * Returns the previous binding. PROCESSOR_NULL means
1422 * not bound.
1423 *
1424 * XXX - DO NOT export this to users - XXX
1425 */
1426processor_t
1427thread_bind(
1428 processor_t processor)
1429{
1430 thread_t self = current_thread();
1431 processor_t prev;
1432 spl_t s;
1433
1434 s = splsched();
1435 thread_lock(self);
1436
1437 prev = thread_bind_internal(self, processor);
1438
1439 thread_unlock(self);
1440 splx(s);
1441
1442 return (prev);
1443}
1444
1445/*
1446 * thread_bind_internal:
1447 *
1448 * If the specified thread is not the current thread, and it is currently
1449 * running on another CPU, a remote AST must be sent to that CPU to cause
1450 * the thread to migrate to its bound processor. Otherwise, the migration
1451 * will occur at the next quantum expiration or blocking point.
1452 *
1453 * When the thread is the current thread, and explicit thread_block() should
1454 * be used to force the current processor to context switch away and
1455 * let the thread migrate to the bound processor.
1456 *
1457 * Thread must be locked, and at splsched.
1458 */
1459
1460static processor_t
1461thread_bind_internal(
1462 thread_t thread,
1463 processor_t processor)
1464{
1465 processor_t prev;
1466
1467 /* <rdar://problem/15102234> */
1468 assert(thread->sched_pri < BASEPRI_RTQUEUES);
1469 /* A thread can't be bound if it's sitting on a (potentially incorrect) runqueue */
1470 assert(thread->runq == PROCESSOR_NULL);
1471
1472 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_THREAD_BIND), thread_tid(thread), processor ? (uintptr_t)processor->cpu_id : (uintptr_t)-1, 0, 0, 0);
1473
1474 prev = thread->bound_processor;
1475 thread->bound_processor = processor;
1476
1477 return (prev);
1478}
1479
1480/*
1481 * thread_vm_bind_group_add:
1482 *
1483 * The "VM bind group" is a special mechanism to mark a collection
1484 * of threads from the VM subsystem that, in general, should be scheduled
1485 * with only one CPU of parallelism. To accomplish this, we initially
1486 * bind all the threads to the master processor, which has the effect
1487 * that only one of the threads in the group can execute at once, including
1488 * preempting threads in the group that are a lower priority. Future
1489 * mechanisms may use more dynamic mechanisms to prevent the collection
1490 * of VM threads from using more CPU time than desired.
1491 *
1492 * The current implementation can result in priority inversions where
1493 * compute-bound priority 95 or realtime threads that happen to have
1494 * landed on the master processor prevent the VM threads from running.
1495 * When this situation is detected, we unbind the threads for one
1496 * scheduler tick to allow the scheduler to run the threads an
1497 * additional CPUs, before restoring the binding (assuming high latency
1498 * is no longer a problem).
1499 */
1500
1501/*
1502 * The current max is provisioned for:
1503 * vm_compressor_swap_trigger_thread (92)
1504 * 2 x vm_pageout_iothread_internal (92) when vm_restricted_to_single_processor==TRUE
1505 * vm_pageout_continue (92)
1506 * memorystatus_thread (95)
1507 */
1508#define MAX_VM_BIND_GROUP_COUNT (5)
1509decl_simple_lock_data(static,sched_vm_group_list_lock);
1510static thread_t sched_vm_group_thread_list[MAX_VM_BIND_GROUP_COUNT];
1511static int sched_vm_group_thread_count;
1512static boolean_t sched_vm_group_temporarily_unbound = FALSE;
1513
1514void
1515thread_vm_bind_group_add(void)
1516{
1517 thread_t self = current_thread();
1518
1519 thread_reference_internal(self);
1520 self->options |= TH_OPT_SCHED_VM_GROUP;
1521
1522 simple_lock(&sched_vm_group_list_lock);
1523 assert(sched_vm_group_thread_count < MAX_VM_BIND_GROUP_COUNT);
1524 sched_vm_group_thread_list[sched_vm_group_thread_count++] = self;
1525 simple_unlock(&sched_vm_group_list_lock);
1526
1527 thread_bind(master_processor);
1528
1529 /* Switch to bound processor if not already there */
1530 thread_block(THREAD_CONTINUE_NULL);
1531}
1532
1533static void
1534sched_vm_group_maintenance(void)
1535{
1536 uint64_t ctime = mach_absolute_time();
1537 uint64_t longtime = ctime - sched_tick_interval;
1538 int i;
1539 spl_t s;
1540 boolean_t high_latency_observed = FALSE;
1541 boolean_t runnable_and_not_on_runq_observed = FALSE;
1542 boolean_t bind_target_changed = FALSE;
1543 processor_t bind_target = PROCESSOR_NULL;
1544
1545 /* Make sure nobody attempts to add new threads while we are enumerating them */
1546 simple_lock(&sched_vm_group_list_lock);
1547
1548 s = splsched();
1549
1550 for (i=0; i < sched_vm_group_thread_count; i++) {
1551 thread_t thread = sched_vm_group_thread_list[i];
1552 assert(thread != THREAD_NULL);
1553 thread_lock(thread);
1554 if ((thread->state & (TH_RUN|TH_WAIT)) == TH_RUN) {
1555 if (thread->runq != PROCESSOR_NULL && thread->last_made_runnable_time < longtime) {
1556 high_latency_observed = TRUE;
1557 } else if (thread->runq == PROCESSOR_NULL) {
1558 /* There are some cases where a thread be transitiong that also fall into this case */
1559 runnable_and_not_on_runq_observed = TRUE;
1560 }
1561 }
1562 thread_unlock(thread);
1563
1564 if (high_latency_observed && runnable_and_not_on_runq_observed) {
1565 /* All the things we are looking for are true, stop looking */
1566 break;
1567 }
1568 }
1569
1570 splx(s);
1571
1572 if (sched_vm_group_temporarily_unbound) {
1573 /* If we turned off binding, make sure everything is OK before rebinding */
1574 if (!high_latency_observed) {
1575 /* rebind */
1576 bind_target_changed = TRUE;
1577 bind_target = master_processor;
1578 sched_vm_group_temporarily_unbound = FALSE; /* might be reset to TRUE if change cannot be completed */
1579 }
1580 } else {
1581 /*
1582 * Check if we're in a bad state, which is defined by high
1583 * latency with no core currently executing a thread. If a
1584 * single thread is making progress on a CPU, that means the
1585 * binding concept to reduce parallelism is working as
1586 * designed.
1587 */
1588 if (high_latency_observed && !runnable_and_not_on_runq_observed) {
1589 /* unbind */
1590 bind_target_changed = TRUE;
1591 bind_target = PROCESSOR_NULL;
1592 sched_vm_group_temporarily_unbound = TRUE;
1593 }
1594 }
1595
1596 if (bind_target_changed) {
1597 s = splsched();
1598 for (i=0; i < sched_vm_group_thread_count; i++) {
1599 thread_t thread = sched_vm_group_thread_list[i];
1600 boolean_t removed;
1601 assert(thread != THREAD_NULL);
1602
1603 thread_lock(thread);
1604 removed = thread_run_queue_remove(thread);
1605 if (removed || ((thread->state & (TH_RUN | TH_WAIT)) == TH_WAIT)) {
1606 thread_bind_internal(thread, bind_target);
1607 } else {
1608 /*
1609 * Thread was in the middle of being context-switched-to,
1610 * or was in the process of blocking. To avoid switching the bind
1611 * state out mid-flight, defer the change if possible.
1612 */
1613 if (bind_target == PROCESSOR_NULL) {
1614 thread_bind_internal(thread, bind_target);
1615 } else {
1616 sched_vm_group_temporarily_unbound = TRUE; /* next pass will try again */
1617 }
1618 }
1619
1620 if (removed) {
1621 thread_run_queue_reinsert(thread, SCHED_PREEMPT | SCHED_TAILQ);
1622 }
1623 thread_unlock(thread);
1624 }
1625 splx(s);
1626 }
1627
1628 simple_unlock(&sched_vm_group_list_lock);
1629}
1630
1631/* Invoked prior to idle entry to determine if, on SMT capable processors, an SMT
1632 * rebalancing opportunity exists when a core is (instantaneously) idle, but
1633 * other SMT-capable cores may be over-committed. TODO: some possible negatives:
1634 * IPI thrash if this core does not remain idle following the load balancing ASTs
1635 * Idle "thrash", when IPI issue is followed by idle entry/core power down
1636 * followed by a wakeup shortly thereafter.
1637 */
1638
1639#if (DEVELOPMENT || DEBUG)
1640int sched_smt_balance = 1;
1641#endif
1642
1643#if __SMP__
1644/* Invoked with pset locked, returns with pset unlocked */
1645static void
1646sched_SMT_balance(processor_t cprocessor, processor_set_t cpset) {
1647 processor_t ast_processor = NULL;
1648
1649#if (DEVELOPMENT || DEBUG)
1650 if (__improbable(sched_smt_balance == 0))
1651 goto smt_balance_exit;
1652#endif
1653
1654 assert(cprocessor == current_processor());
1655 if (cprocessor->is_SMT == FALSE)
1656 goto smt_balance_exit;
1657
1658 processor_t sib_processor = cprocessor->processor_secondary ? cprocessor->processor_secondary : cprocessor->processor_primary;
1659
1660 /* Determine if both this processor and its sibling are idle,
1661 * indicating an SMT rebalancing opportunity.
1662 */
1663 if (sib_processor->state != PROCESSOR_IDLE)
1664 goto smt_balance_exit;
1665
1666 processor_t sprocessor;
1667
1668 sprocessor = (processor_t)queue_first(&cpset->active_queue);
1669
1670 while (!queue_end(&cpset->active_queue, (queue_entry_t)sprocessor)) {
1671 if ((sprocessor->state == PROCESSOR_RUNNING) &&
1672 (sprocessor->processor_primary != sprocessor) &&
1673 (sprocessor->processor_primary->state == PROCESSOR_RUNNING) &&
1674 (sprocessor->current_pri < BASEPRI_RTQUEUES) &&
1675 ((cpset->pending_AST_cpu_mask & (1ULL << sprocessor->cpu_id)) == 0)) {
1676 assert(sprocessor != cprocessor);
1677 ast_processor = sprocessor;
1678 break;
1679 }
1680 sprocessor = (processor_t)queue_next((queue_entry_t)sprocessor);
1681 }
1682
1683smt_balance_exit:
1684 pset_unlock(cpset);
1685
1686 if (ast_processor) {
1687 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_SMT_BALANCE), ast_processor->cpu_id, ast_processor->state, ast_processor->processor_primary->state, 0, 0);
1688 cause_ast_check(ast_processor);
1689 }
1690}
1691#endif /* __SMP__ */
1692
1693/*
1694 * thread_select:
1695 *
1696 * Select a new thread for the current processor to execute.
1697 *
1698 * May select the current thread, which must be locked.
1699 */
1700static thread_t
1701thread_select(
1702 thread_t thread,
1703 processor_t processor,
1704 ast_t reason)
1705{
1706 processor_set_t pset = processor->processor_set;
1707 thread_t new_thread = THREAD_NULL;
1708
1709 assert(processor == current_processor());
1710 assert((thread->state & (TH_RUN|TH_TERMINATE2)) == TH_RUN);
1711
1712 do {
1713 /*
1714 * Update the priority.
1715 */
1716 if (SCHED(can_update_priority)(thread))
1717 SCHED(update_priority)(thread);
1718
1719 processor->current_pri = thread->sched_pri;
1720 processor->current_thmode = thread->sched_mode;
1721 processor->current_sfi_class = thread->sfi_class;
1722
1723 pset_lock(pset);
1724
1725 assert(processor->state != PROCESSOR_OFF_LINE);
1726
1727 if (!processor->is_recommended) {
1728 /*
1729 * The performance controller has provided a hint to not dispatch more threads,
1730 * unless they are bound to us (and thus we are the only option
1731 */
1732 if (!SCHED(processor_bound_count)(processor)) {
1733 goto idle;
1734 }
1735 } else if (processor->processor_primary != processor) {
1736 /*
1737 * Should this secondary SMT processor attempt to find work? For pset runqueue systems,
1738 * we should look for work only under the same conditions that choose_processor()
1739 * would have assigned work, which is when all primary processors have been assigned work.
1740 *
1741 * An exception is that bound threads are dispatched to a processor without going through
1742 * choose_processor(), so in those cases we should continue trying to dequeue work.
1743 */
1744 if (!SCHED(processor_bound_count)(processor) && !queue_empty(&pset->idle_queue) && !rt_runq.count) {
1745 goto idle;
1746 }
1747 }
1748
1749 rt_lock_lock();
1750
1751 /*
1752 * Test to see if the current thread should continue
1753 * to run on this processor. Must not be attempting to wait, and not
1754 * bound to a different processor, nor be in the wrong
1755 * processor set, nor be forced to context switch by TH_SUSP.
1756 *
1757 * Note that there are never any RT threads in the regular runqueue.
1758 *
1759 * This code is very insanely tricky.
1760 */
1761
1762 if (((thread->state & (TH_TERMINATE|TH_IDLE|TH_WAIT|TH_RUN|TH_SUSP)) == TH_RUN) &&
1763 (thread->sched_pri >= BASEPRI_RTQUEUES || processor->processor_primary == processor) &&
1764 (thread->bound_processor == PROCESSOR_NULL || thread->bound_processor == processor) &&
1765 (thread->affinity_set == AFFINITY_SET_NULL || thread->affinity_set->aset_pset == pset)) {
1766 /*
1767 * RT threads with un-expired quantum stay on processor,
1768 * unless there's a valid RT thread with an earlier deadline.
1769 */
1770 if (thread->sched_pri >= BASEPRI_RTQUEUES && processor->first_timeslice) {
1771 if (rt_runq.count > 0) {
1772 thread_t next_rt;
1773
1774 next_rt = (thread_t)queue_first(&rt_runq.queue);
1775
1776 assert(next_rt->runq == THREAD_ON_RT_RUNQ);
1777
1778 if (next_rt->realtime.deadline < processor->deadline &&
1779 (next_rt->bound_processor == PROCESSOR_NULL ||
1780 next_rt->bound_processor == processor)) {
1781 /* The next RT thread is better, so pick it off the runqueue. */
1782 goto pick_new_rt_thread;
1783 }
1784 }
1785
1786 /* This is still the best RT thread to run. */
1787 processor->deadline = thread->realtime.deadline;
1788
1789 rt_lock_unlock();
1790 pset_unlock(pset);
1791
1792 return (thread);
1793 }
1794
1795 if ((rt_runq.count == 0) &&
1796 SCHED(processor_queue_has_priority)(processor, thread->sched_pri, TRUE) == FALSE) {
1797 /* This thread is still the highest priority runnable (non-idle) thread */
1798 processor->deadline = UINT64_MAX;
1799
1800 rt_lock_unlock();
1801 pset_unlock(pset);
1802
1803 return (thread);
1804 }
1805 }
1806
1807 /* OK, so we're not going to run the current thread. Look at the RT queue. */
1808 if (rt_runq.count > 0) {
1809 thread_t next_rt = (thread_t)queue_first(&rt_runq.queue);
1810
1811 assert(next_rt->runq == THREAD_ON_RT_RUNQ);
1812
1813 if (__probable((next_rt->bound_processor == PROCESSOR_NULL ||
1814 (next_rt->bound_processor == processor)))) {
1815pick_new_rt_thread:
1816 new_thread = (thread_t)dequeue_head(&rt_runq.queue);
1817
1818 new_thread->runq = PROCESSOR_NULL;
1819 SCHED_STATS_RUNQ_CHANGE(&rt_runq.runq_stats, rt_runq.count);
1820 rt_runq.count--;
1821
1822 processor->deadline = new_thread->realtime.deadline;
1823
1824 rt_lock_unlock();
1825 pset_unlock(pset);
1826
1827 return (new_thread);
1828 }
1829 }
1830
1831 processor->deadline = UINT64_MAX;
1832 rt_lock_unlock();
1833
1834 /* No RT threads, so let's look at the regular threads. */
1835 if ((new_thread = SCHED(choose_thread)(processor, MINPRI, reason)) != THREAD_NULL) {
1836 pset_unlock(pset);
1837 return (new_thread);
1838 }
1839
1840#if __SMP__
1841 if (SCHED(steal_thread_enabled)) {
1842 /*
1843 * No runnable threads, attempt to steal
1844 * from other processors. Returns with pset lock dropped.
1845 */
1846
1847 if ((new_thread = SCHED(steal_thread)(pset)) != THREAD_NULL) {
1848 return (new_thread);
1849 }
1850
1851 /*
1852 * If other threads have appeared, shortcut
1853 * around again.
1854 */
1855 if (!SCHED(processor_queue_empty)(processor) || rt_runq.count > 0)
1856 continue;
1857
1858 pset_lock(pset);
1859 }
1860#endif
1861
1862 idle:
1863 /*
1864 * Nothing is runnable, so set this processor idle if it
1865 * was running.
1866 */
1867 if (processor->state == PROCESSOR_RUNNING) {
1868 remqueue((queue_entry_t)processor);
1869 processor->state = PROCESSOR_IDLE;
1870
1871 if (processor->processor_primary == processor) {
1872 enqueue_head(&pset->idle_queue, (queue_entry_t)processor);
1873 }
1874 else {
1875 enqueue_head(&pset->idle_secondary_queue, (queue_entry_t)processor);
1876 }
1877 }
1878
1879#if __SMP__
1880 /* Invoked with pset locked, returns with pset unlocked */
1881 sched_SMT_balance(processor, pset);
1882#else
1883 pset_unlock(pset);
1884#endif
1885
1886#if CONFIG_SCHED_IDLE_IN_PLACE
1887 /*
1888 * Choose idle thread if fast idle is not possible.
1889 */
1890 if (processor->processor_primary != processor)
1891 return (processor->idle_thread);
1892
1893 if ((thread->state & (TH_IDLE|TH_TERMINATE|TH_SUSP)) || !(thread->state & TH_WAIT) || thread->wake_active || thread->sched_pri >= BASEPRI_RTQUEUES)
1894 return (processor->idle_thread);
1895
1896 /*
1897 * Perform idling activities directly without a
1898 * context switch. Return dispatched thread,
1899 * else check again for a runnable thread.
1900 */
1901 new_thread = thread_select_idle(thread, processor);
1902
1903#else /* !CONFIG_SCHED_IDLE_IN_PLACE */
1904
1905 /*
1906 * Do a full context switch to idle so that the current
1907 * thread can start running on another processor without
1908 * waiting for the fast-idled processor to wake up.
1909 */
1910 new_thread = processor->idle_thread;
1911
1912#endif /* !CONFIG_SCHED_IDLE_IN_PLACE */
1913
1914 } while (new_thread == THREAD_NULL);
1915
1916 return (new_thread);
1917}
1918
1919#if CONFIG_SCHED_IDLE_IN_PLACE
1920/*
1921 * thread_select_idle:
1922 *
1923 * Idle the processor using the current thread context.
1924 *
1925 * Called with thread locked, then dropped and relocked.
1926 */
1927static thread_t
1928thread_select_idle(
1929 thread_t thread,
1930 processor_t processor)
1931{
1932 thread_t new_thread;
1933 uint64_t arg1, arg2;
1934 int urgency;
1935
1936 if (thread->sched_mode == TH_MODE_TIMESHARE) {
1937 if (thread->sched_flags & TH_SFLAG_THROTTLED)
1938 sched_background_decr(thread);
1939
1940 sched_share_decr(thread);
1941 }
1942 sched_run_decr(thread);
1943
1944 thread->state |= TH_IDLE;
1945 processor->current_pri = IDLEPRI;
1946 processor->current_thmode = TH_MODE_NONE;
1947 processor->current_sfi_class = SFI_CLASS_KERNEL;
1948
1949 /* Reload precise timing global policy to thread-local policy */
1950 thread->precise_user_kernel_time = use_precise_user_kernel_time(thread);
1951
1952 thread_unlock(thread);
1953
1954 /*
1955 * Switch execution timing to processor idle thread.
1956 */
1957 processor->last_dispatch = mach_absolute_time();
1958
1959#ifdef CONFIG_MACH_APPROXIMATE_TIME
1960 commpage_update_mach_approximate_time(processor->last_dispatch);
1961#endif
1962
1963 thread->last_run_time = processor->last_dispatch;
1964 thread_timer_event(processor->last_dispatch, &processor->idle_thread->system_timer);
1965 PROCESSOR_DATA(processor, kernel_timer) = &processor->idle_thread->system_timer;
1966
1967 /*
1968 * Cancel the quantum timer while idling.
1969 */
1970 timer_call_cancel(&processor->quantum_timer);
1971 processor->first_timeslice = FALSE;
1972
1973 (*thread->sched_call)(SCHED_CALL_BLOCK, thread);
1974
1975 thread_tell_urgency(THREAD_URGENCY_NONE, 0, 0, 0, NULL);
1976
1977 /*
1978 * Enable interrupts and perform idling activities. No
1979 * preemption due to TH_IDLE being set.
1980 */
1981 spllo(); new_thread = processor_idle(thread, processor);
1982
1983 /*
1984 * Return at splsched.
1985 */
1986 (*thread->sched_call)(SCHED_CALL_UNBLOCK, thread);
1987
1988 thread_lock(thread);
1989
1990 /*
1991 * If awakened, switch to thread timer and start a new quantum.
1992 * Otherwise skip; we will context switch to another thread or return here.
1993 */
1994 if (!(thread->state & TH_WAIT)) {
1995 processor->last_dispatch = mach_absolute_time();
1996 thread_timer_event(processor->last_dispatch, &thread->system_timer);
1997 PROCESSOR_DATA(processor, kernel_timer) = &thread->system_timer;
1998
1999 thread_quantum_init(thread);
2000 processor->quantum_end = processor->last_dispatch + thread->quantum_remaining;
2001 timer_call_enter1(&processor->quantum_timer, thread, processor->quantum_end, TIMER_CALL_SYS_CRITICAL | TIMER_CALL_LOCAL);
2002 processor->first_timeslice = TRUE;
2003
2004 thread->computation_epoch = processor->last_dispatch;
2005 }
2006
2007 thread->state &= ~TH_IDLE;
2008
2009 urgency = thread_get_urgency(thread, &arg1, &arg2);
2010
2011 thread_tell_urgency(urgency, arg1, arg2, 0, new_thread);
2012
2013 sched_run_incr(thread);
2014 if (thread->sched_mode == TH_MODE_TIMESHARE) {
2015 sched_share_incr(thread);
2016
2017 if (thread->sched_flags & TH_SFLAG_THROTTLED)
2018 sched_background_incr(thread);
2019 }
2020
2021 return (new_thread);
2022}
2023#endif /* CONFIG_SCHED_IDLE_IN_PLACE */
2024
2025/*
2026 * thread_invoke
2027 *
2028 * Called at splsched with neither thread locked.
2029 *
2030 * Perform a context switch and start executing the new thread.
2031 *
2032 * Returns FALSE when the context switch didn't happen.
2033 * The reference to the new thread is still consumed.
2034 *
2035 * "self" is what is currently running on the processor,
2036 * "thread" is the new thread to context switch to
2037 * (which may be the same thread in some cases)
2038 */
2039static boolean_t
2040thread_invoke(
2041 thread_t self,
2042 thread_t thread,
2043 ast_t reason)
2044{
2045 if (__improbable(get_preemption_level() != 0)) {
2046 int pl = get_preemption_level();
2047 panic("thread_invoke: preemption_level %d, possible cause: %s",
2048 pl, (pl < 0 ? "unlocking an unlocked mutex or spinlock" :
2049 "blocking while holding a spinlock, or within interrupt context"));
2050 }
2051
2052 thread_continue_t continuation = self->continuation;
2053 void *parameter = self->parameter;
2054 processor_t processor;
2055
2056 uint64_t ctime = mach_absolute_time();
2057
2058#ifdef CONFIG_MACH_APPROXIMATE_TIME
2059 commpage_update_mach_approximate_time(ctime);
2060#endif
2061
2062#if defined(CONFIG_SCHED_TIMESHARE_CORE)
2063 sched_timeshare_consider_maintenance(ctime);
2064#endif
2065
2066 assert(self == current_thread());
2067 assert(self->runq == PROCESSOR_NULL);
2068 assert((self->state & (TH_RUN|TH_TERMINATE2)) == TH_RUN);
2069
2070 thread_lock(thread);
2071
2072 assert((thread->state & (TH_RUN|TH_WAIT|TH_UNINT|TH_TERMINATE|TH_TERMINATE2)) == TH_RUN);
2073 assert(thread->bound_processor == PROCESSOR_NULL || thread->bound_processor == current_processor());
2074 assert(thread->runq == PROCESSOR_NULL);
2075
2076 /* Reload precise timing global policy to thread-local policy */
2077 thread->precise_user_kernel_time = use_precise_user_kernel_time(thread);
2078
2079 /* Update SFI class based on other factors */
2080 thread->sfi_class = sfi_thread_classify(thread);
2081
2082 /* Allow realtime threads to hang onto a stack. */
2083 if ((self->sched_mode == TH_MODE_REALTIME) && !self->reserved_stack)
2084 self->reserved_stack = self->kernel_stack;
2085
2086 if (continuation != NULL) {
2087 if (!thread->kernel_stack) {
2088 /*
2089 * If we are using a privileged stack,
2090 * check to see whether we can exchange it with
2091 * that of the other thread.
2092 */
2093 if (self->kernel_stack == self->reserved_stack && !thread->reserved_stack)
2094 goto need_stack;
2095
2096 /*
2097 * Context switch by performing a stack handoff.
2098 */
2099 continuation = thread->continuation;
2100 parameter = thread->parameter;
2101
2102 processor = current_processor();
2103 processor->active_thread = thread;
2104 processor->current_pri = thread->sched_pri;
2105 processor->current_thmode = thread->sched_mode;
2106 processor->current_sfi_class = thread->sfi_class;
2107 if (thread->last_processor != processor && thread->last_processor != NULL) {
2108 if (thread->last_processor->processor_set != processor->processor_set)
2109 thread->ps_switch++;
2110 thread->p_switch++;
2111 }
2112 thread->last_processor = processor;
2113 thread->c_switch++;
2114 ast_context(thread);
2115
2116 thread_unlock(thread);
2117
2118 self->reason = reason;
2119
2120 processor->last_dispatch = ctime;
2121 self->last_run_time = ctime;
2122 thread_timer_event(ctime, &thread->system_timer);
2123 PROCESSOR_DATA(processor, kernel_timer) = &thread->system_timer;
2124
2125 /*
2126 * Since non-precise user/kernel time doesn't update the state timer
2127 * during privilege transitions, synthesize an event now.
2128 */
2129 if (!thread->precise_user_kernel_time) {
2130 timer_switch(PROCESSOR_DATA(processor, current_state),
2131 ctime,
2132 PROCESSOR_DATA(processor, current_state));
2133 }
2134
2135 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2136 MACHDBG_CODE(DBG_MACH_SCHED, MACH_STACK_HANDOFF)|DBG_FUNC_NONE,
2137 self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
2138
2139 if ((thread->chosen_processor != processor) && (thread->chosen_processor != PROCESSOR_NULL)) {
2140 SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_MOVED)|DBG_FUNC_NONE,
2141 (uintptr_t)thread_tid(thread), (uintptr_t)thread->chosen_processor->cpu_id, 0, 0, 0);
2142 }
2143
2144 DTRACE_SCHED2(off__cpu, struct thread *, thread, struct proc *, thread->task->bsd_info);
2145
2146 SCHED_STATS_CSW(processor, self->reason, self->sched_pri, thread->sched_pri);
2147
2148 TLOG(1, "thread_invoke: calling stack_handoff\n");
2149 stack_handoff(self, thread);
2150
2151 /* 'self' is now off core */
2152 assert(thread == current_thread());
2153
2154 DTRACE_SCHED(on__cpu);
2155
2156 thread_dispatch(self, thread);
2157
2158 thread->continuation = thread->parameter = NULL;
2159
2160 counter(c_thread_invoke_hits++);
2161
2162 (void) spllo();
2163
2164 assert(continuation);
2165 call_continuation(continuation, parameter, thread->wait_result);
2166 /*NOTREACHED*/
2167 }
2168 else if (thread == self) {
2169 /* same thread but with continuation */
2170 ast_context(self);
2171 counter(++c_thread_invoke_same);
2172
2173 thread_unlock(self);
2174
2175 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2176 MACHDBG_CODE(DBG_MACH_SCHED,MACH_SCHED) | DBG_FUNC_NONE,
2177 self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
2178
2179 self->continuation = self->parameter = NULL;
2180
2181 (void) spllo();
2182
2183 call_continuation(continuation, parameter, self->wait_result);
2184 /*NOTREACHED*/
2185 }
2186 } else {
2187 /*
2188 * Check that the other thread has a stack
2189 */
2190 if (!thread->kernel_stack) {
2191need_stack:
2192 if (!stack_alloc_try(thread)) {
2193 counter(c_thread_invoke_misses++);
2194 thread_unlock(thread);
2195 thread_stack_enqueue(thread);
2196 return (FALSE);
2197 }
2198 } else if (thread == self) {
2199 ast_context(self);
2200 counter(++c_thread_invoke_same);
2201 thread_unlock(self);
2202
2203 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2204 MACHDBG_CODE(DBG_MACH_SCHED,MACH_SCHED) | DBG_FUNC_NONE,
2205 self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
2206
2207 return (TRUE);
2208 }
2209 }
2210
2211 /*
2212 * Context switch by full context save.
2213 */
2214 processor = current_processor();
2215 processor->active_thread = thread;
2216 processor->current_pri = thread->sched_pri;
2217 processor->current_thmode = thread->sched_mode;
2218 processor->current_sfi_class = thread->sfi_class;
2219 if (thread->last_processor != processor && thread->last_processor != NULL) {
2220 if (thread->last_processor->processor_set != processor->processor_set)
2221 thread->ps_switch++;
2222 thread->p_switch++;
2223 }
2224 thread->last_processor = processor;
2225 thread->c_switch++;
2226 ast_context(thread);
2227
2228 thread_unlock(thread);
2229
2230 counter(c_thread_invoke_csw++);
2231
2232 self->reason = reason;
2233
2234 processor->last_dispatch = ctime;
2235 self->last_run_time = ctime;
2236 thread_timer_event(ctime, &thread->system_timer);
2237 PROCESSOR_DATA(processor, kernel_timer) = &thread->system_timer;
2238
2239 /*
2240 * Since non-precise user/kernel time doesn't update the state timer
2241 * during privilege transitions, synthesize an event now.
2242 */
2243 if (!thread->precise_user_kernel_time) {
2244 timer_switch(PROCESSOR_DATA(processor, current_state),
2245 ctime,
2246 PROCESSOR_DATA(processor, current_state));
2247 }
2248
2249 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2250 MACHDBG_CODE(DBG_MACH_SCHED,MACH_SCHED) | DBG_FUNC_NONE,
2251 self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
2252
2253 if ((thread->chosen_processor != processor) && (thread->chosen_processor != NULL)) {
2254 SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_MOVED)|DBG_FUNC_NONE,
2255 (uintptr_t)thread_tid(thread), (uintptr_t)thread->chosen_processor->cpu_id, 0, 0, 0);
2256 }
2257
2258 DTRACE_SCHED2(off__cpu, struct thread *, thread, struct proc *, thread->task->bsd_info);
2259
2260 SCHED_STATS_CSW(processor, self->reason, self->sched_pri, thread->sched_pri);
2261
2262 /*
2263 * This is where we actually switch register context,
2264 * and address space if required. We will next run
2265 * as a result of a subsequent context switch.
2266 *
2267 * Once registers are switched and the processor is running "thread",
2268 * the stack variables and non-volatile registers will contain whatever
2269 * was there the last time that thread blocked. No local variables should
2270 * be used after this point, except for the special case of "thread", which
2271 * the platform layer returns as the previous thread running on the processor
2272 * via the function call ABI as a return register, and "self", which may have
2273 * been stored on the stack or a non-volatile register, but a stale idea of
2274 * what was on the CPU is newly-accurate because that thread is again
2275 * running on the CPU.
2276 */
2277 assert(continuation == self->continuation);
2278 thread = machine_switch_context(self, continuation, thread);
2279 assert(self == current_thread());
2280 TLOG(1,"thread_invoke: returning machine_switch_context: self %p continuation %p thread %p\n", self, continuation, thread);
2281
2282 DTRACE_SCHED(on__cpu);
2283
2284 /*
2285 * We have been resumed and are set to run.
2286 */
2287 thread_dispatch(thread, self);
2288
2289 if (continuation) {
2290 self->continuation = self->parameter = NULL;
2291
2292 (void) spllo();
2293
2294 call_continuation(continuation, parameter, self->wait_result);
2295 /*NOTREACHED*/
2296 }
2297
2298 return (TRUE);
2299}
2300
2301#if defined(CONFIG_SCHED_DEFERRED_AST)
2302/*
2303 * pset_cancel_deferred_dispatch:
2304 *
2305 * Cancels all ASTs that we can cancel for the given processor set
2306 * if the current processor is running the last runnable thread in the
2307 * system.
2308 *
2309 * This function assumes the current thread is runnable. This must
2310 * be called with the pset unlocked.
2311 */
2312static void
2313pset_cancel_deferred_dispatch(
2314 processor_set_t pset,
2315 processor_t processor)
2316{
2317 processor_t active_processor = NULL;
2318 uint32_t sampled_sched_run_count;
2319
2320 pset_lock(pset);
2321 sampled_sched_run_count = (volatile uint32_t) sched_run_count;
2322
2323 /*
2324 * If we have emptied the run queue, and our current thread is runnable, we
2325 * should tell any processors that are still DISPATCHING that they will
2326 * probably not have any work to do. In the event that there are no
2327 * pending signals that we can cancel, this is also uninteresting.
2328 *
2329 * In the unlikely event that another thread becomes runnable while we are
2330 * doing this (sched_run_count is atomically updated, not guarded), the
2331 * codepath making it runnable SHOULD (a dangerous word) need the pset lock
2332 * in order to dispatch it to a processor in our pset. So, the other
2333 * codepath will wait while we squash all cancelable ASTs, get the pset
2334 * lock, and then dispatch the freshly runnable thread. So this should be
2335 * correct (we won't accidentally have a runnable thread that hasn't been
2336 * dispatched to an idle processor), if not ideal (we may be restarting the
2337 * dispatch process, which could have some overhead).
2338 *
2339 */
2340 if ((sampled_sched_run_count == 1) &&
2341 (pset->pending_deferred_AST_cpu_mask)) {
2342 qe_foreach_element_safe(active_processor, &pset->active_queue, processor_queue) {
2343 /*
2344 * If a processor is DISPATCHING, it could be because of
2345 * a cancelable signal.
2346 *
2347 * IF the processor is not our
2348 * current processor (the current processor should not
2349 * be DISPATCHING, so this is a bit paranoid), AND there
2350 * is a cancelable signal pending on the processor, AND
2351 * there is no non-cancelable signal pending (as there is
2352 * no point trying to backtrack on bringing the processor
2353 * up if a signal we cannot cancel is outstanding), THEN
2354 * it should make sense to roll back the processor state
2355 * to the IDLE state.
2356 *
2357 * If the racey nature of this approach (as the signal
2358 * will be arbitrated by hardware, and can fire as we
2359 * roll back state) results in the core responding
2360 * despite being pushed back to the IDLE state, it
2361 * should be no different than if the core took some
2362 * interrupt while IDLE.
2363 */
2364 if ((active_processor->state == PROCESSOR_DISPATCHING) &&
2365 (pset->pending_deferred_AST_cpu_mask & (1ULL << active_processor->cpu_id)) &&
2366 (!(pset->pending_AST_cpu_mask & (1ULL << active_processor->cpu_id))) &&
2367 (active_processor != processor)) {
2368 /*
2369 * Squash all of the processor state back to some
2370 * reasonable facsimile of PROCESSOR_IDLE.
2371 *
2372 * TODO: What queue policy do we actually want here?
2373 * We want to promote selection of a good processor
2374 * to run on. Do we want to enqueue at the head?
2375 * The tail? At the (relative) old position in the
2376 * queue? Or something else entirely?
2377 */
2378 re_queue_head(&pset->idle_queue, (queue_entry_t)active_processor);
2379
2380 assert(active_processor->next_thread == THREAD_NULL);
2381
2382 active_processor->current_pri = IDLEPRI;
2383 active_processor->current_thmode = TH_MODE_FIXED;
2384 active_processor->current_sfi_class = SFI_CLASS_KERNEL;
2385 active_processor->deadline = UINT64_MAX;
2386 active_processor->state = PROCESSOR_IDLE;
2387 pset->pending_deferred_AST_cpu_mask &= ~(1U << active_processor->cpu_id);
2388 machine_signal_idle_cancel(active_processor);
2389 }
2390
2391 }
2392 }
2393
2394 pset_unlock(pset);
2395}
2396#else
2397/* We don't support deferred ASTs; everything is candycanes and sunshine. */
2398#endif
2399
2400/*
2401 * thread_dispatch:
2402 *
2403 * Handle threads at context switch. Re-dispatch other thread
2404 * if still running, otherwise update run state and perform
2405 * special actions. Update quantum for other thread and begin
2406 * the quantum for ourselves.
2407 *
2408 * "thread" is the old thread that we have switched away from.
2409 * "self" is the new current thread that we have context switched to
2410 *
2411 * Called at splsched.
2412 */
2413void
2414thread_dispatch(
2415 thread_t thread,
2416 thread_t self)
2417{
2418 processor_t processor = self->last_processor;
2419
2420 assert(processor == current_processor());
2421 assert(self == current_thread());
2422 assert(thread != self);
2423
2424 if (thread != THREAD_NULL) {
2425 /*
2426 * If blocked at a continuation, discard
2427 * the stack.
2428 */
2429 if (thread->continuation != NULL && thread->kernel_stack != 0)
2430 stack_free(thread);
2431
2432 if (thread->state & TH_IDLE) {
2433 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2434 MACHDBG_CODE(DBG_MACH_SCHED,MACH_DISPATCH) | DBG_FUNC_NONE,
2435 (uintptr_t)thread_tid(thread), 0, thread->state, sched_run_count, 0);
2436 } else {
2437 int64_t consumed;
2438 int64_t remainder = 0;
2439
2440 if (processor->quantum_end > processor->last_dispatch)
2441 remainder = processor->quantum_end -
2442 processor->last_dispatch;
2443
2444 consumed = thread->quantum_remaining - remainder;
2445
2446 if ((thread->reason & AST_LEDGER) == 0) {
2447 /*
2448 * Bill CPU time to both the task and
2449 * the individual thread.
2450 */
2451 ledger_credit(thread->t_ledger,
2452 task_ledgers.cpu_time, consumed);
2453 ledger_credit(thread->t_threadledger,
2454 thread_ledgers.cpu_time, consumed);
2455#ifdef CONFIG_BANK
2456 if (thread->t_bankledger) {
2457 ledger_credit(thread->t_bankledger,
2458 bank_ledgers.cpu_time,
2459 (consumed - thread->t_deduct_bank_ledger_time));
2460
2461 }
2462 thread->t_deduct_bank_ledger_time =0;
2463#endif
2464 }
2465
2466 wake_lock(thread);
2467 thread_lock(thread);
2468
2469 /*
2470 * Compute remainder of current quantum.
2471 */
2472 if (processor->first_timeslice &&
2473 processor->quantum_end > processor->last_dispatch)
2474 thread->quantum_remaining = (uint32_t)remainder;
2475 else
2476 thread->quantum_remaining = 0;
2477
2478 if (thread->sched_mode == TH_MODE_REALTIME) {
2479 /*
2480 * Cancel the deadline if the thread has
2481 * consumed the entire quantum.
2482 */
2483 if (thread->quantum_remaining == 0) {
2484 thread->realtime.deadline = UINT64_MAX;
2485 }
2486 } else {
2487#if defined(CONFIG_SCHED_TIMESHARE_CORE)
2488 /*
2489 * For non-realtime threads treat a tiny
2490 * remaining quantum as an expired quantum
2491 * but include what's left next time.
2492 */
2493 if (thread->quantum_remaining < min_std_quantum) {
2494 thread->reason |= AST_QUANTUM;
2495 thread->quantum_remaining += SCHED(initial_quantum_size)(thread);
2496 }
2497#endif /* CONFIG_SCHED_TIMESHARE_CORE */
2498 }
2499
2500 /*
2501 * If we are doing a direct handoff then
2502 * take the remainder of the quantum.
2503 */
2504 if ((thread->reason & (AST_HANDOFF|AST_QUANTUM)) == AST_HANDOFF) {
2505 self->quantum_remaining = thread->quantum_remaining;
2506 thread->reason |= AST_QUANTUM;
2507 thread->quantum_remaining = 0;
2508 } else {
2509#if defined(CONFIG_SCHED_MULTIQ)
2510 if (SCHED(sched_groups_enabled) &&
2511 thread->sched_group == self->sched_group) {
2512 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2513 MACHDBG_CODE(DBG_MACH_SCHED, MACH_QUANTUM_HANDOFF),
2514 self->reason, (uintptr_t)thread_tid(thread),
2515 self->quantum_remaining, thread->quantum_remaining, 0);
2516
2517 self->quantum_remaining = thread->quantum_remaining;
2518 thread->quantum_remaining = 0;
2519 /* Don't set AST_QUANTUM here - old thread might still want to preempt someone else */
2520 }
2521#endif /* defined(CONFIG_SCHED_MULTIQ) */
2522 }
2523
2524 thread->computation_metered += (processor->last_dispatch - thread->computation_epoch);
2525
2526 if ((thread->rwlock_count != 0) && !(LcksOpts & disLkRWPrio)) {
2527 integer_t priority;
2528
2529 priority = thread->sched_pri;
2530
2531 if (priority < thread->base_pri)
2532 priority = thread->base_pri;
2533 if (priority < BASEPRI_BACKGROUND)
2534 priority = BASEPRI_BACKGROUND;
2535
2536 if ((thread->sched_pri < priority) || !(thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
2537 KERNEL_DEBUG_CONSTANT(
2538 MACHDBG_CODE(DBG_MACH_SCHED, MACH_RW_PROMOTE) | DBG_FUNC_NONE,
2539 (uintptr_t)thread_tid(thread), thread->sched_pri, thread->base_pri, priority, 0);
2540
2541 thread->sched_flags |= TH_SFLAG_RW_PROMOTED;
2542
2543 if (thread->sched_pri < priority)
2544 set_sched_pri(thread, priority);
2545 }
2546 }
2547
2548 if (!(thread->state & TH_WAIT)) {
2549 /*
2550 * Still runnable.
2551 */
2552 thread->last_made_runnable_time = mach_approximate_time();
2553
2554 machine_thread_going_off_core(thread, FALSE);
2555
2556 if (thread->reason & AST_QUANTUM)
2557 thread_setrun(thread, SCHED_TAILQ);
2558 else if (thread->reason & AST_PREEMPT)
2559 thread_setrun(thread, SCHED_HEADQ);
2560 else
2561 thread_setrun(thread, SCHED_PREEMPT | SCHED_TAILQ);
2562
2563 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2564 MACHDBG_CODE(DBG_MACH_SCHED,MACH_DISPATCH) | DBG_FUNC_NONE,
2565 (uintptr_t)thread_tid(thread), thread->reason, thread->state, sched_run_count, 0);
2566
2567 if (thread->wake_active) {
2568 thread->wake_active = FALSE;
2569 thread_unlock(thread);
2570
2571 thread_wakeup(&thread->wake_active);
2572 } else {
2573 thread_unlock(thread);
2574 }
2575
2576 wake_unlock(thread);
2577 } else {
2578 /*
2579 * Waiting.
2580 */
2581 boolean_t should_terminate = FALSE;
2582 uint32_t new_run_count;
2583
2584 /* Only the first call to thread_dispatch
2585 * after explicit termination should add
2586 * the thread to the termination queue
2587 */
2588 if ((thread->state & (TH_TERMINATE|TH_TERMINATE2)) == TH_TERMINATE) {
2589 should_terminate = TRUE;
2590 thread->state |= TH_TERMINATE2;
2591 }
2592
2593 thread->state &= ~TH_RUN;
2594 thread->last_made_runnable_time = ~0ULL;
2595 thread->chosen_processor = PROCESSOR_NULL;
2596
2597 if (thread->sched_mode == TH_MODE_TIMESHARE) {
2598 if (thread->sched_flags & TH_SFLAG_THROTTLED)
2599 sched_background_decr(thread);
2600
2601 sched_share_decr(thread);
2602 }
2603 new_run_count = sched_run_decr(thread);
2604
2605#if CONFIG_SCHED_SFI
2606 if ((thread->state & (TH_WAIT | TH_TERMINATE)) == TH_WAIT) {
2607 if (thread->reason & AST_SFI) {
2608 thread->wait_sfi_begin_time = processor->last_dispatch;
2609 }
2610 }
2611#endif
2612
2613 machine_thread_going_off_core(thread, should_terminate);
2614
2615 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2616 MACHDBG_CODE(DBG_MACH_SCHED,MACH_DISPATCH) | DBG_FUNC_NONE,
2617 (uintptr_t)thread_tid(thread), thread->reason, thread->state, new_run_count, 0);
2618
2619 (*thread->sched_call)(SCHED_CALL_BLOCK, thread);
2620
2621 if (thread->wake_active) {
2622 thread->wake_active = FALSE;
2623 thread_unlock(thread);
2624
2625 thread_wakeup(&thread->wake_active);
2626 } else {
2627 thread_unlock(thread);
2628 }
2629
2630 wake_unlock(thread);
2631
2632 if (should_terminate)
2633 thread_terminate_enqueue(thread);
2634 }
2635 }
2636 }
2637
2638 /* Update (new) current thread and reprogram quantum timer */
2639 thread_lock(self);
2640 if (!(self->state & TH_IDLE)) {
2641 uint64_t arg1, arg2;
2642 int urgency;
2643 uint64_t latency;
2644
2645#if CONFIG_SCHED_SFI
2646 ast_t new_ast;
2647
2648 new_ast = sfi_thread_needs_ast(self, NULL);
2649
2650 if (new_ast != AST_NONE) {
2651 ast_on(new_ast);
2652 }
2653#endif
2654
2655 assert(processor->last_dispatch >= self->last_made_runnable_time);
2656 latency = processor->last_dispatch - self->last_made_runnable_time;
2657
2658 urgency = thread_get_urgency(self, &arg1, &arg2);
2659
2660 thread_tell_urgency(urgency, arg1, arg2, latency, self);
2661
2662 machine_thread_going_on_core(self, urgency, latency);
2663
2664 /*
2665 * Get a new quantum if none remaining.
2666 */
2667 if (self->quantum_remaining == 0) {
2668 thread_quantum_init(self);
2669 }
2670
2671 /*
2672 * Set up quantum timer and timeslice.
2673 */
2674 processor->quantum_end = processor->last_dispatch + self->quantum_remaining;
2675 timer_call_enter1(&processor->quantum_timer, self, processor->quantum_end, TIMER_CALL_SYS_CRITICAL | TIMER_CALL_LOCAL);
2676
2677 processor->first_timeslice = TRUE;
2678 } else {
2679 timer_call_cancel(&processor->quantum_timer);
2680 processor->first_timeslice = FALSE;
2681
2682 thread_tell_urgency(THREAD_URGENCY_NONE, 0, 0, 0, self);
2683 machine_thread_going_on_core(self, THREAD_URGENCY_NONE, 0);
2684 }
2685
2686 self->computation_epoch = processor->last_dispatch;
2687 self->reason = AST_NONE;
2688
2689 thread_unlock(self);
2690
2691#if defined(CONFIG_SCHED_DEFERRED_AST)
2692 /*
2693 * TODO: Can we state that redispatching our old thread is also
2694 * uninteresting?
2695 */
2696 if ((((volatile uint32_t)sched_run_count) == 1) &&
2697 !(self->state & TH_IDLE)) {
2698 pset_cancel_deferred_dispatch(processor->processor_set, processor);
2699 }
2700#endif
2701
2702}
2703
2704/*
2705 * thread_block_reason:
2706 *
2707 * Forces a reschedule, blocking the caller if a wait
2708 * has been asserted.
2709 *
2710 * If a continuation is specified, then thread_invoke will
2711 * attempt to discard the thread's kernel stack. When the
2712 * thread resumes, it will execute the continuation function
2713 * on a new kernel stack.
2714 */
2715counter(mach_counter_t c_thread_block_calls = 0;)
2716
2717wait_result_t
2718thread_block_reason(
2719 thread_continue_t continuation,
2720 void *parameter,
2721 ast_t reason)
2722{
2723 thread_t self = current_thread();
2724 processor_t processor;
2725 thread_t new_thread;
2726 spl_t s;
2727
2728 counter(++c_thread_block_calls);
2729
2730 s = splsched();
2731
2732 processor = current_processor();
2733
2734 /* If we're explicitly yielding, force a subsequent quantum */
2735 if (reason & AST_YIELD)
2736 processor->first_timeslice = FALSE;
2737
2738 /* We're handling all scheduling AST's */
2739 ast_off(AST_SCHEDULING);
2740
2741#if PROC_REF_DEBUG
2742 if ((continuation != NULL) && (self->task != kernel_task)) {
2743 if (uthread_get_proc_refcount(self->uthread) != 0) {
2744 panic("thread_block_reason with continuation uthread %p with uu_proc_refcount != 0", self->uthread);
2745 }
2746 }
2747#endif
2748
2749 self->continuation = continuation;
2750 self->parameter = parameter;
2751
2752 if (self->state & ~(TH_RUN | TH_IDLE)) {
2753 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2754 MACHDBG_CODE(DBG_MACH_SCHED,MACH_BLOCK),
2755 reason, VM_KERNEL_UNSLIDE(continuation), 0, 0, 0);
2756 }
2757
2758 do {
2759 thread_lock(self);
2760 new_thread = thread_select(self, processor, reason);
2761 thread_unlock(self);
2762 } while (!thread_invoke(self, new_thread, reason));
2763
2764 splx(s);
2765
2766 return (self->wait_result);
2767}
2768
2769/*
2770 * thread_block:
2771 *
2772 * Block the current thread if a wait has been asserted.
2773 */
2774wait_result_t
2775thread_block(
2776 thread_continue_t continuation)
2777{
2778 return thread_block_reason(continuation, NULL, AST_NONE);
2779}
2780
2781wait_result_t
2782thread_block_parameter(
2783 thread_continue_t continuation,
2784 void *parameter)
2785{
2786 return thread_block_reason(continuation, parameter, AST_NONE);
2787}
2788
2789/*
2790 * thread_run:
2791 *
2792 * Switch directly from the current thread to the
2793 * new thread, handing off our quantum if appropriate.
2794 *
2795 * New thread must be runnable, and not on a run queue.
2796 *
2797 * Called at splsched.
2798 */
2799int
2800thread_run(
2801 thread_t self,
2802 thread_continue_t continuation,
2803 void *parameter,
2804 thread_t new_thread)
2805{
2806 ast_t handoff = AST_HANDOFF;
2807
2808 self->continuation = continuation;
2809 self->parameter = parameter;
2810
2811 while (!thread_invoke(self, new_thread, handoff)) {
2812 processor_t processor = current_processor();
2813
2814 thread_lock(self);
2815 new_thread = thread_select(self, processor, AST_NONE);
2816 thread_unlock(self);
2817 handoff = AST_NONE;
2818 }
2819
2820 return (self->wait_result);
2821}
2822
2823/*
2824 * thread_continue:
2825 *
2826 * Called at splsched when a thread first receives
2827 * a new stack after a continuation.
2828 */
2829void
2830thread_continue(
2831 thread_t thread)
2832{
2833 thread_t self = current_thread();
2834 thread_continue_t continuation;
2835 void *parameter;
2836
2837 DTRACE_SCHED(on__cpu);
2838
2839 continuation = self->continuation;
2840 parameter = self->parameter;
2841
2842 thread_dispatch(thread, self);
2843
2844 self->continuation = self->parameter = NULL;
2845
2846 if (thread != THREAD_NULL)
2847 (void)spllo();
2848
2849 TLOG(1, "thread_continue: calling call_continuation \n");
2850 call_continuation(continuation, parameter, self->wait_result);
2851 /*NOTREACHED*/
2852}
2853
2854void
2855thread_quantum_init(thread_t thread)
2856{
2857 if (thread->sched_mode == TH_MODE_REALTIME) {
2858 thread->quantum_remaining = thread->realtime.computation;
2859 } else {
2860 thread->quantum_remaining = SCHED(initial_quantum_size)(thread);
2861 }
2862}
2863
2864uint32_t
2865sched_timeshare_initial_quantum_size(thread_t thread)
2866{
2867 if ((thread == THREAD_NULL) || !(thread->sched_flags & TH_SFLAG_THROTTLED))
2868 return std_quantum;
2869 else
2870 return bg_quantum;
2871}
2872
2873/*
2874 * run_queue_init:
2875 *
2876 * Initialize a run queue before first use.
2877 */
2878void
2879run_queue_init(
2880 run_queue_t rq)
2881{
2882 int i;
2883
2884 rq->highq = IDLEPRI;
2885 for (i = 0; i < NRQBM; i++)
2886 rq->bitmap[i] = 0;
2887 setbit(MAXPRI - IDLEPRI, rq->bitmap);
2888 rq->urgency = rq->count = 0;
2889 for (i = 0; i < NRQS; i++)
2890 queue_init(&rq->queues[i]);
2891}
2892
2893/*
2894 * run_queue_dequeue:
2895 *
2896 * Perform a dequeue operation on a run queue,
2897 * and return the resulting thread.
2898 *
2899 * The run queue must be locked (see thread_run_queue_remove()
2900 * for more info), and not empty.
2901 */
2902thread_t
2903run_queue_dequeue(
2904 run_queue_t rq,
2905 integer_t options)
2906{
2907 thread_t thread;
2908 queue_t queue = rq->queues + rq->highq;
2909
2910 if (options & SCHED_HEADQ) {
2911 thread = (thread_t)dequeue_head(queue);
2912 }
2913 else {
2914 thread = (thread_t)dequeue_tail(queue);
2915 }
2916
2917 thread->runq = PROCESSOR_NULL;
2918 SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
2919 rq->count--;
2920 if (SCHED(priority_is_urgent)(rq->highq)) {
2921 rq->urgency--; assert(rq->urgency >= 0);
2922 }
2923 if (queue_empty(queue)) {
2924 if (rq->highq != IDLEPRI)
2925 clrbit(MAXPRI - rq->highq, rq->bitmap);
2926 rq->highq = MAXPRI - ffsbit(rq->bitmap);
2927 }
2928
2929 return (thread);
2930}
2931
2932/*
2933 * run_queue_enqueue:
2934 *
2935 * Perform a enqueue operation on a run queue.
2936 *
2937 * The run queue must be locked (see thread_run_queue_remove()
2938 * for more info).
2939 */
2940boolean_t
2941run_queue_enqueue(
2942 run_queue_t rq,
2943 thread_t thread,
2944 integer_t options)
2945{
2946 queue_t queue = rq->queues + thread->sched_pri;
2947 boolean_t result = FALSE;
2948
2949 if (queue_empty(queue)) {
2950 enqueue_tail(queue, (queue_entry_t)thread);
2951
2952 setbit(MAXPRI - thread->sched_pri, rq->bitmap);
2953 if (thread->sched_pri > rq->highq) {
2954 rq->highq = thread->sched_pri;
2955 result = TRUE;
2956 }
2957 } else {
2958 if (options & SCHED_TAILQ)
2959 enqueue_tail(queue, (queue_entry_t)thread);
2960 else
2961 enqueue_head(queue, (queue_entry_t)thread);
2962 }
2963 if (SCHED(priority_is_urgent)(thread->sched_pri))
2964 rq->urgency++;
2965 SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
2966 rq->count++;
2967
2968 return (result);
2969
2970}
2971
2972/*
2973 * run_queue_remove:
2974 *
2975 * Remove a specific thread from a runqueue.
2976 *
2977 * The run queue must be locked.
2978 */
2979void
2980run_queue_remove(
2981 run_queue_t rq,
2982 thread_t thread)
2983{
2984
2985 remqueue((queue_entry_t)thread);
2986 SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
2987 rq->count--;
2988 if (SCHED(priority_is_urgent)(thread->sched_pri)) {
2989 rq->urgency--; assert(rq->urgency >= 0);
2990 }
2991
2992 if (queue_empty(rq->queues + thread->sched_pri)) {
2993 /* update run queue status */
2994 if (thread->sched_pri != IDLEPRI)
2995 clrbit(MAXPRI - thread->sched_pri, rq->bitmap);
2996 rq->highq = MAXPRI - ffsbit(rq->bitmap);
2997 }
2998
2999 thread->runq = PROCESSOR_NULL;
3000}
3001
3002/* Assumes RT lock is not held, and acquires splsched/rt_lock itself */
3003void
3004rt_runq_scan(sched_update_scan_context_t scan_context)
3005{
3006 spl_t s;
3007 thread_t thread;
3008
3009 s = splsched();
3010 rt_lock_lock();
3011
3012 qe_foreach_element_safe(thread, &rt_runq.queue, links) {
3013 if (thread->last_made_runnable_time < scan_context->earliest_rt_make_runnable_time) {
3014 scan_context->earliest_rt_make_runnable_time = thread->last_made_runnable_time;
3015 }
3016 }
3017
3018 rt_lock_unlock();
3019 splx(s);
3020}
3021
3022
3023/*
3024 * realtime_queue_insert:
3025 *
3026 * Enqueue a thread for realtime execution.
3027 */
3028static boolean_t
3029realtime_queue_insert(
3030 thread_t thread)
3031{
3032 queue_t queue = &rt_runq.queue;
3033 uint64_t deadline = thread->realtime.deadline;
3034 boolean_t preempt = FALSE;
3035
3036 rt_lock_lock();
3037
3038 if (queue_empty(queue)) {
3039 enqueue_tail(queue, (queue_entry_t)thread);
3040 preempt = TRUE;
3041 }
3042 else {
3043 register thread_t entry = (thread_t)queue_first(queue);
3044
3045 while (TRUE) {
3046 if ( queue_end(queue, (queue_entry_t)entry) ||
3047 deadline < entry->realtime.deadline ) {
3048 entry = (thread_t)queue_prev((queue_entry_t)entry);
3049 break;
3050 }
3051
3052 entry = (thread_t)queue_next((queue_entry_t)entry);
3053 }
3054
3055 if ((queue_entry_t)entry == queue)
3056 preempt = TRUE;
3057
3058 insque((queue_entry_t)thread, (queue_entry_t)entry);
3059 }
3060
3061 thread->runq = THREAD_ON_RT_RUNQ;
3062 SCHED_STATS_RUNQ_CHANGE(&rt_runq.runq_stats, rt_runq.count);
3063 rt_runq.count++;
3064
3065 rt_lock_unlock();
3066
3067 return (preempt);
3068}
3069
3070/*
3071 * realtime_setrun:
3072 *
3073 * Dispatch a thread for realtime execution.
3074 *
3075 * Thread must be locked. Associated pset must
3076 * be locked, and is returned unlocked.
3077 */
3078static void
3079realtime_setrun(
3080 processor_t processor,
3081 thread_t thread)
3082{
3083 processor_set_t pset = processor->processor_set;
3084 ast_t preempt;
3085
3086 boolean_t do_signal_idle = FALSE, do_cause_ast = FALSE;
3087
3088 thread->chosen_processor = processor;
3089
3090 /* <rdar://problem/15102234> */
3091 assert(thread->bound_processor == PROCESSOR_NULL);
3092
3093 /*
3094 * Dispatch directly onto idle processor.
3095 */
3096 if ( (thread->bound_processor == processor)
3097 && processor->state == PROCESSOR_IDLE) {
3098 remqueue((queue_entry_t)processor);
3099 enqueue_tail(&pset->active_queue, (queue_entry_t)processor);
3100
3101 processor->next_thread = thread;
3102 processor->current_pri = thread->sched_pri;
3103 processor->current_thmode = thread->sched_mode;
3104 processor->current_sfi_class = thread->sfi_class;
3105 processor->deadline = thread->realtime.deadline;
3106 processor->state = PROCESSOR_DISPATCHING;
3107
3108 if (processor != current_processor()) {
3109 if (!(pset->pending_AST_cpu_mask & (1ULL << processor->cpu_id))) {
3110 /* cleared on exit from main processor_idle() loop */
3111 pset->pending_AST_cpu_mask |= (1ULL << processor->cpu_id);
3112 do_signal_idle = TRUE;
3113 }
3114 }
3115 pset_unlock(pset);
3116
3117 if (do_signal_idle) {
3118 machine_signal_idle(processor);
3119 }
3120 return;
3121 }
3122
3123 if (processor->current_pri < BASEPRI_RTQUEUES)
3124 preempt = (AST_PREEMPT | AST_URGENT);
3125 else if (thread->realtime.deadline < processor->deadline)
3126 preempt = (AST_PREEMPT | AST_URGENT);
3127 else
3128 preempt = AST_NONE;
3129
3130 realtime_queue_insert(thread);
3131
3132 if (preempt != AST_NONE) {
3133 if (processor->state == PROCESSOR_IDLE) {
3134 remqueue((queue_entry_t)processor);
3135 enqueue_tail(&pset->active_queue, (queue_entry_t)processor);
3136 processor->next_thread = THREAD_NULL;
3137 processor->current_pri = thread->sched_pri;
3138 processor->current_thmode = thread->sched_mode;
3139 processor->current_sfi_class = thread->sfi_class;
3140 processor->deadline = thread->realtime.deadline;
3141 processor->state = PROCESSOR_DISPATCHING;
3142 if (processor == current_processor()) {
3143 ast_on(preempt);
3144 } else {
3145 if (!(pset->pending_AST_cpu_mask & (1ULL << processor->cpu_id))) {
3146 /* cleared on exit from main processor_idle() loop */
3147 pset->pending_AST_cpu_mask |= (1ULL << processor->cpu_id);
3148 do_signal_idle = TRUE;
3149 }
3150 }
3151 } else if (processor->state == PROCESSOR_DISPATCHING) {
3152 if ((processor->next_thread == THREAD_NULL) && ((processor->current_pri < thread->sched_pri) || (processor->deadline > thread->realtime.deadline))) {
3153 processor->current_pri = thread->sched_pri;
3154 processor->current_thmode = thread->sched_mode;
3155 processor->current_sfi_class = thread->sfi_class;
3156 processor->deadline = thread->realtime.deadline;
3157 }
3158 } else {
3159 if (processor == current_processor()) {
3160 ast_on(preempt);
3161 } else {
3162 if (!(pset->pending_AST_cpu_mask & (1ULL << processor->cpu_id))) {
3163 /* cleared after IPI causes csw_check() to be called */
3164 pset->pending_AST_cpu_mask |= (1ULL << processor->cpu_id);
3165 do_cause_ast = TRUE;
3166 }
3167 }
3168 }
3169 } else {
3170 /* Selected processor was too busy, just keep thread enqueued and let other processors drain it naturally. */
3171 }
3172
3173 pset_unlock(pset);
3174
3175 if (do_signal_idle) {
3176 machine_signal_idle(processor);
3177 } else if (do_cause_ast) {
3178 cause_ast_check(processor);
3179 }
3180}
3181
3182
3183#if defined(CONFIG_SCHED_TIMESHARE_CORE)
3184
3185boolean_t
3186priority_is_urgent(int priority)
3187{
3188 return testbit(priority, sched_preempt_pri) ? TRUE : FALSE;
3189}
3190
3191#endif /* CONFIG_SCHED_TIMESHARE_CORE */
3192
3193/*
3194 * processor_setrun:
3195 *
3196 * Dispatch a thread for execution on a
3197 * processor.
3198 *
3199 * Thread must be locked. Associated pset must
3200 * be locked, and is returned unlocked.
3201 */
3202static void
3203processor_setrun(
3204 processor_t processor,
3205 thread_t thread,
3206 integer_t options)
3207{
3208 processor_set_t pset = processor->processor_set;
3209 ast_t preempt;
3210 enum { eExitIdle, eInterruptRunning, eDoNothing } ipi_action = eDoNothing;
3211 enum { eNoSignal, eDoSignal, eDoDeferredSignal } do_signal_idle = eNoSignal;
3212
3213 boolean_t do_cause_ast = FALSE;
3214
3215 thread->chosen_processor = processor;
3216
3217 /*
3218 * Dispatch directly onto idle processor.
3219 */
3220 if ( (SCHED(direct_dispatch_to_idle_processors) ||
3221 thread->bound_processor == processor)
3222 && processor->state == PROCESSOR_IDLE) {
3223 remqueue((queue_entry_t)processor);
3224 enqueue_tail(&pset->active_queue, (queue_entry_t)processor);
3225
3226 processor->next_thread = thread;
3227 processor->current_pri = thread->sched_pri;
3228 processor->current_thmode = thread->sched_mode;
3229 processor->current_sfi_class = thread->sfi_class;
3230 processor->deadline = UINT64_MAX;
3231 processor->state = PROCESSOR_DISPATCHING;
3232
3233 if (!(pset->pending_AST_cpu_mask & (1ULL << processor->cpu_id))) {
3234 /* cleared on exit from main processor_idle() loop */
3235 pset->pending_AST_cpu_mask |= (1ULL << processor->cpu_id);
3236 do_signal_idle = eDoSignal;
3237 }
3238
3239 pset_unlock(pset);
3240
3241 if (do_signal_idle == eDoSignal) {
3242 machine_signal_idle(processor);
3243 }
3244
3245 return;
3246 }
3247
3248 /*
3249 * Set preemption mode.
3250 */
3251#if defined(CONFIG_SCHED_DEFERRED_AST)
3252 /* TODO: Do we need to care about urgency (see rdar://problem/20136239)? */
3253#endif
3254 if (SCHED(priority_is_urgent)(thread->sched_pri) && thread->sched_pri > processor->current_pri)
3255 preempt = (AST_PREEMPT | AST_URGENT);
3256 else if(processor->active_thread && thread_eager_preemption(processor->active_thread))
3257 preempt = (AST_PREEMPT | AST_URGENT);
3258 else if ((thread->sched_mode == TH_MODE_TIMESHARE) && (thread->sched_pri < thread->base_pri)) {
3259 if(SCHED(priority_is_urgent)(thread->base_pri) && thread->sched_pri > processor->current_pri) {
3260 preempt = (options & SCHED_PREEMPT)? AST_PREEMPT: AST_NONE;
3261 } else {
3262 preempt = AST_NONE;
3263 }
3264 } else
3265 preempt = (options & SCHED_PREEMPT)? AST_PREEMPT: AST_NONE;
3266
3267 SCHED(processor_enqueue)(processor, thread, options);
3268
3269 if (preempt != AST_NONE) {
3270 if (processor->state == PROCESSOR_IDLE) {
3271 remqueue((queue_entry_t)processor);
3272 enqueue_tail(&pset->active_queue, (queue_entry_t)processor);
3273 processor->next_thread = THREAD_NULL;
3274 processor->current_pri = thread->sched_pri;
3275 processor->current_thmode = thread->sched_mode;
3276 processor->current_sfi_class = thread->sfi_class;
3277 processor->deadline = UINT64_MAX;
3278 processor->state = PROCESSOR_DISPATCHING;
3279
3280 ipi_action = eExitIdle;
3281 } else if ( processor->state == PROCESSOR_DISPATCHING) {
3282 if ((processor->next_thread == THREAD_NULL) && (processor->current_pri < thread->sched_pri)) {
3283 processor->current_pri = thread->sched_pri;
3284 processor->current_thmode = thread->sched_mode;
3285 processor->current_sfi_class = thread->sfi_class;
3286 processor->deadline = UINT64_MAX;
3287 }
3288 } else if ( (processor->state == PROCESSOR_RUNNING ||
3289 processor->state == PROCESSOR_SHUTDOWN) &&
3290 (thread->sched_pri >= processor->current_pri)) {
3291 ipi_action = eInterruptRunning;
3292 }
3293 } else {
3294 /*
3295 * New thread is not important enough to preempt what is running, but
3296 * special processor states may need special handling
3297 */
3298 if (processor->state == PROCESSOR_SHUTDOWN &&
3299 thread->sched_pri >= processor->current_pri ) {
3300 ipi_action = eInterruptRunning;
3301 } else if ( processor->state == PROCESSOR_IDLE &&
3302 processor != current_processor() ) {
3303 remqueue((queue_entry_t)processor);
3304 enqueue_tail(&pset->active_queue, (queue_entry_t)processor);
3305 processor->next_thread = THREAD_NULL;
3306 processor->current_pri = thread->sched_pri;
3307 processor->current_thmode = thread->sched_mode;
3308 processor->current_sfi_class = thread->sfi_class;
3309 processor->deadline = UINT64_MAX;
3310 processor->state = PROCESSOR_DISPATCHING;
3311
3312 ipi_action = eExitIdle;
3313 }
3314 }
3315
3316 switch (ipi_action) {
3317 case eDoNothing:
3318 break;
3319 case eExitIdle:
3320 if (processor == current_processor()) {
3321 if (csw_check_locked(processor, pset, AST_NONE) != AST_NONE)
3322 ast_on(preempt);
3323 } else {
3324#if defined(CONFIG_SCHED_DEFERRED_AST)
3325 if (!(pset->pending_deferred_AST_cpu_mask & (1ULL << processor->cpu_id)) &&
3326 !(pset->pending_AST_cpu_mask & (1ULL << processor->cpu_id))) {
3327 /* cleared on exit from main processor_idle() loop */
3328 pset->pending_deferred_AST_cpu_mask |= (1ULL << processor->cpu_id);
3329 do_signal_idle = eDoDeferredSignal;
3330 }
3331#else
3332 if (!(pset->pending_AST_cpu_mask & (1ULL << processor->cpu_id))) {
3333 /* cleared on exit from main processor_idle() loop */
3334 pset->pending_AST_cpu_mask |= (1ULL << processor->cpu_id);
3335 do_signal_idle = eDoSignal;
3336 }
3337#endif
3338 }
3339 break;
3340 case eInterruptRunning:
3341 if (processor == current_processor()) {
3342 if (csw_check_locked(processor, pset, AST_NONE) != AST_NONE)
3343 ast_on(preempt);
3344 } else {
3345 if (!(pset->pending_AST_cpu_mask & (1ULL << processor->cpu_id))) {
3346 /* cleared after IPI causes csw_check() to be called */
3347 pset->pending_AST_cpu_mask |= (1ULL << processor->cpu_id);
3348 do_cause_ast = TRUE;
3349 }
3350 }
3351 break;
3352 }
3353
3354 pset_unlock(pset);
3355
3356 if (do_signal_idle == eDoSignal) {
3357 machine_signal_idle(processor);
3358 }
3359#if defined(CONFIG_SCHED_DEFERRED_AST)
3360 else if (do_signal_idle == eDoDeferredSignal) {
3361 /*
3362 * TODO: The ability to cancel this signal could make
3363 * sending it outside of the pset lock an issue. Do
3364 * we need to address this? Or would the only fallout
3365 * be that the core takes a signal? As long as we do
3366 * not run the risk of having a core marked as signal
3367 * outstanding, with no real signal outstanding, the
3368 * only result should be that we fail to cancel some
3369 * signals.
3370 */
3371 machine_signal_idle_deferred(processor);
3372 }
3373#endif
3374 else if (do_cause_ast) {
3375 cause_ast_check(processor);
3376 }
3377}
3378
3379/*
3380 * choose_next_pset:
3381 *
3382 * Return the next sibling pset containing
3383 * available processors.
3384 *
3385 * Returns the original pset if none other is
3386 * suitable.
3387 */
3388static processor_set_t
3389choose_next_pset(
3390 processor_set_t pset)
3391{
3392 processor_set_t nset = pset;
3393
3394 do {
3395 nset = next_pset(nset);
3396 } while (nset->online_processor_count < 1 && nset != pset);
3397
3398 return (nset);
3399}
3400
3401/*
3402 * choose_processor:
3403 *
3404 * Choose a processor for the thread, beginning at
3405 * the pset. Accepts an optional processor hint in
3406 * the pset.
3407 *
3408 * Returns a processor, possibly from a different pset.
3409 *
3410 * The thread must be locked. The pset must be locked,
3411 * and the resulting pset is locked on return.
3412 */
3413processor_t
3414choose_processor(
3415 processor_set_t pset,
3416 processor_t processor,
3417 thread_t thread)
3418{
3419 processor_set_t nset, cset = pset;
3420
3421 /*
3422 * Prefer the hinted processor, when appropriate.
3423 */
3424
3425 /* Fold last processor hint from secondary processor to its primary */
3426 if (processor != PROCESSOR_NULL) {
3427 processor = processor->processor_primary;
3428 }
3429
3430 /*
3431 * Only consult platform layer if pset is active, which
3432 * it may not be in some cases when a multi-set system
3433 * is going to sleep.
3434 */
3435 if (pset->online_processor_count) {
3436 if ((processor == PROCESSOR_NULL) || (processor->processor_set == pset && processor->state == PROCESSOR_IDLE)) {
3437 processor_t mc_processor = machine_choose_processor(pset, processor);
3438 if (mc_processor != PROCESSOR_NULL)
3439 processor = mc_processor->processor_primary;
3440 }
3441 }
3442
3443 /*
3444 * At this point, we may have a processor hint, and we may have
3445 * an initial starting pset. If the hint is not in the pset, or
3446 * if the hint is for a processor in an invalid state, discard
3447 * the hint.
3448 */
3449 if (processor != PROCESSOR_NULL) {
3450 if (processor->processor_set != pset) {
3451 processor = PROCESSOR_NULL;
3452 } else if (!processor->is_recommended) {
3453 processor = PROCESSOR_NULL;
3454 } else {
3455 switch (processor->state) {
3456 case PROCESSOR_START:
3457 case PROCESSOR_SHUTDOWN:
3458 case PROCESSOR_OFF_LINE:
3459 /*
3460 * Hint is for a processor that cannot support running new threads.
3461 */
3462 processor = PROCESSOR_NULL;
3463 break;
3464 case PROCESSOR_IDLE:
3465 /*
3466 * Hint is for an idle processor. Assume it is no worse than any other
3467 * idle processor. The platform layer had an opportunity to provide
3468 * the "least cost idle" processor above.
3469 */
3470 return (processor);
3471 break;
3472 case PROCESSOR_RUNNING:
3473 case PROCESSOR_DISPATCHING:
3474 /*
3475 * Hint is for an active CPU. This fast-path allows
3476 * realtime threads to preempt non-realtime threads
3477 * to regain their previous executing processor.
3478 */
3479 if ((thread->sched_pri >= BASEPRI_RTQUEUES) &&
3480 (processor->current_pri < BASEPRI_RTQUEUES))
3481 return (processor);
3482
3483 /* Otherwise, use hint as part of search below */
3484 break;
3485 default:
3486 processor = PROCESSOR_NULL;
3487 break;
3488 }
3489 }
3490 }
3491
3492 /*
3493 * Iterate through the processor sets to locate
3494 * an appropriate processor. Seed results with
3495 * a last-processor hint, if available, so that
3496 * a search must find something strictly better
3497 * to replace it.
3498 *
3499 * A primary/secondary pair of SMT processors are
3500 * "unpaired" if the primary is busy but its
3501 * corresponding secondary is idle (so the physical
3502 * core has full use of its resources).
3503 */
3504
3505 integer_t lowest_priority = MAXPRI + 1;
3506 integer_t lowest_unpaired_primary_priority = MAXPRI + 1;
3507 integer_t lowest_count = INT_MAX;
3508 uint64_t furthest_deadline = 1;
3509 processor_t lp_processor = PROCESSOR_NULL;
3510 processor_t lp_unpaired_primary_processor = PROCESSOR_NULL;
3511 processor_t lp_unpaired_secondary_processor = PROCESSOR_NULL;
3512 processor_t lc_processor = PROCESSOR_NULL;
3513 processor_t fd_processor = PROCESSOR_NULL;
3514
3515 if (processor != PROCESSOR_NULL) {
3516 /* All other states should be enumerated above. */
3517 assert(processor->state == PROCESSOR_RUNNING || processor->state == PROCESSOR_DISPATCHING);
3518
3519 lowest_priority = processor->current_pri;
3520 lp_processor = processor;
3521
3522 if (processor->current_pri >= BASEPRI_RTQUEUES) {
3523 furthest_deadline = processor->deadline;
3524 fd_processor = processor;
3525 }
3526
3527 lowest_count = SCHED(processor_runq_count)(processor);
3528 lc_processor = processor;
3529 }
3530
3531 do {
3532
3533 /*
3534 * Choose an idle processor, in pset traversal order
3535 */
3536 qe_foreach_element(processor, &cset->idle_queue, processor_queue) {
3537 if (processor->is_recommended)
3538 return processor;
3539 }
3540
3541 /*
3542 * Otherwise, enumerate active and idle processors to find candidates
3543 * with lower priority/etc.
3544 */
3545
3546 qe_foreach_element(processor, &cset->active_queue, processor_queue) {
3547
3548 if (!processor->is_recommended) {
3549 continue;
3550 }
3551
3552 integer_t cpri = processor->current_pri;
3553 if (cpri < lowest_priority) {
3554 lowest_priority = cpri;
3555 lp_processor = processor;
3556 }
3557
3558 if ((cpri >= BASEPRI_RTQUEUES) && (processor->deadline > furthest_deadline)) {
3559 furthest_deadline = processor->deadline;
3560 fd_processor = processor;
3561 }
3562
3563 integer_t ccount = SCHED(processor_runq_count)(processor);
3564 if (ccount < lowest_count) {
3565 lowest_count = ccount;
3566 lc_processor = processor;
3567 }
3568 }
3569
3570 /*
3571 * For SMT configs, these idle secondary processors must have active primary. Otherwise
3572 * the idle primary would have short-circuited the loop above
3573 */
3574 qe_foreach_element(processor, &cset->idle_secondary_queue, processor_queue) {
3575
3576 if (!processor->is_recommended) {
3577 continue;
3578 }
3579
3580 processor_t cprimary = processor->processor_primary;
3581
3582 /* If the primary processor is offline or starting up, it's not a candidate for this path */
3583 if (cprimary->state == PROCESSOR_RUNNING || cprimary->state == PROCESSOR_DISPATCHING) {
3584 integer_t primary_pri = cprimary->current_pri;
3585
3586 if (primary_pri < lowest_unpaired_primary_priority) {
3587 lowest_unpaired_primary_priority = primary_pri;
3588 lp_unpaired_primary_processor = cprimary;
3589 lp_unpaired_secondary_processor = processor;
3590 }
3591 }
3592 }
3593
3594
3595 if (thread->sched_pri >= BASEPRI_RTQUEUES) {
3596
3597 /*
3598 * For realtime threads, the most important aspect is
3599 * scheduling latency, so we attempt to assign threads
3600 * to good preemption candidates (assuming an idle primary
3601 * processor was not available above).
3602 */
3603
3604 if (thread->sched_pri > lowest_unpaired_primary_priority) {
3605 /* Move to end of active queue so that the next thread doesn't also pick it */
3606 re_queue_tail(&cset->active_queue, (queue_entry_t)lp_unpaired_primary_processor);
3607 return lp_unpaired_primary_processor;
3608 }
3609 if (thread->sched_pri > lowest_priority) {
3610 /* Move to end of active queue so that the next thread doesn't also pick it */
3611 re_queue_tail(&cset->active_queue, (queue_entry_t)lp_processor);
3612 return lp_processor;
3613 }
3614 if (thread->realtime.deadline < furthest_deadline)
3615 return fd_processor;
3616
3617 /*
3618 * If all primary and secondary CPUs are busy with realtime
3619 * threads with deadlines earlier than us, move on to next
3620 * pset.
3621 */
3622 }
3623 else {
3624
3625 if (thread->sched_pri > lowest_unpaired_primary_priority) {
3626 /* Move to end of active queue so that the next thread doesn't also pick it */
3627 re_queue_tail(&cset->active_queue, (queue_entry_t)lp_unpaired_primary_processor);
3628 return lp_unpaired_primary_processor;
3629 }
3630 if (thread->sched_pri > lowest_priority) {
3631 /* Move to end of active queue so that the next thread doesn't also pick it */
3632 re_queue_tail(&cset->active_queue, (queue_entry_t)lp_processor);
3633 return lp_processor;
3634 }
3635
3636 /*
3637 * If all primary processor in this pset are running a higher
3638 * priority thread, move on to next pset. Only when we have
3639 * exhausted this search do we fall back to other heuristics.
3640 */
3641 }
3642
3643 /*
3644 * Move onto the next processor set.
3645 */
3646 nset = next_pset(cset);
3647
3648 if (nset != pset) {
3649 pset_unlock(cset);
3650
3651 cset = nset;
3652 pset_lock(cset);
3653 }
3654 } while (nset != pset);
3655
3656 /*
3657 * Make sure that we pick a running processor,
3658 * and that the correct processor set is locked.
3659 * Since we may have unlock the candidate processor's
3660 * pset, it may have changed state.
3661 *
3662 * All primary processors are running a higher priority
3663 * thread, so the only options left are enqueuing on
3664 * the secondary processor that would perturb the least priority
3665 * primary, or the least busy primary.
3666 */
3667 do {
3668
3669 /* lowest_priority is evaluated in the main loops above */
3670 if (lp_unpaired_secondary_processor != PROCESSOR_NULL) {
3671 processor = lp_unpaired_secondary_processor;
3672 lp_unpaired_secondary_processor = PROCESSOR_NULL;
3673 } else if (lc_processor != PROCESSOR_NULL) {
3674 processor = lc_processor;
3675 lc_processor = PROCESSOR_NULL;
3676 } else {
3677 /*
3678 * All processors are executing higher
3679 * priority threads, and the lowest_count
3680 * candidate was not usable
3681 */
3682 processor = master_processor;
3683 }
3684
3685 /*
3686 * Check that the correct processor set is
3687 * returned locked.
3688 */
3689 if (cset != processor->processor_set) {
3690 pset_unlock(cset);
3691 cset = processor->processor_set;
3692 pset_lock(cset);
3693 }
3694
3695 /*
3696 * We must verify that the chosen processor is still available.
3697 * master_processor is an exception, since we may need to preempt
3698 * a running thread on it during processor shutdown (for sleep),
3699 * and that thread needs to be enqueued on its runqueue to run
3700 * when the processor is restarted.
3701 */
3702 if (processor != master_processor && (processor->state == PROCESSOR_SHUTDOWN || processor->state == PROCESSOR_OFF_LINE))
3703 processor = PROCESSOR_NULL;
3704
3705 } while (processor == PROCESSOR_NULL);
3706
3707 return (processor);
3708}
3709
3710/*
3711 * thread_setrun:
3712 *
3713 * Dispatch thread for execution, onto an idle
3714 * processor or run queue, and signal a preemption
3715 * as appropriate.
3716 *
3717 * Thread must be locked.
3718 */
3719void
3720thread_setrun(
3721 thread_t thread,
3722 integer_t options)
3723{
3724 processor_t processor;
3725 processor_set_t pset;
3726
3727 assert((thread->state & (TH_RUN|TH_WAIT|TH_UNINT|TH_TERMINATE|TH_TERMINATE2)) == TH_RUN);
3728 assert(thread->runq == PROCESSOR_NULL);
3729
3730 /*
3731 * Update priority if needed.
3732 */
3733 if (SCHED(can_update_priority)(thread))
3734 SCHED(update_priority)(thread);
3735
3736 thread->sfi_class = sfi_thread_classify(thread);
3737
3738 assert(thread->runq == PROCESSOR_NULL);
3739
3740#if __SMP__
3741 if (thread->bound_processor == PROCESSOR_NULL) {
3742 /*
3743 * Unbound case.
3744 */
3745 if (thread->affinity_set != AFFINITY_SET_NULL) {
3746 /*
3747 * Use affinity set policy hint.
3748 */
3749 pset = thread->affinity_set->aset_pset;
3750 pset_lock(pset);
3751
3752 processor = SCHED(choose_processor)(pset, PROCESSOR_NULL, thread);
3753
3754 SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR)|DBG_FUNC_NONE,
3755 (uintptr_t)thread_tid(thread), (uintptr_t)-1, processor->cpu_id, processor->state, 0);
3756 } else if (thread->last_processor != PROCESSOR_NULL) {
3757 /*
3758 * Simple (last processor) affinity case.
3759 */
3760 processor = thread->last_processor;
3761 pset = processor->processor_set;
3762 pset_lock(pset);
3763 processor = SCHED(choose_processor)(pset, processor, thread);
3764
3765 SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR)|DBG_FUNC_NONE,
3766 (uintptr_t)thread_tid(thread), thread->last_processor->cpu_id, processor->cpu_id, processor->state, 0);
3767 } else {
3768 /*
3769 * No Affinity case:
3770 *
3771 * Utilitize a per task hint to spread threads
3772 * among the available processor sets.
3773 */
3774 task_t task = thread->task;
3775
3776 pset = task->pset_hint;
3777 if (pset == PROCESSOR_SET_NULL)
3778 pset = current_processor()->processor_set;
3779
3780 pset = choose_next_pset(pset);
3781 pset_lock(pset);
3782
3783 processor = SCHED(choose_processor)(pset, PROCESSOR_NULL, thread);
3784 task->pset_hint = processor->processor_set;
3785
3786 SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR)|DBG_FUNC_NONE,
3787 (uintptr_t)thread_tid(thread), (uintptr_t)-1, processor->cpu_id, processor->state, 0);
3788 }
3789 } else {
3790 /*
3791 * Bound case:
3792 *
3793 * Unconditionally dispatch on the processor.
3794 */
3795 processor = thread->bound_processor;
3796 pset = processor->processor_set;
3797 pset_lock(pset);
3798
3799 SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR)|DBG_FUNC_NONE,
3800 (uintptr_t)thread_tid(thread), (uintptr_t)-2, processor->cpu_id, processor->state, 0);
3801 }
3802#else /* !__SMP__ */
3803 /* Only one processor to choose */
3804 assert(thread->bound_processor == PROCESSOR_NULL || thread->bound_processor == master_processor);
3805 processor = master_processor;
3806 pset = processor->processor_set;
3807 pset_lock(pset);
3808#endif /* !__SMP__ */
3809
3810 /*
3811 * Dispatch the thread on the chosen processor.
3812 * TODO: This should be based on sched_mode, not sched_pri
3813 */
3814 if (thread->sched_pri >= BASEPRI_RTQUEUES)
3815 realtime_setrun(processor, thread);
3816 else
3817 processor_setrun(processor, thread, options);
3818}
3819
3820processor_set_t
3821task_choose_pset(
3822 task_t task)
3823{
3824 processor_set_t pset = task->pset_hint;
3825
3826 if (pset != PROCESSOR_SET_NULL)
3827 pset = choose_next_pset(pset);
3828
3829 return (pset);
3830}
3831
3832/*
3833 * Check for a preemption point in
3834 * the current context.
3835 *
3836 * Called at splsched with thread locked.
3837 */
3838ast_t
3839csw_check(
3840 processor_t processor,
3841 ast_t check_reason)
3842{
3843 processor_set_t pset = processor->processor_set;
3844 ast_t result;
3845
3846 pset_lock(pset);
3847
3848 /* If we were sent a remote AST and interrupted a running processor, acknowledge it here with pset lock held */
3849 pset->pending_AST_cpu_mask &= ~(1ULL << processor->cpu_id);
3850
3851 result = csw_check_locked(processor, pset, check_reason);
3852
3853 pset_unlock(pset);
3854
3855 return result;
3856}
3857
3858/*
3859 * Check for preemption at splsched with
3860 * pset and thread locked
3861 */
3862ast_t
3863csw_check_locked(
3864 processor_t processor,
3865 processor_set_t pset __unused,
3866 ast_t check_reason)
3867{
3868 ast_t result;
3869 thread_t thread = processor->active_thread;
3870
3871 if (processor->first_timeslice) {
3872 if (rt_runq.count > 0)
3873 return (check_reason | AST_PREEMPT | AST_URGENT);
3874 }
3875 else {
3876 if (rt_runq.count > 0) {
3877 if (BASEPRI_RTQUEUES > processor->current_pri)
3878 return (check_reason | AST_PREEMPT | AST_URGENT);
3879 else
3880 return (check_reason | AST_PREEMPT);
3881 }
3882 }
3883
3884 result = SCHED(processor_csw_check)(processor);
3885 if (result != AST_NONE)
3886 return (check_reason | result | (thread_eager_preemption(thread) ? AST_URGENT : AST_NONE));
3887
3888#if __SMP__
3889
3890 /*
3891 * If the current thread is running on a processor that is no longer recommended, gently
3892 * (non-urgently) get to a point and then block, and which point thread_select() should
3893 * try to idle the processor and re-dispatch the thread to a recommended processor.
3894 */
3895 if (!processor->is_recommended)
3896 return (check_reason | AST_PREEMPT);
3897
3898 /*
3899 * Even though we could continue executing on this processor, a
3900 * secondary SMT core should try to shed load to another primary core.
3901 *
3902 * TODO: Should this do the same check that thread_select does? i.e.
3903 * if no bound threads target this processor, and idle primaries exist, preempt
3904 * The case of RT threads existing is already taken care of above
3905 * Consider Capri in this scenario.
3906 *
3907 * if (!SCHED(processor_bound_count)(processor) && !queue_empty(&pset->idle_queue))
3908 *
3909 * TODO: Alternatively - check if only primary is idle, or check if primary's pri is lower than mine.
3910 */
3911
3912 if (processor->current_pri < BASEPRI_RTQUEUES &&
3913 processor->processor_primary != processor)
3914 return (check_reason | AST_PREEMPT);
3915#endif
3916
3917 if (thread->state & TH_SUSP)
3918 return (check_reason | AST_PREEMPT);
3919
3920#if CONFIG_SCHED_SFI
3921 /*
3922 * Current thread may not need to be preempted, but maybe needs
3923 * an SFI wait?
3924 */
3925 result = sfi_thread_needs_ast(thread, NULL);
3926 if (result != AST_NONE)
3927 return (check_reason | result);
3928#endif
3929
3930 return (AST_NONE);
3931}
3932
3933/*
3934 * set_sched_pri:
3935 *
3936 * Set the scheduled priority of the specified thread.
3937 *
3938 * This may cause the thread to change queues.
3939 *
3940 * Thread must be locked.
3941 */
3942void
3943set_sched_pri(
3944 thread_t thread,
3945 int priority)
3946{
3947 thread_t cthread = current_thread();
3948 boolean_t is_current_thread = (thread == cthread) ? TRUE : FALSE;
3949 int curgency, nurgency;
3950 uint64_t urgency_param1, urgency_param2;
3951 boolean_t removed_from_runq = FALSE;
3952
3953 /* If we're already at this priority, no need to mess with the runqueue */
3954 if (priority == thread->sched_pri)
3955 return;
3956
3957 if (is_current_thread) {
3958 assert(thread->runq == PROCESSOR_NULL);
3959 curgency = thread_get_urgency(thread, &urgency_param1, &urgency_param2);
3960 } else {
3961 removed_from_runq = thread_run_queue_remove(thread);
3962 }
3963
3964 thread->sched_pri = priority;
3965
3966 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHANGE_PRIORITY),
3967 (uintptr_t)thread_tid(thread),
3968 thread->base_pri,
3969 thread->sched_pri,
3970 0, /* eventually, 'reason' */
3971 0);
3972
3973 if (is_current_thread) {
3974 nurgency = thread_get_urgency(thread, &urgency_param1, &urgency_param2);
3975 /*
3976 * set_sched_pri doesn't alter RT params. We expect direct base priority/QoS
3977 * class alterations from user space to occur relatively infrequently, hence
3978 * those are lazily handled. QoS classes have distinct priority bands, and QoS
3979 * inheritance is expected to involve priority changes.
3980 */
3981 if (nurgency != curgency) {
3982 thread_tell_urgency(nurgency, urgency_param1, urgency_param2, 0, thread);
3983 machine_thread_going_on_core(thread, nurgency, 0);
3984 }
3985 }
3986
3987 /* TODO: Should this be TAILQ if it went down, HEADQ if it went up? */
3988 if (removed_from_runq)
3989 thread_run_queue_reinsert(thread, SCHED_PREEMPT | SCHED_TAILQ);
3990 else if (thread->state & TH_RUN) {
3991 processor_t processor = thread->last_processor;
3992
3993 if (is_current_thread) {
3994 ast_t preempt;
3995
3996 processor->current_pri = priority;
3997 processor->current_thmode = thread->sched_mode;
3998 processor->current_sfi_class = thread->sfi_class = sfi_thread_classify(thread);
3999 if ((preempt = csw_check(processor, AST_NONE)) != AST_NONE)
4000 ast_on(preempt);
4001 } else if (processor != PROCESSOR_NULL && processor->active_thread == thread)
4002 cause_ast_check(processor);
4003 }
4004}
4005
4006/*
4007 * thread_run_queue_remove_for_handoff
4008 *
4009 * Pull a thread or its (recursive) push target out of the runqueue
4010 * so that it is ready for thread_run()
4011 *
4012 * Called at splsched
4013 *
4014 * Returns the thread that was pulled or THREAD_NULL if no thread could be pulled.
4015 * This may be different than the thread that was passed in.
4016 */
4017thread_t
4018thread_run_queue_remove_for_handoff(thread_t thread) {
4019
4020 thread_t pulled_thread = THREAD_NULL;
4021
4022 thread_lock(thread);
4023
4024 /*
4025 * Check that the thread is not bound
4026 * to a different processor, and that realtime
4027 * is not involved.
4028 *
4029 * Next, pull it off its run queue. If it
4030 * doesn't come, it's not eligible.
4031 */
4032
4033 processor_t processor = current_processor();
4034 if (processor->current_pri < BASEPRI_RTQUEUES && thread->sched_pri < BASEPRI_RTQUEUES &&
4035 (thread->bound_processor == PROCESSOR_NULL || thread->bound_processor == processor)) {
4036
4037 if (thread_run_queue_remove(thread))
4038 pulled_thread = thread;
4039 }
4040
4041 thread_unlock(thread);
4042
4043 return pulled_thread;
4044}
4045
4046/*
4047 * thread_run_queue_remove:
4048 *
4049 * Remove a thread from its current run queue and
4050 * return TRUE if successful.
4051 *
4052 * Thread must be locked.
4053 *
4054 * If thread->runq is PROCESSOR_NULL, the thread will not re-enter the
4055 * run queues because the caller locked the thread. Otherwise
4056 * the thread is on a run queue, but could be chosen for dispatch
4057 * and removed by another processor under a different lock, which
4058 * will set thread->runq to PROCESSOR_NULL.
4059 *
4060 * Hence the thread select path must not rely on anything that could
4061 * be changed under the thread lock after calling this function,
4062 * most importantly thread->sched_pri.
4063 */
4064boolean_t
4065thread_run_queue_remove(
4066 thread_t thread)
4067{
4068 boolean_t removed = FALSE;
4069 processor_t processor = thread->runq;
4070
4071 if ((thread->state & (TH_RUN|TH_WAIT)) == TH_WAIT) {
4072 /* Thread isn't runnable */
4073 assert(thread->runq == PROCESSOR_NULL);
4074 return FALSE;
4075 }
4076
4077 if (processor == PROCESSOR_NULL) {
4078 /*
4079 * The thread is either not on the runq,
4080 * or is in the midst of being removed from the runq.
4081 *
4082 * runq is set to NULL under the pset lock, not the thread
4083 * lock, so the thread may still be in the process of being dequeued
4084 * from the runq. It will wait in invoke for the thread lock to be
4085 * dropped.
4086 */
4087
4088 return FALSE;
4089 }
4090
4091 if (thread->sched_pri < BASEPRI_RTQUEUES) {
4092 return SCHED(processor_queue_remove)(processor, thread);
4093 }
4094
4095 rt_lock_lock();
4096
4097 if (thread->runq != PROCESSOR_NULL) {
4098 /*
4099 * Thread is on the RT run queue and we have a lock on
4100 * that run queue.
4101 */
4102
4103 assert(thread->runq == THREAD_ON_RT_RUNQ);
4104
4105 remqueue((queue_entry_t)thread);
4106 SCHED_STATS_RUNQ_CHANGE(&rt_runq.runq_stats, rt_runq.count);
4107 rt_runq.count--;
4108
4109 thread->runq = PROCESSOR_NULL;
4110
4111 removed = TRUE;
4112 }
4113
4114 rt_lock_unlock();
4115
4116 return (removed);
4117}
4118
4119/*
4120 * Put the thread back where it goes after a thread_run_queue_remove
4121 *
4122 * Thread must have been removed under the same thread lock hold
4123 *
4124 * thread locked, at splsched
4125 */
4126void
4127thread_run_queue_reinsert(thread_t thread, integer_t options)
4128{
4129 assert(thread->runq == PROCESSOR_NULL);
4130
4131 assert(thread->state & (TH_RUN));
4132 thread_setrun(thread, options);
4133
4134}
4135
4136void
4137sys_override_cpu_throttle(int flag)
4138{
4139 if (flag == CPU_THROTTLE_ENABLE)
4140 cpu_throttle_enabled = 1;
4141 if (flag == CPU_THROTTLE_DISABLE)
4142 cpu_throttle_enabled = 0;
4143}
4144
4145int
4146thread_get_urgency(thread_t thread, uint64_t *arg1, uint64_t *arg2)
4147{
4148 if (thread == NULL || (thread->state & TH_IDLE)) {
4149 *arg1 = 0;
4150 *arg2 = 0;
4151
4152 return (THREAD_URGENCY_NONE);
4153 } else if (thread->sched_mode == TH_MODE_REALTIME) {
4154 *arg1 = thread->realtime.period;
4155 *arg2 = thread->realtime.deadline;
4156
4157 return (THREAD_URGENCY_REAL_TIME);
4158 } else if (cpu_throttle_enabled &&
4159 ((thread->sched_pri <= MAXPRI_THROTTLE) && (thread->base_pri <= MAXPRI_THROTTLE))) {
4160 /*
4161 * Background urgency applied when thread priority is MAXPRI_THROTTLE or lower and thread is not promoted
4162 * TODO: Use TH_SFLAG_THROTTLED instead?
4163 */
4164 *arg1 = thread->sched_pri;
4165 *arg2 = thread->base_pri;
4166
4167 return (THREAD_URGENCY_BACKGROUND);
4168 } else {
4169 /* For otherwise unclassified threads, report throughput QoS
4170 * parameters
4171 */
4172 *arg1 = thread->effective_policy.t_through_qos;
4173 *arg2 = thread->task->effective_policy.t_through_qos;
4174
4175 return (THREAD_URGENCY_NORMAL);
4176 }
4177}
4178
4179
4180/*
4181 * This is the processor idle loop, which just looks for other threads
4182 * to execute. Processor idle threads invoke this without supplying a
4183 * current thread to idle without an asserted wait state.
4184 *
4185 * Returns a the next thread to execute if dispatched directly.
4186 */
4187
4188#if 0
4189#define IDLE_KERNEL_DEBUG_CONSTANT(...) KERNEL_DEBUG_CONSTANT(__VA_ARGS__)
4190#else
4191#define IDLE_KERNEL_DEBUG_CONSTANT(...) do { } while(0)
4192#endif
4193
4194thread_t
4195processor_idle(
4196 thread_t thread,
4197 processor_t processor)
4198{
4199 processor_set_t pset = processor->processor_set;
4200 thread_t new_thread;
4201 int state;
4202 (void)splsched();
4203
4204 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
4205 MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_START,
4206 (uintptr_t)thread_tid(thread), 0, 0, 0, 0);
4207
4208 SCHED_STATS_CPU_IDLE_START(processor);
4209
4210 timer_switch(&PROCESSOR_DATA(processor, system_state),
4211 mach_absolute_time(), &PROCESSOR_DATA(processor, idle_state));
4212 PROCESSOR_DATA(processor, current_state) = &PROCESSOR_DATA(processor, idle_state);
4213
4214 while (1) {
4215 if (processor->state != PROCESSOR_IDLE) /* unsafe, but worst case we loop around once */
4216 break;
4217 if (pset->pending_AST_cpu_mask & (1ULL << processor->cpu_id))
4218 break;
4219 if (processor->is_recommended) {
4220 if (rt_runq.count)
4221 break;
4222 } else {
4223 if (SCHED(processor_bound_count)(processor))
4224 break;
4225 }
4226
4227#if CONFIG_SCHED_IDLE_IN_PLACE
4228 if (thread != THREAD_NULL) {
4229 /* Did idle-in-place thread wake up */
4230 if ((thread->state & (TH_WAIT|TH_SUSP)) != TH_WAIT || thread->wake_active)
4231 break;
4232 }
4233#endif
4234
4235 IDLE_KERNEL_DEBUG_CONSTANT(
4236 MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), rt_runq.count, SCHED(processor_runq_count)(processor), -1, 0);
4237
4238 machine_track_platform_idle(TRUE);
4239
4240 machine_idle();
4241
4242 machine_track_platform_idle(FALSE);
4243
4244 (void)splsched();
4245
4246 IDLE_KERNEL_DEBUG_CONSTANT(
4247 MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), rt_runq.count, SCHED(processor_runq_count)(processor), -2, 0);
4248
4249 if (!SCHED(processor_queue_empty)(processor)) {
4250 /* Secondary SMT processors respond to directed wakeups
4251 * exclusively. Some platforms induce 'spurious' SMT wakeups.
4252 */
4253 if (processor->processor_primary == processor)
4254 break;
4255 }
4256 }
4257
4258 timer_switch(&PROCESSOR_DATA(processor, idle_state),
4259 mach_absolute_time(), &PROCESSOR_DATA(processor, system_state));
4260 PROCESSOR_DATA(processor, current_state) = &PROCESSOR_DATA(processor, system_state);
4261
4262 pset_lock(pset);
4263
4264 /* If we were sent a remote AST and came out of idle, acknowledge it here with pset lock held */
4265 pset->pending_AST_cpu_mask &= ~(1ULL << processor->cpu_id);
4266#if defined(CONFIG_SCHED_DEFERRED_AST)
4267 pset->pending_deferred_AST_cpu_mask &= ~(1ULL << processor->cpu_id);
4268#endif
4269
4270 state = processor->state;
4271 if (state == PROCESSOR_DISPATCHING) {
4272 /*
4273 * Commmon case -- cpu dispatched.
4274 */
4275 new_thread = processor->next_thread;
4276 processor->next_thread = THREAD_NULL;
4277 processor->state = PROCESSOR_RUNNING;
4278
4279 if ((new_thread != THREAD_NULL) && (SCHED(processor_queue_has_priority)(processor, new_thread->sched_pri, FALSE) ||
4280 (rt_runq.count > 0)) ) {
4281 /* Something higher priority has popped up on the runqueue - redispatch this thread elsewhere */
4282 processor->current_pri = IDLEPRI;
4283 processor->current_thmode = TH_MODE_FIXED;
4284 processor->current_sfi_class = SFI_CLASS_KERNEL;
4285 processor->deadline = UINT64_MAX;
4286
4287 pset_unlock(pset);
4288
4289 thread_lock(new_thread);
4290 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_REDISPATCH), (uintptr_t)thread_tid(new_thread), new_thread->sched_pri, rt_runq.count, 0, 0);
4291 thread_setrun(new_thread, SCHED_HEADQ);
4292 thread_unlock(new_thread);
4293
4294 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
4295 MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_END,
4296 (uintptr_t)thread_tid(thread), state, 0, 0, 0);
4297
4298 return (THREAD_NULL);
4299 }
4300
4301 pset_unlock(pset);
4302
4303 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
4304 MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_END,
4305 (uintptr_t)thread_tid(thread), state, (uintptr_t)thread_tid(new_thread), 0, 0);
4306
4307 return (new_thread);
4308 }
4309 else
4310 if (state == PROCESSOR_IDLE) {
4311 remqueue((queue_entry_t)processor);
4312
4313 processor->state = PROCESSOR_RUNNING;
4314 processor->current_pri = IDLEPRI;
4315 processor->current_thmode = TH_MODE_FIXED;
4316 processor->current_sfi_class = SFI_CLASS_KERNEL;
4317 processor->deadline = UINT64_MAX;
4318 enqueue_tail(&pset->active_queue, (queue_entry_t)processor);
4319 }
4320 else
4321 if (state == PROCESSOR_SHUTDOWN) {
4322 /*
4323 * Going off-line. Force a
4324 * reschedule.
4325 */
4326 if ((new_thread = processor->next_thread) != THREAD_NULL) {
4327 processor->next_thread = THREAD_NULL;
4328 processor->current_pri = IDLEPRI;
4329 processor->current_thmode = TH_MODE_FIXED;
4330 processor->current_sfi_class = SFI_CLASS_KERNEL;
4331 processor->deadline = UINT64_MAX;
4332
4333 pset_unlock(pset);
4334
4335 thread_lock(new_thread);
4336 thread_setrun(new_thread, SCHED_HEADQ);
4337 thread_unlock(new_thread);
4338
4339 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
4340 MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_END,
4341 (uintptr_t)thread_tid(thread), state, 0, 0, 0);
4342
4343 return (THREAD_NULL);
4344 }
4345 }
4346
4347 pset_unlock(pset);
4348
4349 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
4350 MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_END,
4351 (uintptr_t)thread_tid(thread), state, 0, 0, 0);
4352
4353 return (THREAD_NULL);
4354}
4355
4356/*
4357 * Each processor has a dedicated thread which
4358 * executes the idle loop when there is no suitable
4359 * previous context.
4360 */
4361void
4362idle_thread(void)
4363{
4364 processor_t processor = current_processor();
4365 thread_t new_thread;
4366
4367 new_thread = processor_idle(THREAD_NULL, processor);
4368 if (new_thread != THREAD_NULL) {
4369 thread_run(processor->idle_thread, (thread_continue_t)idle_thread, NULL, new_thread);
4370 /*NOTREACHED*/
4371 }
4372
4373 thread_block((thread_continue_t)idle_thread);
4374 /*NOTREACHED*/
4375}
4376
4377kern_return_t
4378idle_thread_create(
4379 processor_t processor)
4380{
4381 kern_return_t result;
4382 thread_t thread;
4383 spl_t s;
4384
4385 result = kernel_thread_create((thread_continue_t)idle_thread, NULL, MAXPRI_KERNEL, &thread);
4386 if (result != KERN_SUCCESS)
4387 return (result);
4388
4389 s = splsched();
4390 thread_lock(thread);
4391 thread->bound_processor = processor;
4392 processor->idle_thread = thread;
4393 thread->sched_pri = thread->base_pri = IDLEPRI;
4394 thread->state = (TH_RUN | TH_IDLE);
4395 thread->options |= TH_OPT_IDLE_THREAD;
4396 thread_unlock(thread);
4397 splx(s);
4398
4399 thread_deallocate(thread);
4400
4401 return (KERN_SUCCESS);
4402}
4403
4404/*
4405 * sched_startup:
4406 *
4407 * Kicks off scheduler services.
4408 *
4409 * Called at splsched.
4410 */
4411void
4412sched_startup(void)
4413{
4414 kern_return_t result;
4415 thread_t thread;
4416
4417 simple_lock_init(&sched_vm_group_list_lock, 0);
4418
4419
4420 result = kernel_thread_start_priority((thread_continue_t)sched_init_thread,
4421 (void *)SCHED(maintenance_continuation), MAXPRI_KERNEL, &thread);
4422 if (result != KERN_SUCCESS)
4423 panic("sched_startup");
4424
4425 thread_deallocate(thread);
4426
4427 /*
4428 * Yield to the sched_init_thread once, to
4429 * initialize our own thread after being switched
4430 * back to.
4431 *
4432 * The current thread is the only other thread
4433 * active at this point.
4434 */
4435 thread_block(THREAD_CONTINUE_NULL);
4436}
4437
4438#if defined(CONFIG_SCHED_TIMESHARE_CORE)
4439
4440static volatile uint64_t sched_maintenance_deadline;
4441#if defined(CONFIG_TELEMETRY)
4442static volatile uint64_t sched_telemetry_deadline = 0;
4443#endif
4444static uint64_t sched_tick_last_abstime;
4445static uint64_t sched_tick_delta;
4446uint64_t sched_tick_max_delta;
4447/*
4448 * sched_init_thread:
4449 *
4450 * Perform periodic bookkeeping functions about ten
4451 * times per second.
4452 */
4453void
4454sched_timeshare_maintenance_continue(void)
4455{
4456 uint64_t sched_tick_ctime, late_time;
4457
4458 struct sched_update_scan_context scan_context = {
4459 .earliest_bg_make_runnable_time = UINT64_MAX,
4460 .earliest_normal_make_runnable_time = UINT64_MAX,
4461 .earliest_rt_make_runnable_time = UINT64_MAX
4462 };
4463
4464 sched_tick_ctime = mach_absolute_time();
4465
4466 if (__improbable(sched_tick_last_abstime == 0)) {
4467 sched_tick_last_abstime = sched_tick_ctime;
4468 late_time = 0;
4469 sched_tick_delta = 1;
4470 } else {
4471 late_time = sched_tick_ctime - sched_tick_last_abstime;
4472 sched_tick_delta = late_time / sched_tick_interval;
4473 /* Ensure a delta of 1, since the interval could be slightly
4474 * smaller than the sched_tick_interval due to dispatch
4475 * latencies.
4476 */
4477 sched_tick_delta = MAX(sched_tick_delta, 1);
4478
4479 /* In the event interrupt latencies or platform
4480 * idle events that advanced the timebase resulted
4481 * in periods where no threads were dispatched,
4482 * cap the maximum "tick delta" at SCHED_TICK_MAX_DELTA
4483 * iterations.
4484 */
4485 sched_tick_delta = MIN(sched_tick_delta, SCHED_TICK_MAX_DELTA);
4486
4487 sched_tick_last_abstime = sched_tick_ctime;
4488 sched_tick_max_delta = MAX(sched_tick_delta, sched_tick_max_delta);
4489 }
4490
4491 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_MAINTENANCE)|DBG_FUNC_START,
4492 sched_tick_delta,
4493 late_time,
4494 0,
4495 0,
4496 0);
4497
4498 /* Add a number of pseudo-ticks corresponding to the elapsed interval
4499 * This could be greater than 1 if substantial intervals where
4500 * all processors are idle occur, which rarely occurs in practice.
4501 */
4502
4503 sched_tick += sched_tick_delta;
4504
4505 /*
4506 * Compute various averages.
4507 */
4508 compute_averages(sched_tick_delta);
4509
4510 /*
4511 * Scan the run queues for threads which
4512 * may need to be updated.
4513 */
4514 SCHED(thread_update_scan)(&scan_context);
4515
4516 rt_runq_scan(&scan_context);
4517
4518 uint64_t ctime = mach_absolute_time();
4519
4520 machine_max_runnable_latency(ctime > scan_context.earliest_bg_make_runnable_time ? ctime - scan_context.earliest_bg_make_runnable_time : 0,
4521 ctime > scan_context.earliest_normal_make_runnable_time ? ctime - scan_context.earliest_normal_make_runnable_time : 0,
4522 ctime > scan_context.earliest_rt_make_runnable_time ? ctime - scan_context.earliest_rt_make_runnable_time : 0);
4523
4524 /*
4525 * Check to see if the special sched VM group needs attention.
4526 */
4527 sched_vm_group_maintenance();
4528
4529
4530 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_MAINTENANCE)|DBG_FUNC_END,
4531 sched_pri_shift,
4532 sched_background_pri_shift,
4533 0,
4534 0,
4535 0);
4536
4537 assert_wait((event_t)sched_timeshare_maintenance_continue, THREAD_UNINT);
4538 thread_block((thread_continue_t)sched_timeshare_maintenance_continue);
4539 /*NOTREACHED*/
4540}
4541
4542static uint64_t sched_maintenance_wakeups;
4543
4544/*
4545 * Determine if the set of routines formerly driven by a maintenance timer
4546 * must be invoked, based on a deadline comparison. Signals the scheduler
4547 * maintenance thread on deadline expiration. Must be invoked at an interval
4548 * lower than the "sched_tick_interval", currently accomplished by
4549 * invocation via the quantum expiration timer and at context switch time.
4550 * Performance matters: this routine reuses a timestamp approximating the
4551 * current absolute time received from the caller, and should perform
4552 * no more than a comparison against the deadline in the common case.
4553 */
4554void
4555sched_timeshare_consider_maintenance(uint64_t ctime) {
4556 uint64_t ndeadline, deadline = sched_maintenance_deadline;
4557
4558 if (__improbable(ctime >= deadline)) {
4559 if (__improbable(current_thread() == sched_maintenance_thread))
4560 return;
4561 OSMemoryBarrier();
4562
4563 ndeadline = ctime + sched_tick_interval;
4564
4565 if (__probable(__sync_bool_compare_and_swap(&sched_maintenance_deadline, deadline, ndeadline))) {
4566 thread_wakeup((event_t)sched_timeshare_maintenance_continue);
4567 sched_maintenance_wakeups++;
4568 }
4569 }
4570
4571#if defined(CONFIG_TELEMETRY)
4572 /*
4573 * Windowed telemetry is driven by the scheduler. It should be safe
4574 * to call compute_telemetry_windowed() even when windowed telemetry
4575 * is disabled, but we should try to avoid doing extra work for no
4576 * reason.
4577 */
4578 if (telemetry_window_enabled) {
4579 deadline = sched_telemetry_deadline;
4580
4581 if (__improbable(ctime >= deadline)) {
4582 ndeadline = ctime + sched_telemetry_interval;
4583
4584 if (__probable(__sync_bool_compare_and_swap(&sched_telemetry_deadline, deadline, ndeadline))) {
4585 compute_telemetry_windowed();
4586 }
4587 }
4588 }
4589#endif /* CONFIG_TELEMETRY */
4590}
4591
4592#endif /* CONFIG_SCHED_TIMESHARE_CORE */
4593
4594void
4595sched_init_thread(void (*continuation)(void))
4596{
4597 thread_block(THREAD_CONTINUE_NULL);
4598
4599 thread_t thread = current_thread();
4600
4601 sched_maintenance_thread = thread;
4602
4603 continuation();
4604
4605 /*NOTREACHED*/
4606}
4607
4608#if defined(CONFIG_SCHED_TIMESHARE_CORE)
4609
4610/*
4611 * thread_update_scan / runq_scan:
4612 *
4613 * Scan the run queues to account for timesharing threads
4614 * which need to be updated.
4615 *
4616 * Scanner runs in two passes. Pass one squirrels likely
4617 * threads away in an array, pass two does the update.
4618 *
4619 * This is necessary because the run queue is locked for
4620 * the candidate scan, but the thread is locked for the update.
4621 *
4622 * Array should be sized to make forward progress, without
4623 * disabling preemption for long periods.
4624 */
4625
4626#define THREAD_UPDATE_SIZE 128
4627
4628static thread_t thread_update_array[THREAD_UPDATE_SIZE];
4629static int thread_update_count = 0;
4630
4631/* Returns TRUE if thread was added, FALSE if thread_update_array is full */
4632boolean_t
4633thread_update_add_thread(thread_t thread)
4634{
4635 if (thread_update_count == THREAD_UPDATE_SIZE)
4636 return (FALSE);
4637
4638 thread_update_array[thread_update_count++] = thread;
4639 thread_reference_internal(thread);
4640 return (TRUE);
4641}
4642
4643void
4644thread_update_process_threads(void)
4645{
4646 while (thread_update_count > 0) {
4647 spl_t s;
4648 thread_t thread = thread_update_array[--thread_update_count];
4649 thread_update_array[thread_update_count] = THREAD_NULL;
4650
4651 s = splsched();
4652 thread_lock(thread);
4653 if (!(thread->state & (TH_WAIT)) && (SCHED(can_update_priority)(thread))) {
4654 SCHED(update_priority)(thread);
4655 }
4656 thread_unlock(thread);
4657 splx(s);
4658
4659 thread_deallocate(thread);
4660 }
4661}
4662
4663/*
4664 * Scan a runq for candidate threads.
4665 *
4666 * Returns TRUE if retry is needed.
4667 */
4668boolean_t
4669runq_scan(
4670 run_queue_t runq,
4671 sched_update_scan_context_t scan_context)
4672{
4673 register int count;
4674 register queue_t q;
4675 register thread_t thread;
4676
4677 if ((count = runq->count) > 0) {
4678 q = runq->queues + runq->highq;
4679 while (count > 0) {
4680 queue_iterate(q, thread, thread_t, links) {
4681 if ( thread->sched_stamp != sched_tick &&
4682 (thread->sched_mode == TH_MODE_TIMESHARE) ) {
4683 if (thread_update_add_thread(thread) == FALSE)
4684 return (TRUE);
4685 }
4686
4687 if (cpu_throttle_enabled && ((thread->sched_pri <= MAXPRI_THROTTLE) && (thread->base_pri <= MAXPRI_THROTTLE))) {
4688 if (thread->last_made_runnable_time < scan_context->earliest_bg_make_runnable_time) {
4689 scan_context->earliest_bg_make_runnable_time = thread->last_made_runnable_time;
4690 }
4691 } else {
4692 if (thread->last_made_runnable_time < scan_context->earliest_normal_make_runnable_time) {
4693 scan_context->earliest_normal_make_runnable_time = thread->last_made_runnable_time;
4694 }
4695 }
4696
4697 count--;
4698 }
4699
4700 q--;
4701 }
4702 }
4703
4704 return (FALSE);
4705}
4706
4707#endif /* CONFIG_SCHED_TIMESHARE_CORE */
4708
4709boolean_t
4710thread_eager_preemption(thread_t thread)
4711{
4712 return ((thread->sched_flags & TH_SFLAG_EAGERPREEMPT) != 0);
4713}
4714
4715void
4716thread_set_eager_preempt(thread_t thread)
4717{
4718 spl_t x;
4719 processor_t p;
4720 ast_t ast = AST_NONE;
4721
4722 x = splsched();
4723 p = current_processor();
4724
4725 thread_lock(thread);
4726 thread->sched_flags |= TH_SFLAG_EAGERPREEMPT;
4727
4728 if (thread == current_thread()) {
4729
4730 ast = csw_check(p, AST_NONE);
4731 thread_unlock(thread);
4732 if (ast != AST_NONE) {
4733 (void) thread_block_reason(THREAD_CONTINUE_NULL, NULL, ast);
4734 }
4735 } else {
4736 p = thread->last_processor;
4737
4738 if (p != PROCESSOR_NULL && p->state == PROCESSOR_RUNNING &&
4739 p->active_thread == thread) {
4740 cause_ast_check(p);
4741 }
4742
4743 thread_unlock(thread);
4744 }
4745
4746 splx(x);
4747}
4748
4749void
4750thread_clear_eager_preempt(thread_t thread)
4751{
4752 spl_t x;
4753
4754 x = splsched();
4755 thread_lock(thread);
4756
4757 thread->sched_flags &= ~TH_SFLAG_EAGERPREEMPT;
4758
4759 thread_unlock(thread);
4760 splx(x);
4761}
4762
4763/*
4764 * Scheduling statistics
4765 */
4766void
4767sched_stats_handle_csw(processor_t processor, int reasons, int selfpri, int otherpri)
4768{
4769 struct processor_sched_statistics *stats;
4770 boolean_t to_realtime = FALSE;
4771
4772 stats = &processor->processor_data.sched_stats;
4773 stats->csw_count++;
4774
4775 if (otherpri >= BASEPRI_REALTIME) {
4776 stats->rt_sched_count++;
4777 to_realtime = TRUE;
4778 }
4779
4780 if ((reasons & AST_PREEMPT) != 0) {
4781 stats->preempt_count++;
4782
4783 if (selfpri >= BASEPRI_REALTIME) {
4784 stats->preempted_rt_count++;
4785 }
4786
4787 if (to_realtime) {
4788 stats->preempted_by_rt_count++;
4789 }
4790
4791 }
4792}
4793
4794void
4795sched_stats_handle_runq_change(struct runq_stats *stats, int old_count)
4796{
4797 uint64_t timestamp = mach_absolute_time();
4798
4799 stats->count_sum += (timestamp - stats->last_change_timestamp) * old_count;
4800 stats->last_change_timestamp = timestamp;
4801}
4802
4803/*
4804 * For calls from assembly code
4805 */
4806#undef thread_wakeup
4807void
4808thread_wakeup(
4809 event_t x);
4810
4811void
4812thread_wakeup(
4813 event_t x)
4814{
4815 thread_wakeup_with_result(x, THREAD_AWAKENED);
4816}
4817
4818boolean_t
4819preemption_enabled(void)
4820{
4821 return (get_preemption_level() == 0 && ml_get_interrupts_enabled());
4822}
4823
4824static void
4825sched_timer_deadline_tracking_init(void) {
4826 nanoseconds_to_absolutetime(TIMER_DEADLINE_TRACKING_BIN_1_DEFAULT, &timer_deadline_tracking_bin_1);
4827 nanoseconds_to_absolutetime(TIMER_DEADLINE_TRACKING_BIN_2_DEFAULT, &timer_deadline_tracking_bin_2);
4828}
4829
4830
4831kern_return_t
4832sched_work_interval_notify(thread_t thread, uint64_t work_interval_id, uint64_t start, uint64_t finish, uint64_t deadline, uint64_t next_start, uint32_t flags)
4833{
4834 int urgency;
4835 uint64_t urgency_param1, urgency_param2;
4836 spl_t s;
4837
4838 if (work_interval_id == 0) {
4839 return (KERN_INVALID_ARGUMENT);
4840 }
4841
4842 assert(thread == current_thread());
4843
4844 thread_mtx_lock(thread);
4845 if (thread->work_interval_id != work_interval_id) {
4846 thread_mtx_unlock(thread);
4847 return (KERN_INVALID_ARGUMENT);
4848 }
4849 thread_mtx_unlock(thread);
4850
4851 s = splsched();
4852 thread_lock(thread);
4853 urgency = thread_get_urgency(thread, &urgency_param1, &urgency_param2);
4854 thread_unlock(thread);
4855 splx(s);
4856
4857 machine_work_interval_notify(thread, work_interval_id, start, finish, deadline, next_start, urgency, flags);
4858 return (KERN_SUCCESS);
4859}
4860
4861void thread_set_options(uint32_t thopt) {
4862 spl_t x;
4863 thread_t t = current_thread();
4864
4865 x = splsched();
4866 thread_lock(t);
4867
4868 t->options |= thopt;
4869
4870 thread_unlock(t);
4871 splx(x);
4872}