]> git.saurik.com Git - apple/xnu.git/blob - osfmk/kern/sched_prim.c
xnu-3789.41.3.tar.gz
[apple/xnu.git] / osfmk / kern / sched_prim.c
1 /*
2 * Copyright (c) 2000-2016 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_FREE_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 */
58 /*
59 * File: sched_prim.c
60 * Author: Avadis Tevanian, Jr.
61 * Date: 1986
62 *
63 * Scheduling primitives
64 *
65 */
66
67 #include <debug.h>
68
69 #include <mach/mach_types.h>
70 #include <mach/machine.h>
71 #include <mach/policy.h>
72 #include <mach/sync_policy.h>
73 #include <mach/thread_act.h>
74
75 #include <machine/machine_routines.h>
76 #include <machine/sched_param.h>
77 #include <machine/machine_cpu.h>
78 #include <machine/machlimits.h>
79
80 #ifdef CONFIG_MACH_APPROXIMATE_TIME
81 #include <machine/commpage.h>
82 #endif
83
84 #include <kern/kern_types.h>
85 #include <kern/backtrace.h>
86 #include <kern/clock.h>
87 #include <kern/counters.h>
88 #include <kern/cpu_number.h>
89 #include <kern/cpu_data.h>
90 #include <kern/smp.h>
91 #include <kern/debug.h>
92 #include <kern/macro_help.h>
93 #include <kern/machine.h>
94 #include <kern/misc_protos.h>
95 #include <kern/processor.h>
96 #include <kern/queue.h>
97 #include <kern/sched.h>
98 #include <kern/sched_prim.h>
99 #include <kern/sfi.h>
100 #include <kern/syscall_subr.h>
101 #include <kern/task.h>
102 #include <kern/thread.h>
103 #include <kern/ledger.h>
104 #include <kern/timer_queue.h>
105 #include <kern/waitq.h>
106 #include <kern/policy_internal.h>
107
108 #include <vm/pmap.h>
109 #include <vm/vm_kern.h>
110 #include <vm/vm_map.h>
111
112 #include <mach/sdt.h>
113
114 #include <sys/kdebug.h>
115 #include <kperf/kperf.h>
116 #include <kern/kpc.h>
117
118 #include <kern/pms.h>
119
120 struct rt_queue rt_runq;
121
122 uintptr_t sched_thread_on_rt_queue = (uintptr_t)0xDEAFBEE0;
123
124 /* Lock RT runq, must be done with interrupts disabled (under splsched()) */
125 #if __SMP__
126 decl_simple_lock_data(static,rt_lock);
127 #define rt_lock_init() simple_lock_init(&rt_lock, 0)
128 #define rt_lock_lock() simple_lock(&rt_lock)
129 #define rt_lock_unlock() simple_unlock(&rt_lock)
130 #else
131 #define rt_lock_init() do { } while(0)
132 #define rt_lock_lock() do { } while(0)
133 #define rt_lock_unlock() do { } while(0)
134 #endif
135
136 #define DEFAULT_PREEMPTION_RATE 100 /* (1/s) */
137 int default_preemption_rate = DEFAULT_PREEMPTION_RATE;
138
139 #define DEFAULT_BG_PREEMPTION_RATE 400 /* (1/s) */
140 int default_bg_preemption_rate = DEFAULT_BG_PREEMPTION_RATE;
141
142 #define MAX_UNSAFE_QUANTA 800
143 int max_unsafe_quanta = MAX_UNSAFE_QUANTA;
144
145 #define MAX_POLL_QUANTA 2
146 int max_poll_quanta = MAX_POLL_QUANTA;
147
148 #define SCHED_POLL_YIELD_SHIFT 4 /* 1/16 */
149 int sched_poll_yield_shift = SCHED_POLL_YIELD_SHIFT;
150
151 uint64_t max_poll_computation;
152
153 uint64_t max_unsafe_computation;
154 uint64_t sched_safe_duration;
155
156 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
157
158 uint32_t std_quantum;
159 uint32_t min_std_quantum;
160 uint32_t bg_quantum;
161
162 uint32_t std_quantum_us;
163 uint32_t bg_quantum_us;
164
165 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
166
167 uint32_t thread_depress_time;
168 uint32_t default_timeshare_computation;
169 uint32_t default_timeshare_constraint;
170
171 uint32_t max_rt_quantum;
172 uint32_t min_rt_quantum;
173
174 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
175
176 unsigned sched_tick;
177 uint32_t sched_tick_interval;
178
179 uint32_t sched_pri_shifts[TH_BUCKET_MAX];
180 uint32_t sched_fixed_shift;
181
182 uint32_t sched_decay_usage_age_factor = 1; /* accelerate 5/8^n usage aging */
183
184 /* Allow foreground to decay past default to resolve inversions */
185 #define DEFAULT_DECAY_BAND_LIMIT ((BASEPRI_FOREGROUND - BASEPRI_DEFAULT) + 2)
186 int sched_pri_decay_band_limit = DEFAULT_DECAY_BAND_LIMIT;
187
188 /* Defaults for timer deadline profiling */
189 #define TIMER_DEADLINE_TRACKING_BIN_1_DEFAULT 2000000 /* Timers with deadlines <=
190 * 2ms */
191 #define TIMER_DEADLINE_TRACKING_BIN_2_DEFAULT 5000000 /* Timers with deadlines
192 <= 5ms */
193
194 uint64_t timer_deadline_tracking_bin_1;
195 uint64_t timer_deadline_tracking_bin_2;
196
197 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
198
199 thread_t sched_maintenance_thread;
200
201
202 uint64_t sched_one_second_interval;
203
204 /* Forwards */
205
206 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
207
208 static void load_shift_init(void);
209 static void preempt_pri_init(void);
210
211 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
212
213 static thread_t thread_select(
214 thread_t thread,
215 processor_t processor,
216 ast_t reason);
217
218 #if CONFIG_SCHED_IDLE_IN_PLACE
219 static thread_t thread_select_idle(
220 thread_t thread,
221 processor_t processor);
222 #endif
223
224 thread_t processor_idle(
225 thread_t thread,
226 processor_t processor);
227
228 ast_t
229 csw_check_locked( processor_t processor,
230 processor_set_t pset,
231 ast_t check_reason);
232
233 static void processor_setrun(
234 processor_t processor,
235 thread_t thread,
236 integer_t options);
237
238 static void
239 sched_realtime_init(void);
240
241 static void
242 sched_realtime_timebase_init(void);
243
244 static void
245 sched_timer_deadline_tracking_init(void);
246
247 #if DEBUG
248 extern int debug_task;
249 #define TLOG(a, fmt, args...) if(debug_task & a) kprintf(fmt, ## args)
250 #else
251 #define TLOG(a, fmt, args...) do {} while (0)
252 #endif
253
254 static processor_t
255 thread_bind_internal(
256 thread_t thread,
257 processor_t processor);
258
259 static void
260 sched_vm_group_maintenance(void);
261
262 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
263 int8_t sched_load_shifts[NRQS];
264 bitmap_t sched_preempt_pri[BITMAP_LEN(NRQS)];
265 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
266
267 const struct sched_dispatch_table *sched_current_dispatch = NULL;
268
269 /*
270 * Statically allocate a buffer to hold the longest possible
271 * scheduler description string, as currently implemented.
272 * bsd/kern/kern_sysctl.c has a corresponding definition in bsd/
273 * to export to userspace via sysctl(3). If either version
274 * changes, update the other.
275 *
276 * Note that in addition to being an upper bound on the strings
277 * in the kernel, it's also an exact parameter to PE_get_default(),
278 * which interrogates the device tree on some platforms. That
279 * API requires the caller know the exact size of the device tree
280 * property, so we need both a legacy size (32) and the current size
281 * (48) to deal with old and new device trees. The device tree property
282 * is similarly padded to a fixed size so that the same kernel image
283 * can run on multiple devices with different schedulers configured
284 * in the device tree.
285 */
286 char sched_string[SCHED_STRING_MAX_LENGTH];
287
288 uint32_t sched_debug_flags;
289
290 /* Global flag which indicates whether Background Stepper Context is enabled */
291 static int cpu_throttle_enabled = 1;
292
293 void
294 sched_init(void)
295 {
296 char sched_arg[SCHED_STRING_MAX_LENGTH] = { '\0' };
297
298 /* Check for runtime selection of the scheduler algorithm */
299 if (!PE_parse_boot_argn("sched", sched_arg, sizeof (sched_arg))) {
300 /* If no boot-args override, look in device tree */
301 if (!PE_get_default("kern.sched", sched_arg,
302 SCHED_STRING_MAX_LENGTH)) {
303 sched_arg[0] = '\0';
304 }
305 }
306
307
308 if (!PE_parse_boot_argn("sched_pri_decay_limit", &sched_pri_decay_band_limit, sizeof(sched_pri_decay_band_limit))) {
309 /* No boot-args, check in device tree */
310 if (!PE_get_default("kern.sched_pri_decay_limit",
311 &sched_pri_decay_band_limit,
312 sizeof(sched_pri_decay_band_limit))) {
313 /* Allow decay all the way to normal limits */
314 sched_pri_decay_band_limit = DEFAULT_DECAY_BAND_LIMIT;
315 }
316 }
317
318 kprintf("Setting scheduler priority decay band limit %d\n", sched_pri_decay_band_limit);
319
320 if (strlen(sched_arg) > 0) {
321 if (0) {
322 /* Allow pattern below */
323 #if defined(CONFIG_SCHED_TRADITIONAL)
324 } else if (0 == strcmp(sched_arg, sched_traditional_dispatch.sched_name)) {
325 sched_current_dispatch = &sched_traditional_dispatch;
326 } else if (0 == strcmp(sched_arg, sched_traditional_with_pset_runqueue_dispatch.sched_name)) {
327 sched_current_dispatch = &sched_traditional_with_pset_runqueue_dispatch;
328 #endif
329 #if defined(CONFIG_SCHED_PROTO)
330 } else if (0 == strcmp(sched_arg, sched_proto_dispatch.sched_name)) {
331 sched_current_dispatch = &sched_proto_dispatch;
332 #endif
333 #if defined(CONFIG_SCHED_GRRR)
334 } else if (0 == strcmp(sched_arg, sched_grrr_dispatch.sched_name)) {
335 sched_current_dispatch = &sched_grrr_dispatch;
336 #endif
337 #if defined(CONFIG_SCHED_MULTIQ)
338 } else if (0 == strcmp(sched_arg, sched_multiq_dispatch.sched_name)) {
339 sched_current_dispatch = &sched_multiq_dispatch;
340 } else if (0 == strcmp(sched_arg, sched_dualq_dispatch.sched_name)) {
341 sched_current_dispatch = &sched_dualq_dispatch;
342 #endif
343 } else {
344 #if defined(CONFIG_SCHED_TRADITIONAL)
345 printf("Unrecognized scheduler algorithm: %s\n", sched_arg);
346 printf("Scheduler: Using instead: %s\n", sched_traditional_with_pset_runqueue_dispatch.sched_name);
347 sched_current_dispatch = &sched_traditional_with_pset_runqueue_dispatch;
348 #else
349 panic("Unrecognized scheduler algorithm: %s", sched_arg);
350 #endif
351 }
352 kprintf("Scheduler: Runtime selection of %s\n", SCHED(sched_name));
353 } else {
354 #if defined(CONFIG_SCHED_MULTIQ)
355 sched_current_dispatch = &sched_multiq_dispatch;
356 #elif defined(CONFIG_SCHED_TRADITIONAL)
357 sched_current_dispatch = &sched_traditional_with_pset_runqueue_dispatch;
358 #elif defined(CONFIG_SCHED_PROTO)
359 sched_current_dispatch = &sched_proto_dispatch;
360 #elif defined(CONFIG_SCHED_GRRR)
361 sched_current_dispatch = &sched_grrr_dispatch;
362 #else
363 #error No default scheduler implementation
364 #endif
365 kprintf("Scheduler: Default of %s\n", SCHED(sched_name));
366 }
367
368 strlcpy(sched_string, SCHED(sched_name), sizeof(sched_string));
369
370 if (PE_parse_boot_argn("sched_debug", &sched_debug_flags, sizeof(sched_debug_flags))) {
371 kprintf("Scheduler: Debug flags 0x%08x\n", sched_debug_flags);
372 }
373
374 SCHED(init)();
375 sched_realtime_init();
376 ast_init();
377 sched_timer_deadline_tracking_init();
378
379 SCHED(pset_init)(&pset0);
380 SCHED(processor_init)(master_processor);
381 }
382
383 void
384 sched_timebase_init(void)
385 {
386 uint64_t abstime;
387
388 clock_interval_to_absolutetime_interval(1, NSEC_PER_SEC, &abstime);
389 sched_one_second_interval = abstime;
390
391 SCHED(timebase_init)();
392 sched_realtime_timebase_init();
393 }
394
395 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
396
397 void
398 sched_timeshare_init(void)
399 {
400 /*
401 * Calculate the timeslicing quantum
402 * in us.
403 */
404 if (default_preemption_rate < 1)
405 default_preemption_rate = DEFAULT_PREEMPTION_RATE;
406 std_quantum_us = (1000 * 1000) / default_preemption_rate;
407
408 printf("standard timeslicing quantum is %d us\n", std_quantum_us);
409
410 if (default_bg_preemption_rate < 1)
411 default_bg_preemption_rate = DEFAULT_BG_PREEMPTION_RATE;
412 bg_quantum_us = (1000 * 1000) / default_bg_preemption_rate;
413
414 printf("standard background quantum is %d us\n", bg_quantum_us);
415
416 load_shift_init();
417 preempt_pri_init();
418 sched_tick = 0;
419 }
420
421 void
422 sched_timeshare_timebase_init(void)
423 {
424 uint64_t abstime;
425 uint32_t shift;
426
427 /* standard timeslicing quantum */
428 clock_interval_to_absolutetime_interval(
429 std_quantum_us, NSEC_PER_USEC, &abstime);
430 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
431 std_quantum = (uint32_t)abstime;
432
433 /* smallest remaining quantum (250 us) */
434 clock_interval_to_absolutetime_interval(250, NSEC_PER_USEC, &abstime);
435 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
436 min_std_quantum = (uint32_t)abstime;
437
438 /* quantum for background tasks */
439 clock_interval_to_absolutetime_interval(
440 bg_quantum_us, NSEC_PER_USEC, &abstime);
441 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
442 bg_quantum = (uint32_t)abstime;
443
444 /* scheduler tick interval */
445 clock_interval_to_absolutetime_interval(USEC_PER_SEC >> SCHED_TICK_SHIFT,
446 NSEC_PER_USEC, &abstime);
447 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
448 sched_tick_interval = (uint32_t)abstime;
449
450 /*
451 * Compute conversion factor from usage to
452 * timesharing priorities with 5/8 ** n aging.
453 */
454 abstime = (abstime * 5) / 3;
455 for (shift = 0; abstime > BASEPRI_DEFAULT; ++shift)
456 abstime >>= 1;
457 sched_fixed_shift = shift;
458
459 for (uint32_t i = 0 ; i < TH_BUCKET_MAX ; i++)
460 sched_pri_shifts[i] = INT8_MAX;
461
462 max_unsafe_computation = ((uint64_t)max_unsafe_quanta) * std_quantum;
463 sched_safe_duration = 2 * ((uint64_t)max_unsafe_quanta) * std_quantum;
464
465 max_poll_computation = ((uint64_t)max_poll_quanta) * std_quantum;
466 thread_depress_time = 1 * std_quantum;
467 default_timeshare_computation = std_quantum / 2;
468 default_timeshare_constraint = std_quantum;
469
470 }
471
472 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
473
474 static void
475 sched_realtime_init(void)
476 {
477 rt_lock_init();
478
479 rt_runq.count = 0;
480 queue_init(&rt_runq.queue);
481 }
482
483 static void
484 sched_realtime_timebase_init(void)
485 {
486 uint64_t abstime;
487
488 /* smallest rt computaton (50 us) */
489 clock_interval_to_absolutetime_interval(50, NSEC_PER_USEC, &abstime);
490 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
491 min_rt_quantum = (uint32_t)abstime;
492
493 /* maximum rt computation (50 ms) */
494 clock_interval_to_absolutetime_interval(
495 50, 1000*NSEC_PER_USEC, &abstime);
496 assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
497 max_rt_quantum = (uint32_t)abstime;
498
499 }
500
501 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
502
503 /*
504 * Set up values for timeshare
505 * loading factors.
506 */
507 static void
508 load_shift_init(void)
509 {
510 int8_t k, *p = sched_load_shifts;
511 uint32_t i, j;
512
513 uint32_t sched_decay_penalty = 1;
514
515 if (PE_parse_boot_argn("sched_decay_penalty", &sched_decay_penalty, sizeof (sched_decay_penalty))) {
516 kprintf("Overriding scheduler decay penalty %u\n", sched_decay_penalty);
517 }
518
519 if (PE_parse_boot_argn("sched_decay_usage_age_factor", &sched_decay_usage_age_factor, sizeof (sched_decay_usage_age_factor))) {
520 kprintf("Overriding scheduler decay usage age factor %u\n", sched_decay_usage_age_factor);
521 }
522
523 if (sched_decay_penalty == 0) {
524 /*
525 * There is no penalty for timeshare threads for using too much
526 * CPU, so set all load shifts to INT8_MIN. Even under high load,
527 * sched_pri_shift will be >INT8_MAX, and there will be no
528 * penalty applied to threads (nor will sched_usage be updated per
529 * thread).
530 */
531 for (i = 0; i < NRQS; i++) {
532 sched_load_shifts[i] = INT8_MIN;
533 }
534
535 return;
536 }
537
538 *p++ = INT8_MIN; *p++ = 0;
539
540 /*
541 * For a given system load "i", the per-thread priority
542 * penalty per quantum of CPU usage is ~2^k priority
543 * levels. "sched_decay_penalty" can cause more
544 * array entries to be filled with smaller "k" values
545 */
546 for (i = 2, j = 1 << sched_decay_penalty, k = 1; i < NRQS; ++k) {
547 for (j <<= 1; (i < j) && (i < NRQS); ++i)
548 *p++ = k;
549 }
550 }
551
552 static void
553 preempt_pri_init(void)
554 {
555 bitmap_t *p = sched_preempt_pri;
556
557 for (int i = BASEPRI_FOREGROUND; i < MINPRI_KERNEL; ++i)
558 bitmap_set(p, i);
559
560 for (int i = BASEPRI_PREEMPT; i <= MAXPRI; ++i)
561 bitmap_set(p, i);
562 }
563
564 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
565
566 /*
567 * Thread wait timer expiration.
568 */
569 void
570 thread_timer_expire(
571 void *p0,
572 __unused void *p1)
573 {
574 thread_t thread = p0;
575 spl_t s;
576
577 assert_thread_magic(thread);
578
579 s = splsched();
580 thread_lock(thread);
581 if (--thread->wait_timer_active == 0) {
582 if (thread->wait_timer_is_set) {
583 thread->wait_timer_is_set = FALSE;
584 clear_wait_internal(thread, THREAD_TIMED_OUT);
585 }
586 }
587 thread_unlock(thread);
588 splx(s);
589 }
590
591 /*
592 * thread_unblock:
593 *
594 * Unblock thread on wake up.
595 *
596 * Returns TRUE if the thread should now be placed on the runqueue.
597 *
598 * Thread must be locked.
599 *
600 * Called at splsched().
601 */
602 boolean_t
603 thread_unblock(
604 thread_t thread,
605 wait_result_t wresult)
606 {
607 boolean_t ready_for_runq = FALSE;
608 thread_t cthread = current_thread();
609 uint32_t new_run_count;
610
611 /*
612 * Set wait_result.
613 */
614 thread->wait_result = wresult;
615
616 /*
617 * Cancel pending wait timer.
618 */
619 if (thread->wait_timer_is_set) {
620 if (timer_call_cancel(&thread->wait_timer))
621 thread->wait_timer_active--;
622 thread->wait_timer_is_set = FALSE;
623 }
624
625 /*
626 * Update scheduling state: not waiting,
627 * set running.
628 */
629 thread->state &= ~(TH_WAIT|TH_UNINT);
630
631 if (!(thread->state & TH_RUN)) {
632 thread->state |= TH_RUN;
633 thread->last_made_runnable_time = mach_approximate_time();
634
635 ready_for_runq = TRUE;
636
637 (*thread->sched_call)(SCHED_CALL_UNBLOCK, thread);
638
639 /* Update the runnable thread count */
640 new_run_count = sched_run_incr(thread);
641 } else {
642 /*
643 * Either the thread is idling in place on another processor,
644 * or it hasn't finished context switching yet.
645 */
646 #if CONFIG_SCHED_IDLE_IN_PLACE
647 if (thread->state & TH_IDLE) {
648 processor_t processor = thread->last_processor;
649
650 if (processor != current_processor())
651 machine_signal_idle(processor);
652 }
653 #else
654 assert((thread->state & TH_IDLE) == 0);
655 #endif
656 /*
657 * The run count is only dropped after the context switch completes
658 * and the thread is still waiting, so we should not run_incr here
659 */
660 new_run_count = sched_run_buckets[TH_BUCKET_RUN];
661 }
662
663
664 /*
665 * Calculate deadline for real-time threads.
666 */
667 if (thread->sched_mode == TH_MODE_REALTIME) {
668 uint64_t ctime;
669
670 ctime = mach_absolute_time();
671 thread->realtime.deadline = thread->realtime.constraint + ctime;
672 }
673
674 /*
675 * Clear old quantum, fail-safe computation, etc.
676 */
677 thread->quantum_remaining = 0;
678 thread->computation_metered = 0;
679 thread->reason = AST_NONE;
680
681 /* Obtain power-relevant interrupt and "platform-idle exit" statistics.
682 * We also account for "double hop" thread signaling via
683 * the thread callout infrastructure.
684 * DRK: consider removing the callout wakeup counters in the future
685 * they're present for verification at the moment.
686 */
687 boolean_t aticontext, pidle;
688 ml_get_power_state(&aticontext, &pidle);
689
690 if (__improbable(aticontext && !(thread_get_tag_internal(thread) & THREAD_TAG_CALLOUT))) {
691 ledger_credit(thread->t_ledger, task_ledgers.interrupt_wakeups, 1);
692 DTRACE_SCHED2(iwakeup, struct thread *, thread, struct proc *, thread->task->bsd_info);
693
694 uint64_t ttd = PROCESSOR_DATA(current_processor(), timer_call_ttd);
695
696 if (ttd) {
697 if (ttd <= timer_deadline_tracking_bin_1)
698 thread->thread_timer_wakeups_bin_1++;
699 else
700 if (ttd <= timer_deadline_tracking_bin_2)
701 thread->thread_timer_wakeups_bin_2++;
702 }
703
704 if (pidle) {
705 ledger_credit(thread->t_ledger, task_ledgers.platform_idle_wakeups, 1);
706 }
707
708 } else if (thread_get_tag_internal(cthread) & THREAD_TAG_CALLOUT) {
709 if (cthread->callout_woken_from_icontext) {
710 ledger_credit(thread->t_ledger, task_ledgers.interrupt_wakeups, 1);
711 thread->thread_callout_interrupt_wakeups++;
712 if (cthread->callout_woken_from_platform_idle) {
713 ledger_credit(thread->t_ledger, task_ledgers.platform_idle_wakeups, 1);
714 thread->thread_callout_platform_idle_wakeups++;
715 }
716
717 cthread->callout_woke_thread = TRUE;
718 }
719 }
720
721 if (thread_get_tag_internal(thread) & THREAD_TAG_CALLOUT) {
722 thread->callout_woken_from_icontext = aticontext;
723 thread->callout_woken_from_platform_idle = pidle;
724 thread->callout_woke_thread = FALSE;
725 }
726
727 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
728 MACHDBG_CODE(DBG_MACH_SCHED,MACH_MAKE_RUNNABLE) | DBG_FUNC_NONE,
729 (uintptr_t)thread_tid(thread), thread->sched_pri, thread->wait_result,
730 sched_run_buckets[TH_BUCKET_RUN], 0);
731
732 DTRACE_SCHED2(wakeup, struct thread *, thread, struct proc *, thread->task->bsd_info);
733
734 return (ready_for_runq);
735 }
736
737 /*
738 * Routine: thread_go
739 * Purpose:
740 * Unblock and dispatch thread.
741 * Conditions:
742 * thread lock held, IPC locks may be held.
743 * thread must have been pulled from wait queue under same lock hold.
744 * thread must have been waiting
745 * Returns:
746 * KERN_SUCCESS - Thread was set running
747 *
748 * TODO: This should return void
749 */
750 kern_return_t
751 thread_go(
752 thread_t thread,
753 wait_result_t wresult)
754 {
755 assert_thread_magic(thread);
756
757 assert(thread->at_safe_point == FALSE);
758 assert(thread->wait_event == NO_EVENT64);
759 assert(thread->waitq == NULL);
760
761 assert(!(thread->state & (TH_TERMINATE|TH_TERMINATE2)));
762 assert(thread->state & TH_WAIT);
763
764
765 if (thread_unblock(thread, wresult)) {
766 #if SCHED_TRACE_THREAD_WAKEUPS
767 backtrace(&thread->thread_wakeup_bt[0],
768 (sizeof(thread->thread_wakeup_bt)/sizeof(uintptr_t)));
769 #endif
770 thread_setrun(thread, SCHED_PREEMPT | SCHED_TAILQ);
771 }
772
773 return (KERN_SUCCESS);
774 }
775
776 /*
777 * Routine: thread_mark_wait_locked
778 * Purpose:
779 * Mark a thread as waiting. If, given the circumstances,
780 * it doesn't want to wait (i.e. already aborted), then
781 * indicate that in the return value.
782 * Conditions:
783 * at splsched() and thread is locked.
784 */
785 __private_extern__
786 wait_result_t
787 thread_mark_wait_locked(
788 thread_t thread,
789 wait_interrupt_t interruptible)
790 {
791 boolean_t at_safe_point;
792
793 assert(!(thread->state & (TH_WAIT|TH_IDLE|TH_UNINT|TH_TERMINATE2)));
794
795 /*
796 * The thread may have certain types of interrupts/aborts masked
797 * off. Even if the wait location says these types of interrupts
798 * are OK, we have to honor mask settings (outer-scoped code may
799 * not be able to handle aborts at the moment).
800 */
801 if (interruptible > (thread->options & TH_OPT_INTMASK))
802 interruptible = thread->options & TH_OPT_INTMASK;
803
804 at_safe_point = (interruptible == THREAD_ABORTSAFE);
805
806 if ( interruptible == THREAD_UNINT ||
807 !(thread->sched_flags & TH_SFLAG_ABORT) ||
808 (!at_safe_point &&
809 (thread->sched_flags & TH_SFLAG_ABORTSAFELY))) {
810
811 if ( !(thread->state & TH_TERMINATE))
812 DTRACE_SCHED(sleep);
813
814 thread->state |= (interruptible) ? TH_WAIT : (TH_WAIT | TH_UNINT);
815 thread->at_safe_point = at_safe_point;
816 return (thread->wait_result = THREAD_WAITING);
817 }
818 else
819 if (thread->sched_flags & TH_SFLAG_ABORTSAFELY)
820 thread->sched_flags &= ~TH_SFLAG_ABORTED_MASK;
821
822 return (thread->wait_result = THREAD_INTERRUPTED);
823 }
824
825 /*
826 * Routine: thread_interrupt_level
827 * Purpose:
828 * Set the maximum interruptible state for the
829 * current thread. The effective value of any
830 * interruptible flag passed into assert_wait
831 * will never exceed this.
832 *
833 * Useful for code that must not be interrupted,
834 * but which calls code that doesn't know that.
835 * Returns:
836 * The old interrupt level for the thread.
837 */
838 __private_extern__
839 wait_interrupt_t
840 thread_interrupt_level(
841 wait_interrupt_t new_level)
842 {
843 thread_t thread = current_thread();
844 wait_interrupt_t result = thread->options & TH_OPT_INTMASK;
845
846 thread->options = (thread->options & ~TH_OPT_INTMASK) | (new_level & TH_OPT_INTMASK);
847
848 return result;
849 }
850
851 /*
852 * Check to see if an assert wait is possible, without actually doing one.
853 * This is used by debug code in locks and elsewhere to verify that it is
854 * always OK to block when trying to take a blocking lock (since waiting
855 * for the actual assert_wait to catch the case may make it hard to detect
856 * this case.
857 */
858 boolean_t
859 assert_wait_possible(void)
860 {
861
862 thread_t thread;
863
864 #if DEBUG
865 if(debug_mode) return TRUE; /* Always succeed in debug mode */
866 #endif
867
868 thread = current_thread();
869
870 return (thread == NULL || waitq_wait_possible(thread));
871 }
872
873 /*
874 * assert_wait:
875 *
876 * Assert that the current thread is about to go to
877 * sleep until the specified event occurs.
878 */
879 wait_result_t
880 assert_wait(
881 event_t event,
882 wait_interrupt_t interruptible)
883 {
884 if (__improbable(event == NO_EVENT))
885 panic("%s() called with NO_EVENT", __func__);
886
887 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
888 MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)|DBG_FUNC_NONE,
889 VM_KERNEL_UNSLIDE_OR_PERM(event), 0, 0, 0, 0);
890
891 struct waitq *waitq;
892 waitq = global_eventq(event);
893 return waitq_assert_wait64(waitq, CAST_EVENT64_T(event), interruptible, TIMEOUT_WAIT_FOREVER);
894 }
895
896 /*
897 * assert_wait_queue:
898 *
899 * Return the global waitq for the specified event
900 */
901 struct waitq *
902 assert_wait_queue(
903 event_t event)
904 {
905 return global_eventq(event);
906 }
907
908 wait_result_t
909 assert_wait_timeout(
910 event_t event,
911 wait_interrupt_t interruptible,
912 uint32_t interval,
913 uint32_t scale_factor)
914 {
915 thread_t thread = current_thread();
916 wait_result_t wresult;
917 uint64_t deadline;
918 spl_t s;
919
920 if (__improbable(event == NO_EVENT))
921 panic("%s() called with NO_EVENT", __func__);
922
923 struct waitq *waitq;
924 waitq = global_eventq(event);
925
926 s = splsched();
927 waitq_lock(waitq);
928
929 clock_interval_to_deadline(interval, scale_factor, &deadline);
930
931 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
932 MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)|DBG_FUNC_NONE,
933 VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
934
935 wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
936 interruptible,
937 TIMEOUT_URGENCY_SYS_NORMAL,
938 deadline, TIMEOUT_NO_LEEWAY,
939 thread);
940
941 waitq_unlock(waitq);
942 splx(s);
943 return wresult;
944 }
945
946 wait_result_t
947 assert_wait_timeout_with_leeway(
948 event_t event,
949 wait_interrupt_t interruptible,
950 wait_timeout_urgency_t urgency,
951 uint32_t interval,
952 uint32_t leeway,
953 uint32_t scale_factor)
954 {
955 thread_t thread = current_thread();
956 wait_result_t wresult;
957 uint64_t deadline;
958 uint64_t abstime;
959 uint64_t slop;
960 uint64_t now;
961 spl_t s;
962
963 if (__improbable(event == NO_EVENT))
964 panic("%s() called with NO_EVENT", __func__);
965
966 now = mach_absolute_time();
967 clock_interval_to_absolutetime_interval(interval, scale_factor, &abstime);
968 deadline = now + abstime;
969
970 clock_interval_to_absolutetime_interval(leeway, scale_factor, &slop);
971
972 struct waitq *waitq;
973 waitq = global_eventq(event);
974
975 s = splsched();
976 waitq_lock(waitq);
977
978 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
979 MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)|DBG_FUNC_NONE,
980 VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
981
982 wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
983 interruptible,
984 urgency, deadline, slop,
985 thread);
986
987 waitq_unlock(waitq);
988 splx(s);
989 return wresult;
990 }
991
992 wait_result_t
993 assert_wait_deadline(
994 event_t event,
995 wait_interrupt_t interruptible,
996 uint64_t deadline)
997 {
998 thread_t thread = current_thread();
999 wait_result_t wresult;
1000 spl_t s;
1001
1002 if (__improbable(event == NO_EVENT))
1003 panic("%s() called with NO_EVENT", __func__);
1004
1005 struct waitq *waitq;
1006 waitq = global_eventq(event);
1007
1008 s = splsched();
1009 waitq_lock(waitq);
1010
1011 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1012 MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)|DBG_FUNC_NONE,
1013 VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
1014
1015 wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
1016 interruptible,
1017 TIMEOUT_URGENCY_SYS_NORMAL, deadline,
1018 TIMEOUT_NO_LEEWAY, thread);
1019 waitq_unlock(waitq);
1020 splx(s);
1021 return wresult;
1022 }
1023
1024 wait_result_t
1025 assert_wait_deadline_with_leeway(
1026 event_t event,
1027 wait_interrupt_t interruptible,
1028 wait_timeout_urgency_t urgency,
1029 uint64_t deadline,
1030 uint64_t leeway)
1031 {
1032 thread_t thread = current_thread();
1033 wait_result_t wresult;
1034 spl_t s;
1035
1036 if (__improbable(event == NO_EVENT))
1037 panic("%s() called with NO_EVENT", __func__);
1038
1039 struct waitq *waitq;
1040 waitq = global_eventq(event);
1041
1042 s = splsched();
1043 waitq_lock(waitq);
1044
1045 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1046 MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)|DBG_FUNC_NONE,
1047 VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0);
1048
1049 wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event),
1050 interruptible,
1051 urgency, deadline, leeway,
1052 thread);
1053 waitq_unlock(waitq);
1054 splx(s);
1055 return wresult;
1056 }
1057
1058 /*
1059 * thread_isoncpu:
1060 *
1061 * Return TRUE if a thread is running on a processor such that an AST
1062 * is needed to pull it out of userspace execution, or if executing in
1063 * the kernel, bring to a context switch boundary that would cause
1064 * thread state to be serialized in the thread PCB.
1065 *
1066 * Thread locked, returns the same way. While locked, fields
1067 * like "state" cannot change. "runq" can change only from set to unset.
1068 */
1069 static inline boolean_t
1070 thread_isoncpu(thread_t thread)
1071 {
1072 /* Not running or runnable */
1073 if (!(thread->state & TH_RUN))
1074 return (FALSE);
1075
1076 /* Waiting on a runqueue, not currently running */
1077 /* TODO: This is invalid - it can get dequeued without thread lock, but not context switched. */
1078 if (thread->runq != PROCESSOR_NULL)
1079 return (FALSE);
1080
1081 /*
1082 * Thread does not have a stack yet
1083 * It could be on the stack alloc queue or preparing to be invoked
1084 */
1085 if (!thread->kernel_stack)
1086 return (FALSE);
1087
1088 /*
1089 * Thread must be running on a processor, or
1090 * about to run, or just did run. In all these
1091 * cases, an AST to the processor is needed
1092 * to guarantee that the thread is kicked out
1093 * of userspace and the processor has
1094 * context switched (and saved register state).
1095 */
1096 return (TRUE);
1097 }
1098
1099 /*
1100 * thread_stop:
1101 *
1102 * Force a preemption point for a thread and wait
1103 * for it to stop running on a CPU. If a stronger
1104 * guarantee is requested, wait until no longer
1105 * runnable. Arbitrates access among
1106 * multiple stop requests. (released by unstop)
1107 *
1108 * The thread must enter a wait state and stop via a
1109 * separate means.
1110 *
1111 * Returns FALSE if interrupted.
1112 */
1113 boolean_t
1114 thread_stop(
1115 thread_t thread,
1116 boolean_t until_not_runnable)
1117 {
1118 wait_result_t wresult;
1119 spl_t s = splsched();
1120 boolean_t oncpu;
1121
1122 wake_lock(thread);
1123 thread_lock(thread);
1124
1125 while (thread->state & TH_SUSP) {
1126 thread->wake_active = TRUE;
1127 thread_unlock(thread);
1128
1129 wresult = assert_wait(&thread->wake_active, THREAD_ABORTSAFE);
1130 wake_unlock(thread);
1131 splx(s);
1132
1133 if (wresult == THREAD_WAITING)
1134 wresult = thread_block(THREAD_CONTINUE_NULL);
1135
1136 if (wresult != THREAD_AWAKENED)
1137 return (FALSE);
1138
1139 s = splsched();
1140 wake_lock(thread);
1141 thread_lock(thread);
1142 }
1143
1144 thread->state |= TH_SUSP;
1145
1146 while ((oncpu = thread_isoncpu(thread)) ||
1147 (until_not_runnable && (thread->state & TH_RUN))) {
1148 processor_t processor;
1149
1150 if (oncpu) {
1151 assert(thread->state & TH_RUN);
1152 processor = thread->chosen_processor;
1153 cause_ast_check(processor);
1154 }
1155
1156 thread->wake_active = TRUE;
1157 thread_unlock(thread);
1158
1159 wresult = assert_wait(&thread->wake_active, THREAD_ABORTSAFE);
1160 wake_unlock(thread);
1161 splx(s);
1162
1163 if (wresult == THREAD_WAITING)
1164 wresult = thread_block(THREAD_CONTINUE_NULL);
1165
1166 if (wresult != THREAD_AWAKENED) {
1167 thread_unstop(thread);
1168 return (FALSE);
1169 }
1170
1171 s = splsched();
1172 wake_lock(thread);
1173 thread_lock(thread);
1174 }
1175
1176 thread_unlock(thread);
1177 wake_unlock(thread);
1178 splx(s);
1179
1180 /*
1181 * We return with the thread unlocked. To prevent it from
1182 * transitioning to a runnable state (or from TH_RUN to
1183 * being on the CPU), the caller must ensure the thread
1184 * is stopped via an external means (such as an AST)
1185 */
1186
1187 return (TRUE);
1188 }
1189
1190 /*
1191 * thread_unstop:
1192 *
1193 * Release a previous stop request and set
1194 * the thread running if appropriate.
1195 *
1196 * Use only after a successful stop operation.
1197 */
1198 void
1199 thread_unstop(
1200 thread_t thread)
1201 {
1202 spl_t s = splsched();
1203
1204 wake_lock(thread);
1205 thread_lock(thread);
1206
1207 assert((thread->state & (TH_RUN|TH_WAIT|TH_SUSP)) != TH_SUSP);
1208
1209 if (thread->state & TH_SUSP) {
1210 thread->state &= ~TH_SUSP;
1211
1212 if (thread->wake_active) {
1213 thread->wake_active = FALSE;
1214 thread_unlock(thread);
1215
1216 thread_wakeup(&thread->wake_active);
1217 wake_unlock(thread);
1218 splx(s);
1219
1220 return;
1221 }
1222 }
1223
1224 thread_unlock(thread);
1225 wake_unlock(thread);
1226 splx(s);
1227 }
1228
1229 /*
1230 * thread_wait:
1231 *
1232 * Wait for a thread to stop running. (non-interruptible)
1233 *
1234 */
1235 void
1236 thread_wait(
1237 thread_t thread,
1238 boolean_t until_not_runnable)
1239 {
1240 wait_result_t wresult;
1241 boolean_t oncpu;
1242 processor_t processor;
1243 spl_t s = splsched();
1244
1245 wake_lock(thread);
1246 thread_lock(thread);
1247
1248 /*
1249 * Wait until not running on a CPU. If stronger requirement
1250 * desired, wait until not runnable. Assumption: if thread is
1251 * on CPU, then TH_RUN is set, so we're not waiting in any case
1252 * where the original, pure "TH_RUN" check would have let us
1253 * finish.
1254 */
1255 while ((oncpu = thread_isoncpu(thread)) ||
1256 (until_not_runnable && (thread->state & TH_RUN))) {
1257
1258 if (oncpu) {
1259 assert(thread->state & TH_RUN);
1260 processor = thread->chosen_processor;
1261 cause_ast_check(processor);
1262 }
1263
1264 thread->wake_active = TRUE;
1265 thread_unlock(thread);
1266
1267 wresult = assert_wait(&thread->wake_active, THREAD_UNINT);
1268 wake_unlock(thread);
1269 splx(s);
1270
1271 if (wresult == THREAD_WAITING)
1272 thread_block(THREAD_CONTINUE_NULL);
1273
1274 s = splsched();
1275 wake_lock(thread);
1276 thread_lock(thread);
1277 }
1278
1279 thread_unlock(thread);
1280 wake_unlock(thread);
1281 splx(s);
1282 }
1283
1284 /*
1285 * Routine: clear_wait_internal
1286 *
1287 * Clear the wait condition for the specified thread.
1288 * Start the thread executing if that is appropriate.
1289 * Arguments:
1290 * thread thread to awaken
1291 * result Wakeup result the thread should see
1292 * Conditions:
1293 * At splsched
1294 * the thread is locked.
1295 * Returns:
1296 * KERN_SUCCESS thread was rousted out a wait
1297 * KERN_FAILURE thread was waiting but could not be rousted
1298 * KERN_NOT_WAITING thread was not waiting
1299 */
1300 __private_extern__ kern_return_t
1301 clear_wait_internal(
1302 thread_t thread,
1303 wait_result_t wresult)
1304 {
1305 uint32_t i = LockTimeOutUsec;
1306 struct waitq *waitq = thread->waitq;
1307
1308 do {
1309 if (wresult == THREAD_INTERRUPTED && (thread->state & TH_UNINT))
1310 return (KERN_FAILURE);
1311
1312 if (waitq != NULL) {
1313 if (!waitq_pull_thread_locked(waitq, thread)) {
1314 thread_unlock(thread);
1315 delay(1);
1316 if (i > 0 && !machine_timeout_suspended())
1317 i--;
1318 thread_lock(thread);
1319 if (waitq != thread->waitq)
1320 return KERN_NOT_WAITING;
1321 continue;
1322 }
1323 }
1324
1325 /* TODO: Can we instead assert TH_TERMINATE is not set? */
1326 if ((thread->state & (TH_WAIT|TH_TERMINATE)) == TH_WAIT)
1327 return (thread_go(thread, wresult));
1328 else
1329 return (KERN_NOT_WAITING);
1330 } while (i > 0);
1331
1332 panic("clear_wait_internal: deadlock: thread=%p, wq=%p, cpu=%d\n",
1333 thread, waitq, cpu_number());
1334
1335 return (KERN_FAILURE);
1336 }
1337
1338
1339 /*
1340 * clear_wait:
1341 *
1342 * Clear the wait condition for the specified thread. Start the thread
1343 * executing if that is appropriate.
1344 *
1345 * parameters:
1346 * thread thread to awaken
1347 * result Wakeup result the thread should see
1348 */
1349 kern_return_t
1350 clear_wait(
1351 thread_t thread,
1352 wait_result_t result)
1353 {
1354 kern_return_t ret;
1355 spl_t s;
1356
1357 s = splsched();
1358 thread_lock(thread);
1359 ret = clear_wait_internal(thread, result);
1360 thread_unlock(thread);
1361 splx(s);
1362 return ret;
1363 }
1364
1365
1366 /*
1367 * thread_wakeup_prim:
1368 *
1369 * Common routine for thread_wakeup, thread_wakeup_with_result,
1370 * and thread_wakeup_one.
1371 *
1372 */
1373 kern_return_t
1374 thread_wakeup_prim(
1375 event_t event,
1376 boolean_t one_thread,
1377 wait_result_t result)
1378 {
1379 if (__improbable(event == NO_EVENT))
1380 panic("%s() called with NO_EVENT", __func__);
1381
1382 struct waitq *wq = global_eventq(event);
1383
1384 if (one_thread)
1385 return waitq_wakeup64_one(wq, CAST_EVENT64_T(event), result, WAITQ_ALL_PRIORITIES);
1386 else
1387 return waitq_wakeup64_all(wq, CAST_EVENT64_T(event), result, WAITQ_ALL_PRIORITIES);
1388 }
1389
1390 /*
1391 * Wakeup a specified thread if and only if it's waiting for this event
1392 */
1393 kern_return_t
1394 thread_wakeup_thread(
1395 event_t event,
1396 thread_t thread)
1397 {
1398 if (__improbable(event == NO_EVENT))
1399 panic("%s() called with NO_EVENT", __func__);
1400
1401 struct waitq *wq = global_eventq(event);
1402
1403 return waitq_wakeup64_thread(wq, CAST_EVENT64_T(event), thread, THREAD_AWAKENED);
1404 }
1405
1406 /*
1407 * Wakeup a thread waiting on an event and promote it to a priority.
1408 *
1409 * Requires woken thread to un-promote itself when done.
1410 */
1411 kern_return_t
1412 thread_wakeup_one_with_pri(
1413 event_t event,
1414 int priority)
1415 {
1416 if (__improbable(event == NO_EVENT))
1417 panic("%s() called with NO_EVENT", __func__);
1418
1419 struct waitq *wq = global_eventq(event);
1420
1421 return waitq_wakeup64_one(wq, CAST_EVENT64_T(event), THREAD_AWAKENED, priority);
1422 }
1423
1424 /*
1425 * Wakeup a thread waiting on an event,
1426 * promote it to a priority,
1427 * and return a reference to the woken thread.
1428 *
1429 * Requires woken thread to un-promote itself when done.
1430 */
1431 thread_t
1432 thread_wakeup_identify(event_t event,
1433 int priority)
1434 {
1435 if (__improbable(event == NO_EVENT))
1436 panic("%s() called with NO_EVENT", __func__);
1437
1438 struct waitq *wq = global_eventq(event);
1439
1440 return waitq_wakeup64_identify(wq, CAST_EVENT64_T(event), THREAD_AWAKENED, priority);
1441 }
1442
1443 /*
1444 * thread_bind:
1445 *
1446 * Force the current thread to execute on the specified processor.
1447 * Takes effect after the next thread_block().
1448 *
1449 * Returns the previous binding. PROCESSOR_NULL means
1450 * not bound.
1451 *
1452 * XXX - DO NOT export this to users - XXX
1453 */
1454 processor_t
1455 thread_bind(
1456 processor_t processor)
1457 {
1458 thread_t self = current_thread();
1459 processor_t prev;
1460 spl_t s;
1461
1462 s = splsched();
1463 thread_lock(self);
1464
1465 prev = thread_bind_internal(self, processor);
1466
1467 thread_unlock(self);
1468 splx(s);
1469
1470 return (prev);
1471 }
1472
1473 /*
1474 * thread_bind_internal:
1475 *
1476 * If the specified thread is not the current thread, and it is currently
1477 * running on another CPU, a remote AST must be sent to that CPU to cause
1478 * the thread to migrate to its bound processor. Otherwise, the migration
1479 * will occur at the next quantum expiration or blocking point.
1480 *
1481 * When the thread is the current thread, and explicit thread_block() should
1482 * be used to force the current processor to context switch away and
1483 * let the thread migrate to the bound processor.
1484 *
1485 * Thread must be locked, and at splsched.
1486 */
1487
1488 static processor_t
1489 thread_bind_internal(
1490 thread_t thread,
1491 processor_t processor)
1492 {
1493 processor_t prev;
1494
1495 /* <rdar://problem/15102234> */
1496 assert(thread->sched_pri < BASEPRI_RTQUEUES);
1497 /* A thread can't be bound if it's sitting on a (potentially incorrect) runqueue */
1498 assert(thread->runq == PROCESSOR_NULL);
1499
1500 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_THREAD_BIND), thread_tid(thread), processor ? (uintptr_t)processor->cpu_id : (uintptr_t)-1, 0, 0, 0);
1501
1502 prev = thread->bound_processor;
1503 thread->bound_processor = processor;
1504
1505 return (prev);
1506 }
1507
1508 /*
1509 * thread_vm_bind_group_add:
1510 *
1511 * The "VM bind group" is a special mechanism to mark a collection
1512 * of threads from the VM subsystem that, in general, should be scheduled
1513 * with only one CPU of parallelism. To accomplish this, we initially
1514 * bind all the threads to the master processor, which has the effect
1515 * that only one of the threads in the group can execute at once, including
1516 * preempting threads in the group that are a lower priority. Future
1517 * mechanisms may use more dynamic mechanisms to prevent the collection
1518 * of VM threads from using more CPU time than desired.
1519 *
1520 * The current implementation can result in priority inversions where
1521 * compute-bound priority 95 or realtime threads that happen to have
1522 * landed on the master processor prevent the VM threads from running.
1523 * When this situation is detected, we unbind the threads for one
1524 * scheduler tick to allow the scheduler to run the threads an
1525 * additional CPUs, before restoring the binding (assuming high latency
1526 * is no longer a problem).
1527 */
1528
1529 /*
1530 * The current max is provisioned for:
1531 * vm_compressor_swap_trigger_thread (92)
1532 * 2 x vm_pageout_iothread_internal (92) when vm_restricted_to_single_processor==TRUE
1533 * vm_pageout_continue (92)
1534 * memorystatus_thread (95)
1535 */
1536 #define MAX_VM_BIND_GROUP_COUNT (5)
1537 decl_simple_lock_data(static,sched_vm_group_list_lock);
1538 static thread_t sched_vm_group_thread_list[MAX_VM_BIND_GROUP_COUNT];
1539 static int sched_vm_group_thread_count;
1540 static boolean_t sched_vm_group_temporarily_unbound = FALSE;
1541
1542 void
1543 thread_vm_bind_group_add(void)
1544 {
1545 thread_t self = current_thread();
1546
1547 thread_reference_internal(self);
1548 self->options |= TH_OPT_SCHED_VM_GROUP;
1549
1550 simple_lock(&sched_vm_group_list_lock);
1551 assert(sched_vm_group_thread_count < MAX_VM_BIND_GROUP_COUNT);
1552 sched_vm_group_thread_list[sched_vm_group_thread_count++] = self;
1553 simple_unlock(&sched_vm_group_list_lock);
1554
1555 thread_bind(master_processor);
1556
1557 /* Switch to bound processor if not already there */
1558 thread_block(THREAD_CONTINUE_NULL);
1559 }
1560
1561 static void
1562 sched_vm_group_maintenance(void)
1563 {
1564 uint64_t ctime = mach_absolute_time();
1565 uint64_t longtime = ctime - sched_tick_interval;
1566 int i;
1567 spl_t s;
1568 boolean_t high_latency_observed = FALSE;
1569 boolean_t runnable_and_not_on_runq_observed = FALSE;
1570 boolean_t bind_target_changed = FALSE;
1571 processor_t bind_target = PROCESSOR_NULL;
1572
1573 /* Make sure nobody attempts to add new threads while we are enumerating them */
1574 simple_lock(&sched_vm_group_list_lock);
1575
1576 s = splsched();
1577
1578 for (i=0; i < sched_vm_group_thread_count; i++) {
1579 thread_t thread = sched_vm_group_thread_list[i];
1580 assert(thread != THREAD_NULL);
1581 thread_lock(thread);
1582 if ((thread->state & (TH_RUN|TH_WAIT)) == TH_RUN) {
1583 if (thread->runq != PROCESSOR_NULL && thread->last_made_runnable_time < longtime) {
1584 high_latency_observed = TRUE;
1585 } else if (thread->runq == PROCESSOR_NULL) {
1586 /* There are some cases where a thread be transitiong that also fall into this case */
1587 runnable_and_not_on_runq_observed = TRUE;
1588 }
1589 }
1590 thread_unlock(thread);
1591
1592 if (high_latency_observed && runnable_and_not_on_runq_observed) {
1593 /* All the things we are looking for are true, stop looking */
1594 break;
1595 }
1596 }
1597
1598 splx(s);
1599
1600 if (sched_vm_group_temporarily_unbound) {
1601 /* If we turned off binding, make sure everything is OK before rebinding */
1602 if (!high_latency_observed) {
1603 /* rebind */
1604 bind_target_changed = TRUE;
1605 bind_target = master_processor;
1606 sched_vm_group_temporarily_unbound = FALSE; /* might be reset to TRUE if change cannot be completed */
1607 }
1608 } else {
1609 /*
1610 * Check if we're in a bad state, which is defined by high
1611 * latency with no core currently executing a thread. If a
1612 * single thread is making progress on a CPU, that means the
1613 * binding concept to reduce parallelism is working as
1614 * designed.
1615 */
1616 if (high_latency_observed && !runnable_and_not_on_runq_observed) {
1617 /* unbind */
1618 bind_target_changed = TRUE;
1619 bind_target = PROCESSOR_NULL;
1620 sched_vm_group_temporarily_unbound = TRUE;
1621 }
1622 }
1623
1624 if (bind_target_changed) {
1625 s = splsched();
1626 for (i=0; i < sched_vm_group_thread_count; i++) {
1627 thread_t thread = sched_vm_group_thread_list[i];
1628 boolean_t removed;
1629 assert(thread != THREAD_NULL);
1630
1631 thread_lock(thread);
1632 removed = thread_run_queue_remove(thread);
1633 if (removed || ((thread->state & (TH_RUN | TH_WAIT)) == TH_WAIT)) {
1634 thread_bind_internal(thread, bind_target);
1635 } else {
1636 /*
1637 * Thread was in the middle of being context-switched-to,
1638 * or was in the process of blocking. To avoid switching the bind
1639 * state out mid-flight, defer the change if possible.
1640 */
1641 if (bind_target == PROCESSOR_NULL) {
1642 thread_bind_internal(thread, bind_target);
1643 } else {
1644 sched_vm_group_temporarily_unbound = TRUE; /* next pass will try again */
1645 }
1646 }
1647
1648 if (removed) {
1649 thread_run_queue_reinsert(thread, SCHED_PREEMPT | SCHED_TAILQ);
1650 }
1651 thread_unlock(thread);
1652 }
1653 splx(s);
1654 }
1655
1656 simple_unlock(&sched_vm_group_list_lock);
1657 }
1658
1659 /* Invoked prior to idle entry to determine if, on SMT capable processors, an SMT
1660 * rebalancing opportunity exists when a core is (instantaneously) idle, but
1661 * other SMT-capable cores may be over-committed. TODO: some possible negatives:
1662 * IPI thrash if this core does not remain idle following the load balancing ASTs
1663 * Idle "thrash", when IPI issue is followed by idle entry/core power down
1664 * followed by a wakeup shortly thereafter.
1665 */
1666
1667 #if (DEVELOPMENT || DEBUG)
1668 int sched_smt_balance = 1;
1669 #endif
1670
1671 #if __SMP__
1672 /* Invoked with pset locked, returns with pset unlocked */
1673 static void
1674 sched_SMT_balance(processor_t cprocessor, processor_set_t cpset) {
1675 processor_t ast_processor = NULL;
1676
1677 #if (DEVELOPMENT || DEBUG)
1678 if (__improbable(sched_smt_balance == 0))
1679 goto smt_balance_exit;
1680 #endif
1681
1682 assert(cprocessor == current_processor());
1683 if (cprocessor->is_SMT == FALSE)
1684 goto smt_balance_exit;
1685
1686 processor_t sib_processor = cprocessor->processor_secondary ? cprocessor->processor_secondary : cprocessor->processor_primary;
1687
1688 /* Determine if both this processor and its sibling are idle,
1689 * indicating an SMT rebalancing opportunity.
1690 */
1691 if (sib_processor->state != PROCESSOR_IDLE)
1692 goto smt_balance_exit;
1693
1694 processor_t sprocessor;
1695
1696 qe_foreach_element(sprocessor, &cpset->active_queue, processor_queue) {
1697 if ((sprocessor->state == PROCESSOR_RUNNING) &&
1698 (sprocessor->processor_primary != sprocessor) &&
1699 (sprocessor->processor_primary->state == PROCESSOR_RUNNING) &&
1700 (sprocessor->current_pri < BASEPRI_RTQUEUES) &&
1701 ((cpset->pending_AST_cpu_mask & (1ULL << sprocessor->cpu_id)) == 0)) {
1702 assert(sprocessor != cprocessor);
1703 ast_processor = sprocessor;
1704 break;
1705 }
1706 }
1707
1708 smt_balance_exit:
1709 pset_unlock(cpset);
1710
1711 if (ast_processor) {
1712 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_SMT_BALANCE), ast_processor->cpu_id, ast_processor->state, ast_processor->processor_primary->state, 0, 0);
1713 cause_ast_check(ast_processor);
1714 }
1715 }
1716 #endif /* __SMP__ */
1717
1718 /*
1719 * thread_select:
1720 *
1721 * Select a new thread for the current processor to execute.
1722 *
1723 * May select the current thread, which must be locked.
1724 */
1725 static thread_t
1726 thread_select(
1727 thread_t thread,
1728 processor_t processor,
1729 ast_t reason)
1730 {
1731 processor_set_t pset = processor->processor_set;
1732 thread_t new_thread = THREAD_NULL;
1733
1734 assert(processor == current_processor());
1735 assert((thread->state & (TH_RUN|TH_TERMINATE2)) == TH_RUN);
1736
1737 do {
1738 /*
1739 * Update the priority.
1740 */
1741 if (SCHED(can_update_priority)(thread))
1742 SCHED(update_priority)(thread);
1743
1744 processor->current_pri = thread->sched_pri;
1745 processor->current_thmode = thread->sched_mode;
1746 processor->current_sfi_class = thread->sfi_class;
1747
1748 pset_lock(pset);
1749
1750 assert(processor->state != PROCESSOR_OFF_LINE);
1751
1752 if (!processor->is_recommended) {
1753 /*
1754 * The performance controller has provided a hint to not dispatch more threads,
1755 * unless they are bound to us (and thus we are the only option
1756 */
1757 if (!SCHED(processor_bound_count)(processor)) {
1758 goto idle;
1759 }
1760 } else if (processor->processor_primary != processor) {
1761 /*
1762 * Should this secondary SMT processor attempt to find work? For pset runqueue systems,
1763 * we should look for work only under the same conditions that choose_processor()
1764 * would have assigned work, which is when all primary processors have been assigned work.
1765 *
1766 * An exception is that bound threads are dispatched to a processor without going through
1767 * choose_processor(), so in those cases we should continue trying to dequeue work.
1768 */
1769 if (!SCHED(processor_bound_count)(processor) && !queue_empty(&pset->idle_queue) && !rt_runq.count) {
1770 goto idle;
1771 }
1772 }
1773
1774 rt_lock_lock();
1775
1776 /*
1777 * Test to see if the current thread should continue
1778 * to run on this processor. Must not be attempting to wait, and not
1779 * bound to a different processor, nor be in the wrong
1780 * processor set, nor be forced to context switch by TH_SUSP.
1781 *
1782 * Note that there are never any RT threads in the regular runqueue.
1783 *
1784 * This code is very insanely tricky.
1785 */
1786
1787 if (((thread->state & (TH_TERMINATE|TH_IDLE|TH_WAIT|TH_RUN|TH_SUSP)) == TH_RUN) &&
1788 (thread->sched_pri >= BASEPRI_RTQUEUES || processor->processor_primary == processor) &&
1789 (thread->bound_processor == PROCESSOR_NULL || thread->bound_processor == processor) &&
1790 (thread->affinity_set == AFFINITY_SET_NULL || thread->affinity_set->aset_pset == pset)) {
1791 /*
1792 * RT threads with un-expired quantum stay on processor,
1793 * unless there's a valid RT thread with an earlier deadline.
1794 */
1795 if (thread->sched_pri >= BASEPRI_RTQUEUES && processor->first_timeslice) {
1796 if (rt_runq.count > 0) {
1797 thread_t next_rt = qe_queue_first(&rt_runq.queue, struct thread, runq_links);
1798
1799 assert(next_rt->runq == THREAD_ON_RT_RUNQ);
1800
1801 if (next_rt->realtime.deadline < processor->deadline &&
1802 (next_rt->bound_processor == PROCESSOR_NULL ||
1803 next_rt->bound_processor == processor)) {
1804 /* The next RT thread is better, so pick it off the runqueue. */
1805 goto pick_new_rt_thread;
1806 }
1807 }
1808
1809 /* This is still the best RT thread to run. */
1810 processor->deadline = thread->realtime.deadline;
1811
1812 rt_lock_unlock();
1813 pset_unlock(pset);
1814
1815 return (thread);
1816 }
1817
1818 if ((rt_runq.count == 0) &&
1819 SCHED(processor_queue_has_priority)(processor, thread->sched_pri, TRUE) == FALSE) {
1820 /* This thread is still the highest priority runnable (non-idle) thread */
1821 processor->deadline = UINT64_MAX;
1822
1823 rt_lock_unlock();
1824 pset_unlock(pset);
1825
1826 return (thread);
1827 }
1828 }
1829
1830 /* OK, so we're not going to run the current thread. Look at the RT queue. */
1831 if (rt_runq.count > 0) {
1832 thread_t next_rt = qe_queue_first(&rt_runq.queue, struct thread, runq_links);
1833
1834 assert(next_rt->runq == THREAD_ON_RT_RUNQ);
1835
1836 if (__probable((next_rt->bound_processor == PROCESSOR_NULL ||
1837 (next_rt->bound_processor == processor)))) {
1838 pick_new_rt_thread:
1839 new_thread = qe_dequeue_head(&rt_runq.queue, struct thread, runq_links);
1840
1841 new_thread->runq = PROCESSOR_NULL;
1842 SCHED_STATS_RUNQ_CHANGE(&rt_runq.runq_stats, rt_runq.count);
1843 rt_runq.count--;
1844
1845 processor->deadline = new_thread->realtime.deadline;
1846
1847 rt_lock_unlock();
1848 pset_unlock(pset);
1849
1850 return (new_thread);
1851 }
1852 }
1853
1854 processor->deadline = UINT64_MAX;
1855 rt_lock_unlock();
1856
1857 /* No RT threads, so let's look at the regular threads. */
1858 if ((new_thread = SCHED(choose_thread)(processor, MINPRI, reason)) != THREAD_NULL) {
1859 pset_unlock(pset);
1860 return (new_thread);
1861 }
1862
1863 #if __SMP__
1864 if (SCHED(steal_thread_enabled)) {
1865 /*
1866 * No runnable threads, attempt to steal
1867 * from other processors. Returns with pset lock dropped.
1868 */
1869
1870 if ((new_thread = SCHED(steal_thread)(pset)) != THREAD_NULL) {
1871 return (new_thread);
1872 }
1873
1874 /*
1875 * If other threads have appeared, shortcut
1876 * around again.
1877 */
1878 if (!SCHED(processor_queue_empty)(processor) || rt_runq.count > 0)
1879 continue;
1880
1881 pset_lock(pset);
1882 }
1883 #endif
1884
1885 idle:
1886 /*
1887 * Nothing is runnable, so set this processor idle if it
1888 * was running.
1889 */
1890 if (processor->state == PROCESSOR_RUNNING) {
1891 processor->state = PROCESSOR_IDLE;
1892
1893 if (processor->processor_primary == processor) {
1894 re_queue_head(&pset->idle_queue, &processor->processor_queue);
1895 } else {
1896 re_queue_head(&pset->idle_secondary_queue, &processor->processor_queue);
1897 }
1898 }
1899
1900 #if __SMP__
1901 /* Invoked with pset locked, returns with pset unlocked */
1902 sched_SMT_balance(processor, pset);
1903 #else
1904 pset_unlock(pset);
1905 #endif
1906
1907 #if CONFIG_SCHED_IDLE_IN_PLACE
1908 /*
1909 * Choose idle thread if fast idle is not possible.
1910 */
1911 if (processor->processor_primary != processor)
1912 return (processor->idle_thread);
1913
1914 if ((thread->state & (TH_IDLE|TH_TERMINATE|TH_SUSP)) || !(thread->state & TH_WAIT) || thread->wake_active || thread->sched_pri >= BASEPRI_RTQUEUES)
1915 return (processor->idle_thread);
1916
1917 /*
1918 * Perform idling activities directly without a
1919 * context switch. Return dispatched thread,
1920 * else check again for a runnable thread.
1921 */
1922 new_thread = thread_select_idle(thread, processor);
1923
1924 #else /* !CONFIG_SCHED_IDLE_IN_PLACE */
1925
1926 /*
1927 * Do a full context switch to idle so that the current
1928 * thread can start running on another processor without
1929 * waiting for the fast-idled processor to wake up.
1930 */
1931 new_thread = processor->idle_thread;
1932
1933 #endif /* !CONFIG_SCHED_IDLE_IN_PLACE */
1934
1935 } while (new_thread == THREAD_NULL);
1936
1937 return (new_thread);
1938 }
1939
1940 #if CONFIG_SCHED_IDLE_IN_PLACE
1941 /*
1942 * thread_select_idle:
1943 *
1944 * Idle the processor using the current thread context.
1945 *
1946 * Called with thread locked, then dropped and relocked.
1947 */
1948 static thread_t
1949 thread_select_idle(
1950 thread_t thread,
1951 processor_t processor)
1952 {
1953 thread_t new_thread;
1954 uint64_t arg1, arg2;
1955 int urgency;
1956
1957 sched_run_decr(thread);
1958
1959 thread->state |= TH_IDLE;
1960 processor->current_pri = IDLEPRI;
1961 processor->current_thmode = TH_MODE_NONE;
1962 processor->current_sfi_class = SFI_CLASS_KERNEL;
1963
1964 /* Reload precise timing global policy to thread-local policy */
1965 thread->precise_user_kernel_time = use_precise_user_kernel_time(thread);
1966
1967 thread_unlock(thread);
1968
1969 /*
1970 * Switch execution timing to processor idle thread.
1971 */
1972 processor->last_dispatch = mach_absolute_time();
1973
1974 #ifdef CONFIG_MACH_APPROXIMATE_TIME
1975 commpage_update_mach_approximate_time(processor->last_dispatch);
1976 #endif
1977
1978 thread->last_run_time = processor->last_dispatch;
1979 thread_timer_event(processor->last_dispatch, &processor->idle_thread->system_timer);
1980 PROCESSOR_DATA(processor, kernel_timer) = &processor->idle_thread->system_timer;
1981
1982 /*
1983 * Cancel the quantum timer while idling.
1984 */
1985 timer_call_cancel(&processor->quantum_timer);
1986 processor->first_timeslice = FALSE;
1987
1988 (*thread->sched_call)(SCHED_CALL_BLOCK, thread);
1989
1990 thread_tell_urgency(THREAD_URGENCY_NONE, 0, 0, 0, NULL);
1991
1992 /*
1993 * Enable interrupts and perform idling activities. No
1994 * preemption due to TH_IDLE being set.
1995 */
1996 spllo(); new_thread = processor_idle(thread, processor);
1997
1998 /*
1999 * Return at splsched.
2000 */
2001 (*thread->sched_call)(SCHED_CALL_UNBLOCK, thread);
2002
2003 thread_lock(thread);
2004
2005 /*
2006 * If awakened, switch to thread timer and start a new quantum.
2007 * Otherwise skip; we will context switch to another thread or return here.
2008 */
2009 if (!(thread->state & TH_WAIT)) {
2010 processor->last_dispatch = mach_absolute_time();
2011 thread_timer_event(processor->last_dispatch, &thread->system_timer);
2012 PROCESSOR_DATA(processor, kernel_timer) = &thread->system_timer;
2013
2014 thread_quantum_init(thread);
2015 processor->quantum_end = processor->last_dispatch + thread->quantum_remaining;
2016 timer_call_enter1(&processor->quantum_timer, thread, processor->quantum_end, TIMER_CALL_SYS_CRITICAL | TIMER_CALL_LOCAL);
2017 processor->first_timeslice = TRUE;
2018
2019 thread->computation_epoch = processor->last_dispatch;
2020 }
2021
2022 thread->state &= ~TH_IDLE;
2023
2024 urgency = thread_get_urgency(thread, &arg1, &arg2);
2025
2026 thread_tell_urgency(urgency, arg1, arg2, 0, new_thread);
2027
2028 sched_run_incr(thread);
2029
2030 return (new_thread);
2031 }
2032 #endif /* CONFIG_SCHED_IDLE_IN_PLACE */
2033
2034 /*
2035 * thread_invoke
2036 *
2037 * Called at splsched with neither thread locked.
2038 *
2039 * Perform a context switch and start executing the new thread.
2040 *
2041 * Returns FALSE when the context switch didn't happen.
2042 * The reference to the new thread is still consumed.
2043 *
2044 * "self" is what is currently running on the processor,
2045 * "thread" is the new thread to context switch to
2046 * (which may be the same thread in some cases)
2047 */
2048 static boolean_t
2049 thread_invoke(
2050 thread_t self,
2051 thread_t thread,
2052 ast_t reason)
2053 {
2054 if (__improbable(get_preemption_level() != 0)) {
2055 int pl = get_preemption_level();
2056 panic("thread_invoke: preemption_level %d, possible cause: %s",
2057 pl, (pl < 0 ? "unlocking an unlocked mutex or spinlock" :
2058 "blocking while holding a spinlock, or within interrupt context"));
2059 }
2060
2061 thread_continue_t continuation = self->continuation;
2062 void *parameter = self->parameter;
2063 processor_t processor;
2064
2065 uint64_t ctime = mach_absolute_time();
2066
2067 #ifdef CONFIG_MACH_APPROXIMATE_TIME
2068 commpage_update_mach_approximate_time(ctime);
2069 #endif
2070
2071 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
2072 sched_timeshare_consider_maintenance(ctime);
2073 #endif
2074
2075 assert_thread_magic(self);
2076 assert(self == current_thread());
2077 assert(self->runq == PROCESSOR_NULL);
2078 assert((self->state & (TH_RUN|TH_TERMINATE2)) == TH_RUN);
2079
2080 thread_lock(thread);
2081
2082 assert_thread_magic(thread);
2083 assert((thread->state & (TH_RUN|TH_WAIT|TH_UNINT|TH_TERMINATE|TH_TERMINATE2)) == TH_RUN);
2084 assert(thread->bound_processor == PROCESSOR_NULL || thread->bound_processor == current_processor());
2085 assert(thread->runq == PROCESSOR_NULL);
2086
2087 /* Reload precise timing global policy to thread-local policy */
2088 thread->precise_user_kernel_time = use_precise_user_kernel_time(thread);
2089
2090 /* Update SFI class based on other factors */
2091 thread->sfi_class = sfi_thread_classify(thread);
2092
2093 /* Allow realtime threads to hang onto a stack. */
2094 if ((self->sched_mode == TH_MODE_REALTIME) && !self->reserved_stack)
2095 self->reserved_stack = self->kernel_stack;
2096
2097 if (continuation != NULL) {
2098 if (!thread->kernel_stack) {
2099 /*
2100 * If we are using a privileged stack,
2101 * check to see whether we can exchange it with
2102 * that of the other thread.
2103 */
2104 if (self->kernel_stack == self->reserved_stack && !thread->reserved_stack)
2105 goto need_stack;
2106
2107 /*
2108 * Context switch by performing a stack handoff.
2109 */
2110 continuation = thread->continuation;
2111 parameter = thread->parameter;
2112
2113 processor = current_processor();
2114 processor->active_thread = thread;
2115 processor->current_pri = thread->sched_pri;
2116 processor->current_thmode = thread->sched_mode;
2117 processor->current_sfi_class = thread->sfi_class;
2118 if (thread->last_processor != processor && thread->last_processor != NULL) {
2119 if (thread->last_processor->processor_set != processor->processor_set)
2120 thread->ps_switch++;
2121 thread->p_switch++;
2122 }
2123 thread->last_processor = processor;
2124 thread->c_switch++;
2125 ast_context(thread);
2126
2127 thread_unlock(thread);
2128
2129 self->reason = reason;
2130
2131 processor->last_dispatch = ctime;
2132 self->last_run_time = ctime;
2133 thread_timer_event(ctime, &thread->system_timer);
2134 PROCESSOR_DATA(processor, kernel_timer) = &thread->system_timer;
2135
2136 /*
2137 * Since non-precise user/kernel time doesn't update the state timer
2138 * during privilege transitions, synthesize an event now.
2139 */
2140 if (!thread->precise_user_kernel_time) {
2141 timer_switch(PROCESSOR_DATA(processor, current_state),
2142 ctime,
2143 PROCESSOR_DATA(processor, current_state));
2144 }
2145
2146 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2147 MACHDBG_CODE(DBG_MACH_SCHED, MACH_STACK_HANDOFF)|DBG_FUNC_NONE,
2148 self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
2149
2150 if ((thread->chosen_processor != processor) && (thread->chosen_processor != PROCESSOR_NULL)) {
2151 SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_MOVED)|DBG_FUNC_NONE,
2152 (uintptr_t)thread_tid(thread), (uintptr_t)thread->chosen_processor->cpu_id, 0, 0, 0);
2153 }
2154
2155 DTRACE_SCHED2(off__cpu, struct thread *, thread, struct proc *, thread->task->bsd_info);
2156
2157 SCHED_STATS_CSW(processor, self->reason, self->sched_pri, thread->sched_pri);
2158
2159 TLOG(1, "thread_invoke: calling stack_handoff\n");
2160 stack_handoff(self, thread);
2161
2162 /* 'self' is now off core */
2163 assert(thread == current_thread());
2164
2165 DTRACE_SCHED(on__cpu);
2166
2167 #if KPERF
2168 kperf_on_cpu(thread, continuation, NULL);
2169 #endif /* KPERF */
2170
2171 thread_dispatch(self, thread);
2172
2173 thread->continuation = thread->parameter = NULL;
2174
2175 counter(c_thread_invoke_hits++);
2176
2177 (void) spllo();
2178
2179 assert(continuation);
2180 call_continuation(continuation, parameter, thread->wait_result);
2181 /*NOTREACHED*/
2182 }
2183 else if (thread == self) {
2184 /* same thread but with continuation */
2185 ast_context(self);
2186 counter(++c_thread_invoke_same);
2187
2188 thread_unlock(self);
2189
2190 #if KPERF
2191 kperf_on_cpu(thread, continuation, NULL);
2192 #endif /* KPERF */
2193
2194 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2195 MACHDBG_CODE(DBG_MACH_SCHED,MACH_SCHED) | DBG_FUNC_NONE,
2196 self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
2197
2198 self->continuation = self->parameter = NULL;
2199
2200 (void) spllo();
2201
2202 call_continuation(continuation, parameter, self->wait_result);
2203 /*NOTREACHED*/
2204 }
2205 } else {
2206 /*
2207 * Check that the other thread has a stack
2208 */
2209 if (!thread->kernel_stack) {
2210 need_stack:
2211 if (!stack_alloc_try(thread)) {
2212 counter(c_thread_invoke_misses++);
2213 thread_unlock(thread);
2214 thread_stack_enqueue(thread);
2215 return (FALSE);
2216 }
2217 } else if (thread == self) {
2218 ast_context(self);
2219 counter(++c_thread_invoke_same);
2220 thread_unlock(self);
2221
2222 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2223 MACHDBG_CODE(DBG_MACH_SCHED,MACH_SCHED) | DBG_FUNC_NONE,
2224 self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
2225
2226 return (TRUE);
2227 }
2228 }
2229
2230 /*
2231 * Context switch by full context save.
2232 */
2233 processor = current_processor();
2234 processor->active_thread = thread;
2235 processor->current_pri = thread->sched_pri;
2236 processor->current_thmode = thread->sched_mode;
2237 processor->current_sfi_class = thread->sfi_class;
2238 if (thread->last_processor != processor && thread->last_processor != NULL) {
2239 if (thread->last_processor->processor_set != processor->processor_set)
2240 thread->ps_switch++;
2241 thread->p_switch++;
2242 }
2243 thread->last_processor = processor;
2244 thread->c_switch++;
2245 ast_context(thread);
2246
2247 thread_unlock(thread);
2248
2249 counter(c_thread_invoke_csw++);
2250
2251 self->reason = reason;
2252
2253 processor->last_dispatch = ctime;
2254 self->last_run_time = ctime;
2255 thread_timer_event(ctime, &thread->system_timer);
2256 PROCESSOR_DATA(processor, kernel_timer) = &thread->system_timer;
2257
2258 /*
2259 * Since non-precise user/kernel time doesn't update the state timer
2260 * during privilege transitions, synthesize an event now.
2261 */
2262 if (!thread->precise_user_kernel_time) {
2263 timer_switch(PROCESSOR_DATA(processor, current_state),
2264 ctime,
2265 PROCESSOR_DATA(processor, current_state));
2266 }
2267
2268 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2269 MACHDBG_CODE(DBG_MACH_SCHED,MACH_SCHED) | DBG_FUNC_NONE,
2270 self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
2271
2272 if ((thread->chosen_processor != processor) && (thread->chosen_processor != NULL)) {
2273 SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_MOVED)|DBG_FUNC_NONE,
2274 (uintptr_t)thread_tid(thread), (uintptr_t)thread->chosen_processor->cpu_id, 0, 0, 0);
2275 }
2276
2277 DTRACE_SCHED2(off__cpu, struct thread *, thread, struct proc *, thread->task->bsd_info);
2278
2279 SCHED_STATS_CSW(processor, self->reason, self->sched_pri, thread->sched_pri);
2280
2281 /*
2282 * This is where we actually switch register context,
2283 * and address space if required. We will next run
2284 * as a result of a subsequent context switch.
2285 *
2286 * Once registers are switched and the processor is running "thread",
2287 * the stack variables and non-volatile registers will contain whatever
2288 * was there the last time that thread blocked. No local variables should
2289 * be used after this point, except for the special case of "thread", which
2290 * the platform layer returns as the previous thread running on the processor
2291 * via the function call ABI as a return register, and "self", which may have
2292 * been stored on the stack or a non-volatile register, but a stale idea of
2293 * what was on the CPU is newly-accurate because that thread is again
2294 * running on the CPU.
2295 */
2296 assert(continuation == self->continuation);
2297 thread = machine_switch_context(self, continuation, thread);
2298 assert(self == current_thread());
2299 TLOG(1,"thread_invoke: returning machine_switch_context: self %p continuation %p thread %p\n", self, continuation, thread);
2300
2301 DTRACE_SCHED(on__cpu);
2302
2303 #if KPERF
2304 kperf_on_cpu(self, NULL, __builtin_frame_address(0));
2305 #endif /* KPERF */
2306
2307 /*
2308 * We have been resumed and are set to run.
2309 */
2310 thread_dispatch(thread, self);
2311
2312 if (continuation) {
2313 self->continuation = self->parameter = NULL;
2314
2315 (void) spllo();
2316
2317 call_continuation(continuation, parameter, self->wait_result);
2318 /*NOTREACHED*/
2319 }
2320
2321 return (TRUE);
2322 }
2323
2324 #if defined(CONFIG_SCHED_DEFERRED_AST)
2325 /*
2326 * pset_cancel_deferred_dispatch:
2327 *
2328 * Cancels all ASTs that we can cancel for the given processor set
2329 * if the current processor is running the last runnable thread in the
2330 * system.
2331 *
2332 * This function assumes the current thread is runnable. This must
2333 * be called with the pset unlocked.
2334 */
2335 static void
2336 pset_cancel_deferred_dispatch(
2337 processor_set_t pset,
2338 processor_t processor)
2339 {
2340 processor_t active_processor = NULL;
2341 uint32_t sampled_sched_run_count;
2342
2343 pset_lock(pset);
2344 sampled_sched_run_count = (volatile uint32_t) sched_run_buckets[TH_BUCKET_RUN];
2345
2346 /*
2347 * If we have emptied the run queue, and our current thread is runnable, we
2348 * should tell any processors that are still DISPATCHING that they will
2349 * probably not have any work to do. In the event that there are no
2350 * pending signals that we can cancel, this is also uninteresting.
2351 *
2352 * In the unlikely event that another thread becomes runnable while we are
2353 * doing this (sched_run_count is atomically updated, not guarded), the
2354 * codepath making it runnable SHOULD (a dangerous word) need the pset lock
2355 * in order to dispatch it to a processor in our pset. So, the other
2356 * codepath will wait while we squash all cancelable ASTs, get the pset
2357 * lock, and then dispatch the freshly runnable thread. So this should be
2358 * correct (we won't accidentally have a runnable thread that hasn't been
2359 * dispatched to an idle processor), if not ideal (we may be restarting the
2360 * dispatch process, which could have some overhead).
2361 *
2362 */
2363 if ((sampled_sched_run_count == 1) &&
2364 (pset->pending_deferred_AST_cpu_mask)) {
2365 qe_foreach_element_safe(active_processor, &pset->active_queue, processor_queue) {
2366 /*
2367 * If a processor is DISPATCHING, it could be because of
2368 * a cancelable signal.
2369 *
2370 * IF the processor is not our
2371 * current processor (the current processor should not
2372 * be DISPATCHING, so this is a bit paranoid), AND there
2373 * is a cancelable signal pending on the processor, AND
2374 * there is no non-cancelable signal pending (as there is
2375 * no point trying to backtrack on bringing the processor
2376 * up if a signal we cannot cancel is outstanding), THEN
2377 * it should make sense to roll back the processor state
2378 * to the IDLE state.
2379 *
2380 * If the racey nature of this approach (as the signal
2381 * will be arbitrated by hardware, and can fire as we
2382 * roll back state) results in the core responding
2383 * despite being pushed back to the IDLE state, it
2384 * should be no different than if the core took some
2385 * interrupt while IDLE.
2386 */
2387 if ((active_processor->state == PROCESSOR_DISPATCHING) &&
2388 (pset->pending_deferred_AST_cpu_mask & (1ULL << active_processor->cpu_id)) &&
2389 (!(pset->pending_AST_cpu_mask & (1ULL << active_processor->cpu_id))) &&
2390 (active_processor != processor)) {
2391 /*
2392 * Squash all of the processor state back to some
2393 * reasonable facsimile of PROCESSOR_IDLE.
2394 *
2395 * TODO: What queue policy do we actually want here?
2396 * We want to promote selection of a good processor
2397 * to run on. Do we want to enqueue at the head?
2398 * The tail? At the (relative) old position in the
2399 * queue? Or something else entirely?
2400 */
2401 re_queue_head(&pset->idle_queue, &active_processor->processor_queue);
2402
2403 assert(active_processor->next_thread == THREAD_NULL);
2404
2405 active_processor->current_pri = IDLEPRI;
2406 active_processor->current_thmode = TH_MODE_FIXED;
2407 active_processor->current_sfi_class = SFI_CLASS_KERNEL;
2408 active_processor->deadline = UINT64_MAX;
2409 active_processor->state = PROCESSOR_IDLE;
2410 pset->pending_deferred_AST_cpu_mask &= ~(1U << active_processor->cpu_id);
2411 machine_signal_idle_cancel(active_processor);
2412 }
2413
2414 }
2415 }
2416
2417 pset_unlock(pset);
2418 }
2419 #else
2420 /* We don't support deferred ASTs; everything is candycanes and sunshine. */
2421 #endif
2422
2423 /*
2424 * thread_dispatch:
2425 *
2426 * Handle threads at context switch. Re-dispatch other thread
2427 * if still running, otherwise update run state and perform
2428 * special actions. Update quantum for other thread and begin
2429 * the quantum for ourselves.
2430 *
2431 * "thread" is the old thread that we have switched away from.
2432 * "self" is the new current thread that we have context switched to
2433 *
2434 * Called at splsched.
2435 */
2436 void
2437 thread_dispatch(
2438 thread_t thread,
2439 thread_t self)
2440 {
2441 processor_t processor = self->last_processor;
2442
2443 assert(processor == current_processor());
2444 assert(self == current_thread());
2445 assert(thread != self);
2446
2447 if (thread != THREAD_NULL) {
2448 /*
2449 * If blocked at a continuation, discard
2450 * the stack.
2451 */
2452 if (thread->continuation != NULL && thread->kernel_stack != 0)
2453 stack_free(thread);
2454
2455 if (thread->state & TH_IDLE) {
2456 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2457 MACHDBG_CODE(DBG_MACH_SCHED,MACH_DISPATCH) | DBG_FUNC_NONE,
2458 (uintptr_t)thread_tid(thread), 0, thread->state,
2459 sched_run_buckets[TH_BUCKET_RUN], 0);
2460 } else {
2461 int64_t consumed;
2462 int64_t remainder = 0;
2463
2464 if (processor->quantum_end > processor->last_dispatch)
2465 remainder = processor->quantum_end -
2466 processor->last_dispatch;
2467
2468 consumed = thread->quantum_remaining - remainder;
2469
2470 if ((thread->reason & AST_LEDGER) == 0) {
2471 /*
2472 * Bill CPU time to both the task and
2473 * the individual thread.
2474 */
2475 ledger_credit(thread->t_ledger,
2476 task_ledgers.cpu_time, consumed);
2477 ledger_credit(thread->t_threadledger,
2478 thread_ledgers.cpu_time, consumed);
2479 #ifdef CONFIG_BANK
2480 if (thread->t_bankledger) {
2481 ledger_credit(thread->t_bankledger,
2482 bank_ledgers.cpu_time,
2483 (consumed - thread->t_deduct_bank_ledger_time));
2484
2485 }
2486 thread->t_deduct_bank_ledger_time =0;
2487 #endif
2488 }
2489
2490 wake_lock(thread);
2491 thread_lock(thread);
2492
2493 /*
2494 * Apply a priority floor if the thread holds a kernel resource
2495 * Do this before checking starting_pri to avoid overpenalizing
2496 * repeated rwlock blockers.
2497 */
2498 if (__improbable(thread->rwlock_count != 0))
2499 lck_rw_set_promotion_locked(thread);
2500
2501 boolean_t keep_quantum = processor->first_timeslice;
2502
2503 /*
2504 * Treat a thread which has dropped priority since it got on core
2505 * as having expired its quantum.
2506 */
2507 if (processor->starting_pri > thread->sched_pri)
2508 keep_quantum = FALSE;
2509
2510 /* Compute remainder of current quantum. */
2511 if (keep_quantum &&
2512 processor->quantum_end > processor->last_dispatch)
2513 thread->quantum_remaining = (uint32_t)remainder;
2514 else
2515 thread->quantum_remaining = 0;
2516
2517 if (thread->sched_mode == TH_MODE_REALTIME) {
2518 /*
2519 * Cancel the deadline if the thread has
2520 * consumed the entire quantum.
2521 */
2522 if (thread->quantum_remaining == 0) {
2523 thread->realtime.deadline = UINT64_MAX;
2524 }
2525 } else {
2526 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
2527 /*
2528 * For non-realtime threads treat a tiny
2529 * remaining quantum as an expired quantum
2530 * but include what's left next time.
2531 */
2532 if (thread->quantum_remaining < min_std_quantum) {
2533 thread->reason |= AST_QUANTUM;
2534 thread->quantum_remaining += SCHED(initial_quantum_size)(thread);
2535 }
2536 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
2537 }
2538
2539 /*
2540 * If we are doing a direct handoff then
2541 * take the remainder of the quantum.
2542 */
2543 if ((thread->reason & (AST_HANDOFF|AST_QUANTUM)) == AST_HANDOFF) {
2544 self->quantum_remaining = thread->quantum_remaining;
2545 thread->reason |= AST_QUANTUM;
2546 thread->quantum_remaining = 0;
2547 } else {
2548 #if defined(CONFIG_SCHED_MULTIQ)
2549 if (SCHED(sched_groups_enabled) &&
2550 thread->sched_group == self->sched_group) {
2551 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2552 MACHDBG_CODE(DBG_MACH_SCHED, MACH_QUANTUM_HANDOFF),
2553 self->reason, (uintptr_t)thread_tid(thread),
2554 self->quantum_remaining, thread->quantum_remaining, 0);
2555
2556 self->quantum_remaining = thread->quantum_remaining;
2557 thread->quantum_remaining = 0;
2558 /* Don't set AST_QUANTUM here - old thread might still want to preempt someone else */
2559 }
2560 #endif /* defined(CONFIG_SCHED_MULTIQ) */
2561 }
2562
2563 thread->computation_metered += (processor->last_dispatch - thread->computation_epoch);
2564
2565 if (!(thread->state & TH_WAIT)) {
2566 /*
2567 * Still runnable.
2568 */
2569 thread->last_made_runnable_time = mach_approximate_time();
2570
2571 machine_thread_going_off_core(thread, FALSE, processor->last_dispatch);
2572
2573 if (thread->reason & AST_QUANTUM)
2574 thread_setrun(thread, SCHED_TAILQ);
2575 else if (thread->reason & AST_PREEMPT)
2576 thread_setrun(thread, SCHED_HEADQ);
2577 else
2578 thread_setrun(thread, SCHED_PREEMPT | SCHED_TAILQ);
2579
2580 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2581 MACHDBG_CODE(DBG_MACH_SCHED,MACH_DISPATCH) | DBG_FUNC_NONE,
2582 (uintptr_t)thread_tid(thread), thread->reason, thread->state,
2583 sched_run_buckets[TH_BUCKET_RUN], 0);
2584
2585 if (thread->wake_active) {
2586 thread->wake_active = FALSE;
2587 thread_unlock(thread);
2588
2589 thread_wakeup(&thread->wake_active);
2590 } else {
2591 thread_unlock(thread);
2592 }
2593
2594 wake_unlock(thread);
2595 } else {
2596 /*
2597 * Waiting.
2598 */
2599 boolean_t should_terminate = FALSE;
2600 uint32_t new_run_count;
2601
2602 /* Only the first call to thread_dispatch
2603 * after explicit termination should add
2604 * the thread to the termination queue
2605 */
2606 if ((thread->state & (TH_TERMINATE|TH_TERMINATE2)) == TH_TERMINATE) {
2607 should_terminate = TRUE;
2608 thread->state |= TH_TERMINATE2;
2609 }
2610
2611 thread->state &= ~TH_RUN;
2612 thread->last_made_runnable_time = ~0ULL;
2613 thread->chosen_processor = PROCESSOR_NULL;
2614
2615 new_run_count = sched_run_decr(thread);
2616
2617 #if CONFIG_SCHED_SFI
2618 if ((thread->state & (TH_WAIT | TH_TERMINATE)) == TH_WAIT) {
2619 if (thread->reason & AST_SFI) {
2620 thread->wait_sfi_begin_time = processor->last_dispatch;
2621 }
2622 }
2623 #endif
2624
2625 machine_thread_going_off_core(thread, should_terminate, processor->last_dispatch);
2626
2627 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2628 MACHDBG_CODE(DBG_MACH_SCHED,MACH_DISPATCH) | DBG_FUNC_NONE,
2629 (uintptr_t)thread_tid(thread), thread->reason, thread->state,
2630 new_run_count, 0);
2631
2632 (*thread->sched_call)(SCHED_CALL_BLOCK, thread);
2633
2634 if (thread->wake_active) {
2635 thread->wake_active = FALSE;
2636 thread_unlock(thread);
2637
2638 thread_wakeup(&thread->wake_active);
2639 } else {
2640 thread_unlock(thread);
2641 }
2642
2643 wake_unlock(thread);
2644
2645 if (should_terminate)
2646 thread_terminate_enqueue(thread);
2647 }
2648 }
2649 }
2650
2651 /* Update (new) current thread and reprogram quantum timer */
2652 thread_lock(self);
2653 if (!(self->state & TH_IDLE)) {
2654 uint64_t arg1, arg2;
2655 int urgency;
2656 uint64_t latency;
2657
2658 #if CONFIG_SCHED_SFI
2659 ast_t new_ast;
2660
2661 new_ast = sfi_thread_needs_ast(self, NULL);
2662
2663 if (new_ast != AST_NONE) {
2664 ast_on(new_ast);
2665 }
2666 #endif
2667
2668 assertf(processor->last_dispatch >= self->last_made_runnable_time, "Non-monotonic time? dispatch at 0x%llx, runnable at 0x%llx", processor->last_dispatch, self->last_made_runnable_time);
2669 latency = processor->last_dispatch - self->last_made_runnable_time;
2670
2671 urgency = thread_get_urgency(self, &arg1, &arg2);
2672
2673 thread_tell_urgency(urgency, arg1, arg2, latency, self);
2674
2675 machine_thread_going_on_core(self, urgency, latency, processor->last_dispatch);
2676
2677 /*
2678 * Get a new quantum if none remaining.
2679 */
2680 if (self->quantum_remaining == 0) {
2681 thread_quantum_init(self);
2682 }
2683
2684 /*
2685 * Set up quantum timer and timeslice.
2686 */
2687 processor->quantum_end = processor->last_dispatch + self->quantum_remaining;
2688 timer_call_enter1(&processor->quantum_timer, self, processor->quantum_end, TIMER_CALL_SYS_CRITICAL | TIMER_CALL_LOCAL);
2689
2690 processor->first_timeslice = TRUE;
2691 } else {
2692 timer_call_cancel(&processor->quantum_timer);
2693 processor->first_timeslice = FALSE;
2694
2695 thread_tell_urgency(THREAD_URGENCY_NONE, 0, 0, 0, self);
2696 machine_thread_going_on_core(self, THREAD_URGENCY_NONE, 0, processor->last_dispatch);
2697 }
2698
2699 self->computation_epoch = processor->last_dispatch;
2700 self->reason = AST_NONE;
2701 processor->starting_pri = self->sched_pri;
2702
2703 thread_unlock(self);
2704
2705 #if defined(CONFIG_SCHED_DEFERRED_AST)
2706 /*
2707 * TODO: Can we state that redispatching our old thread is also
2708 * uninteresting?
2709 */
2710 if ((((volatile uint32_t)sched_run_buckets[TH_BUCKET_RUN]) == 1) &&
2711 !(self->state & TH_IDLE)) {
2712 pset_cancel_deferred_dispatch(processor->processor_set, processor);
2713 }
2714 #endif
2715
2716 }
2717
2718 /*
2719 * thread_block_reason:
2720 *
2721 * Forces a reschedule, blocking the caller if a wait
2722 * has been asserted.
2723 *
2724 * If a continuation is specified, then thread_invoke will
2725 * attempt to discard the thread's kernel stack. When the
2726 * thread resumes, it will execute the continuation function
2727 * on a new kernel stack.
2728 */
2729 counter(mach_counter_t c_thread_block_calls = 0;)
2730
2731 wait_result_t
2732 thread_block_reason(
2733 thread_continue_t continuation,
2734 void *parameter,
2735 ast_t reason)
2736 {
2737 thread_t self = current_thread();
2738 processor_t processor;
2739 thread_t new_thread;
2740 spl_t s;
2741
2742 counter(++c_thread_block_calls);
2743
2744 s = splsched();
2745
2746 processor = current_processor();
2747
2748 /* If we're explicitly yielding, force a subsequent quantum */
2749 if (reason & AST_YIELD)
2750 processor->first_timeslice = FALSE;
2751
2752 /* We're handling all scheduling AST's */
2753 ast_off(AST_SCHEDULING);
2754
2755 #if PROC_REF_DEBUG
2756 if ((continuation != NULL) && (self->task != kernel_task)) {
2757 if (uthread_get_proc_refcount(self->uthread) != 0) {
2758 panic("thread_block_reason with continuation uthread %p with uu_proc_refcount != 0", self->uthread);
2759 }
2760 }
2761 #endif
2762
2763 self->continuation = continuation;
2764 self->parameter = parameter;
2765
2766 if (self->state & ~(TH_RUN | TH_IDLE)) {
2767 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2768 MACHDBG_CODE(DBG_MACH_SCHED,MACH_BLOCK),
2769 reason, VM_KERNEL_UNSLIDE(continuation), 0, 0, 0);
2770 }
2771
2772 do {
2773 thread_lock(self);
2774 new_thread = thread_select(self, processor, reason);
2775 thread_unlock(self);
2776 } while (!thread_invoke(self, new_thread, reason));
2777
2778 splx(s);
2779
2780 return (self->wait_result);
2781 }
2782
2783 /*
2784 * thread_block:
2785 *
2786 * Block the current thread if a wait has been asserted.
2787 */
2788 wait_result_t
2789 thread_block(
2790 thread_continue_t continuation)
2791 {
2792 return thread_block_reason(continuation, NULL, AST_NONE);
2793 }
2794
2795 wait_result_t
2796 thread_block_parameter(
2797 thread_continue_t continuation,
2798 void *parameter)
2799 {
2800 return thread_block_reason(continuation, parameter, AST_NONE);
2801 }
2802
2803 /*
2804 * thread_run:
2805 *
2806 * Switch directly from the current thread to the
2807 * new thread, handing off our quantum if appropriate.
2808 *
2809 * New thread must be runnable, and not on a run queue.
2810 *
2811 * Called at splsched.
2812 */
2813 int
2814 thread_run(
2815 thread_t self,
2816 thread_continue_t continuation,
2817 void *parameter,
2818 thread_t new_thread)
2819 {
2820 ast_t handoff = AST_HANDOFF;
2821
2822 self->continuation = continuation;
2823 self->parameter = parameter;
2824
2825 while (!thread_invoke(self, new_thread, handoff)) {
2826 processor_t processor = current_processor();
2827
2828 thread_lock(self);
2829 new_thread = thread_select(self, processor, AST_NONE);
2830 thread_unlock(self);
2831 handoff = AST_NONE;
2832 }
2833
2834 return (self->wait_result);
2835 }
2836
2837 /*
2838 * thread_continue:
2839 *
2840 * Called at splsched when a thread first receives
2841 * a new stack after a continuation.
2842 */
2843 void
2844 thread_continue(
2845 thread_t thread)
2846 {
2847 thread_t self = current_thread();
2848 thread_continue_t continuation;
2849 void *parameter;
2850
2851 DTRACE_SCHED(on__cpu);
2852
2853 continuation = self->continuation;
2854 parameter = self->parameter;
2855
2856 #if KPERF
2857 kperf_on_cpu(self, continuation, NULL);
2858 #endif
2859
2860 thread_dispatch(thread, self);
2861
2862 self->continuation = self->parameter = NULL;
2863
2864 if (thread != THREAD_NULL)
2865 (void)spllo();
2866
2867 TLOG(1, "thread_continue: calling call_continuation \n");
2868 call_continuation(continuation, parameter, self->wait_result);
2869 /*NOTREACHED*/
2870 }
2871
2872 void
2873 thread_quantum_init(thread_t thread)
2874 {
2875 if (thread->sched_mode == TH_MODE_REALTIME) {
2876 thread->quantum_remaining = thread->realtime.computation;
2877 } else {
2878 thread->quantum_remaining = SCHED(initial_quantum_size)(thread);
2879 }
2880 }
2881
2882 uint32_t
2883 sched_timeshare_initial_quantum_size(thread_t thread)
2884 {
2885 if ((thread != THREAD_NULL) && thread->th_sched_bucket == TH_BUCKET_SHARE_BG)
2886 return bg_quantum;
2887 else
2888 return std_quantum;
2889 }
2890
2891 /*
2892 * run_queue_init:
2893 *
2894 * Initialize a run queue before first use.
2895 */
2896 void
2897 run_queue_init(
2898 run_queue_t rq)
2899 {
2900 rq->highq = NOPRI;
2901 for (u_int i = 0; i < BITMAP_LEN(NRQS); i++)
2902 rq->bitmap[i] = 0;
2903 rq->urgency = rq->count = 0;
2904 for (int i = 0; i < NRQS; i++)
2905 queue_init(&rq->queues[i]);
2906 }
2907
2908 /*
2909 * run_queue_dequeue:
2910 *
2911 * Perform a dequeue operation on a run queue,
2912 * and return the resulting thread.
2913 *
2914 * The run queue must be locked (see thread_run_queue_remove()
2915 * for more info), and not empty.
2916 */
2917 thread_t
2918 run_queue_dequeue(
2919 run_queue_t rq,
2920 integer_t options)
2921 {
2922 thread_t thread;
2923 queue_t queue = &rq->queues[rq->highq];
2924
2925 if (options & SCHED_HEADQ) {
2926 thread = qe_dequeue_head(queue, struct thread, runq_links);
2927 } else {
2928 thread = qe_dequeue_tail(queue, struct thread, runq_links);
2929 }
2930
2931 assert(thread != THREAD_NULL);
2932 assert_thread_magic(thread);
2933
2934 thread->runq = PROCESSOR_NULL;
2935 SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
2936 rq->count--;
2937 if (SCHED(priority_is_urgent)(rq->highq)) {
2938 rq->urgency--; assert(rq->urgency >= 0);
2939 }
2940 if (queue_empty(queue)) {
2941 bitmap_clear(rq->bitmap, rq->highq);
2942 rq->highq = bitmap_first(rq->bitmap, NRQS);
2943 }
2944
2945 return thread;
2946 }
2947
2948 /*
2949 * run_queue_enqueue:
2950 *
2951 * Perform a enqueue operation on a run queue.
2952 *
2953 * The run queue must be locked (see thread_run_queue_remove()
2954 * for more info).
2955 */
2956 boolean_t
2957 run_queue_enqueue(
2958 run_queue_t rq,
2959 thread_t thread,
2960 integer_t options)
2961 {
2962 queue_t queue = &rq->queues[thread->sched_pri];
2963 boolean_t result = FALSE;
2964
2965 assert_thread_magic(thread);
2966
2967 if (queue_empty(queue)) {
2968 enqueue_tail(queue, &thread->runq_links);
2969
2970 rq_bitmap_set(rq->bitmap, thread->sched_pri);
2971 if (thread->sched_pri > rq->highq) {
2972 rq->highq = thread->sched_pri;
2973 result = TRUE;
2974 }
2975 } else {
2976 if (options & SCHED_TAILQ)
2977 enqueue_tail(queue, &thread->runq_links);
2978 else
2979 enqueue_head(queue, &thread->runq_links);
2980 }
2981 if (SCHED(priority_is_urgent)(thread->sched_pri))
2982 rq->urgency++;
2983 SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
2984 rq->count++;
2985
2986 return (result);
2987 }
2988
2989 /*
2990 * run_queue_remove:
2991 *
2992 * Remove a specific thread from a runqueue.
2993 *
2994 * The run queue must be locked.
2995 */
2996 void
2997 run_queue_remove(
2998 run_queue_t rq,
2999 thread_t thread)
3000 {
3001 assert(thread->runq != PROCESSOR_NULL);
3002 assert_thread_magic(thread);
3003
3004 remqueue(&thread->runq_links);
3005 SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
3006 rq->count--;
3007 if (SCHED(priority_is_urgent)(thread->sched_pri)) {
3008 rq->urgency--; assert(rq->urgency >= 0);
3009 }
3010
3011 if (queue_empty(&rq->queues[thread->sched_pri])) {
3012 /* update run queue status */
3013 bitmap_clear(rq->bitmap, thread->sched_pri);
3014 rq->highq = bitmap_first(rq->bitmap, NRQS);
3015 }
3016
3017 thread->runq = PROCESSOR_NULL;
3018 }
3019
3020 /* Assumes RT lock is not held, and acquires splsched/rt_lock itself */
3021 void
3022 rt_runq_scan(sched_update_scan_context_t scan_context)
3023 {
3024 spl_t s;
3025 thread_t thread;
3026
3027 s = splsched();
3028 rt_lock_lock();
3029
3030 qe_foreach_element_safe(thread, &rt_runq.queue, runq_links) {
3031 if (thread->last_made_runnable_time < scan_context->earliest_rt_make_runnable_time) {
3032 scan_context->earliest_rt_make_runnable_time = thread->last_made_runnable_time;
3033 }
3034 }
3035
3036 rt_lock_unlock();
3037 splx(s);
3038 }
3039
3040
3041 /*
3042 * realtime_queue_insert:
3043 *
3044 * Enqueue a thread for realtime execution.
3045 */
3046 static boolean_t
3047 realtime_queue_insert(thread_t thread)
3048 {
3049 queue_t queue = &rt_runq.queue;
3050 uint64_t deadline = thread->realtime.deadline;
3051 boolean_t preempt = FALSE;
3052
3053 rt_lock_lock();
3054
3055 if (queue_empty(queue)) {
3056 enqueue_tail(queue, &thread->runq_links);
3057 preempt = TRUE;
3058 } else {
3059 /* Insert into rt_runq in thread deadline order */
3060 queue_entry_t iter;
3061 qe_foreach(iter, queue) {
3062 thread_t iter_thread = qe_element(iter, struct thread, runq_links);
3063 assert_thread_magic(iter_thread);
3064
3065 if (deadline < iter_thread->realtime.deadline) {
3066 if (iter == queue_first(queue))
3067 preempt = TRUE;
3068 insque(&thread->runq_links, queue_prev(iter));
3069 break;
3070 } else if (iter == queue_last(queue)) {
3071 enqueue_tail(queue, &thread->runq_links);
3072 break;
3073 }
3074 }
3075 }
3076
3077 thread->runq = THREAD_ON_RT_RUNQ;
3078 SCHED_STATS_RUNQ_CHANGE(&rt_runq.runq_stats, rt_runq.count);
3079 rt_runq.count++;
3080
3081 rt_lock_unlock();
3082
3083 return (preempt);
3084 }
3085
3086 /*
3087 * realtime_setrun:
3088 *
3089 * Dispatch a thread for realtime execution.
3090 *
3091 * Thread must be locked. Associated pset must
3092 * be locked, and is returned unlocked.
3093 */
3094 static void
3095 realtime_setrun(
3096 processor_t processor,
3097 thread_t thread)
3098 {
3099 processor_set_t pset = processor->processor_set;
3100 ast_t preempt;
3101
3102 boolean_t do_signal_idle = FALSE, do_cause_ast = FALSE;
3103
3104 thread->chosen_processor = processor;
3105
3106 /* <rdar://problem/15102234> */
3107 assert(thread->bound_processor == PROCESSOR_NULL);
3108
3109 /*
3110 * Dispatch directly onto idle processor.
3111 */
3112 if ( (thread->bound_processor == processor)
3113 && processor->state == PROCESSOR_IDLE) {
3114 re_queue_tail(&pset->active_queue, &processor->processor_queue);
3115
3116 processor->next_thread = thread;
3117 processor->current_pri = thread->sched_pri;
3118 processor->current_thmode = thread->sched_mode;
3119 processor->current_sfi_class = thread->sfi_class;
3120 processor->deadline = thread->realtime.deadline;
3121 processor->state = PROCESSOR_DISPATCHING;
3122
3123 if (processor != current_processor()) {
3124 if (!(pset->pending_AST_cpu_mask & (1ULL << processor->cpu_id))) {
3125 /* cleared on exit from main processor_idle() loop */
3126 pset->pending_AST_cpu_mask |= (1ULL << processor->cpu_id);
3127 do_signal_idle = TRUE;
3128 }
3129 }
3130 pset_unlock(pset);
3131
3132 if (do_signal_idle) {
3133 machine_signal_idle(processor);
3134 }
3135 return;
3136 }
3137
3138 if (processor->current_pri < BASEPRI_RTQUEUES)
3139 preempt = (AST_PREEMPT | AST_URGENT);
3140 else if (thread->realtime.deadline < processor->deadline)
3141 preempt = (AST_PREEMPT | AST_URGENT);
3142 else
3143 preempt = AST_NONE;
3144
3145 realtime_queue_insert(thread);
3146
3147 if (preempt != AST_NONE) {
3148 if (processor->state == PROCESSOR_IDLE) {
3149 re_queue_tail(&pset->active_queue, &processor->processor_queue);
3150
3151 processor->next_thread = THREAD_NULL;
3152 processor->current_pri = thread->sched_pri;
3153 processor->current_thmode = thread->sched_mode;
3154 processor->current_sfi_class = thread->sfi_class;
3155 processor->deadline = thread->realtime.deadline;
3156 processor->state = PROCESSOR_DISPATCHING;
3157 if (processor == current_processor()) {
3158 ast_on(preempt);
3159 } else {
3160 if (!(pset->pending_AST_cpu_mask & (1ULL << processor->cpu_id))) {
3161 /* cleared on exit from main processor_idle() loop */
3162 pset->pending_AST_cpu_mask |= (1ULL << processor->cpu_id);
3163 do_signal_idle = TRUE;
3164 }
3165 }
3166 } else if (processor->state == PROCESSOR_DISPATCHING) {
3167 if ((processor->next_thread == THREAD_NULL) && ((processor->current_pri < thread->sched_pri) || (processor->deadline > thread->realtime.deadline))) {
3168 processor->current_pri = thread->sched_pri;
3169 processor->current_thmode = thread->sched_mode;
3170 processor->current_sfi_class = thread->sfi_class;
3171 processor->deadline = thread->realtime.deadline;
3172 }
3173 } else {
3174 if (processor == current_processor()) {
3175 ast_on(preempt);
3176 } else {
3177 if (!(pset->pending_AST_cpu_mask & (1ULL << processor->cpu_id))) {
3178 /* cleared after IPI causes csw_check() to be called */
3179 pset->pending_AST_cpu_mask |= (1ULL << processor->cpu_id);
3180 do_cause_ast = TRUE;
3181 }
3182 }
3183 }
3184 } else {
3185 /* Selected processor was too busy, just keep thread enqueued and let other processors drain it naturally. */
3186 }
3187
3188 pset_unlock(pset);
3189
3190 if (do_signal_idle) {
3191 machine_signal_idle(processor);
3192 } else if (do_cause_ast) {
3193 cause_ast_check(processor);
3194 }
3195 }
3196
3197
3198 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
3199
3200 boolean_t
3201 priority_is_urgent(int priority)
3202 {
3203 return bitmap_test(sched_preempt_pri, priority) ? TRUE : FALSE;
3204 }
3205
3206 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
3207
3208 /*
3209 * processor_setrun:
3210 *
3211 * Dispatch a thread for execution on a
3212 * processor.
3213 *
3214 * Thread must be locked. Associated pset must
3215 * be locked, and is returned unlocked.
3216 */
3217 static void
3218 processor_setrun(
3219 processor_t processor,
3220 thread_t thread,
3221 integer_t options)
3222 {
3223 processor_set_t pset = processor->processor_set;
3224 ast_t preempt;
3225 enum { eExitIdle, eInterruptRunning, eDoNothing } ipi_action = eDoNothing;
3226 enum { eNoSignal, eDoSignal, eDoDeferredSignal } do_signal_idle = eNoSignal;
3227
3228 boolean_t do_cause_ast = FALSE;
3229
3230 thread->chosen_processor = processor;
3231
3232 /*
3233 * Dispatch directly onto idle processor.
3234 */
3235 if ( (SCHED(direct_dispatch_to_idle_processors) ||
3236 thread->bound_processor == processor)
3237 && processor->state == PROCESSOR_IDLE) {
3238
3239 re_queue_tail(&pset->active_queue, &processor->processor_queue);
3240
3241 processor->next_thread = thread;
3242 processor->current_pri = thread->sched_pri;
3243 processor->current_thmode = thread->sched_mode;
3244 processor->current_sfi_class = thread->sfi_class;
3245 processor->deadline = UINT64_MAX;
3246 processor->state = PROCESSOR_DISPATCHING;
3247
3248 if (!(pset->pending_AST_cpu_mask & (1ULL << processor->cpu_id))) {
3249 /* cleared on exit from main processor_idle() loop */
3250 pset->pending_AST_cpu_mask |= (1ULL << processor->cpu_id);
3251 do_signal_idle = eDoSignal;
3252 }
3253
3254 pset_unlock(pset);
3255
3256 if (do_signal_idle == eDoSignal) {
3257 machine_signal_idle(processor);
3258 }
3259
3260 return;
3261 }
3262
3263 /*
3264 * Set preemption mode.
3265 */
3266 #if defined(CONFIG_SCHED_DEFERRED_AST)
3267 /* TODO: Do we need to care about urgency (see rdar://problem/20136239)? */
3268 #endif
3269 if (SCHED(priority_is_urgent)(thread->sched_pri) && thread->sched_pri > processor->current_pri)
3270 preempt = (AST_PREEMPT | AST_URGENT);
3271 else if(processor->active_thread && thread_eager_preemption(processor->active_thread))
3272 preempt = (AST_PREEMPT | AST_URGENT);
3273 else if ((thread->sched_mode == TH_MODE_TIMESHARE) && (thread->sched_pri < thread->base_pri)) {
3274 if(SCHED(priority_is_urgent)(thread->base_pri) && thread->sched_pri > processor->current_pri) {
3275 preempt = (options & SCHED_PREEMPT)? AST_PREEMPT: AST_NONE;
3276 } else {
3277 preempt = AST_NONE;
3278 }
3279 } else
3280 preempt = (options & SCHED_PREEMPT)? AST_PREEMPT: AST_NONE;
3281
3282 SCHED(processor_enqueue)(processor, thread, options);
3283
3284 if (preempt != AST_NONE) {
3285 if (processor->state == PROCESSOR_IDLE) {
3286 re_queue_tail(&pset->active_queue, &processor->processor_queue);
3287
3288 processor->next_thread = THREAD_NULL;
3289 processor->current_pri = thread->sched_pri;
3290 processor->current_thmode = thread->sched_mode;
3291 processor->current_sfi_class = thread->sfi_class;
3292 processor->deadline = UINT64_MAX;
3293 processor->state = PROCESSOR_DISPATCHING;
3294
3295 ipi_action = eExitIdle;
3296 } else if ( processor->state == PROCESSOR_DISPATCHING) {
3297 if ((processor->next_thread == THREAD_NULL) && (processor->current_pri < thread->sched_pri)) {
3298 processor->current_pri = thread->sched_pri;
3299 processor->current_thmode = thread->sched_mode;
3300 processor->current_sfi_class = thread->sfi_class;
3301 processor->deadline = UINT64_MAX;
3302 }
3303 } else if ( (processor->state == PROCESSOR_RUNNING ||
3304 processor->state == PROCESSOR_SHUTDOWN) &&
3305 (thread->sched_pri >= processor->current_pri)) {
3306 ipi_action = eInterruptRunning;
3307 }
3308 } else {
3309 /*
3310 * New thread is not important enough to preempt what is running, but
3311 * special processor states may need special handling
3312 */
3313 if (processor->state == PROCESSOR_SHUTDOWN &&
3314 thread->sched_pri >= processor->current_pri ) {
3315 ipi_action = eInterruptRunning;
3316 } else if (processor->state == PROCESSOR_IDLE) {
3317 re_queue_tail(&pset->active_queue, &processor->processor_queue);
3318
3319 processor->next_thread = THREAD_NULL;
3320 processor->current_pri = thread->sched_pri;
3321 processor->current_thmode = thread->sched_mode;
3322 processor->current_sfi_class = thread->sfi_class;
3323 processor->deadline = UINT64_MAX;
3324 processor->state = PROCESSOR_DISPATCHING;
3325
3326 ipi_action = eExitIdle;
3327 }
3328 }
3329
3330 switch (ipi_action) {
3331 case eDoNothing:
3332 break;
3333 case eExitIdle:
3334 if (processor == current_processor()) {
3335 if (csw_check_locked(processor, pset, AST_NONE) != AST_NONE)
3336 ast_on(preempt);
3337 } else {
3338 #if defined(CONFIG_SCHED_DEFERRED_AST)
3339 if (!(pset->pending_deferred_AST_cpu_mask & (1ULL << processor->cpu_id)) &&
3340 !(pset->pending_AST_cpu_mask & (1ULL << processor->cpu_id))) {
3341 /* cleared on exit from main processor_idle() loop */
3342 pset->pending_deferred_AST_cpu_mask |= (1ULL << processor->cpu_id);
3343 do_signal_idle = eDoDeferredSignal;
3344 }
3345 #else
3346 if (!(pset->pending_AST_cpu_mask & (1ULL << processor->cpu_id))) {
3347 /* cleared on exit from main processor_idle() loop */
3348 pset->pending_AST_cpu_mask |= (1ULL << processor->cpu_id);
3349 do_signal_idle = eDoSignal;
3350 }
3351 #endif
3352 }
3353 break;
3354 case eInterruptRunning:
3355 if (processor == current_processor()) {
3356 if (csw_check_locked(processor, pset, AST_NONE) != AST_NONE)
3357 ast_on(preempt);
3358 } else {
3359 if (!(pset->pending_AST_cpu_mask & (1ULL << processor->cpu_id))) {
3360 /* cleared after IPI causes csw_check() to be called */
3361 pset->pending_AST_cpu_mask |= (1ULL << processor->cpu_id);
3362 do_cause_ast = TRUE;
3363 }
3364 }
3365 break;
3366 }
3367
3368 pset_unlock(pset);
3369
3370 if (do_signal_idle == eDoSignal) {
3371 machine_signal_idle(processor);
3372 }
3373 #if defined(CONFIG_SCHED_DEFERRED_AST)
3374 else if (do_signal_idle == eDoDeferredSignal) {
3375 /*
3376 * TODO: The ability to cancel this signal could make
3377 * sending it outside of the pset lock an issue. Do
3378 * we need to address this? Or would the only fallout
3379 * be that the core takes a signal? As long as we do
3380 * not run the risk of having a core marked as signal
3381 * outstanding, with no real signal outstanding, the
3382 * only result should be that we fail to cancel some
3383 * signals.
3384 */
3385 machine_signal_idle_deferred(processor);
3386 }
3387 #endif
3388 else if (do_cause_ast) {
3389 cause_ast_check(processor);
3390 }
3391 }
3392
3393 /*
3394 * choose_next_pset:
3395 *
3396 * Return the next sibling pset containing
3397 * available processors.
3398 *
3399 * Returns the original pset if none other is
3400 * suitable.
3401 */
3402 static processor_set_t
3403 choose_next_pset(
3404 processor_set_t pset)
3405 {
3406 processor_set_t nset = pset;
3407
3408 do {
3409 nset = next_pset(nset);
3410 } while (nset->online_processor_count < 1 && nset != pset);
3411
3412 return (nset);
3413 }
3414
3415 /*
3416 * choose_processor:
3417 *
3418 * Choose a processor for the thread, beginning at
3419 * the pset. Accepts an optional processor hint in
3420 * the pset.
3421 *
3422 * Returns a processor, possibly from a different pset.
3423 *
3424 * The thread must be locked. The pset must be locked,
3425 * and the resulting pset is locked on return.
3426 */
3427 processor_t
3428 choose_processor(
3429 processor_set_t pset,
3430 processor_t processor,
3431 thread_t thread)
3432 {
3433 processor_set_t nset, cset = pset;
3434
3435 assert(thread->sched_pri <= BASEPRI_RTQUEUES);
3436
3437 /*
3438 * Prefer the hinted processor, when appropriate.
3439 */
3440
3441 /* Fold last processor hint from secondary processor to its primary */
3442 if (processor != PROCESSOR_NULL) {
3443 processor = processor->processor_primary;
3444 }
3445
3446 /*
3447 * Only consult platform layer if pset is active, which
3448 * it may not be in some cases when a multi-set system
3449 * is going to sleep.
3450 */
3451 if (pset->online_processor_count) {
3452 if ((processor == PROCESSOR_NULL) || (processor->processor_set == pset && processor->state == PROCESSOR_IDLE)) {
3453 processor_t mc_processor = machine_choose_processor(pset, processor);
3454 if (mc_processor != PROCESSOR_NULL)
3455 processor = mc_processor->processor_primary;
3456 }
3457 }
3458
3459 /*
3460 * At this point, we may have a processor hint, and we may have
3461 * an initial starting pset. If the hint is not in the pset, or
3462 * if the hint is for a processor in an invalid state, discard
3463 * the hint.
3464 */
3465 if (processor != PROCESSOR_NULL) {
3466 if (processor->processor_set != pset) {
3467 processor = PROCESSOR_NULL;
3468 } else if (!processor->is_recommended) {
3469 processor = PROCESSOR_NULL;
3470 } else {
3471 switch (processor->state) {
3472 case PROCESSOR_START:
3473 case PROCESSOR_SHUTDOWN:
3474 case PROCESSOR_OFF_LINE:
3475 /*
3476 * Hint is for a processor that cannot support running new threads.
3477 */
3478 processor = PROCESSOR_NULL;
3479 break;
3480 case PROCESSOR_IDLE:
3481 /*
3482 * Hint is for an idle processor. Assume it is no worse than any other
3483 * idle processor. The platform layer had an opportunity to provide
3484 * the "least cost idle" processor above.
3485 */
3486 return (processor);
3487 case PROCESSOR_RUNNING:
3488 case PROCESSOR_DISPATCHING:
3489 /*
3490 * Hint is for an active CPU. This fast-path allows
3491 * realtime threads to preempt non-realtime threads
3492 * to regain their previous executing processor.
3493 */
3494 if ((thread->sched_pri >= BASEPRI_RTQUEUES) &&
3495 (processor->current_pri < BASEPRI_RTQUEUES))
3496 return (processor);
3497
3498 /* Otherwise, use hint as part of search below */
3499 break;
3500 default:
3501 processor = PROCESSOR_NULL;
3502 break;
3503 }
3504 }
3505 }
3506
3507 /*
3508 * Iterate through the processor sets to locate
3509 * an appropriate processor. Seed results with
3510 * a last-processor hint, if available, so that
3511 * a search must find something strictly better
3512 * to replace it.
3513 *
3514 * A primary/secondary pair of SMT processors are
3515 * "unpaired" if the primary is busy but its
3516 * corresponding secondary is idle (so the physical
3517 * core has full use of its resources).
3518 */
3519
3520 integer_t lowest_priority = MAXPRI + 1;
3521 integer_t lowest_unpaired_primary_priority = MAXPRI + 1;
3522 integer_t lowest_count = INT_MAX;
3523 uint64_t furthest_deadline = 1;
3524 processor_t lp_processor = PROCESSOR_NULL;
3525 processor_t lp_unpaired_primary_processor = PROCESSOR_NULL;
3526 processor_t lp_unpaired_secondary_processor = PROCESSOR_NULL;
3527 processor_t lc_processor = PROCESSOR_NULL;
3528 processor_t fd_processor = PROCESSOR_NULL;
3529
3530 if (processor != PROCESSOR_NULL) {
3531 /* All other states should be enumerated above. */
3532 assert(processor->state == PROCESSOR_RUNNING || processor->state == PROCESSOR_DISPATCHING);
3533
3534 lowest_priority = processor->current_pri;
3535 lp_processor = processor;
3536
3537 if (processor->current_pri >= BASEPRI_RTQUEUES) {
3538 furthest_deadline = processor->deadline;
3539 fd_processor = processor;
3540 }
3541
3542 lowest_count = SCHED(processor_runq_count)(processor);
3543 lc_processor = processor;
3544 }
3545
3546 do {
3547
3548 /*
3549 * Choose an idle processor, in pset traversal order
3550 */
3551 qe_foreach_element(processor, &cset->idle_queue, processor_queue) {
3552 if (processor->is_recommended)
3553 return processor;
3554 }
3555
3556 /*
3557 * Otherwise, enumerate active and idle processors to find candidates
3558 * with lower priority/etc.
3559 */
3560
3561 qe_foreach_element(processor, &cset->active_queue, processor_queue) {
3562
3563 if (!processor->is_recommended) {
3564 continue;
3565 }
3566
3567 integer_t cpri = processor->current_pri;
3568 if (cpri < lowest_priority) {
3569 lowest_priority = cpri;
3570 lp_processor = processor;
3571 }
3572
3573 if ((cpri >= BASEPRI_RTQUEUES) && (processor->deadline > furthest_deadline)) {
3574 furthest_deadline = processor->deadline;
3575 fd_processor = processor;
3576 }
3577
3578 integer_t ccount = SCHED(processor_runq_count)(processor);
3579 if (ccount < lowest_count) {
3580 lowest_count = ccount;
3581 lc_processor = processor;
3582 }
3583 }
3584
3585 /*
3586 * For SMT configs, these idle secondary processors must have active primary. Otherwise
3587 * the idle primary would have short-circuited the loop above
3588 */
3589 qe_foreach_element(processor, &cset->idle_secondary_queue, processor_queue) {
3590
3591 if (!processor->is_recommended) {
3592 continue;
3593 }
3594
3595 processor_t cprimary = processor->processor_primary;
3596
3597 /* If the primary processor is offline or starting up, it's not a candidate for this path */
3598 if (cprimary->state == PROCESSOR_RUNNING || cprimary->state == PROCESSOR_DISPATCHING) {
3599 integer_t primary_pri = cprimary->current_pri;
3600
3601 if (primary_pri < lowest_unpaired_primary_priority) {
3602 lowest_unpaired_primary_priority = primary_pri;
3603 lp_unpaired_primary_processor = cprimary;
3604 lp_unpaired_secondary_processor = processor;
3605 }
3606 }
3607 }
3608
3609
3610 if (thread->sched_pri >= BASEPRI_RTQUEUES) {
3611
3612 /*
3613 * For realtime threads, the most important aspect is
3614 * scheduling latency, so we attempt to assign threads
3615 * to good preemption candidates (assuming an idle primary
3616 * processor was not available above).
3617 */
3618
3619 if (thread->sched_pri > lowest_unpaired_primary_priority) {
3620 /* Move to end of active queue so that the next thread doesn't also pick it */
3621 re_queue_tail(&cset->active_queue, &lp_unpaired_primary_processor->processor_queue);
3622 return lp_unpaired_primary_processor;
3623 }
3624 if (thread->sched_pri > lowest_priority) {
3625 /* Move to end of active queue so that the next thread doesn't also pick it */
3626 re_queue_tail(&cset->active_queue, &lp_processor->processor_queue);
3627 return lp_processor;
3628 }
3629 if (thread->realtime.deadline < furthest_deadline)
3630 return fd_processor;
3631
3632 /*
3633 * If all primary and secondary CPUs are busy with realtime
3634 * threads with deadlines earlier than us, move on to next
3635 * pset.
3636 */
3637 }
3638 else {
3639
3640 if (thread->sched_pri > lowest_unpaired_primary_priority) {
3641 /* Move to end of active queue so that the next thread doesn't also pick it */
3642 re_queue_tail(&cset->active_queue, &lp_unpaired_primary_processor->processor_queue);
3643 return lp_unpaired_primary_processor;
3644 }
3645 if (thread->sched_pri > lowest_priority) {
3646 /* Move to end of active queue so that the next thread doesn't also pick it */
3647 re_queue_tail(&cset->active_queue, &lp_processor->processor_queue);
3648 return lp_processor;
3649 }
3650
3651 /*
3652 * If all primary processor in this pset are running a higher
3653 * priority thread, move on to next pset. Only when we have
3654 * exhausted this search do we fall back to other heuristics.
3655 */
3656 }
3657
3658 /*
3659 * Move onto the next processor set.
3660 */
3661 nset = next_pset(cset);
3662
3663 if (nset != pset) {
3664 pset_unlock(cset);
3665
3666 cset = nset;
3667 pset_lock(cset);
3668 }
3669 } while (nset != pset);
3670
3671 /*
3672 * Make sure that we pick a running processor,
3673 * and that the correct processor set is locked.
3674 * Since we may have unlock the candidate processor's
3675 * pset, it may have changed state.
3676 *
3677 * All primary processors are running a higher priority
3678 * thread, so the only options left are enqueuing on
3679 * the secondary processor that would perturb the least priority
3680 * primary, or the least busy primary.
3681 */
3682 do {
3683
3684 /* lowest_priority is evaluated in the main loops above */
3685 if (lp_unpaired_secondary_processor != PROCESSOR_NULL) {
3686 processor = lp_unpaired_secondary_processor;
3687 lp_unpaired_secondary_processor = PROCESSOR_NULL;
3688 } else if (lc_processor != PROCESSOR_NULL) {
3689 processor = lc_processor;
3690 lc_processor = PROCESSOR_NULL;
3691 } else {
3692 /*
3693 * All processors are executing higher
3694 * priority threads, and the lowest_count
3695 * candidate was not usable
3696 */
3697 processor = master_processor;
3698 }
3699
3700 /*
3701 * Check that the correct processor set is
3702 * returned locked.
3703 */
3704 if (cset != processor->processor_set) {
3705 pset_unlock(cset);
3706 cset = processor->processor_set;
3707 pset_lock(cset);
3708 }
3709
3710 /*
3711 * We must verify that the chosen processor is still available.
3712 * master_processor is an exception, since we may need to preempt
3713 * a running thread on it during processor shutdown (for sleep),
3714 * and that thread needs to be enqueued on its runqueue to run
3715 * when the processor is restarted.
3716 */
3717 if (processor != master_processor && (processor->state == PROCESSOR_SHUTDOWN || processor->state == PROCESSOR_OFF_LINE))
3718 processor = PROCESSOR_NULL;
3719
3720 } while (processor == PROCESSOR_NULL);
3721
3722 return (processor);
3723 }
3724
3725 /*
3726 * thread_setrun:
3727 *
3728 * Dispatch thread for execution, onto an idle
3729 * processor or run queue, and signal a preemption
3730 * as appropriate.
3731 *
3732 * Thread must be locked.
3733 */
3734 void
3735 thread_setrun(
3736 thread_t thread,
3737 integer_t options)
3738 {
3739 processor_t processor;
3740 processor_set_t pset;
3741
3742 assert((thread->state & (TH_RUN|TH_WAIT|TH_UNINT|TH_TERMINATE|TH_TERMINATE2)) == TH_RUN);
3743 assert(thread->runq == PROCESSOR_NULL);
3744
3745 /*
3746 * Update priority if needed.
3747 */
3748 if (SCHED(can_update_priority)(thread))
3749 SCHED(update_priority)(thread);
3750
3751 thread->sfi_class = sfi_thread_classify(thread);
3752
3753 assert(thread->runq == PROCESSOR_NULL);
3754
3755 #if __SMP__
3756 if (thread->bound_processor == PROCESSOR_NULL) {
3757 /*
3758 * Unbound case.
3759 */
3760 if (thread->affinity_set != AFFINITY_SET_NULL) {
3761 /*
3762 * Use affinity set policy hint.
3763 */
3764 pset = thread->affinity_set->aset_pset;
3765 pset_lock(pset);
3766
3767 processor = SCHED(choose_processor)(pset, PROCESSOR_NULL, thread);
3768
3769 SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR)|DBG_FUNC_NONE,
3770 (uintptr_t)thread_tid(thread), (uintptr_t)-1, processor->cpu_id, processor->state, 0);
3771 } else if (thread->last_processor != PROCESSOR_NULL) {
3772 /*
3773 * Simple (last processor) affinity case.
3774 */
3775 processor = thread->last_processor;
3776 pset = processor->processor_set;
3777 pset_lock(pset);
3778 processor = SCHED(choose_processor)(pset, processor, thread);
3779
3780 SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR)|DBG_FUNC_NONE,
3781 (uintptr_t)thread_tid(thread), thread->last_processor->cpu_id, processor->cpu_id, processor->state, 0);
3782 } else {
3783 /*
3784 * No Affinity case:
3785 *
3786 * Utilitize a per task hint to spread threads
3787 * among the available processor sets.
3788 */
3789 task_t task = thread->task;
3790
3791 pset = task->pset_hint;
3792 if (pset == PROCESSOR_SET_NULL)
3793 pset = current_processor()->processor_set;
3794
3795 pset = choose_next_pset(pset);
3796 pset_lock(pset);
3797
3798 processor = SCHED(choose_processor)(pset, PROCESSOR_NULL, thread);
3799 task->pset_hint = processor->processor_set;
3800
3801 SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR)|DBG_FUNC_NONE,
3802 (uintptr_t)thread_tid(thread), (uintptr_t)-1, processor->cpu_id, processor->state, 0);
3803 }
3804 } else {
3805 /*
3806 * Bound case:
3807 *
3808 * Unconditionally dispatch on the processor.
3809 */
3810 processor = thread->bound_processor;
3811 pset = processor->processor_set;
3812 pset_lock(pset);
3813
3814 SCHED_DEBUG_CHOOSE_PROCESSOR_KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHOOSE_PROCESSOR)|DBG_FUNC_NONE,
3815 (uintptr_t)thread_tid(thread), (uintptr_t)-2, processor->cpu_id, processor->state, 0);
3816 }
3817 #else /* !__SMP__ */
3818 /* Only one processor to choose */
3819 assert(thread->bound_processor == PROCESSOR_NULL || thread->bound_processor == master_processor);
3820 processor = master_processor;
3821 pset = processor->processor_set;
3822 pset_lock(pset);
3823 #endif /* !__SMP__ */
3824
3825 /*
3826 * Dispatch the thread on the chosen processor.
3827 * TODO: This should be based on sched_mode, not sched_pri
3828 */
3829 if (thread->sched_pri >= BASEPRI_RTQUEUES)
3830 realtime_setrun(processor, thread);
3831 else
3832 processor_setrun(processor, thread, options);
3833 }
3834
3835 processor_set_t
3836 task_choose_pset(
3837 task_t task)
3838 {
3839 processor_set_t pset = task->pset_hint;
3840
3841 if (pset != PROCESSOR_SET_NULL)
3842 pset = choose_next_pset(pset);
3843
3844 return (pset);
3845 }
3846
3847 /*
3848 * Check for a preemption point in
3849 * the current context.
3850 *
3851 * Called at splsched with thread locked.
3852 */
3853 ast_t
3854 csw_check(
3855 processor_t processor,
3856 ast_t check_reason)
3857 {
3858 processor_set_t pset = processor->processor_set;
3859 ast_t result;
3860
3861 pset_lock(pset);
3862
3863 /* If we were sent a remote AST and interrupted a running processor, acknowledge it here with pset lock held */
3864 pset->pending_AST_cpu_mask &= ~(1ULL << processor->cpu_id);
3865
3866 result = csw_check_locked(processor, pset, check_reason);
3867
3868 pset_unlock(pset);
3869
3870 return result;
3871 }
3872
3873 /*
3874 * Check for preemption at splsched with
3875 * pset and thread locked
3876 */
3877 ast_t
3878 csw_check_locked(
3879 processor_t processor,
3880 processor_set_t pset __unused,
3881 ast_t check_reason)
3882 {
3883 ast_t result;
3884 thread_t thread = processor->active_thread;
3885
3886 if (processor->first_timeslice) {
3887 if (rt_runq.count > 0)
3888 return (check_reason | AST_PREEMPT | AST_URGENT);
3889 }
3890 else {
3891 if (rt_runq.count > 0) {
3892 if (BASEPRI_RTQUEUES > processor->current_pri)
3893 return (check_reason | AST_PREEMPT | AST_URGENT);
3894 else
3895 return (check_reason | AST_PREEMPT);
3896 }
3897 }
3898
3899 result = SCHED(processor_csw_check)(processor);
3900 if (result != AST_NONE)
3901 return (check_reason | result | (thread_eager_preemption(thread) ? AST_URGENT : AST_NONE));
3902
3903 #if __SMP__
3904
3905 /*
3906 * If the current thread is running on a processor that is no longer recommended, gently
3907 * (non-urgently) get to a point and then block, and which point thread_select() should
3908 * try to idle the processor and re-dispatch the thread to a recommended processor.
3909 */
3910 if (!processor->is_recommended)
3911 return (check_reason | AST_PREEMPT);
3912
3913 /*
3914 * Even though we could continue executing on this processor, a
3915 * secondary SMT core should try to shed load to another primary core.
3916 *
3917 * TODO: Should this do the same check that thread_select does? i.e.
3918 * if no bound threads target this processor, and idle primaries exist, preempt
3919 * The case of RT threads existing is already taken care of above
3920 * Consider Capri in this scenario.
3921 *
3922 * if (!SCHED(processor_bound_count)(processor) && !queue_empty(&pset->idle_queue))
3923 *
3924 * TODO: Alternatively - check if only primary is idle, or check if primary's pri is lower than mine.
3925 */
3926
3927 if (processor->current_pri < BASEPRI_RTQUEUES &&
3928 processor->processor_primary != processor)
3929 return (check_reason | AST_PREEMPT);
3930 #endif
3931
3932 if (thread->state & TH_SUSP)
3933 return (check_reason | AST_PREEMPT);
3934
3935 #if CONFIG_SCHED_SFI
3936 /*
3937 * Current thread may not need to be preempted, but maybe needs
3938 * an SFI wait?
3939 */
3940 result = sfi_thread_needs_ast(thread, NULL);
3941 if (result != AST_NONE)
3942 return (check_reason | result);
3943 #endif
3944
3945 return (AST_NONE);
3946 }
3947
3948 /*
3949 * set_sched_pri:
3950 *
3951 * Set the scheduled priority of the specified thread.
3952 *
3953 * This may cause the thread to change queues.
3954 *
3955 * Thread must be locked.
3956 */
3957 void
3958 set_sched_pri(
3959 thread_t thread,
3960 int priority)
3961 {
3962 thread_t cthread = current_thread();
3963 boolean_t is_current_thread = (thread == cthread) ? TRUE : FALSE;
3964 int curgency, nurgency;
3965 uint64_t urgency_param1, urgency_param2;
3966 boolean_t removed_from_runq = FALSE;
3967
3968 /* If we're already at this priority, no need to mess with the runqueue */
3969 if (priority == thread->sched_pri)
3970 return;
3971
3972 if (is_current_thread) {
3973 assert(thread->runq == PROCESSOR_NULL);
3974 curgency = thread_get_urgency(thread, &urgency_param1, &urgency_param2);
3975 } else {
3976 removed_from_runq = thread_run_queue_remove(thread);
3977 }
3978
3979 thread->sched_pri = priority;
3980
3981 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHANGE_PRIORITY),
3982 (uintptr_t)thread_tid(thread),
3983 thread->base_pri,
3984 thread->sched_pri,
3985 0, /* eventually, 'reason' */
3986 0);
3987
3988 if (is_current_thread) {
3989 nurgency = thread_get_urgency(thread, &urgency_param1, &urgency_param2);
3990 /*
3991 * set_sched_pri doesn't alter RT params. We expect direct base priority/QoS
3992 * class alterations from user space to occur relatively infrequently, hence
3993 * those are lazily handled. QoS classes have distinct priority bands, and QoS
3994 * inheritance is expected to involve priority changes.
3995 */
3996 if (nurgency != curgency) {
3997 thread_tell_urgency(nurgency, urgency_param1, urgency_param2, 0, thread);
3998 machine_thread_going_on_core(thread, nurgency, 0, 0);
3999 }
4000 }
4001
4002 /* TODO: Should this be TAILQ if it went down, HEADQ if it went up? */
4003 if (removed_from_runq)
4004 thread_run_queue_reinsert(thread, SCHED_PREEMPT | SCHED_TAILQ);
4005 else if (thread->state & TH_RUN) {
4006 processor_t processor = thread->last_processor;
4007
4008 if (is_current_thread) {
4009 ast_t preempt;
4010
4011 processor->current_pri = priority;
4012 processor->current_thmode = thread->sched_mode;
4013 processor->current_sfi_class = thread->sfi_class = sfi_thread_classify(thread);
4014 if ((preempt = csw_check(processor, AST_NONE)) != AST_NONE)
4015 ast_on(preempt);
4016 } else if (processor != PROCESSOR_NULL && processor->active_thread == thread)
4017 cause_ast_check(processor);
4018 }
4019 }
4020
4021 /*
4022 * thread_run_queue_remove_for_handoff
4023 *
4024 * Pull a thread or its (recursive) push target out of the runqueue
4025 * so that it is ready for thread_run()
4026 *
4027 * Called at splsched
4028 *
4029 * Returns the thread that was pulled or THREAD_NULL if no thread could be pulled.
4030 * This may be different than the thread that was passed in.
4031 */
4032 thread_t
4033 thread_run_queue_remove_for_handoff(thread_t thread) {
4034
4035 thread_t pulled_thread = THREAD_NULL;
4036
4037 thread_lock(thread);
4038
4039 /*
4040 * Check that the thread is not bound
4041 * to a different processor, and that realtime
4042 * is not involved.
4043 *
4044 * Next, pull it off its run queue. If it
4045 * doesn't come, it's not eligible.
4046 */
4047
4048 processor_t processor = current_processor();
4049 if (processor->current_pri < BASEPRI_RTQUEUES && thread->sched_pri < BASEPRI_RTQUEUES &&
4050 (thread->bound_processor == PROCESSOR_NULL || thread->bound_processor == processor)) {
4051
4052 if (thread_run_queue_remove(thread))
4053 pulled_thread = thread;
4054 }
4055
4056 thread_unlock(thread);
4057
4058 return pulled_thread;
4059 }
4060
4061 /*
4062 * thread_run_queue_remove:
4063 *
4064 * Remove a thread from its current run queue and
4065 * return TRUE if successful.
4066 *
4067 * Thread must be locked.
4068 *
4069 * If thread->runq is PROCESSOR_NULL, the thread will not re-enter the
4070 * run queues because the caller locked the thread. Otherwise
4071 * the thread is on a run queue, but could be chosen for dispatch
4072 * and removed by another processor under a different lock, which
4073 * will set thread->runq to PROCESSOR_NULL.
4074 *
4075 * Hence the thread select path must not rely on anything that could
4076 * be changed under the thread lock after calling this function,
4077 * most importantly thread->sched_pri.
4078 */
4079 boolean_t
4080 thread_run_queue_remove(
4081 thread_t thread)
4082 {
4083 boolean_t removed = FALSE;
4084 processor_t processor = thread->runq;
4085
4086 if ((thread->state & (TH_RUN|TH_WAIT)) == TH_WAIT) {
4087 /* Thread isn't runnable */
4088 assert(thread->runq == PROCESSOR_NULL);
4089 return FALSE;
4090 }
4091
4092 if (processor == PROCESSOR_NULL) {
4093 /*
4094 * The thread is either not on the runq,
4095 * or is in the midst of being removed from the runq.
4096 *
4097 * runq is set to NULL under the pset lock, not the thread
4098 * lock, so the thread may still be in the process of being dequeued
4099 * from the runq. It will wait in invoke for the thread lock to be
4100 * dropped.
4101 */
4102
4103 return FALSE;
4104 }
4105
4106 if (thread->sched_pri < BASEPRI_RTQUEUES) {
4107 return SCHED(processor_queue_remove)(processor, thread);
4108 }
4109
4110 rt_lock_lock();
4111
4112 if (thread->runq != PROCESSOR_NULL) {
4113 /*
4114 * Thread is on the RT run queue and we have a lock on
4115 * that run queue.
4116 */
4117
4118 assert(thread->runq == THREAD_ON_RT_RUNQ);
4119
4120 remqueue(&thread->runq_links);
4121 SCHED_STATS_RUNQ_CHANGE(&rt_runq.runq_stats, rt_runq.count);
4122 rt_runq.count--;
4123
4124 thread->runq = PROCESSOR_NULL;
4125
4126 removed = TRUE;
4127 }
4128
4129 rt_lock_unlock();
4130
4131 return (removed);
4132 }
4133
4134 /*
4135 * Put the thread back where it goes after a thread_run_queue_remove
4136 *
4137 * Thread must have been removed under the same thread lock hold
4138 *
4139 * thread locked, at splsched
4140 */
4141 void
4142 thread_run_queue_reinsert(thread_t thread, integer_t options)
4143 {
4144 assert(thread->runq == PROCESSOR_NULL);
4145
4146 assert(thread->state & (TH_RUN));
4147 thread_setrun(thread, options);
4148
4149 }
4150
4151 void
4152 sys_override_cpu_throttle(int flag)
4153 {
4154 if (flag == CPU_THROTTLE_ENABLE)
4155 cpu_throttle_enabled = 1;
4156 if (flag == CPU_THROTTLE_DISABLE)
4157 cpu_throttle_enabled = 0;
4158 }
4159
4160 int
4161 thread_get_urgency(thread_t thread, uint64_t *arg1, uint64_t *arg2)
4162 {
4163 if (thread == NULL || (thread->state & TH_IDLE)) {
4164 *arg1 = 0;
4165 *arg2 = 0;
4166
4167 return (THREAD_URGENCY_NONE);
4168 } else if (thread->sched_mode == TH_MODE_REALTIME) {
4169 *arg1 = thread->realtime.period;
4170 *arg2 = thread->realtime.deadline;
4171
4172 return (THREAD_URGENCY_REAL_TIME);
4173 } else if (cpu_throttle_enabled &&
4174 ((thread->sched_pri <= MAXPRI_THROTTLE) && (thread->base_pri <= MAXPRI_THROTTLE))) {
4175 /*
4176 * Background urgency applied when thread priority is MAXPRI_THROTTLE or lower and thread is not promoted
4177 */
4178 *arg1 = thread->sched_pri;
4179 *arg2 = thread->base_pri;
4180
4181 return (THREAD_URGENCY_BACKGROUND);
4182 } else {
4183 /* For otherwise unclassified threads, report throughput QoS
4184 * parameters
4185 */
4186 *arg1 = proc_get_effective_thread_policy(thread, TASK_POLICY_THROUGH_QOS);
4187 *arg2 = proc_get_effective_task_policy(thread->task, TASK_POLICY_THROUGH_QOS);
4188
4189 return (THREAD_URGENCY_NORMAL);
4190 }
4191 }
4192
4193
4194 /*
4195 * This is the processor idle loop, which just looks for other threads
4196 * to execute. Processor idle threads invoke this without supplying a
4197 * current thread to idle without an asserted wait state.
4198 *
4199 * Returns a the next thread to execute if dispatched directly.
4200 */
4201
4202 #if 0
4203 #define IDLE_KERNEL_DEBUG_CONSTANT(...) KERNEL_DEBUG_CONSTANT(__VA_ARGS__)
4204 #else
4205 #define IDLE_KERNEL_DEBUG_CONSTANT(...) do { } while(0)
4206 #endif
4207
4208 thread_t
4209 processor_idle(
4210 thread_t thread,
4211 processor_t processor)
4212 {
4213 processor_set_t pset = processor->processor_set;
4214 thread_t new_thread;
4215 int state;
4216 (void)splsched();
4217
4218 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
4219 MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_START,
4220 (uintptr_t)thread_tid(thread), 0, 0, 0, 0);
4221
4222 SCHED_STATS_CPU_IDLE_START(processor);
4223
4224 timer_switch(&PROCESSOR_DATA(processor, system_state),
4225 mach_absolute_time(), &PROCESSOR_DATA(processor, idle_state));
4226 PROCESSOR_DATA(processor, current_state) = &PROCESSOR_DATA(processor, idle_state);
4227
4228 while (1) {
4229 if (processor->state != PROCESSOR_IDLE) /* unsafe, but worst case we loop around once */
4230 break;
4231 if (pset->pending_AST_cpu_mask & (1ULL << processor->cpu_id))
4232 break;
4233 if (processor->is_recommended) {
4234 if (rt_runq.count)
4235 break;
4236 } else {
4237 if (SCHED(processor_bound_count)(processor))
4238 break;
4239 }
4240
4241 #if CONFIG_SCHED_IDLE_IN_PLACE
4242 if (thread != THREAD_NULL) {
4243 /* Did idle-in-place thread wake up */
4244 if ((thread->state & (TH_WAIT|TH_SUSP)) != TH_WAIT || thread->wake_active)
4245 break;
4246 }
4247 #endif
4248
4249 IDLE_KERNEL_DEBUG_CONSTANT(
4250 MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), rt_runq.count, SCHED(processor_runq_count)(processor), -1, 0);
4251
4252 machine_track_platform_idle(TRUE);
4253
4254 machine_idle();
4255
4256 machine_track_platform_idle(FALSE);
4257
4258 (void)splsched();
4259
4260 IDLE_KERNEL_DEBUG_CONSTANT(
4261 MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), rt_runq.count, SCHED(processor_runq_count)(processor), -2, 0);
4262
4263 if (!SCHED(processor_queue_empty)(processor)) {
4264 /* Secondary SMT processors respond to directed wakeups
4265 * exclusively. Some platforms induce 'spurious' SMT wakeups.
4266 */
4267 if (processor->processor_primary == processor)
4268 break;
4269 }
4270 }
4271
4272 timer_switch(&PROCESSOR_DATA(processor, idle_state),
4273 mach_absolute_time(), &PROCESSOR_DATA(processor, system_state));
4274 PROCESSOR_DATA(processor, current_state) = &PROCESSOR_DATA(processor, system_state);
4275
4276 pset_lock(pset);
4277
4278 /* If we were sent a remote AST and came out of idle, acknowledge it here with pset lock held */
4279 pset->pending_AST_cpu_mask &= ~(1ULL << processor->cpu_id);
4280 #if defined(CONFIG_SCHED_DEFERRED_AST)
4281 pset->pending_deferred_AST_cpu_mask &= ~(1ULL << processor->cpu_id);
4282 #endif
4283
4284 state = processor->state;
4285 if (state == PROCESSOR_DISPATCHING) {
4286 /*
4287 * Commmon case -- cpu dispatched.
4288 */
4289 new_thread = processor->next_thread;
4290 processor->next_thread = THREAD_NULL;
4291 processor->state = PROCESSOR_RUNNING;
4292
4293 if ((new_thread != THREAD_NULL) && (SCHED(processor_queue_has_priority)(processor, new_thread->sched_pri, FALSE) ||
4294 (rt_runq.count > 0)) ) {
4295 /* Something higher priority has popped up on the runqueue - redispatch this thread elsewhere */
4296 processor->current_pri = IDLEPRI;
4297 processor->current_thmode = TH_MODE_FIXED;
4298 processor->current_sfi_class = SFI_CLASS_KERNEL;
4299 processor->deadline = UINT64_MAX;
4300
4301 pset_unlock(pset);
4302
4303 thread_lock(new_thread);
4304 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_REDISPATCH), (uintptr_t)thread_tid(new_thread), new_thread->sched_pri, rt_runq.count, 0, 0);
4305 thread_setrun(new_thread, SCHED_HEADQ);
4306 thread_unlock(new_thread);
4307
4308 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
4309 MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_END,
4310 (uintptr_t)thread_tid(thread), state, 0, 0, 0);
4311
4312 return (THREAD_NULL);
4313 }
4314
4315 pset_unlock(pset);
4316
4317 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
4318 MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_END,
4319 (uintptr_t)thread_tid(thread), state, (uintptr_t)thread_tid(new_thread), 0, 0);
4320
4321 return (new_thread);
4322
4323 } else if (state == PROCESSOR_IDLE) {
4324 re_queue_tail(&pset->active_queue, &processor->processor_queue);
4325
4326 processor->state = PROCESSOR_RUNNING;
4327 processor->current_pri = IDLEPRI;
4328 processor->current_thmode = TH_MODE_FIXED;
4329 processor->current_sfi_class = SFI_CLASS_KERNEL;
4330 processor->deadline = UINT64_MAX;
4331
4332 } else if (state == PROCESSOR_SHUTDOWN) {
4333 /*
4334 * Going off-line. Force a
4335 * reschedule.
4336 */
4337 if ((new_thread = processor->next_thread) != THREAD_NULL) {
4338 processor->next_thread = THREAD_NULL;
4339 processor->current_pri = IDLEPRI;
4340 processor->current_thmode = TH_MODE_FIXED;
4341 processor->current_sfi_class = SFI_CLASS_KERNEL;
4342 processor->deadline = UINT64_MAX;
4343
4344 pset_unlock(pset);
4345
4346 thread_lock(new_thread);
4347 thread_setrun(new_thread, SCHED_HEADQ);
4348 thread_unlock(new_thread);
4349
4350 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
4351 MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_END,
4352 (uintptr_t)thread_tid(thread), state, 0, 0, 0);
4353
4354 return (THREAD_NULL);
4355 }
4356 }
4357
4358 pset_unlock(pset);
4359
4360 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
4361 MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_END,
4362 (uintptr_t)thread_tid(thread), state, 0, 0, 0);
4363
4364 return (THREAD_NULL);
4365 }
4366
4367 /*
4368 * Each processor has a dedicated thread which
4369 * executes the idle loop when there is no suitable
4370 * previous context.
4371 */
4372 void
4373 idle_thread(void)
4374 {
4375 processor_t processor = current_processor();
4376 thread_t new_thread;
4377
4378 new_thread = processor_idle(THREAD_NULL, processor);
4379 if (new_thread != THREAD_NULL) {
4380 thread_run(processor->idle_thread, (thread_continue_t)idle_thread, NULL, new_thread);
4381 /*NOTREACHED*/
4382 }
4383
4384 thread_block((thread_continue_t)idle_thread);
4385 /*NOTREACHED*/
4386 }
4387
4388 kern_return_t
4389 idle_thread_create(
4390 processor_t processor)
4391 {
4392 kern_return_t result;
4393 thread_t thread;
4394 spl_t s;
4395
4396 result = kernel_thread_create((thread_continue_t)idle_thread, NULL, MAXPRI_KERNEL, &thread);
4397 if (result != KERN_SUCCESS)
4398 return (result);
4399
4400 s = splsched();
4401 thread_lock(thread);
4402 thread->bound_processor = processor;
4403 processor->idle_thread = thread;
4404 thread->sched_pri = thread->base_pri = IDLEPRI;
4405 thread->state = (TH_RUN | TH_IDLE);
4406 thread->options |= TH_OPT_IDLE_THREAD;
4407 thread_unlock(thread);
4408 splx(s);
4409
4410 thread_deallocate(thread);
4411
4412 return (KERN_SUCCESS);
4413 }
4414
4415 /*
4416 * sched_startup:
4417 *
4418 * Kicks off scheduler services.
4419 *
4420 * Called at splsched.
4421 */
4422 void
4423 sched_startup(void)
4424 {
4425 kern_return_t result;
4426 thread_t thread;
4427
4428 simple_lock_init(&sched_vm_group_list_lock, 0);
4429
4430
4431 result = kernel_thread_start_priority((thread_continue_t)sched_init_thread,
4432 (void *)SCHED(maintenance_continuation), MAXPRI_KERNEL, &thread);
4433 if (result != KERN_SUCCESS)
4434 panic("sched_startup");
4435
4436 thread_deallocate(thread);
4437
4438 assert_thread_magic(thread);
4439
4440 /*
4441 * Yield to the sched_init_thread once, to
4442 * initialize our own thread after being switched
4443 * back to.
4444 *
4445 * The current thread is the only other thread
4446 * active at this point.
4447 */
4448 thread_block(THREAD_CONTINUE_NULL);
4449 }
4450
4451 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
4452
4453 static volatile uint64_t sched_maintenance_deadline;
4454 static uint64_t sched_tick_last_abstime;
4455 static uint64_t sched_tick_delta;
4456 uint64_t sched_tick_max_delta;
4457 /*
4458 * sched_init_thread:
4459 *
4460 * Perform periodic bookkeeping functions about ten
4461 * times per second.
4462 */
4463 void
4464 sched_timeshare_maintenance_continue(void)
4465 {
4466 uint64_t sched_tick_ctime, late_time;
4467
4468 struct sched_update_scan_context scan_context = {
4469 .earliest_bg_make_runnable_time = UINT64_MAX,
4470 .earliest_normal_make_runnable_time = UINT64_MAX,
4471 .earliest_rt_make_runnable_time = UINT64_MAX
4472 };
4473
4474 sched_tick_ctime = mach_absolute_time();
4475
4476 if (__improbable(sched_tick_last_abstime == 0)) {
4477 sched_tick_last_abstime = sched_tick_ctime;
4478 late_time = 0;
4479 sched_tick_delta = 1;
4480 } else {
4481 late_time = sched_tick_ctime - sched_tick_last_abstime;
4482 sched_tick_delta = late_time / sched_tick_interval;
4483 /* Ensure a delta of 1, since the interval could be slightly
4484 * smaller than the sched_tick_interval due to dispatch
4485 * latencies.
4486 */
4487 sched_tick_delta = MAX(sched_tick_delta, 1);
4488
4489 /* In the event interrupt latencies or platform
4490 * idle events that advanced the timebase resulted
4491 * in periods where no threads were dispatched,
4492 * cap the maximum "tick delta" at SCHED_TICK_MAX_DELTA
4493 * iterations.
4494 */
4495 sched_tick_delta = MIN(sched_tick_delta, SCHED_TICK_MAX_DELTA);
4496
4497 sched_tick_last_abstime = sched_tick_ctime;
4498 sched_tick_max_delta = MAX(sched_tick_delta, sched_tick_max_delta);
4499 }
4500
4501 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_MAINTENANCE)|DBG_FUNC_START,
4502 sched_tick_delta, late_time, 0, 0, 0);
4503
4504 /* Add a number of pseudo-ticks corresponding to the elapsed interval
4505 * This could be greater than 1 if substantial intervals where
4506 * all processors are idle occur, which rarely occurs in practice.
4507 */
4508
4509 sched_tick += sched_tick_delta;
4510
4511 /*
4512 * Compute various averages.
4513 */
4514 compute_averages(sched_tick_delta);
4515
4516 /*
4517 * Scan the run queues for threads which
4518 * may need to be updated, and find the earliest runnable thread on the runqueue
4519 * to report its latency.
4520 */
4521 SCHED(thread_update_scan)(&scan_context);
4522
4523 rt_runq_scan(&scan_context);
4524
4525 uint64_t ctime = mach_absolute_time();
4526
4527 uint64_t bg_max_latency = (ctime > scan_context.earliest_bg_make_runnable_time) ?
4528 ctime - scan_context.earliest_bg_make_runnable_time : 0;
4529
4530 uint64_t default_max_latency = (ctime > scan_context.earliest_normal_make_runnable_time) ?
4531 ctime - scan_context.earliest_normal_make_runnable_time : 0;
4532
4533 uint64_t realtime_max_latency = (ctime > scan_context.earliest_rt_make_runnable_time) ?
4534 ctime - scan_context.earliest_rt_make_runnable_time : 0;
4535
4536 machine_max_runnable_latency(bg_max_latency, default_max_latency, realtime_max_latency);
4537
4538 /*
4539 * Check to see if the special sched VM group needs attention.
4540 */
4541 sched_vm_group_maintenance();
4542
4543
4544 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_MAINTENANCE) | DBG_FUNC_END,
4545 sched_pri_shifts[TH_BUCKET_SHARE_FG], sched_pri_shifts[TH_BUCKET_SHARE_BG],
4546 sched_pri_shifts[TH_BUCKET_SHARE_UT], 0, 0);
4547
4548 assert_wait((event_t)sched_timeshare_maintenance_continue, THREAD_UNINT);
4549 thread_block((thread_continue_t)sched_timeshare_maintenance_continue);
4550 /*NOTREACHED*/
4551 }
4552
4553 static uint64_t sched_maintenance_wakeups;
4554
4555 /*
4556 * Determine if the set of routines formerly driven by a maintenance timer
4557 * must be invoked, based on a deadline comparison. Signals the scheduler
4558 * maintenance thread on deadline expiration. Must be invoked at an interval
4559 * lower than the "sched_tick_interval", currently accomplished by
4560 * invocation via the quantum expiration timer and at context switch time.
4561 * Performance matters: this routine reuses a timestamp approximating the
4562 * current absolute time received from the caller, and should perform
4563 * no more than a comparison against the deadline in the common case.
4564 */
4565 void
4566 sched_timeshare_consider_maintenance(uint64_t ctime) {
4567 uint64_t ndeadline, deadline = sched_maintenance_deadline;
4568
4569 if (__improbable(ctime >= deadline)) {
4570 if (__improbable(current_thread() == sched_maintenance_thread))
4571 return;
4572 OSMemoryBarrier();
4573
4574 ndeadline = ctime + sched_tick_interval;
4575
4576 if (__probable(__sync_bool_compare_and_swap(&sched_maintenance_deadline, deadline, ndeadline))) {
4577 thread_wakeup((event_t)sched_timeshare_maintenance_continue);
4578 sched_maintenance_wakeups++;
4579 }
4580 }
4581 }
4582
4583 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
4584
4585 void
4586 sched_init_thread(void (*continuation)(void))
4587 {
4588 thread_block(THREAD_CONTINUE_NULL);
4589
4590 thread_t thread = current_thread();
4591
4592 thread_set_thread_name(thread, "sched_maintenance_thread");
4593
4594 sched_maintenance_thread = thread;
4595
4596 continuation();
4597
4598 /*NOTREACHED*/
4599 }
4600
4601 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
4602
4603 /*
4604 * thread_update_scan / runq_scan:
4605 *
4606 * Scan the run queues to account for timesharing threads
4607 * which need to be updated.
4608 *
4609 * Scanner runs in two passes. Pass one squirrels likely
4610 * threads away in an array, pass two does the update.
4611 *
4612 * This is necessary because the run queue is locked for
4613 * the candidate scan, but the thread is locked for the update.
4614 *
4615 * Array should be sized to make forward progress, without
4616 * disabling preemption for long periods.
4617 */
4618
4619 #define THREAD_UPDATE_SIZE 128
4620
4621 static thread_t thread_update_array[THREAD_UPDATE_SIZE];
4622 static uint32_t thread_update_count = 0;
4623
4624 /* Returns TRUE if thread was added, FALSE if thread_update_array is full */
4625 boolean_t
4626 thread_update_add_thread(thread_t thread)
4627 {
4628 if (thread_update_count == THREAD_UPDATE_SIZE)
4629 return (FALSE);
4630
4631 thread_update_array[thread_update_count++] = thread;
4632 thread_reference_internal(thread);
4633 return (TRUE);
4634 }
4635
4636 void
4637 thread_update_process_threads(void)
4638 {
4639 assert(thread_update_count <= THREAD_UPDATE_SIZE);
4640
4641 for (uint32_t i = 0 ; i < thread_update_count ; i++) {
4642 thread_t thread = thread_update_array[i];
4643 assert_thread_magic(thread);
4644 thread_update_array[i] = THREAD_NULL;
4645
4646 spl_t s = splsched();
4647 thread_lock(thread);
4648 if (!(thread->state & (TH_WAIT)) && thread->sched_stamp != sched_tick) {
4649 SCHED(update_priority)(thread);
4650 }
4651 thread_unlock(thread);
4652 splx(s);
4653
4654 thread_deallocate(thread);
4655 }
4656
4657 thread_update_count = 0;
4658 }
4659
4660 /*
4661 * Scan a runq for candidate threads.
4662 *
4663 * Returns TRUE if retry is needed.
4664 */
4665 boolean_t
4666 runq_scan(
4667 run_queue_t runq,
4668 sched_update_scan_context_t scan_context)
4669 {
4670 int count = runq->count;
4671 int queue_index;
4672
4673 assert(count >= 0);
4674
4675 if (count == 0)
4676 return FALSE;
4677
4678 for (queue_index = bitmap_first(runq->bitmap, NRQS);
4679 queue_index >= 0;
4680 queue_index = bitmap_next(runq->bitmap, queue_index)) {
4681
4682 thread_t thread;
4683 queue_t queue = &runq->queues[queue_index];
4684
4685 qe_foreach_element(thread, queue, runq_links) {
4686 assert(count > 0);
4687 assert_thread_magic(thread);
4688
4689 if (thread->sched_stamp != sched_tick &&
4690 thread->sched_mode == TH_MODE_TIMESHARE) {
4691 if (thread_update_add_thread(thread) == FALSE)
4692 return TRUE;
4693 }
4694
4695 if (cpu_throttle_enabled && ((thread->sched_pri <= MAXPRI_THROTTLE) && (thread->base_pri <= MAXPRI_THROTTLE))) {
4696 if (thread->last_made_runnable_time < scan_context->earliest_bg_make_runnable_time) {
4697 scan_context->earliest_bg_make_runnable_time = thread->last_made_runnable_time;
4698 }
4699 } else {
4700 if (thread->last_made_runnable_time < scan_context->earliest_normal_make_runnable_time) {
4701 scan_context->earliest_normal_make_runnable_time = thread->last_made_runnable_time;
4702 }
4703 }
4704 count--;
4705 }
4706 }
4707
4708 return FALSE;
4709 }
4710
4711 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
4712
4713 boolean_t
4714 thread_eager_preemption(thread_t thread)
4715 {
4716 return ((thread->sched_flags & TH_SFLAG_EAGERPREEMPT) != 0);
4717 }
4718
4719 void
4720 thread_set_eager_preempt(thread_t thread)
4721 {
4722 spl_t x;
4723 processor_t p;
4724 ast_t ast = AST_NONE;
4725
4726 x = splsched();
4727 p = current_processor();
4728
4729 thread_lock(thread);
4730 thread->sched_flags |= TH_SFLAG_EAGERPREEMPT;
4731
4732 if (thread == current_thread()) {
4733
4734 ast = csw_check(p, AST_NONE);
4735 thread_unlock(thread);
4736 if (ast != AST_NONE) {
4737 (void) thread_block_reason(THREAD_CONTINUE_NULL, NULL, ast);
4738 }
4739 } else {
4740 p = thread->last_processor;
4741
4742 if (p != PROCESSOR_NULL && p->state == PROCESSOR_RUNNING &&
4743 p->active_thread == thread) {
4744 cause_ast_check(p);
4745 }
4746
4747 thread_unlock(thread);
4748 }
4749
4750 splx(x);
4751 }
4752
4753 void
4754 thread_clear_eager_preempt(thread_t thread)
4755 {
4756 spl_t x;
4757
4758 x = splsched();
4759 thread_lock(thread);
4760
4761 thread->sched_flags &= ~TH_SFLAG_EAGERPREEMPT;
4762
4763 thread_unlock(thread);
4764 splx(x);
4765 }
4766
4767 /*
4768 * Scheduling statistics
4769 */
4770 void
4771 sched_stats_handle_csw(processor_t processor, int reasons, int selfpri, int otherpri)
4772 {
4773 struct processor_sched_statistics *stats;
4774 boolean_t to_realtime = FALSE;
4775
4776 stats = &processor->processor_data.sched_stats;
4777 stats->csw_count++;
4778
4779 if (otherpri >= BASEPRI_REALTIME) {
4780 stats->rt_sched_count++;
4781 to_realtime = TRUE;
4782 }
4783
4784 if ((reasons & AST_PREEMPT) != 0) {
4785 stats->preempt_count++;
4786
4787 if (selfpri >= BASEPRI_REALTIME) {
4788 stats->preempted_rt_count++;
4789 }
4790
4791 if (to_realtime) {
4792 stats->preempted_by_rt_count++;
4793 }
4794
4795 }
4796 }
4797
4798 void
4799 sched_stats_handle_runq_change(struct runq_stats *stats, int old_count)
4800 {
4801 uint64_t timestamp = mach_absolute_time();
4802
4803 stats->count_sum += (timestamp - stats->last_change_timestamp) * old_count;
4804 stats->last_change_timestamp = timestamp;
4805 }
4806
4807 /*
4808 * For calls from assembly code
4809 */
4810 #undef thread_wakeup
4811 void
4812 thread_wakeup(
4813 event_t x);
4814
4815 void
4816 thread_wakeup(
4817 event_t x)
4818 {
4819 thread_wakeup_with_result(x, THREAD_AWAKENED);
4820 }
4821
4822 boolean_t
4823 preemption_enabled(void)
4824 {
4825 return (get_preemption_level() == 0 && ml_get_interrupts_enabled());
4826 }
4827
4828 static void
4829 sched_timer_deadline_tracking_init(void) {
4830 nanoseconds_to_absolutetime(TIMER_DEADLINE_TRACKING_BIN_1_DEFAULT, &timer_deadline_tracking_bin_1);
4831 nanoseconds_to_absolutetime(TIMER_DEADLINE_TRACKING_BIN_2_DEFAULT, &timer_deadline_tracking_bin_2);
4832 }
4833
4834
4835 kern_return_t
4836 sched_work_interval_notify(thread_t thread, uint64_t work_interval_id, uint64_t start, uint64_t finish, uint64_t deadline, uint64_t next_start, uint32_t flags)
4837 {
4838 int urgency;
4839 uint64_t urgency_param1, urgency_param2;
4840 spl_t s;
4841
4842 if (work_interval_id == 0) {
4843 return (KERN_INVALID_ARGUMENT);
4844 }
4845
4846 assert(thread == current_thread());
4847
4848 thread_mtx_lock(thread);
4849 if (thread->work_interval_id != work_interval_id) {
4850 thread_mtx_unlock(thread);
4851 return (KERN_INVALID_ARGUMENT);
4852 }
4853 thread_mtx_unlock(thread);
4854
4855 s = splsched();
4856 thread_lock(thread);
4857 urgency = thread_get_urgency(thread, &urgency_param1, &urgency_param2);
4858 thread_unlock(thread);
4859 splx(s);
4860
4861 machine_work_interval_notify(thread, work_interval_id, start, finish, deadline, next_start, urgency, flags);
4862 return (KERN_SUCCESS);
4863 }
4864
4865 void thread_set_options(uint32_t thopt) {
4866 spl_t x;
4867 thread_t t = current_thread();
4868
4869 x = splsched();
4870 thread_lock(t);
4871
4872 t->options |= thopt;
4873
4874 thread_unlock(t);
4875 splx(x);
4876 }