]> git.saurik.com Git - apple/xnu.git/blob - osfmk/kern/sfi.c
8a67ec0344c72e0a53a146d16f717b8cb4670f2d
[apple/xnu.git] / osfmk / kern / sfi.c
1 /*
2 * Copyright (c) 2013 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 #include <mach/mach_types.h>
29 #include <kern/assert.h>
30 #include <kern/clock.h>
31 #include <kern/coalition.h>
32 #include <kern/debug.h>
33 #include <kern/host.h>
34 #include <kern/kalloc.h>
35 #include <kern/kern_types.h>
36 #include <kern/machine.h>
37 #include <kern/simple_lock.h>
38 #include <kern/misc_protos.h>
39 #include <kern/sched.h>
40 #include <kern/sched_prim.h>
41 #include <kern/sfi.h>
42 #include <kern/timer_call.h>
43 #include <kern/waitq.h>
44 #include <kern/ledger.h>
45 #include <kern/policy_internal.h>
46
47 #include <pexpert/pexpert.h>
48
49 #include <libkern/kernel_mach_header.h>
50
51 #include <sys/kdebug.h>
52
53 #if CONFIG_SCHED_SFI
54
55 #define SFI_DEBUG 0
56
57 #if SFI_DEBUG
58 #define dprintf(...) kprintf(__VA_ARGS__)
59 #else
60 #define dprintf(...) do { } while(0)
61 #endif
62
63 #ifdef MACH_BSD
64 extern sched_call_t workqueue_get_sched_callback(void);
65 #endif /* MACH_BSD */
66
67 /*
68 * SFI (Selective Forced Idle) operates by enabling a global
69 * timer on the SFI window interval. When it fires, all processors
70 * running a thread that should be SFI-ed are sent an AST.
71 * As threads become runnable while in their "off phase", they
72 * are placed on a deferred ready queue. When a per-class
73 * "on timer" fires, the ready threads for that class are
74 * re-enqueued for running. As an optimization to avoid spurious
75 * wakeups, the timer may be lazily programmed.
76 */
77
78 /*
79 * The "sfi_lock" simple lock guards access to static configuration
80 * parameters (as specified by userspace), dynamic state changes
81 * (as updated by the timer event routine), and timer data structures.
82 * Since it can be taken with interrupts disabled in some cases, all
83 * uses should be taken with interrupts disabled at splsched(). The
84 * "sfi_lock" also guards the "sfi_wait_class" field of thread_t, and
85 * must only be accessed with it held.
86 *
87 * When an "on timer" fires, we must deterministically be able to drain
88 * the wait queue, since if any threads are added to the queue afterwards,
89 * they may never get woken out of SFI wait. So sfi_lock must be
90 * taken before the wait queue's own spinlock.
91 *
92 * The wait queue will take the thread's scheduling lock. We may also take
93 * the thread_lock directly to update the "sfi_class" field and determine
94 * if the thread should block in the wait queue, but the lock will be
95 * released before doing so.
96 *
97 * The pset lock may also be taken, but not while any other locks are held.
98 *
99 * The task and thread mutex may also be held while reevaluating sfi state.
100 *
101 * splsched ---> sfi_lock ---> waitq ---> thread_lock
102 * \ \ \__ thread_lock (*)
103 * \ \__ pset_lock
104 * \
105 * \__ thread_lock
106 */
107
108 decl_simple_lock_data(static,sfi_lock);
109 static timer_call_data_t sfi_timer_call_entry;
110 volatile boolean_t sfi_is_enabled;
111
112 boolean_t sfi_window_is_set;
113 uint64_t sfi_window_usecs;
114 uint64_t sfi_window_interval;
115 uint64_t sfi_next_off_deadline;
116
117 typedef struct {
118 sfi_class_id_t class_id;
119 thread_continue_t class_continuation;
120 const char * class_name;
121 const char * class_ledger_name;
122 } sfi_class_registration_t;
123
124 /*
125 * To add a new SFI class:
126 *
127 * 1) Raise MAX_SFI_CLASS_ID in mach/sfi_class.h
128 * 2) Add a #define for it to mach/sfi_class.h. It need not be inserted in order of restrictiveness.
129 * 3) Add a call to SFI_CLASS_REGISTER below
130 * 4) Augment sfi_thread_classify to categorize threads as early as possible for as restrictive as possible.
131 * 5) Modify thermald to use the SFI class
132 */
133
134 static inline void _sfi_wait_cleanup(sched_call_t callback);
135
136 #define SFI_CLASS_REGISTER(class_id, ledger_name) \
137 extern char compile_time_assert_ ## class_id[SFI_CLASS_ ## class_id < MAX_SFI_CLASS_ID ? 1 : -1]; \
138 void __attribute__((noinline,noreturn)) SFI_ ## class_id ## _THREAD_IS_WAITING(void *callback, wait_result_t wret __unused); \
139 void SFI_ ## class_id ## _THREAD_IS_WAITING(void *callback, wait_result_t wret __unused) \
140 { \
141 _sfi_wait_cleanup(callback); \
142 thread_exception_return(); \
143 } \
144 \
145 sfi_class_registration_t SFI_ ## class_id ## _registration __attribute__((section("__DATA,__sfi_class_reg"),used)) = { SFI_CLASS_ ## class_id, SFI_ ## class_id ## _THREAD_IS_WAITING, "SFI_CLASS_" # class_id, "SFI_CLASS_" # ledger_name };
146
147 /* SFI_CLASS_UNSPECIFIED not included here */
148 SFI_CLASS_REGISTER(MAINTENANCE, MAINTENANCE)
149 SFI_CLASS_REGISTER(DARWIN_BG, DARWIN_BG)
150 SFI_CLASS_REGISTER(APP_NAP, APP_NAP)
151 SFI_CLASS_REGISTER(MANAGED_FOCAL, MANAGED)
152 SFI_CLASS_REGISTER(MANAGED_NONFOCAL, MANAGED)
153 SFI_CLASS_REGISTER(UTILITY, UTILITY)
154 SFI_CLASS_REGISTER(DEFAULT_FOCAL, DEFAULT)
155 SFI_CLASS_REGISTER(DEFAULT_NONFOCAL, DEFAULT)
156 SFI_CLASS_REGISTER(LEGACY_FOCAL, LEGACY)
157 SFI_CLASS_REGISTER(LEGACY_NONFOCAL, LEGACY)
158 SFI_CLASS_REGISTER(USER_INITIATED_FOCAL, USER_INITIATED)
159 SFI_CLASS_REGISTER(USER_INITIATED_NONFOCAL, USER_INITIATED)
160 SFI_CLASS_REGISTER(USER_INTERACTIVE_FOCAL, USER_INTERACTIVE)
161 SFI_CLASS_REGISTER(USER_INTERACTIVE_NONFOCAL, USER_INTERACTIVE)
162 SFI_CLASS_REGISTER(KERNEL, OPTED_OUT)
163 SFI_CLASS_REGISTER(OPTED_OUT, OPTED_OUT)
164
165 struct sfi_class_state {
166 uint64_t off_time_usecs;
167 uint64_t off_time_interval;
168
169 timer_call_data_t on_timer;
170 uint64_t on_timer_deadline;
171 boolean_t on_timer_programmed;
172
173 boolean_t class_sfi_is_enabled;
174 volatile boolean_t class_in_on_phase;
175
176 struct waitq waitq; /* threads in ready state */
177 thread_continue_t continuation;
178
179 const char * class_name;
180 const char * class_ledger_name;
181 };
182
183 /* Static configuration performed in sfi_early_init() */
184 struct sfi_class_state sfi_classes[MAX_SFI_CLASS_ID];
185
186 int sfi_enabled_class_count;
187
188 static void sfi_timer_global_off(
189 timer_call_param_t param0,
190 timer_call_param_t param1);
191
192 static void sfi_timer_per_class_on(
193 timer_call_param_t param0,
194 timer_call_param_t param1);
195
196 static sfi_class_registration_t *
197 sfi_get_registration_data(unsigned long *count)
198 {
199 unsigned long sectlen = 0;
200 void *sectdata;
201
202 sectdata = getsectdatafromheader(&_mh_execute_header, "__DATA", "__sfi_class_reg", &sectlen);
203 if (sectdata) {
204
205 if (sectlen % sizeof(sfi_class_registration_t) != 0) {
206 /* corrupt data? */
207 panic("__sfi_class_reg section has invalid size %lu", sectlen);
208 __builtin_unreachable();
209 }
210
211 *count = sectlen / sizeof(sfi_class_registration_t);
212 return (sfi_class_registration_t *)sectdata;
213 } else {
214 panic("__sfi_class_reg section not found");
215 __builtin_unreachable();
216 }
217 }
218
219 /* Called early in boot, when kernel is single-threaded */
220 void sfi_early_init(void)
221 {
222 unsigned long i, count;
223 sfi_class_registration_t *registrations;
224
225 registrations = sfi_get_registration_data(&count);
226 for (i=0; i < count; i++) {
227 sfi_class_id_t class_id = registrations[i].class_id;
228
229 assert(class_id < MAX_SFI_CLASS_ID); /* should be caught at compile-time */
230 if (class_id < MAX_SFI_CLASS_ID) {
231 if (sfi_classes[class_id].continuation != NULL) {
232 panic("Duplicate SFI registration for class 0x%x", class_id);
233 }
234 sfi_classes[class_id].class_sfi_is_enabled = FALSE;
235 sfi_classes[class_id].class_in_on_phase = TRUE;
236 sfi_classes[class_id].continuation = registrations[i].class_continuation;
237 sfi_classes[class_id].class_name = registrations[i].class_name;
238 sfi_classes[class_id].class_ledger_name = registrations[i].class_ledger_name;
239 }
240 }
241 }
242
243 void sfi_init(void)
244 {
245 sfi_class_id_t i;
246 kern_return_t kret;
247
248 simple_lock_init(&sfi_lock, 0);
249 timer_call_setup(&sfi_timer_call_entry, sfi_timer_global_off, NULL);
250 sfi_window_is_set = FALSE;
251 sfi_enabled_class_count = 0;
252 sfi_is_enabled = FALSE;
253
254 for (i = 0; i < MAX_SFI_CLASS_ID; i++) {
255 /* If the class was set up in sfi_early_init(), initialize remaining fields */
256 if (sfi_classes[i].continuation) {
257 timer_call_setup(&sfi_classes[i].on_timer, sfi_timer_per_class_on, (void *)(uintptr_t)i);
258 sfi_classes[i].on_timer_programmed = FALSE;
259
260 kret = waitq_init(&sfi_classes[i].waitq, SYNC_POLICY_FIFO|SYNC_POLICY_DISABLE_IRQ);
261 assert(kret == KERN_SUCCESS);
262 } else {
263 /* The only allowed gap is for SFI_CLASS_UNSPECIFIED */
264 if(i != SFI_CLASS_UNSPECIFIED) {
265 panic("Gap in registered SFI classes");
266 }
267 }
268 }
269 }
270
271 /* Can be called before sfi_init() by task initialization, but after sfi_early_init() */
272 sfi_class_id_t
273 sfi_get_ledger_alias_for_class(sfi_class_id_t class_id)
274 {
275 sfi_class_id_t i;
276 const char *ledger_name = NULL;
277
278 ledger_name = sfi_classes[class_id].class_ledger_name;
279
280 /* Find the first class in the registration table with this ledger name */
281 if (ledger_name) {
282 for (i = SFI_CLASS_UNSPECIFIED + 1; i < class_id; i++) {
283 if (0 == strcmp(sfi_classes[i].class_ledger_name, ledger_name)) {
284 dprintf("sfi_get_ledger_alias_for_class(0x%x) -> 0x%x\n", class_id, i);
285 return i;
286 }
287 }
288
289 /* This class is the primary one for the ledger, so there is no alias */
290 dprintf("sfi_get_ledger_alias_for_class(0x%x) -> 0x%x\n", class_id, SFI_CLASS_UNSPECIFIED);
291 return SFI_CLASS_UNSPECIFIED;
292 }
293
294 /* We are permissive on SFI class lookup failures. In sfi_init(), we assert more */
295 return SFI_CLASS_UNSPECIFIED;
296 }
297
298 int
299 sfi_ledger_entry_add(ledger_template_t template, sfi_class_id_t class_id)
300 {
301 const char *ledger_name = NULL;
302
303 ledger_name = sfi_classes[class_id].class_ledger_name;
304
305 dprintf("sfi_ledger_entry_add(%p, 0x%x) -> %s\n", template, class_id, ledger_name);
306 return ledger_entry_add(template, ledger_name, "sfi", "MATUs");
307 }
308
309 static void sfi_timer_global_off(
310 timer_call_param_t param0 __unused,
311 timer_call_param_t param1 __unused)
312 {
313 uint64_t now = mach_absolute_time();
314 sfi_class_id_t i;
315 processor_set_t pset, nset;
316 processor_t processor;
317 uint32_t needs_cause_ast_mask = 0x0;
318 spl_t s;
319
320 s = splsched();
321
322 simple_lock(&sfi_lock);
323 if (!sfi_is_enabled) {
324 /* If SFI has been disabled, let all "on" timers drain naturally */
325 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SFI, SFI_OFF_TIMER) | DBG_FUNC_NONE, 1, 0, 0, 0, 0);
326
327 simple_unlock(&sfi_lock);
328 splx(s);
329 return;
330 }
331
332 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SFI, SFI_OFF_TIMER) | DBG_FUNC_START, 0, 0, 0, 0, 0);
333
334 /* First set all configured classes into the off state, and program their "on" timer */
335 for (i = 0; i < MAX_SFI_CLASS_ID; i++) {
336 if (sfi_classes[i].class_sfi_is_enabled) {
337 uint64_t on_timer_deadline;
338
339 sfi_classes[i].class_in_on_phase = FALSE;
340 sfi_classes[i].on_timer_programmed = TRUE;
341
342 /* Push out on-timer */
343 on_timer_deadline = now + sfi_classes[i].off_time_interval;
344 sfi_classes[i].on_timer_deadline = on_timer_deadline;
345
346 timer_call_enter1(&sfi_classes[i].on_timer, NULL, on_timer_deadline, TIMER_CALL_SYS_CRITICAL);
347 } else {
348 /* If this class no longer needs SFI, make sure the timer is cancelled */
349 sfi_classes[i].class_in_on_phase = TRUE;
350 if (sfi_classes[i].on_timer_programmed) {
351 sfi_classes[i].on_timer_programmed = FALSE;
352 sfi_classes[i].on_timer_deadline = ~0ULL;
353 timer_call_cancel(&sfi_classes[i].on_timer);
354 }
355 }
356 }
357 simple_unlock(&sfi_lock);
358
359 /* Iterate over processors, call cause_ast_check() on ones running a thread that should be in an off phase */
360 processor = processor_list;
361 pset = processor->processor_set;
362
363 pset_lock(pset);
364
365 do {
366 nset = processor->processor_set;
367 if (nset != pset) {
368 pset_unlock(pset);
369 pset = nset;
370 pset_lock(pset);
371 }
372
373 /* "processor" and its pset are locked */
374 if (processor->state == PROCESSOR_RUNNING) {
375 if (AST_NONE != sfi_processor_needs_ast(processor)) {
376 needs_cause_ast_mask |= (1U << processor->cpu_id);
377 }
378 }
379 } while ((processor = processor->processor_list) != NULL);
380
381 pset_unlock(pset);
382
383 processor = processor_list;
384 do {
385 if (needs_cause_ast_mask & (1U << processor->cpu_id)) {
386 if (processor == current_processor())
387 ast_on(AST_SFI);
388 else
389 cause_ast_check(processor);
390 }
391 } while ((processor = processor->processor_list) != NULL);
392
393 /* Re-arm timer if still enabled */
394 simple_lock(&sfi_lock);
395 if (sfi_is_enabled) {
396 clock_deadline_for_periodic_event(sfi_window_interval,
397 now,
398 &sfi_next_off_deadline);
399 timer_call_enter1(&sfi_timer_call_entry,
400 NULL,
401 sfi_next_off_deadline,
402 TIMER_CALL_SYS_CRITICAL);
403 }
404
405 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SFI, SFI_OFF_TIMER) | DBG_FUNC_END, 0, 0, 0, 0, 0);
406
407 simple_unlock(&sfi_lock);
408
409 splx(s);
410 }
411
412 static void sfi_timer_per_class_on(
413 timer_call_param_t param0,
414 timer_call_param_t param1 __unused)
415 {
416 sfi_class_id_t sfi_class_id = (sfi_class_id_t)(uintptr_t)param0;
417 struct sfi_class_state *sfi_class = &sfi_classes[sfi_class_id];
418 kern_return_t kret;
419 spl_t s;
420
421 s = splsched();
422
423 simple_lock(&sfi_lock);
424
425 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SFI, SFI_ON_TIMER) | DBG_FUNC_START, sfi_class_id, 0, 0, 0, 0);
426
427 /*
428 * Any threads that may have accumulated in the ready queue for this class should get re-enqueued.
429 * Since we have the sfi_lock held and have changed "class_in_on_phase", we expect
430 * no new threads to be put on this wait queue until the global "off timer" has fired.
431 */
432
433 sfi_class->class_in_on_phase = TRUE;
434 sfi_class->on_timer_programmed = FALSE;
435
436 kret = waitq_wakeup64_all(&sfi_class->waitq,
437 CAST_EVENT64_T(sfi_class_id),
438 THREAD_AWAKENED, WAITQ_ALL_PRIORITIES);
439 assert(kret == KERN_SUCCESS || kret == KERN_NOT_WAITING);
440
441 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SFI, SFI_ON_TIMER) | DBG_FUNC_END, 0, 0, 0, 0, 0);
442
443 simple_unlock(&sfi_lock);
444
445 splx(s);
446 }
447
448
449 kern_return_t sfi_set_window(uint64_t window_usecs)
450 {
451 uint64_t interval, deadline;
452 uint64_t now = mach_absolute_time();
453 sfi_class_id_t i;
454 spl_t s;
455 uint64_t largest_class_off_interval = 0;
456
457 if (window_usecs < MIN_SFI_WINDOW_USEC)
458 window_usecs = MIN_SFI_WINDOW_USEC;
459
460 if (window_usecs > UINT32_MAX)
461 return (KERN_INVALID_ARGUMENT);
462
463 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SFI, SFI_SET_WINDOW), window_usecs, 0, 0, 0, 0);
464
465 clock_interval_to_absolutetime_interval((uint32_t)window_usecs, NSEC_PER_USEC, &interval);
466 deadline = now + interval;
467
468 s = splsched();
469
470 simple_lock(&sfi_lock);
471
472 /* Check that we are not bringing in the SFI window smaller than any class */
473 for (i = 0; i < MAX_SFI_CLASS_ID; i++) {
474 if (sfi_classes[i].class_sfi_is_enabled) {
475 largest_class_off_interval = MAX(largest_class_off_interval, sfi_classes[i].off_time_interval);
476 }
477 }
478
479 /*
480 * Off window must be strictly greater than all enabled classes,
481 * otherwise threads would build up on ready queue and never be able to run.
482 */
483 if (interval <= largest_class_off_interval) {
484 simple_unlock(&sfi_lock);
485 splx(s);
486 return (KERN_INVALID_ARGUMENT);
487 }
488
489 /*
490 * If the new "off" deadline is further out than the current programmed timer,
491 * just let the current one expire (and the new cadence will be established thereafter).
492 * If the new "off" deadline is nearer than the current one, bring it in, so we
493 * can start the new behavior sooner. Note that this may cause the "off" timer to
494 * fire before some of the class "on" timers have fired.
495 */
496 sfi_window_usecs = window_usecs;
497 sfi_window_interval = interval;
498 sfi_window_is_set = TRUE;
499
500 if (sfi_enabled_class_count == 0) {
501 /* Can't program timer yet */
502 } else if (!sfi_is_enabled) {
503 sfi_is_enabled = TRUE;
504 sfi_next_off_deadline = deadline;
505 timer_call_enter1(&sfi_timer_call_entry,
506 NULL,
507 sfi_next_off_deadline,
508 TIMER_CALL_SYS_CRITICAL);
509 } else if (deadline >= sfi_next_off_deadline) {
510 sfi_next_off_deadline = deadline;
511 } else {
512 sfi_next_off_deadline = deadline;
513 timer_call_enter1(&sfi_timer_call_entry,
514 NULL,
515 sfi_next_off_deadline,
516 TIMER_CALL_SYS_CRITICAL);
517 }
518
519 simple_unlock(&sfi_lock);
520 splx(s);
521
522 return (KERN_SUCCESS);
523 }
524
525 kern_return_t sfi_window_cancel(void)
526 {
527 spl_t s;
528
529 s = splsched();
530
531 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SFI, SFI_CANCEL_WINDOW), 0, 0, 0, 0, 0);
532
533 /* Disable globals so that global "off-timer" is not re-armed */
534 simple_lock(&sfi_lock);
535 sfi_window_is_set = FALSE;
536 sfi_window_usecs = 0;
537 sfi_window_interval = 0;
538 sfi_next_off_deadline = 0;
539 sfi_is_enabled = FALSE;
540 simple_unlock(&sfi_lock);
541
542 splx(s);
543
544 return (KERN_SUCCESS);
545 }
546
547 /* Defers SFI off and per-class on timers (if live) by the specified interval
548 * in Mach Absolute Time Units. Currently invoked to align with the global
549 * forced idle mechanism. Making some simplifying assumptions, the iterative GFI
550 * induced SFI on+off deferrals form a geometric series that converges to yield
551 * an effective SFI duty cycle that is scaled by the GFI duty cycle. Initial phase
552 * alignment and congruency of the SFI/GFI periods can distort this to some extent.
553 */
554
555 kern_return_t sfi_defer(uint64_t sfi_defer_matus)
556 {
557 spl_t s;
558 kern_return_t kr = KERN_FAILURE;
559 s = splsched();
560
561 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SFI, SFI_GLOBAL_DEFER), sfi_defer_matus, 0, 0, 0, 0);
562
563 simple_lock(&sfi_lock);
564 if (!sfi_is_enabled) {
565 goto sfi_defer_done;
566 }
567
568 assert(sfi_next_off_deadline != 0);
569
570 sfi_next_off_deadline += sfi_defer_matus;
571 timer_call_enter1(&sfi_timer_call_entry, NULL, sfi_next_off_deadline, TIMER_CALL_SYS_CRITICAL);
572
573 int i;
574 for (i = 0; i < MAX_SFI_CLASS_ID; i++) {
575 if (sfi_classes[i].class_sfi_is_enabled) {
576 if (sfi_classes[i].on_timer_programmed) {
577 uint64_t new_on_deadline = sfi_classes[i].on_timer_deadline + sfi_defer_matus;
578 sfi_classes[i].on_timer_deadline = new_on_deadline;
579 timer_call_enter1(&sfi_classes[i].on_timer, NULL, new_on_deadline, TIMER_CALL_SYS_CRITICAL);
580 }
581 }
582 }
583
584 kr = KERN_SUCCESS;
585 sfi_defer_done:
586 simple_unlock(&sfi_lock);
587
588 splx(s);
589
590 return (kr);
591 }
592
593
594 kern_return_t sfi_get_window(uint64_t *window_usecs)
595 {
596 spl_t s;
597 uint64_t off_window_us;
598
599 s = splsched();
600 simple_lock(&sfi_lock);
601
602 off_window_us = sfi_window_usecs;
603
604 simple_unlock(&sfi_lock);
605 splx(s);
606
607 *window_usecs = off_window_us;
608
609 return (KERN_SUCCESS);
610 }
611
612
613 kern_return_t sfi_set_class_offtime(sfi_class_id_t class_id, uint64_t offtime_usecs)
614 {
615 uint64_t interval;
616 spl_t s;
617 uint64_t off_window_interval;
618
619 if (offtime_usecs < MIN_SFI_WINDOW_USEC)
620 offtime_usecs = MIN_SFI_WINDOW_USEC;
621
622 if (class_id == SFI_CLASS_UNSPECIFIED || class_id >= MAX_SFI_CLASS_ID)
623 return (KERN_INVALID_ARGUMENT);
624
625 if (offtime_usecs > UINT32_MAX)
626 return (KERN_INVALID_ARGUMENT);
627
628 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SFI, SFI_SET_CLASS_OFFTIME), offtime_usecs, class_id, 0, 0, 0);
629
630 clock_interval_to_absolutetime_interval((uint32_t)offtime_usecs, NSEC_PER_USEC, &interval);
631
632 s = splsched();
633
634 simple_lock(&sfi_lock);
635 off_window_interval = sfi_window_interval;
636
637 /* Check that we are not bringing in class off-time larger than the SFI window */
638 if (off_window_interval && (interval >= off_window_interval)) {
639 simple_unlock(&sfi_lock);
640 splx(s);
641 return (KERN_INVALID_ARGUMENT);
642 }
643
644 /* We never re-program the per-class on-timer, but rather just let it expire naturally */
645 if (!sfi_classes[class_id].class_sfi_is_enabled) {
646 sfi_enabled_class_count++;
647 }
648 sfi_classes[class_id].off_time_usecs = offtime_usecs;
649 sfi_classes[class_id].off_time_interval = interval;
650 sfi_classes[class_id].class_sfi_is_enabled = TRUE;
651
652 if (sfi_window_is_set && !sfi_is_enabled) {
653 /* start global off timer */
654 sfi_is_enabled = TRUE;
655 sfi_next_off_deadline = mach_absolute_time() + sfi_window_interval;
656 timer_call_enter1(&sfi_timer_call_entry,
657 NULL,
658 sfi_next_off_deadline,
659 TIMER_CALL_SYS_CRITICAL);
660 }
661
662 simple_unlock(&sfi_lock);
663
664 splx(s);
665
666 return (KERN_SUCCESS);
667 }
668
669 kern_return_t sfi_class_offtime_cancel(sfi_class_id_t class_id)
670 {
671 spl_t s;
672
673 if (class_id == SFI_CLASS_UNSPECIFIED || class_id >= MAX_SFI_CLASS_ID)
674 return (KERN_INVALID_ARGUMENT);
675
676 s = splsched();
677
678 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SFI, SFI_CANCEL_CLASS_OFFTIME), class_id, 0, 0, 0, 0);
679
680 simple_lock(&sfi_lock);
681
682 /* We never re-program the per-class on-timer, but rather just let it expire naturally */
683 if (sfi_classes[class_id].class_sfi_is_enabled) {
684 sfi_enabled_class_count--;
685 }
686 sfi_classes[class_id].off_time_usecs = 0;
687 sfi_classes[class_id].off_time_interval = 0;
688 sfi_classes[class_id].class_sfi_is_enabled = FALSE;
689
690 if (sfi_enabled_class_count == 0) {
691 sfi_is_enabled = FALSE;
692 }
693
694 simple_unlock(&sfi_lock);
695
696 splx(s);
697
698 return (KERN_SUCCESS);
699 }
700
701 kern_return_t sfi_get_class_offtime(sfi_class_id_t class_id, uint64_t *offtime_usecs)
702 {
703 uint64_t off_time_us;
704 spl_t s;
705
706 if (class_id == SFI_CLASS_UNSPECIFIED || class_id >= MAX_SFI_CLASS_ID)
707 return (0);
708
709 s = splsched();
710
711 simple_lock(&sfi_lock);
712 off_time_us = sfi_classes[class_id].off_time_usecs;
713 simple_unlock(&sfi_lock);
714
715 splx(s);
716
717 *offtime_usecs = off_time_us;
718
719 return (KERN_SUCCESS);
720 }
721
722 /*
723 * sfi_thread_classify and sfi_processor_active_thread_classify perform the critical
724 * role of quickly categorizing a thread into its SFI class so that an AST_SFI can be
725 * set. As the thread is unwinding to userspace, sfi_ast() performs full locking
726 * and determines whether the thread should enter an SFI wait state. Because of
727 * the inherent races between the time the AST is set and when it is evaluated,
728 * thread classification can be inaccurate (but should always be safe). This is
729 * especially the case for sfi_processor_active_thread_classify, which must
730 * classify the active thread on a remote processor without taking the thread lock.
731 * When in doubt, classification should err on the side of *not* classifying a
732 * thread at all, and wait for the thread itself to either hit a quantum expiration
733 * or block inside the kernel.
734 */
735
736 /*
737 * Thread must be locked. Ultimately, the real decision to enter
738 * SFI wait happens at the AST boundary.
739 */
740 sfi_class_id_t sfi_thread_classify(thread_t thread)
741 {
742 task_t task = thread->task;
743 boolean_t is_kernel_thread = (task == kernel_task);
744 sched_mode_t thmode = thread->sched_mode;
745 boolean_t focal = FALSE;
746
747 int task_role = proc_get_effective_task_policy(task, TASK_POLICY_ROLE);
748 int latency_qos = proc_get_effective_task_policy(task, TASK_POLICY_LATENCY_QOS);
749 int managed_task = proc_get_effective_task_policy(task, TASK_POLICY_SFI_MANAGED);
750
751 int thread_qos = proc_get_effective_thread_policy(thread, TASK_POLICY_QOS);
752 int thread_bg = proc_get_effective_thread_policy(thread, TASK_POLICY_DARWIN_BG);
753
754 /* kernel threads never reach the user AST boundary, and are in a separate world for SFI */
755 if (is_kernel_thread) {
756 return SFI_CLASS_KERNEL;
757 }
758
759 if (thread_qos == THREAD_QOS_MAINTENANCE)
760 return SFI_CLASS_MAINTENANCE;
761
762 if (thread_bg || thread_qos == THREAD_QOS_BACKGROUND) {
763 return SFI_CLASS_DARWIN_BG;
764 }
765
766 if (latency_qos != 0) {
767 int latency_qos_wtf = latency_qos - 1;
768
769 if ((latency_qos_wtf >= 4) && (latency_qos_wtf <= 5)) {
770 return SFI_CLASS_APP_NAP;
771 }
772 }
773
774 /*
775 * Realtime and fixed priority threads express their duty cycle constraints
776 * via other mechanisms, and are opted out of (most) forms of SFI
777 */
778 if (thmode == TH_MODE_REALTIME || thmode == TH_MODE_FIXED || task_role == TASK_GRAPHICS_SERVER) {
779 return SFI_CLASS_OPTED_OUT;
780 }
781
782 /*
783 * Threads with unspecified, legacy, or user-initiated QOS class can be individually managed.
784 */
785 switch (task_role) {
786 case TASK_CONTROL_APPLICATION:
787 case TASK_FOREGROUND_APPLICATION:
788 focal = TRUE;
789 break;
790 case TASK_BACKGROUND_APPLICATION:
791 case TASK_DEFAULT_APPLICATION:
792 case TASK_THROTTLE_APPLICATION:
793 case TASK_UNSPECIFIED:
794 /* Focal if the task is in a coalition with a FG/focal app */
795 if (task_coalition_focal_count(thread->task) > 0)
796 focal = TRUE;
797 break;
798 default:
799 break;
800 }
801
802 if (managed_task) {
803 switch (thread_qos) {
804 case THREAD_QOS_UNSPECIFIED:
805 case THREAD_QOS_LEGACY:
806 case THREAD_QOS_USER_INITIATED:
807 if (focal)
808 return SFI_CLASS_MANAGED_FOCAL;
809 else
810 return SFI_CLASS_MANAGED_NONFOCAL;
811 default:
812 break;
813 }
814 }
815
816 if (thread_qos == THREAD_QOS_UTILITY)
817 return SFI_CLASS_UTILITY;
818
819 /*
820 * Classify threads in non-managed tasks
821 */
822 if (focal) {
823 switch (thread_qos) {
824 case THREAD_QOS_USER_INTERACTIVE:
825 return SFI_CLASS_USER_INTERACTIVE_FOCAL;
826 case THREAD_QOS_USER_INITIATED:
827 return SFI_CLASS_USER_INITIATED_FOCAL;
828 case THREAD_QOS_LEGACY:
829 return SFI_CLASS_LEGACY_FOCAL;
830 default:
831 return SFI_CLASS_DEFAULT_FOCAL;
832 }
833 } else {
834 switch (thread_qos) {
835 case THREAD_QOS_USER_INTERACTIVE:
836 return SFI_CLASS_USER_INTERACTIVE_NONFOCAL;
837 case THREAD_QOS_USER_INITIATED:
838 return SFI_CLASS_USER_INITIATED_NONFOCAL;
839 case THREAD_QOS_LEGACY:
840 return SFI_CLASS_LEGACY_NONFOCAL;
841 default:
842 return SFI_CLASS_DEFAULT_NONFOCAL;
843 }
844 }
845 }
846
847 /*
848 * pset must be locked.
849 */
850 sfi_class_id_t sfi_processor_active_thread_classify(processor_t processor)
851 {
852 return processor->current_sfi_class;
853 }
854
855 /*
856 * thread must be locked. This is inherently racy, with the intent that
857 * at the AST boundary, it will be fully evaluated whether we need to
858 * perform an AST wait
859 */
860 ast_t sfi_thread_needs_ast(thread_t thread, sfi_class_id_t *out_class)
861 {
862 sfi_class_id_t class_id;
863
864 class_id = sfi_thread_classify(thread);
865
866 if (out_class)
867 *out_class = class_id;
868
869 /* No lock taken, so a stale value may be used. */
870 if (!sfi_classes[class_id].class_in_on_phase)
871 return AST_SFI;
872 else
873 return AST_NONE;
874 }
875
876 /*
877 * pset must be locked. We take the SFI class for
878 * the currently running thread which is cached on
879 * the processor_t, and assume it is accurate. In the
880 * worst case, the processor will get an IPI and be asked
881 * to evaluate if the current running thread at that
882 * later point in time should be in an SFI wait.
883 */
884 ast_t sfi_processor_needs_ast(processor_t processor)
885 {
886 sfi_class_id_t class_id;
887
888 class_id = sfi_processor_active_thread_classify(processor);
889
890 /* No lock taken, so a stale value may be used. */
891 if (!sfi_classes[class_id].class_in_on_phase)
892 return AST_SFI;
893 else
894 return AST_NONE;
895
896 }
897
898 static inline void _sfi_wait_cleanup(sched_call_t callback) {
899 thread_t self = current_thread();
900 sfi_class_id_t current_sfi_wait_class = SFI_CLASS_UNSPECIFIED;
901 int64_t sfi_wait_time, sfi_wait_begin = 0;
902
903 spl_t s = splsched();
904 thread_lock(self);
905 if (callback) {
906 thread_sched_call(self, callback);
907 }
908 sfi_wait_begin = self->wait_sfi_begin_time;
909 thread_unlock(self);
910
911 simple_lock(&sfi_lock);
912 sfi_wait_time = mach_absolute_time() - sfi_wait_begin;
913 current_sfi_wait_class = self->sfi_wait_class;
914 self->sfi_wait_class = SFI_CLASS_UNSPECIFIED;
915 simple_unlock(&sfi_lock);
916 splx(s);
917 assert((SFI_CLASS_UNSPECIFIED < current_sfi_wait_class) && (current_sfi_wait_class < MAX_SFI_CLASS_ID));
918 ledger_credit(self->task->ledger, task_ledgers.sfi_wait_times[current_sfi_wait_class], sfi_wait_time);
919 }
920
921 /*
922 * Called at AST context to fully evaluate if the current thread
923 * (which is obviously running) should instead block in an SFI wait.
924 * We must take the sfi_lock to check whether we are in the "off" period
925 * for the class, and if so, block.
926 */
927 void sfi_ast(thread_t thread)
928 {
929 sfi_class_id_t class_id;
930 spl_t s;
931 struct sfi_class_state *sfi_class;
932 wait_result_t waitret;
933 boolean_t did_wait = FALSE;
934 uint64_t tid;
935 thread_continue_t continuation;
936 sched_call_t workq_callback = workqueue_get_sched_callback();
937
938 s = splsched();
939
940 simple_lock(&sfi_lock);
941
942 if (!sfi_is_enabled) {
943 /*
944 * SFI is not enabled, or has recently been disabled.
945 * There is no point putting this thread on a deferred ready
946 * queue, even if it were classified as needing it, since
947 * SFI will truly be off at the next global off timer
948 */
949 simple_unlock(&sfi_lock);
950 splx(s);
951
952 return;
953 }
954
955 thread_lock(thread);
956 thread->sfi_class = class_id = sfi_thread_classify(thread);
957 tid = thread_tid(thread);
958
959 /*
960 * Once the sfi_lock is taken and the thread's ->sfi_class field is updated, we
961 * are committed to transitioning to whatever state is indicated by "->class_in_on_phase".
962 * If another thread tries to call sfi_reevaluate() after this point, it will take the
963 * sfi_lock and see the thread in this wait state. If another thread calls
964 * sfi_reevaluate() before this point, it would see a runnable thread and at most
965 * attempt to send an AST to this processor, but we would have the most accurate
966 * classification.
967 */
968
969 /* Optimistically clear workq callback while thread is already locked */
970 if (workq_callback && (thread->sched_call == workq_callback)) {
971 thread_sched_call(thread, NULL);
972 } else {
973 workq_callback = NULL;
974 }
975 thread_unlock(thread);
976
977 sfi_class = &sfi_classes[class_id];
978 if (!sfi_class->class_in_on_phase) {
979 /* Need to block thread in wait queue */
980 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SFI, SFI_THREAD_DEFER), tid, class_id, 0, 0, 0);
981
982 waitret = waitq_assert_wait64(&sfi_class->waitq,
983 CAST_EVENT64_T(class_id),
984 THREAD_INTERRUPTIBLE,
985 0);
986 if (waitret == THREAD_WAITING) {
987 thread->sfi_wait_class = class_id;
988 did_wait = TRUE;
989 continuation = sfi_class->continuation;
990 } else {
991 /* thread may be exiting already, all other errors are unexpected */
992 assert(waitret == THREAD_INTERRUPTED);
993 }
994 }
995 simple_unlock(&sfi_lock);
996
997 splx(s);
998
999 if (did_wait) {
1000 thread_block_reason(continuation, workq_callback, AST_SFI);
1001 } else if (workq_callback) {
1002 thread_reenable_sched_call(thread, workq_callback);
1003 }
1004 }
1005
1006 /* Thread must be unlocked */
1007 void sfi_reevaluate(thread_t thread)
1008 {
1009 kern_return_t kret;
1010 spl_t s;
1011 sfi_class_id_t class_id, current_class_id;
1012 ast_t sfi_ast;
1013
1014 s = splsched();
1015
1016 simple_lock(&sfi_lock);
1017
1018 thread_lock(thread);
1019 sfi_ast = sfi_thread_needs_ast(thread, &class_id);
1020 thread->sfi_class = class_id;
1021
1022 /*
1023 * This routine chiefly exists to boost threads out of an SFI wait
1024 * if their classification changes before the "on" timer fires.
1025 *
1026 * If we calculate that a thread is in a different ->sfi_wait_class
1027 * than we think it should be (including no-SFI-wait), we need to
1028 * correct that:
1029 *
1030 * If the thread is in SFI wait and should not be (or should be waiting
1031 * on a different class' "on" timer), we wake it up. If needed, the
1032 * thread may immediately block again in the different SFI wait state.
1033 *
1034 * If the thread is not in an SFI wait state and it should be, we need
1035 * to get that thread's attention, possibly by sending an AST to another
1036 * processor.
1037 */
1038
1039 if ((current_class_id = thread->sfi_wait_class) != SFI_CLASS_UNSPECIFIED) {
1040
1041 thread_unlock(thread); /* not needed anymore */
1042
1043 assert(current_class_id < MAX_SFI_CLASS_ID);
1044
1045 if ((sfi_ast == AST_NONE) || (class_id != current_class_id)) {
1046 struct sfi_class_state *sfi_class = &sfi_classes[current_class_id];
1047
1048 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SFI, SFI_WAIT_CANCELED), thread_tid(thread), current_class_id, class_id, 0, 0);
1049
1050 kret = waitq_wakeup64_thread(&sfi_class->waitq,
1051 CAST_EVENT64_T(current_class_id),
1052 thread,
1053 THREAD_AWAKENED);
1054 assert(kret == KERN_SUCCESS || kret == KERN_NOT_WAITING);
1055 }
1056 } else {
1057 /*
1058 * Thread's current SFI wait class is not set, and because we
1059 * have the sfi_lock, it won't get set.
1060 */
1061
1062 if ((thread->state & (TH_RUN | TH_IDLE)) == TH_RUN) {
1063 if (sfi_ast != AST_NONE) {
1064 if (thread == current_thread())
1065 ast_on(sfi_ast);
1066 else {
1067 processor_t processor = thread->last_processor;
1068
1069 if (processor != PROCESSOR_NULL &&
1070 processor->state == PROCESSOR_RUNNING &&
1071 processor->active_thread == thread) {
1072 cause_ast_check(processor);
1073 } else {
1074 /*
1075 * Runnable thread that's not on a CPU currently. When a processor
1076 * does context switch to it, the AST will get set based on whether
1077 * the thread is in its "off time".
1078 */
1079 }
1080 }
1081 }
1082 }
1083
1084 thread_unlock(thread);
1085 }
1086
1087 simple_unlock(&sfi_lock);
1088 splx(s);
1089 }
1090
1091 #else /* !CONFIG_SCHED_SFI */
1092
1093 kern_return_t sfi_set_window(uint64_t window_usecs __unused)
1094 {
1095 return (KERN_NOT_SUPPORTED);
1096 }
1097
1098 kern_return_t sfi_window_cancel(void)
1099 {
1100 return (KERN_NOT_SUPPORTED);
1101 }
1102
1103
1104 kern_return_t sfi_get_window(uint64_t *window_usecs __unused)
1105 {
1106 return (KERN_NOT_SUPPORTED);
1107 }
1108
1109
1110 kern_return_t sfi_set_class_offtime(sfi_class_id_t class_id __unused, uint64_t offtime_usecs __unused)
1111 {
1112 return (KERN_NOT_SUPPORTED);
1113 }
1114
1115 kern_return_t sfi_class_offtime_cancel(sfi_class_id_t class_id __unused)
1116 {
1117 return (KERN_NOT_SUPPORTED);
1118 }
1119
1120 kern_return_t sfi_get_class_offtime(sfi_class_id_t class_id __unused, uint64_t *offtime_usecs __unused)
1121 {
1122 return (KERN_NOT_SUPPORTED);
1123 }
1124
1125 void sfi_reevaluate(thread_t thread __unused)
1126 {
1127 return;
1128 }
1129
1130 sfi_class_id_t sfi_thread_classify(thread_t thread)
1131 {
1132 task_t task = thread->task;
1133 boolean_t is_kernel_thread = (task == kernel_task);
1134
1135 if (is_kernel_thread) {
1136 return SFI_CLASS_KERNEL;
1137 }
1138
1139 return SFI_CLASS_OPTED_OUT;
1140 }
1141
1142 #endif /* !CONFIG_SCHED_SFI */