]> git.saurik.com Git - apple/xnu.git/blob - osfmk/kern/sfi.c
85055d0272edeb4e1bbd65543fdd6ed68605d7d8
[apple/xnu.git] / osfmk / kern / sfi.c
1 /*
2 * Copyright (c) 2013 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 #include <mach/mach_types.h>
29 #include <kern/assert.h>
30 #include <kern/clock.h>
31 #include <kern/debug.h>
32 #include <kern/host.h>
33 #include <kern/kalloc.h>
34 #include <kern/kern_types.h>
35 #include <kern/machine.h>
36 #include <kern/simple_lock.h>
37 #include <kern/misc_protos.h>
38 #include <kern/sched.h>
39 #include <kern/sched_prim.h>
40 #include <kern/sfi.h>
41 #include <kern/timer_call.h>
42 #include <kern/wait_queue.h>
43 #include <kern/ledger.h>
44 #include <pexpert/pexpert.h>
45
46 #include <libkern/kernel_mach_header.h>
47
48 #include <sys/kdebug.h>
49
50 #define SFI_DEBUG 0
51
52 #if SFI_DEBUG
53 #define dprintf(...) kprintf(__VA_ARGS__)
54 #else
55 #define dprintf(...) do { } while(0)
56 #endif
57
58 #ifdef MACH_BSD
59 extern sched_call_t workqueue_get_sched_callback(void);
60 #endif /* MACH_BSD */
61
62 /*
63 * SFI (Selective Forced Idle) operates by enabling a global
64 * timer on the SFI window interval. When it fires, all processors
65 * running a thread that should be SFI-ed are sent an AST.
66 * As threads become runnable while in their "off phase", they
67 * are placed on a deferred ready queue. When a per-class
68 * "on timer" fires, the ready threads for that class are
69 * re-enqueued for running. As an optimization to avoid spurious
70 * wakeups, the timer may be lazily programmed.
71 */
72
73 /*
74 * The "sfi_lock" simple lock guards access to static configuration
75 * parameters (as specified by userspace), dynamic state changes
76 * (as updated by the timer event routine), and timer data structures.
77 * Since it can be taken with interrupts disabled in some cases, all
78 * uses should be taken with interrupts disabled at splsched(). The
79 * "sfi_lock" also guards the "sfi_wait_class" field of thread_t, and
80 * must only be accessed with it held.
81 *
82 * When an "on timer" fires, we must deterministically be able to drain
83 * the wait queue, since if any threads are added to the queue afterwards,
84 * they may never get woken out of SFI wait. So sfi_lock must be
85 * taken before the wait queue's own spinlock.
86 *
87 * The wait queue will take the thread's scheduling lock. We may also take
88 * the thread_lock directly to update the "sfi_class" field and determine
89 * if the thread should block in the wait queue, but the lock will be
90 * released before doing so.
91 *
92 * The pset lock may also be taken, but not while any other locks are held.
93 *
94 * splsched ---> sfi_lock ---> wait_queue ---> thread_lock
95 * \ \ \__ thread_lock (*)
96 * \ \__ pset_lock
97 * \
98 * \__ thread_lock
99 */
100
101 decl_simple_lock_data(static,sfi_lock);
102 static timer_call_data_t sfi_timer_call_entry;
103 volatile boolean_t sfi_is_enabled;
104
105 boolean_t sfi_window_is_set;
106 uint64_t sfi_window_usecs;
107 uint64_t sfi_window_interval;
108 uint64_t sfi_next_off_deadline;
109
110 typedef struct {
111 sfi_class_id_t class_id;
112 thread_continue_t class_continuation;
113 const char * class_name;
114 const char * class_ledger_name;
115 } sfi_class_registration_t;
116
117 /*
118 * To add a new SFI class:
119 *
120 * 1) Raise MAX_SFI_CLASS_ID in mach/sfi_class.h
121 * 2) Add a #define for it to mach/sfi_class.h. It need not be inserted in order of restrictiveness.
122 * 3) Add a call to SFI_CLASS_REGISTER below
123 * 4) Augment sfi_thread_classify to categorize threads as early as possible for as restrictive as possible.
124 * 5) Modify thermald to use the SFI class
125 */
126
127 static inline void _sfi_wait_cleanup(sched_call_t callback);
128
129 #define SFI_CLASS_REGISTER(class_id, ledger_name) \
130 extern char compile_time_assert_ ## class_id[SFI_CLASS_ ## class_id < MAX_SFI_CLASS_ID ? 1 : -1]; \
131 void __attribute__((noinline,noreturn)) SFI_ ## class_id ## _THREAD_IS_WAITING(void *callback, wait_result_t wret __unused); \
132 void SFI_ ## class_id ## _THREAD_IS_WAITING(void *callback, wait_result_t wret __unused) \
133 { \
134 _sfi_wait_cleanup(callback); \
135 thread_exception_return(); \
136 } \
137 \
138 sfi_class_registration_t SFI_ ## class_id ## _registration __attribute__((section("__DATA,__sfi_class_reg"),used)) = { SFI_CLASS_ ## class_id, SFI_ ## class_id ## _THREAD_IS_WAITING, "SFI_CLASS_" # class_id, "SFI_CLASS_" # ledger_name };
139
140 /* SFI_CLASS_UNSPECIFIED not included here */
141 SFI_CLASS_REGISTER(MAINTENANCE, MAINTENANCE)
142 SFI_CLASS_REGISTER(DARWIN_BG, DARWIN_BG)
143 SFI_CLASS_REGISTER(APP_NAP, APP_NAP)
144 SFI_CLASS_REGISTER(MANAGED_FOCAL, MANAGED)
145 SFI_CLASS_REGISTER(MANAGED_NONFOCAL, MANAGED)
146 SFI_CLASS_REGISTER(UTILITY, UTILITY)
147 SFI_CLASS_REGISTER(DEFAULT_FOCAL, DEFAULT)
148 SFI_CLASS_REGISTER(DEFAULT_NONFOCAL, DEFAULT)
149 SFI_CLASS_REGISTER(LEGACY_FOCAL, LEGACY)
150 SFI_CLASS_REGISTER(LEGACY_NONFOCAL, LEGACY)
151 SFI_CLASS_REGISTER(USER_INITIATED_FOCAL, USER_INITIATED)
152 SFI_CLASS_REGISTER(USER_INITIATED_NONFOCAL, USER_INITIATED)
153 SFI_CLASS_REGISTER(USER_INTERACTIVE_FOCAL, USER_INTERACTIVE)
154 SFI_CLASS_REGISTER(USER_INTERACTIVE_NONFOCAL, USER_INTERACTIVE)
155 SFI_CLASS_REGISTER(KERNEL, OPTED_OUT)
156 SFI_CLASS_REGISTER(OPTED_OUT, OPTED_OUT)
157
158 struct sfi_class_state {
159 uint64_t off_time_usecs;
160 uint64_t off_time_interval;
161
162 timer_call_data_t on_timer;
163 boolean_t on_timer_programmed;
164
165 boolean_t class_sfi_is_enabled;
166 volatile boolean_t class_in_on_phase;
167
168 struct wait_queue wait_queue; /* threads in ready state */
169 thread_continue_t continuation;
170
171 const char * class_name;
172 const char * class_ledger_name;
173 };
174
175 /* Static configuration performed in sfi_early_init() */
176 struct sfi_class_state sfi_classes[MAX_SFI_CLASS_ID];
177
178 int sfi_enabled_class_count;
179
180 static void sfi_timer_global_off(
181 timer_call_param_t param0,
182 timer_call_param_t param1);
183
184 static void sfi_timer_per_class_on(
185 timer_call_param_t param0,
186 timer_call_param_t param1);
187
188 static sfi_class_registration_t *
189 sfi_get_registration_data(unsigned long *count)
190 {
191 unsigned long sectlen = 0;
192 void *sectdata;
193
194 sectdata = getsectdatafromheader(&_mh_execute_header, "__DATA", "__sfi_class_reg", &sectlen);
195 if (sectdata) {
196
197 if (sectlen % sizeof(sfi_class_registration_t) != 0) {
198 /* corrupt data? */
199 panic("__sfi_class_reg section has invalid size %lu", sectlen);
200 __builtin_unreachable();
201 }
202
203 *count = sectlen / sizeof(sfi_class_registration_t);
204 return (sfi_class_registration_t *)sectdata;
205 } else {
206 panic("__sfi_class_reg section not found");
207 __builtin_unreachable();
208 }
209 }
210
211 /* Called early in boot, when kernel is single-threaded */
212 void sfi_early_init(void)
213 {
214 unsigned long i, count;
215 sfi_class_registration_t *registrations;
216
217 registrations = sfi_get_registration_data(&count);
218 for (i=0; i < count; i++) {
219 sfi_class_id_t class_id = registrations[i].class_id;
220
221 assert(class_id < MAX_SFI_CLASS_ID); /* should be caught at compile-time */
222 if (class_id < MAX_SFI_CLASS_ID) {
223 if (sfi_classes[class_id].continuation != NULL) {
224 panic("Duplicate SFI registration for class 0x%x", class_id);
225 }
226 sfi_classes[class_id].class_sfi_is_enabled = FALSE;
227 sfi_classes[class_id].class_in_on_phase = TRUE;
228 sfi_classes[class_id].continuation = registrations[i].class_continuation;
229 sfi_classes[class_id].class_name = registrations[i].class_name;
230 sfi_classes[class_id].class_ledger_name = registrations[i].class_ledger_name;
231 }
232 }
233 }
234
235 void sfi_init(void)
236 {
237 sfi_class_id_t i;
238 kern_return_t kret;
239
240 simple_lock_init(&sfi_lock, 0);
241 timer_call_setup(&sfi_timer_call_entry, sfi_timer_global_off, NULL);
242 sfi_window_is_set = FALSE;
243 sfi_enabled_class_count = 0;
244 sfi_is_enabled = FALSE;
245
246 for (i = 0; i < MAX_SFI_CLASS_ID; i++) {
247 /* If the class was set up in sfi_early_init(), initialize remaining fields */
248 if (sfi_classes[i].continuation) {
249 timer_call_setup(&sfi_classes[i].on_timer, sfi_timer_per_class_on, (void *)(uintptr_t)i);
250 sfi_classes[i].on_timer_programmed = FALSE;
251
252 kret = wait_queue_init(&sfi_classes[i].wait_queue, SYNC_POLICY_FIFO);
253 assert(kret == KERN_SUCCESS);
254 } else {
255 /* The only allowed gap is for SFI_CLASS_UNSPECIFIED */
256 if(i != SFI_CLASS_UNSPECIFIED) {
257 panic("Gap in registered SFI classes");
258 }
259 }
260 }
261 }
262
263 /* Can be called before sfi_init() by task initialization, but after sfi_early_init() */
264 sfi_class_id_t
265 sfi_get_ledger_alias_for_class(sfi_class_id_t class_id)
266 {
267 sfi_class_id_t i;
268 const char *ledger_name = NULL;
269
270 ledger_name = sfi_classes[class_id].class_ledger_name;
271
272 /* Find the first class in the registration table with this ledger name */
273 if (ledger_name) {
274 for (i = SFI_CLASS_UNSPECIFIED + 1; i < class_id; i++) {
275 if (0 == strcmp(sfi_classes[i].class_ledger_name, ledger_name)) {
276 dprintf("sfi_get_ledger_alias_for_class(0x%x) -> 0x%x\n", class_id, i);
277 return i;
278 }
279 }
280
281 /* This class is the primary one for the ledger, so there is no alias */
282 dprintf("sfi_get_ledger_alias_for_class(0x%x) -> 0x%x\n", class_id, SFI_CLASS_UNSPECIFIED);
283 return SFI_CLASS_UNSPECIFIED;
284 }
285
286 /* We are permissive on SFI class lookup failures. In sfi_init(), we assert more */
287 return SFI_CLASS_UNSPECIFIED;
288 }
289
290 int
291 sfi_ledger_entry_add(ledger_template_t template, sfi_class_id_t class_id)
292 {
293 const char *ledger_name = NULL;
294
295 ledger_name = sfi_classes[class_id].class_ledger_name;
296
297 dprintf("sfi_ledger_entry_add(%p, 0x%x) -> %s\n", template, class_id, ledger_name);
298 return ledger_entry_add(template, ledger_name, "sfi", "MATUs");
299 }
300
301 static void sfi_timer_global_off(
302 timer_call_param_t param0 __unused,
303 timer_call_param_t param1 __unused)
304 {
305 uint64_t now = mach_absolute_time();
306 sfi_class_id_t i;
307 processor_set_t pset, nset;
308 processor_t processor;
309 uint32_t needs_cause_ast_mask = 0x0;
310 spl_t s;
311
312 s = splsched();
313
314 simple_lock(&sfi_lock);
315 if (!sfi_is_enabled) {
316 /* If SFI has been disabled, let all "on" timers drain naturally */
317 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SFI, SFI_OFF_TIMER) | DBG_FUNC_NONE, 1, 0, 0, 0, 0);
318
319 simple_unlock(&sfi_lock);
320 splx(s);
321 return;
322 }
323
324 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SFI, SFI_OFF_TIMER) | DBG_FUNC_START, 0, 0, 0, 0, 0);
325
326 /* First set all configured classes into the off state, and program their "on" timer */
327 for (i = 0; i < MAX_SFI_CLASS_ID; i++) {
328 if (sfi_classes[i].class_sfi_is_enabled) {
329 uint64_t on_timer_deadline;
330
331 sfi_classes[i].class_in_on_phase = FALSE;
332 sfi_classes[i].on_timer_programmed = TRUE;
333
334 /* Push out on-timer */
335 on_timer_deadline = now + sfi_classes[i].off_time_interval;
336 timer_call_enter1(&sfi_classes[i].on_timer, NULL, on_timer_deadline, TIMER_CALL_SYS_CRITICAL);
337 } else {
338 /* If this class no longer needs SFI, make sure the timer is cancelled */
339 sfi_classes[i].class_in_on_phase = TRUE;
340 if (sfi_classes[i].on_timer_programmed) {
341 sfi_classes[i].on_timer_programmed = FALSE;
342 timer_call_cancel(&sfi_classes[i].on_timer);
343 }
344 }
345 }
346 simple_unlock(&sfi_lock);
347
348 /* Iterate over processors, call cause_ast_check() on ones running a thread that should be in an off phase */
349 processor = processor_list;
350 pset = processor->processor_set;
351
352 pset_lock(pset);
353
354 do {
355 nset = processor->processor_set;
356 if (nset != pset) {
357 pset_unlock(pset);
358 pset = nset;
359 pset_lock(pset);
360 }
361
362 /* "processor" and its pset are locked */
363 if (processor->state == PROCESSOR_RUNNING) {
364 if (AST_NONE != sfi_processor_needs_ast(processor)) {
365 needs_cause_ast_mask |= (1U << processor->cpu_id);
366 }
367 }
368 } while ((processor = processor->processor_list) != NULL);
369
370 pset_unlock(pset);
371
372 processor = processor_list;
373 do {
374 if (needs_cause_ast_mask & (1U << processor->cpu_id)) {
375 if (processor == current_processor())
376 ast_on(AST_SFI);
377 else
378 cause_ast_check(processor);
379 }
380 } while ((processor = processor->processor_list) != NULL);
381
382 /* Re-arm timer if still enabled */
383 simple_lock(&sfi_lock);
384 if (sfi_is_enabled) {
385 clock_deadline_for_periodic_event(sfi_window_interval,
386 now,
387 &sfi_next_off_deadline);
388 timer_call_enter1(&sfi_timer_call_entry,
389 NULL,
390 sfi_next_off_deadline,
391 TIMER_CALL_SYS_CRITICAL);
392 }
393
394 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SFI, SFI_OFF_TIMER) | DBG_FUNC_END, 0, 0, 0, 0, 0);
395
396 simple_unlock(&sfi_lock);
397
398 splx(s);
399 }
400
401 static void sfi_timer_per_class_on(
402 timer_call_param_t param0,
403 timer_call_param_t param1 __unused)
404 {
405 sfi_class_id_t sfi_class_id = (sfi_class_id_t)(uintptr_t)param0;
406 struct sfi_class_state *sfi_class = &sfi_classes[sfi_class_id];
407 kern_return_t kret;
408 spl_t s;
409
410 s = splsched();
411
412 simple_lock(&sfi_lock);
413
414 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SFI, SFI_ON_TIMER) | DBG_FUNC_START, sfi_class_id, 0, 0, 0, 0);
415
416 /*
417 * Any threads that may have accumulated in the ready queue for this class should get re-enqueued.
418 * Since we have the sfi_lock held and have changed "class_in_on_phase", we expect
419 * no new threads to be put on this wait queue until the global "off timer" has fired.
420 */
421 sfi_class->class_in_on_phase = TRUE;
422 kret = wait_queue_wakeup64_all(&sfi_class->wait_queue,
423 CAST_EVENT64_T(sfi_class_id),
424 THREAD_AWAKENED);
425 assert(kret == KERN_SUCCESS || kret == KERN_NOT_WAITING);
426
427 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SFI, SFI_ON_TIMER) | DBG_FUNC_END, 0, 0, 0, 0, 0);
428
429 simple_unlock(&sfi_lock);
430
431 splx(s);
432 }
433
434
435 kern_return_t sfi_set_window(uint64_t window_usecs)
436 {
437 uint64_t interval, deadline;
438 uint64_t now = mach_absolute_time();
439 sfi_class_id_t i;
440 spl_t s;
441 uint64_t largest_class_off_interval = 0;
442
443 if (window_usecs < MIN_SFI_WINDOW_USEC)
444 window_usecs = MIN_SFI_WINDOW_USEC;
445
446 if (window_usecs > UINT32_MAX)
447 return (KERN_INVALID_ARGUMENT);
448
449 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SFI, SFI_SET_WINDOW), window_usecs, 0, 0, 0, 0);
450
451 clock_interval_to_absolutetime_interval((uint32_t)window_usecs, NSEC_PER_USEC, &interval);
452 deadline = now + interval;
453
454 s = splsched();
455
456 simple_lock(&sfi_lock);
457
458 /* Check that we are not bringing in the SFI window smaller than any class */
459 for (i = 0; i < MAX_SFI_CLASS_ID; i++) {
460 if (sfi_classes[i].class_sfi_is_enabled) {
461 largest_class_off_interval = MAX(largest_class_off_interval, sfi_classes[i].off_time_interval);
462 }
463 }
464
465 /*
466 * Off window must be strictly greater than all enabled classes,
467 * otherwise threads would build up on ready queue and never be able to run.
468 */
469 if (interval <= largest_class_off_interval) {
470 simple_unlock(&sfi_lock);
471 splx(s);
472 return (KERN_INVALID_ARGUMENT);
473 }
474
475 /*
476 * If the new "off" deadline is further out than the current programmed timer,
477 * just let the current one expire (and the new cadence will be established thereafter).
478 * If the new "off" deadline is nearer than the current one, bring it in, so we
479 * can start the new behavior sooner. Note that this may cause the "off" timer to
480 * fire before some of the class "on" timers have fired.
481 */
482 sfi_window_usecs = window_usecs;
483 sfi_window_interval = interval;
484 sfi_window_is_set = TRUE;
485
486 if (sfi_enabled_class_count == 0) {
487 /* Can't program timer yet */
488 } else if (!sfi_is_enabled) {
489 sfi_is_enabled = TRUE;
490 sfi_next_off_deadline = deadline;
491 timer_call_enter1(&sfi_timer_call_entry,
492 NULL,
493 sfi_next_off_deadline,
494 TIMER_CALL_SYS_CRITICAL);
495 } else if (deadline >= sfi_next_off_deadline) {
496 sfi_next_off_deadline = deadline;
497 } else {
498 sfi_next_off_deadline = deadline;
499 timer_call_enter1(&sfi_timer_call_entry,
500 NULL,
501 sfi_next_off_deadline,
502 TIMER_CALL_SYS_CRITICAL);
503 }
504
505 simple_unlock(&sfi_lock);
506 splx(s);
507
508 return (KERN_SUCCESS);
509 }
510
511 kern_return_t sfi_window_cancel(void)
512 {
513 spl_t s;
514
515 s = splsched();
516
517 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SFI, SFI_CANCEL_WINDOW), 0, 0, 0, 0, 0);
518
519 /* Disable globals so that global "off-timer" is not re-armed */
520 simple_lock(&sfi_lock);
521 sfi_window_is_set = FALSE;
522 sfi_window_usecs = 0;
523 sfi_window_interval = 0;
524 sfi_next_off_deadline = 0;
525 sfi_is_enabled = FALSE;
526 simple_unlock(&sfi_lock);
527
528 splx(s);
529
530 return (KERN_SUCCESS);
531 }
532
533
534 kern_return_t sfi_get_window(uint64_t *window_usecs)
535 {
536 spl_t s;
537 uint64_t off_window_us;
538
539 s = splsched();
540 simple_lock(&sfi_lock);
541
542 off_window_us = sfi_window_usecs;
543
544 simple_unlock(&sfi_lock);
545 splx(s);
546
547 *window_usecs = off_window_us;
548
549 return (KERN_SUCCESS);
550 }
551
552
553 kern_return_t sfi_set_class_offtime(sfi_class_id_t class_id, uint64_t offtime_usecs)
554 {
555 uint64_t interval;
556 spl_t s;
557 uint64_t off_window_interval;
558
559 if (offtime_usecs < MIN_SFI_WINDOW_USEC)
560 offtime_usecs = MIN_SFI_WINDOW_USEC;
561
562 if (class_id == SFI_CLASS_UNSPECIFIED || class_id >= MAX_SFI_CLASS_ID)
563 return (KERN_INVALID_ARGUMENT);
564
565 if (offtime_usecs > UINT32_MAX)
566 return (KERN_INVALID_ARGUMENT);
567
568 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SFI, SFI_SET_CLASS_OFFTIME), offtime_usecs, class_id, 0, 0, 0);
569
570 clock_interval_to_absolutetime_interval((uint32_t)offtime_usecs, NSEC_PER_USEC, &interval);
571
572 s = splsched();
573
574 simple_lock(&sfi_lock);
575 off_window_interval = sfi_window_interval;
576
577 /* Check that we are not bringing in class off-time larger than the SFI window */
578 if (off_window_interval && (interval >= off_window_interval)) {
579 simple_unlock(&sfi_lock);
580 splx(s);
581 return (KERN_INVALID_ARGUMENT);
582 }
583
584 /* We never re-program the per-class on-timer, but rather just let it expire naturally */
585 if (!sfi_classes[class_id].class_sfi_is_enabled) {
586 sfi_enabled_class_count++;
587 }
588 sfi_classes[class_id].off_time_usecs = offtime_usecs;
589 sfi_classes[class_id].off_time_interval = interval;
590 sfi_classes[class_id].class_sfi_is_enabled = TRUE;
591
592 if (sfi_window_is_set && !sfi_is_enabled) {
593 /* start global off timer */
594 sfi_is_enabled = TRUE;
595 sfi_next_off_deadline = mach_absolute_time() + sfi_window_interval;
596 timer_call_enter1(&sfi_timer_call_entry,
597 NULL,
598 sfi_next_off_deadline,
599 TIMER_CALL_SYS_CRITICAL);
600 }
601
602 simple_unlock(&sfi_lock);
603
604 splx(s);
605
606 return (KERN_SUCCESS);
607 }
608
609 kern_return_t sfi_class_offtime_cancel(sfi_class_id_t class_id)
610 {
611 spl_t s;
612
613 if (class_id == SFI_CLASS_UNSPECIFIED || class_id >= MAX_SFI_CLASS_ID)
614 return (KERN_INVALID_ARGUMENT);
615
616 s = splsched();
617
618 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SFI, SFI_CANCEL_CLASS_OFFTIME), class_id, 0, 0, 0, 0);
619
620 simple_lock(&sfi_lock);
621
622 /* We never re-program the per-class on-timer, but rather just let it expire naturally */
623 if (sfi_classes[class_id].class_sfi_is_enabled) {
624 sfi_enabled_class_count--;
625 }
626 sfi_classes[class_id].off_time_usecs = 0;
627 sfi_classes[class_id].off_time_interval = 0;
628 sfi_classes[class_id].class_sfi_is_enabled = FALSE;
629
630 if (sfi_enabled_class_count == 0) {
631 sfi_is_enabled = FALSE;
632 }
633
634 simple_unlock(&sfi_lock);
635
636 splx(s);
637
638 return (KERN_SUCCESS);
639 }
640
641 kern_return_t sfi_get_class_offtime(sfi_class_id_t class_id, uint64_t *offtime_usecs)
642 {
643 uint64_t off_time_us;
644 spl_t s;
645
646 if (class_id == SFI_CLASS_UNSPECIFIED || class_id >= MAX_SFI_CLASS_ID)
647 return (0);
648
649 s = splsched();
650
651 simple_lock(&sfi_lock);
652 off_time_us = sfi_classes[class_id].off_time_usecs;
653 simple_unlock(&sfi_lock);
654
655 splx(s);
656
657 *offtime_usecs = off_time_us;
658
659 return (KERN_SUCCESS);
660 }
661
662 /*
663 * sfi_thread_classify and sfi_processor_active_thread_classify perform the critical
664 * role of quickly categorizing a thread into its SFI class so that an AST_SFI can be
665 * set. As the thread is unwinding to userspace, sfi_ast() performs full locking
666 * and determines whether the thread should enter an SFI wait state. Because of
667 * the inherent races between the time the AST is set and when it is evaluated,
668 * thread classification can be inaccurate (but should always be safe). This is
669 * especially the case for sfi_processor_active_thread_classify, which must
670 * classify the active thread on a remote processor without taking the thread lock.
671 * When in doubt, classification should err on the side of *not* classifying a
672 * thread at all, and wait for the thread itself to either hit a quantum expiration
673 * or block inside the kernel.
674 */
675
676 /*
677 * Thread must be locked. Ultimately, the real decision to enter
678 * SFI wait happens at the AST boundary.
679 */
680 sfi_class_id_t sfi_thread_classify(thread_t thread)
681 {
682 task_t task = thread->task;
683 boolean_t is_kernel_thread = (task == kernel_task);
684 sched_mode_t thmode = thread->sched_mode;
685 int latency_qos = proc_get_effective_task_policy(task, TASK_POLICY_LATENCY_QOS);
686 int task_role = proc_get_effective_task_policy(task, TASK_POLICY_ROLE);
687 int thread_bg = proc_get_effective_thread_policy(thread, TASK_POLICY_DARWIN_BG);
688 int managed_task = proc_get_effective_task_policy(task, TASK_POLICY_SFI_MANAGED);
689 int thread_qos = proc_get_effective_thread_policy(thread, TASK_POLICY_QOS);
690
691 /* kernel threads never reach the user AST boundary, and are in a separate world for SFI */
692 if (is_kernel_thread) {
693 return SFI_CLASS_KERNEL;
694 }
695
696 if (thread_qos == THREAD_QOS_MAINTENANCE)
697 return SFI_CLASS_MAINTENANCE;
698
699 if (thread_bg || thread_qos == THREAD_QOS_BACKGROUND) {
700 return SFI_CLASS_DARWIN_BG;
701 }
702
703 if (latency_qos != 0) {
704 int latency_qos_wtf = latency_qos - 1;
705
706 if ((latency_qos_wtf >= 4) && (latency_qos_wtf <= 5)) {
707 return SFI_CLASS_APP_NAP;
708 }
709 }
710
711 /*
712 * Realtime and fixed priority threads express their duty cycle constraints
713 * via other mechanisms, and are opted out of (most) forms of SFI
714 */
715 if (thmode == TH_MODE_REALTIME || thmode == TH_MODE_FIXED || task_role == TASK_GRAPHICS_SERVER) {
716 return SFI_CLASS_OPTED_OUT;
717 }
718
719 /*
720 * Threads with unspecified or legacy QOS class can be individually managed
721 */
722 if (managed_task &&
723 (thread_qos == THREAD_QOS_UNSPECIFIED || thread_qos == THREAD_QOS_LEGACY)) {
724 if (task_role == TASK_FOREGROUND_APPLICATION || task_role == TASK_CONTROL_APPLICATION)
725 return SFI_CLASS_MANAGED_FOCAL;
726 else
727 return SFI_CLASS_MANAGED_NONFOCAL;
728 }
729
730 if (thread_qos == THREAD_QOS_UTILITY)
731 return SFI_CLASS_UTILITY;
732
733 if (task_role == TASK_FOREGROUND_APPLICATION || task_role == TASK_CONTROL_APPLICATION) {
734 switch (thread_qos) {
735 case THREAD_QOS_USER_INTERACTIVE:
736 return SFI_CLASS_USER_INTERACTIVE_FOCAL;
737 case THREAD_QOS_USER_INITIATED:
738 return SFI_CLASS_USER_INITIATED_FOCAL;
739 case THREAD_QOS_LEGACY:
740 return SFI_CLASS_LEGACY_FOCAL;
741 default:
742 return SFI_CLASS_DEFAULT_FOCAL;
743 }
744 } else {
745 switch (thread_qos) {
746 case THREAD_QOS_USER_INTERACTIVE:
747 return SFI_CLASS_USER_INTERACTIVE_NONFOCAL;
748 case THREAD_QOS_USER_INITIATED:
749 return SFI_CLASS_USER_INITIATED_NONFOCAL;
750 case THREAD_QOS_LEGACY:
751 return SFI_CLASS_LEGACY_NONFOCAL;
752 default:
753 return SFI_CLASS_DEFAULT_NONFOCAL;
754 }
755 }
756 }
757
758 /*
759 * pset must be locked.
760 */
761 sfi_class_id_t sfi_processor_active_thread_classify(processor_t processor)
762 {
763 return processor->current_sfi_class;
764 }
765
766 /*
767 * thread must be locked. This is inherently racy, with the intent that
768 * at the AST boundary, it will be fully evaluated whether we need to
769 * perform an AST wait
770 */
771 ast_t sfi_thread_needs_ast(thread_t thread, sfi_class_id_t *out_class)
772 {
773 sfi_class_id_t class_id;
774
775 class_id = sfi_thread_classify(thread);
776
777 if (out_class)
778 *out_class = class_id;
779
780 /* No lock taken, so a stale value may be used. */
781 if (!sfi_classes[class_id].class_in_on_phase)
782 return AST_SFI;
783 else
784 return AST_NONE;
785 }
786
787 /*
788 * pset must be locked. We take the SFI class for
789 * the currently running thread which is cached on
790 * the processor_t, and assume it is accurate. In the
791 * worst case, the processor will get an IPI and be asked
792 * to evaluate if the current running thread at that
793 * later point in time should be in an SFI wait.
794 */
795 ast_t sfi_processor_needs_ast(processor_t processor)
796 {
797 sfi_class_id_t class_id;
798
799 class_id = sfi_processor_active_thread_classify(processor);
800
801 /* No lock taken, so a stale value may be used. */
802 if (!sfi_classes[class_id].class_in_on_phase)
803 return AST_SFI;
804 else
805 return AST_NONE;
806
807 }
808
809 static inline void _sfi_wait_cleanup(sched_call_t callback) {
810 thread_t self = current_thread();
811 sfi_class_id_t current_sfi_wait_class = SFI_CLASS_UNSPECIFIED;
812 int64_t sfi_wait_time, sfi_wait_begin = 0;
813
814 spl_t s = splsched();
815 thread_lock(self);
816 if (callback) {
817 thread_sched_call(self, callback);
818 }
819 sfi_wait_begin = self->wait_sfi_begin_time;
820 thread_unlock(self);
821
822 simple_lock(&sfi_lock);
823 sfi_wait_time = mach_absolute_time() - sfi_wait_begin;
824 current_sfi_wait_class = self->sfi_wait_class;
825 self->sfi_wait_class = SFI_CLASS_UNSPECIFIED;
826 simple_unlock(&sfi_lock);
827 splx(s);
828 assert(SFI_CLASS_UNSPECIFIED < current_sfi_wait_class < MAX_SFI_CLASS_ID);
829 ledger_credit(self->task->ledger, task_ledgers.sfi_wait_times[current_sfi_wait_class], sfi_wait_time);
830 }
831
832 /*
833 * Called at AST context to fully evaluate if the current thread
834 * (which is obviously running) should instead block in an SFI wait.
835 * We must take the sfi_lock to check whether we are in the "off" period
836 * for the class, and if so, block.
837 */
838 void sfi_ast(thread_t thread)
839 {
840 sfi_class_id_t class_id;
841 spl_t s;
842 struct sfi_class_state *sfi_class;
843 wait_result_t waitret;
844 boolean_t did_wait = FALSE;
845 uint64_t tid;
846 thread_continue_t continuation;
847 sched_call_t workq_callback = workqueue_get_sched_callback();
848 boolean_t did_clear_wq = FALSE;
849
850 s = splsched();
851
852 simple_lock(&sfi_lock);
853
854 if (!sfi_is_enabled) {
855 /*
856 * SFI is not enabled, or has recently been disabled.
857 * There is no point putting this thread on a deferred ready
858 * queue, even if it were classified as needing it, since
859 * SFI will truly be off at the next global off timer
860 */
861 simple_unlock(&sfi_lock);
862 splx(s);
863
864 return;
865 }
866
867 thread_lock(thread);
868 thread->sfi_class = class_id = sfi_thread_classify(thread);
869 tid = thread_tid(thread);
870
871 /*
872 * Once the sfi_lock is taken and the thread's ->sfi_class field is updated, we
873 * are committed to transitioning to whatever state is indicated by "->class_in_on_phase".
874 * If another thread tries to call sfi_reevaluate() after this point, it will take the
875 * sfi_lock and see the thread in this wait state. If another thread calls
876 * sfi_reevaluate() before this point, it would see a runnable thread and at most
877 * attempt to send an AST to this processor, but we would have the most accurate
878 * classification.
879 */
880
881 /* Optimistically clear workq callback while thread is already locked */
882 if (workq_callback && (thread->sched_call == workq_callback)) {
883 thread_sched_call(thread, NULL);
884 did_clear_wq = TRUE;
885 }
886 thread_unlock(thread);
887
888 sfi_class = &sfi_classes[class_id];
889 if (!sfi_class->class_in_on_phase) {
890 /* Need to block thread in wait queue */
891 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SFI, SFI_THREAD_DEFER), tid, class_id, 0, 0, 0);
892
893 waitret = wait_queue_assert_wait64(&sfi_class->wait_queue,
894 CAST_EVENT64_T(class_id),
895 THREAD_INTERRUPTIBLE,
896 0);
897 if (waitret == THREAD_WAITING) {
898 thread->sfi_wait_class = class_id;
899 did_wait = TRUE;
900 continuation = sfi_class->continuation;
901 } else {
902 /* thread may be exiting already, all other errors are unexpected */
903 assert(waitret == THREAD_INTERRUPTED);
904 }
905 }
906 simple_unlock(&sfi_lock);
907
908 splx(s);
909
910 if (did_wait) {
911 thread_block_reason(continuation, did_clear_wq ? workq_callback : NULL, AST_SFI);
912 } else {
913 if (did_clear_wq) {
914 s = splsched();
915 thread_lock(thread);
916 thread_sched_call(thread, workq_callback);
917 thread_unlock(thread);
918 splx(s);
919 }
920 }
921 }
922
923 /* Thread must be unlocked */
924 void sfi_reevaluate(thread_t thread)
925 {
926 kern_return_t kret;
927 spl_t s;
928 sfi_class_id_t class_id, current_class_id;
929 ast_t sfi_ast;
930
931 s = splsched();
932
933 simple_lock(&sfi_lock);
934
935 thread_lock(thread);
936 sfi_ast = sfi_thread_needs_ast(thread, &class_id);
937 thread->sfi_class = class_id;
938
939 /*
940 * This routine chiefly exists to boost threads out of an SFI wait
941 * if their classification changes before the "on" timer fires.
942 *
943 * If we calculate that a thread is in a different ->sfi_wait_class
944 * than we think it should be (including no-SFI-wait), we need to
945 * correct that:
946 *
947 * If the thread is in SFI wait and should not be (or should be waiting
948 * on a different class' "on" timer), we wake it up. If needed, the
949 * thread may immediately block again in the different SFI wait state.
950 *
951 * If the thread is not in an SFI wait state and it should be, we need
952 * to get that thread's attention, possibly by sending an AST to another
953 * processor.
954 */
955
956 if ((current_class_id = thread->sfi_wait_class) != SFI_CLASS_UNSPECIFIED) {
957
958 thread_unlock(thread); /* not needed anymore */
959
960 assert(current_class_id < MAX_SFI_CLASS_ID);
961
962 if ((sfi_ast == AST_NONE) || (class_id != current_class_id)) {
963 struct sfi_class_state *sfi_class = &sfi_classes[current_class_id];
964
965 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SFI, SFI_WAIT_CANCELED), thread_tid(thread), current_class_id, class_id, 0, 0);
966
967 kret = wait_queue_wakeup64_thread(&sfi_class->wait_queue,
968 CAST_EVENT64_T(current_class_id),
969 thread,
970 THREAD_AWAKENED);
971 assert(kret == KERN_SUCCESS || kret == KERN_NOT_WAITING);
972 }
973 } else {
974 /*
975 * Thread's current SFI wait class is not set, and because we
976 * have the sfi_lock, it won't get set.
977 */
978
979 if ((thread->state & (TH_RUN | TH_IDLE)) == TH_RUN) {
980 if (sfi_ast != AST_NONE) {
981 if (thread == current_thread())
982 ast_on(sfi_ast);
983 else {
984 processor_t processor = thread->last_processor;
985
986 if (processor != PROCESSOR_NULL &&
987 processor->state == PROCESSOR_RUNNING &&
988 processor->active_thread == thread) {
989 cause_ast_check(processor);
990 } else {
991 /*
992 * Runnable thread that's not on a CPU currently. When a processor
993 * does context switch to it, the AST will get set based on whether
994 * the thread is in its "off time".
995 */
996 }
997 }
998 }
999 }
1000
1001 thread_unlock(thread);
1002 }
1003
1004 simple_unlock(&sfi_lock);
1005 splx(s);
1006 }