]> git.saurik.com Git - apple/xnu.git/blob - osfmk/kern/work_interval.c
dd574a0b7ed3a9ad0ec2b87821626be6797aa67d
[apple/xnu.git] / osfmk / kern / work_interval.c
1 /*
2 * Copyright (c) 2017 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29
30 #include <sys/work_interval.h>
31
32 #include <kern/work_interval.h>
33
34 #include <kern/thread.h>
35 #include <kern/sched_prim.h>
36 #include <kern/machine.h>
37 #include <kern/thread_group.h>
38 #include <kern/ipc_kobject.h>
39 #include <kern/task.h>
40 #include <kern/coalition.h>
41 #include <kern/policy_internal.h>
42 #include <kern/mpsc_queue.h>
43
44 #include <mach/kern_return.h>
45 #include <mach/notify.h>
46 #include <os/refcnt.h>
47
48 #include <stdatomic.h>
49
50 /*
51 * With the introduction of auto-join work intervals, it is possible
52 * to change the work interval (and related thread group) of a thread in a
53 * variety of contexts (thread termination, context switch, thread mode
54 * change etc.). In order to clearly specify the policy expectation and
55 * the locking behavior, all calls to thread_set_work_interval() pass
56 * in a set of flags.
57 */
58
59 __options_decl(thread_work_interval_options_t, uint32_t, {
60 /* Change the work interval using the explicit join rules */
61 THREAD_WI_EXPLICIT_JOIN_POLICY = 0x1,
62 /* Change the work interval using the auto-join rules */
63 THREAD_WI_AUTO_JOIN_POLICY = 0x2,
64 /* Caller already holds the thread lock */
65 THREAD_WI_THREAD_LOCK_HELD = 0x4,
66 /* Caller does not hold the thread lock */
67 THREAD_WI_THREAD_LOCK_NEEDED = 0x8,
68 /* Change the work interval from the context switch path (thread may not be running or on a runq) */
69 THREAD_WI_THREAD_CTX_SWITCH = 0x10,
70 });
71
72 static kern_return_t thread_set_work_interval(thread_t, struct work_interval *, thread_work_interval_options_t);
73
74 #if CONFIG_SCHED_AUTO_JOIN
75 /* MPSC queue used to defer deallocate work intervals */
76 static struct mpsc_daemon_queue work_interval_deallocate_queue;
77
78 static void work_interval_deferred_release(struct work_interval *);
79
80 /*
81 * Work Interval Auto-Join Status
82 *
83 * work_interval_auto_join_status_t represents the state of auto-join for a given work interval.
84 * It packs the following information:
85 * - A bit representing if a "finish" is deferred on the work interval
86 * - Count of number of threads auto-joined to the work interval
87 */
88 #define WORK_INTERVAL_STATUS_DEFERRED_FINISH_MASK ((uint32_t)(1 << 31))
89 #define WORK_INTERVAL_STATUS_AUTO_JOIN_COUNT_MASK ((uint32_t)(WORK_INTERVAL_STATUS_DEFERRED_FINISH_MASK - 1))
90 #define WORK_INTERVAL_STATUS_AUTO_JOIN_COUNT_MAX WORK_INTERVAL_STATUS_AUTO_JOIN_COUNT_MASK
91 typedef uint32_t work_interval_auto_join_status_t;
92
93 static inline bool __unused
94 work_interval_status_deferred_finish(work_interval_auto_join_status_t status)
95 {
96 return (status & WORK_INTERVAL_STATUS_DEFERRED_FINISH_MASK) ? true : false;
97 }
98
99 static inline uint32_t __unused
100 work_interval_status_auto_join_count(work_interval_auto_join_status_t status)
101 {
102 return (uint32_t)(status & WORK_INTERVAL_STATUS_AUTO_JOIN_COUNT_MASK);
103 }
104
105 /*
106 * struct work_interval_deferred_finish_state
107 *
108 * Contains the parameters of the finish operation which is being deferred.
109 */
110 struct work_interval_deferred_finish_state {
111 uint64_t instance_id;
112 uint64_t start;
113 uint64_t deadline;
114 uint64_t complexity;
115 };
116
117 struct work_interval_auto_join_info {
118 struct work_interval_deferred_finish_state deferred_finish_state;
119 work_interval_auto_join_status_t _Atomic status;
120 };
121 #endif /* CONFIG_SCHED_AUTO_JOIN */
122
123 /*
124 * Work Interval structs
125 *
126 * This struct represents a thread group and/or work interval context
127 * in a mechanism that is represented with a kobject.
128 *
129 * Every thread that has joined a WI has a +1 ref, and the port
130 * has a +1 ref as well.
131 *
132 * TODO: groups need to have a 'is for WI' flag
133 * and they need a flag to create that says 'for WI'
134 * This would allow CLPC to avoid allocating WI support
135 * data unless it is needed
136 *
137 * TODO: Enforce not having more than one non-group joinable work
138 * interval per thread group.
139 * CLPC only wants to see one WI-notify callout per group.
140 */
141
142 struct work_interval {
143 uint64_t wi_id;
144 struct os_refcnt wi_ref_count;
145 uint32_t wi_create_flags;
146
147 /* for debugging purposes only, does not hold a ref on port */
148 ipc_port_t wi_port;
149
150 /*
151 * holds uniqueid and version of creating process,
152 * used to permission-gate notify
153 * TODO: you'd think there would be a better way to do this
154 */
155 uint64_t wi_creator_uniqueid;
156 uint32_t wi_creator_pid;
157 int wi_creator_pidversion;
158
159 #if CONFIG_THREAD_GROUPS
160 struct thread_group *wi_group; /* holds +1 ref on group */
161 #endif /* CONFIG_THREAD_GROUPS */
162
163 #if CONFIG_SCHED_AUTO_JOIN
164 /* Information related to auto-join and deferred finish for work interval */
165 struct work_interval_auto_join_info wi_auto_join_info;
166
167 /*
168 * Since the deallocation of auto-join work intervals
169 * can happen in the scheduler when the last thread in
170 * the WI blocks and the thread lock is held, the deallocation
171 * might have to be done on a separate thread.
172 */
173 struct mpsc_queue_chain wi_deallocate_link;
174 #endif /* CONFIG_SCHED_AUTO_JOIN */
175 };
176
177 #if CONFIG_SCHED_AUTO_JOIN
178
179 /*
180 * work_interval_perform_deferred_finish()
181 *
182 * Perform a deferred finish for a work interval. The routine accepts the deferred_finish_state as an
183 * argument rather than looking at the work_interval since the deferred finish can race with another
184 * start-finish cycle. To address that, the caller ensures that it gets a consistent snapshot of the
185 * deferred state before calling this routine. This allows the racing start-finish cycle to overwrite
186 * the deferred state without issues.
187 */
188 static inline void
189 work_interval_perform_deferred_finish(__unused struct work_interval_deferred_finish_state *deferred_finish_state,
190 __unused struct work_interval *work_interval, __unused thread_t thread)
191 {
192
193 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_WI_DEFERRED_FINISH),
194 thread_tid(thread), thread_group_get_id(work_interval->wi_group));
195 }
196
197 /*
198 * work_interval_auto_join_increment()
199 *
200 * Routine to increment auto-join counter when a new thread is auto-joined to
201 * the work interval.
202 */
203 static void
204 work_interval_auto_join_increment(struct work_interval *work_interval)
205 {
206 struct work_interval_auto_join_info *join_info = &work_interval->wi_auto_join_info;
207 __assert_only work_interval_auto_join_status_t old_status = os_atomic_add_orig(&join_info->status, 1, relaxed);
208 assert(work_interval_status_auto_join_count(old_status) < WORK_INTERVAL_STATUS_AUTO_JOIN_COUNT_MAX);
209 }
210
211 /*
212 * work_interval_auto_join_decrement()
213 *
214 * Routine to decrement the auto-join counter when a thread unjoins the work interval (due to
215 * blocking or termination). If this was the last auto-joined thread in the work interval and
216 * there was a deferred finish, performs the finish operation for the work interval.
217 */
218 static void
219 work_interval_auto_join_decrement(struct work_interval *work_interval, thread_t thread)
220 {
221 struct work_interval_auto_join_info *join_info = &work_interval->wi_auto_join_info;
222 work_interval_auto_join_status_t old_status, new_status;
223 struct work_interval_deferred_finish_state deferred_finish_state;
224 bool perform_finish;
225
226 /* Update the auto-join count for the work interval atomically */
227 os_atomic_rmw_loop(&join_info->status, old_status, new_status, acquire, {
228 perform_finish = false;
229 new_status = old_status;
230 assert(work_interval_status_auto_join_count(old_status) > 0);
231 new_status -= 1;
232 if (new_status == WORK_INTERVAL_STATUS_DEFERRED_FINISH_MASK) {
233 /* No auto-joined threads remaining and finish is deferred */
234 new_status = 0;
235 perform_finish = true;
236 /*
237 * Its important to copy the deferred finish state here so that this works
238 * when racing with another start-finish cycle.
239 */
240 deferred_finish_state = join_info->deferred_finish_state;
241 }
242 });
243
244 if (perform_finish == true) {
245 /*
246 * Since work_interval_perform_deferred_finish() calls down to
247 * the machine layer callout for finish which gets the thread
248 * group from the thread passed in here, it is important to
249 * make sure that the thread still has the work interval thread
250 * group here.
251 */
252 assert(thread->thread_group == work_interval->wi_group);
253 work_interval_perform_deferred_finish(&deferred_finish_state, work_interval, thread);
254 }
255 }
256
257 /*
258 * work_interval_auto_join_enabled()
259 *
260 * Helper routine to check if work interval has auto-join enabled.
261 */
262 static inline bool
263 work_interval_auto_join_enabled(struct work_interval *work_interval)
264 {
265 return (work_interval->wi_create_flags & WORK_INTERVAL_FLAG_ENABLE_AUTO_JOIN) != 0;
266 }
267
268 /*
269 * work_interval_deferred_finish_enabled()
270 *
271 * Helper routine to check if work interval has deferred finish enabled.
272 */
273 static inline bool __unused
274 work_interval_deferred_finish_enabled(struct work_interval *work_interval)
275 {
276 return (work_interval->wi_create_flags & WORK_INTERVAL_FLAG_ENABLE_DEFERRED_FINISH) != 0;
277 }
278
279 #endif /* CONFIG_SCHED_AUTO_JOIN */
280
281 static inline void
282 work_interval_retain(struct work_interval *work_interval)
283 {
284 /*
285 * Even though wi_retain is called under a port lock, we have
286 * to use os_ref_retain instead of os_ref_retain_locked
287 * because wi_release is not synchronized. wi_release calls
288 * os_ref_release which is unsafe to pair with os_ref_retain_locked.
289 */
290 os_ref_retain(&work_interval->wi_ref_count);
291 }
292
293 static inline void
294 work_interval_deallocate(struct work_interval *work_interval)
295 {
296 KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_WORKGROUP, WORKGROUP_INTERVAL_DESTROY),
297 work_interval->wi_id);
298 #if CONFIG_THREAD_GROUPS
299 thread_group_release(work_interval->wi_group);
300 work_interval->wi_group = NULL;
301 #endif /* CONFIG_THREAD_GROUPS */
302 kfree(work_interval, sizeof(struct work_interval));
303 }
304
305 /*
306 * work_interval_release()
307 *
308 * Routine to release a ref count on the work interval. If the refcount goes down
309 * to zero, the work interval needs to be de-allocated.
310 *
311 * For non auto-join work intervals, they are de-allocated in this context.
312 *
313 * For auto-join work intervals, the de-allocation cannot be done from this context
314 * since that might need the kernel memory allocator lock. In that case, the
315 * deallocation is done via a thread-call based mpsc queue.
316 */
317 static void
318 work_interval_release(struct work_interval *work_interval, __unused thread_work_interval_options_t options)
319 {
320 if (os_ref_release(&work_interval->wi_ref_count) == 0) {
321 #if CONFIG_SCHED_AUTO_JOIN
322 if (options & THREAD_WI_THREAD_LOCK_HELD) {
323 work_interval_deferred_release(work_interval);
324 } else {
325 work_interval_deallocate(work_interval);
326 }
327 #else /* CONFIG_SCHED_AUTO_JOIN */
328 work_interval_deallocate(work_interval);
329 #endif /* CONFIG_SCHED_AUTO_JOIN */
330 }
331 }
332
333 #if CONFIG_SCHED_AUTO_JOIN
334
335 /*
336 * work_interval_deferred_release()
337 *
338 * Routine to enqueue the work interval on the deallocation mpsc queue.
339 */
340 static void
341 work_interval_deferred_release(struct work_interval *work_interval)
342 {
343 mpsc_daemon_enqueue(&work_interval_deallocate_queue,
344 &work_interval->wi_deallocate_link, MPSC_QUEUE_NONE);
345 }
346
347 /*
348 * work_interval_should_propagate()
349 *
350 * Main policy routine to decide if a thread should be auto-joined to
351 * another thread's work interval. The conditions are arranged such that
352 * the most common bailout condition are checked the earliest. This routine
353 * is called from the scheduler context; so it needs to be efficient and
354 * be careful when taking locks or performing wakeups.
355 */
356 inline bool
357 work_interval_should_propagate(thread_t cthread, thread_t thread)
358 {
359 /* Only allow propagation if the current thread has a work interval and the woken up thread does not */
360 if ((cthread->th_work_interval == NULL) || (thread->th_work_interval != NULL)) {
361 return false;
362 }
363
364 /* Only propagate work intervals which have auto-join enabled */
365 if (work_interval_auto_join_enabled(cthread->th_work_interval) == false) {
366 return false;
367 }
368
369 /* Work interval propagation is enabled for realtime threads only */
370 if ((cthread->sched_mode != TH_MODE_REALTIME) || (thread->sched_mode != TH_MODE_REALTIME)) {
371 return false;
372 }
373
374
375 /* Work interval propagation only works for threads with the same home thread group */
376 struct thread_group *thread_home_tg = thread_group_get_home_group(thread);
377 if (thread_group_get_home_group(cthread) != thread_home_tg) {
378 return false;
379 }
380
381 /* If woken up thread has adopted vouchers and other thread groups, it does not get propagation */
382 if (thread->thread_group != thread_home_tg) {
383 return false;
384 }
385
386 /* If either thread is inactive (in the termination path), do not propagate auto-join */
387 if ((!cthread->active) || (!thread->active)) {
388 return false;
389 }
390
391 return true;
392 }
393
394 /*
395 * work_interval_auto_join_propagate()
396 *
397 * Routine to auto-join a thread into another thread's work interval
398 *
399 * Should only be invoked if work_interval_should_propagate() returns
400 * true. Also expects "from" thread to be current thread and "to" thread
401 * to be locked.
402 */
403 void
404 work_interval_auto_join_propagate(thread_t from, thread_t to)
405 {
406 assert(from == current_thread());
407 work_interval_retain(from->th_work_interval);
408 work_interval_auto_join_increment(from->th_work_interval);
409 __assert_only kern_return_t kr = thread_set_work_interval(to, from->th_work_interval,
410 THREAD_WI_AUTO_JOIN_POLICY | THREAD_WI_THREAD_LOCK_HELD | THREAD_WI_THREAD_CTX_SWITCH);
411 assert(kr == KERN_SUCCESS);
412 }
413
414 /*
415 * work_interval_auto_join_unwind()
416 *
417 * Routine to un-join an auto-joined work interval for a thread that is blocking.
418 *
419 * Expects thread to be locked.
420 */
421 void
422 work_interval_auto_join_unwind(thread_t thread)
423 {
424 __assert_only kern_return_t kr = thread_set_work_interval(thread, NULL,
425 THREAD_WI_AUTO_JOIN_POLICY | THREAD_WI_THREAD_LOCK_HELD | THREAD_WI_THREAD_CTX_SWITCH);
426 assert(kr == KERN_SUCCESS);
427 }
428
429 /*
430 * work_interval_auto_join_demote()
431 *
432 * Routine to un-join an auto-joined work interval when a thread is changing from
433 * realtime to non-realtime scheduling mode. This could happen due to multiple
434 * reasons such as RT failsafe, thread backgrounding or thread termination. Also,
435 * the thread being demoted may not be the current thread.
436 *
437 * Expects thread to be locked.
438 */
439 void
440 work_interval_auto_join_demote(thread_t thread)
441 {
442 __assert_only kern_return_t kr = thread_set_work_interval(thread, NULL,
443 THREAD_WI_AUTO_JOIN_POLICY | THREAD_WI_THREAD_LOCK_HELD);
444 assert(kr == KERN_SUCCESS);
445 }
446
447 static void
448 work_interval_deallocate_queue_invoke(mpsc_queue_chain_t e,
449 __assert_only mpsc_daemon_queue_t dq)
450 {
451 struct work_interval *work_interval = NULL;
452 work_interval = mpsc_queue_element(e, struct work_interval, wi_deallocate_link);
453 assert(dq == &work_interval_deallocate_queue);
454 assert(os_ref_get_count(&work_interval->wi_ref_count) == 0);
455 work_interval_deallocate(work_interval);
456 }
457
458 #endif /* CONFIG_SCHED_AUTO_JOIN */
459
460 void
461 work_interval_subsystem_init(void)
462 {
463 #if CONFIG_SCHED_AUTO_JOIN
464 /*
465 * The work interval deallocation queue must be a thread call based queue
466 * because it is woken up from contexts where the thread lock is held. The
467 * only way to perform wakeups safely in those contexts is to wakeup a
468 * thread call which is guaranteed to be on a different waitq and would
469 * not hash onto the same global waitq which might be currently locked.
470 */
471 mpsc_daemon_queue_init_with_thread_call(&work_interval_deallocate_queue,
472 work_interval_deallocate_queue_invoke, THREAD_CALL_PRIORITY_KERNEL);
473 #endif /* CONFIG_SCHED_AUTO_JOIN */
474 }
475
476 /*
477 * work_interval_port_convert
478 *
479 * Called with port locked, returns reference to work interval
480 * if indeed the port is a work interval kobject port
481 */
482 static struct work_interval *
483 work_interval_port_convert_locked(ipc_port_t port)
484 {
485 struct work_interval *work_interval = NULL;
486
487 if (!IP_VALID(port)) {
488 return NULL;
489 }
490
491 if (!ip_active(port)) {
492 return NULL;
493 }
494
495 if (IKOT_WORK_INTERVAL != ip_kotype(port)) {
496 return NULL;
497 }
498
499 work_interval = (struct work_interval *) ip_get_kobject(port);
500
501 work_interval_retain(work_interval);
502
503 return work_interval;
504 }
505
506 /*
507 * port_name_to_work_interval
508 *
509 * Description: Obtain a reference to the work_interval associated with a given port.
510 *
511 * Parameters: name A Mach port name to translate.
512 *
513 * Returns: NULL The given Mach port did not reference a work_interval.
514 * !NULL The work_interval that is associated with the Mach port.
515 */
516 static kern_return_t
517 port_name_to_work_interval(mach_port_name_t name,
518 struct work_interval **work_interval)
519 {
520 if (!MACH_PORT_VALID(name)) {
521 return KERN_INVALID_NAME;
522 }
523
524 ipc_port_t port = IPC_PORT_NULL;
525 kern_return_t kr = KERN_SUCCESS;
526
527 kr = ipc_port_translate_send(current_space(), name, &port);
528 if (kr != KERN_SUCCESS) {
529 return kr;
530 }
531 /* port is locked */
532
533 assert(IP_VALID(port));
534
535 struct work_interval *converted_work_interval;
536
537 converted_work_interval = work_interval_port_convert_locked(port);
538
539 /* the port is valid, but doesn't denote a work_interval */
540 if (converted_work_interval == NULL) {
541 kr = KERN_INVALID_CAPABILITY;
542 }
543
544 ip_unlock(port);
545
546 if (kr == KERN_SUCCESS) {
547 *work_interval = converted_work_interval;
548 }
549
550 return kr;
551 }
552
553
554 /*
555 * work_interval_port_notify
556 *
557 * Description: Handle a no-senders notification for a work interval port.
558 * Destroys the port and releases its reference on the work interval.
559 *
560 * Parameters: msg A Mach no-senders notification message.
561 *
562 * Note: This assumes that there is only one create-right-from-work-interval point,
563 * if the ability to extract another send right after creation is added,
564 * this will have to change to handle make-send counts correctly.
565 */
566 void
567 work_interval_port_notify(mach_msg_header_t *msg)
568 {
569 mach_no_senders_notification_t *notification = (void *)msg;
570 ipc_port_t port = notification->not_header.msgh_remote_port;
571 struct work_interval *work_interval = NULL;
572
573 if (!IP_VALID(port)) {
574 panic("work_interval_port_notify(): invalid port");
575 }
576
577 ip_lock(port);
578
579 if (!ip_active(port)) {
580 panic("work_interval_port_notify(): inactive port %p", port);
581 }
582
583 if (ip_kotype(port) != IKOT_WORK_INTERVAL) {
584 panic("work_interval_port_notify(): not the right kobject: %p, %d\n",
585 port, ip_kotype(port));
586 }
587
588 if (port->ip_mscount != notification->not_count) {
589 panic("work_interval_port_notify(): unexpected make-send count: %p, %d, %d",
590 port, port->ip_mscount, notification->not_count);
591 }
592
593 if (port->ip_srights != 0) {
594 panic("work_interval_port_notify(): unexpected send right count: %p, %d",
595 port, port->ip_srights);
596 }
597
598 work_interval = (struct work_interval *) ip_get_kobject(port);
599
600 if (work_interval == NULL) {
601 panic("work_interval_port_notify(): missing kobject: %p", port);
602 }
603
604 ipc_kobject_set_atomically(port, IKO_NULL, IKOT_NONE);
605
606 work_interval->wi_port = MACH_PORT_NULL;
607
608 ip_unlock(port);
609
610 ipc_port_dealloc_kernel(port);
611 work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
612 }
613
614 /*
615 * work_interval_port_type()
616 *
617 * Converts a port name into the work interval object and returns its type.
618 *
619 * For invalid ports, it returns WORK_INTERVAL_TYPE_LAST (which is not a
620 * valid type for work intervals).
621 */
622 static uint32_t
623 work_interval_port_type(mach_port_name_t port_name)
624 {
625 struct work_interval *work_interval = NULL;
626 kern_return_t kr;
627 uint32_t work_interval_type;
628
629 if (port_name == MACH_PORT_NULL) {
630 return WORK_INTERVAL_TYPE_LAST;
631 }
632
633 kr = port_name_to_work_interval(port_name, &work_interval);
634 if (kr != KERN_SUCCESS) {
635 return WORK_INTERVAL_TYPE_LAST;
636 }
637 /* work_interval has a +1 ref */
638
639 assert(work_interval != NULL);
640 work_interval_type = work_interval->wi_create_flags & WORK_INTERVAL_TYPE_MASK;
641 work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
642 return work_interval_type;
643 }
644
645
646 /*
647 * thread_set_work_interval()
648 *
649 * Change thread's bound work interval to the passed-in work interval
650 * Consumes +1 ref on work_interval upon success.
651 *
652 * May also pass NULL to un-set work_interval on the thread
653 * Will deallocate any old work interval on the thread
654 * Return error if thread does not satisfy requirements to join work interval
655 *
656 * For non auto-join work intervals, deallocate any old work interval on the thread
657 * For auto-join work intervals, the routine may wakeup the work interval deferred
658 * deallocation queue since thread locks might be currently held.
659 */
660 static kern_return_t
661 thread_set_work_interval(thread_t thread,
662 struct work_interval *work_interval, thread_work_interval_options_t options)
663 {
664 /* All explicit work interval operations should always be from the current thread */
665 if (options & THREAD_WI_EXPLICIT_JOIN_POLICY) {
666 assert(thread == current_thread());
667 }
668
669 /* All cases of needing the thread lock should be from explicit join scenarios */
670 if (options & THREAD_WI_THREAD_LOCK_NEEDED) {
671 assert((options & THREAD_WI_EXPLICIT_JOIN_POLICY) != 0);
672 }
673
674 /* For all cases of auto join must come in with the thread lock held */
675 if (options & THREAD_WI_AUTO_JOIN_POLICY) {
676 assert((options & THREAD_WI_THREAD_LOCK_HELD) != 0);
677 }
678
679 if (work_interval) {
680 uint32_t work_interval_type = work_interval->wi_create_flags & WORK_INTERVAL_TYPE_MASK;
681
682 if ((work_interval_type == WORK_INTERVAL_TYPE_COREAUDIO) &&
683 (thread->sched_mode != TH_MODE_REALTIME) && (thread->saved_mode != TH_MODE_REALTIME)) {
684 return KERN_INVALID_ARGUMENT;
685 }
686 }
687
688 struct work_interval *old_th_wi = thread->th_work_interval;
689 #if CONFIG_SCHED_AUTO_JOIN
690 bool old_wi_auto_joined = ((thread->sched_flags & TH_SFLAG_THREAD_GROUP_AUTO_JOIN) != 0);
691
692 spl_t s;
693 /* Take the thread lock if needed */
694 if (options & THREAD_WI_THREAD_LOCK_NEEDED) {
695 s = splsched();
696 thread_lock(thread);
697 }
698
699 /*
700 * Work interval auto-join leak to non-RT threads.
701 *
702 * If thread might be running on a remote core and it's not in the context switch path (where
703 * thread is neither running, blocked or in the runq), its not possible to update the
704 * work interval & thread group remotely since its not possible to update CLPC for a remote
705 * core. This situation might happen when a thread is transitioning from realtime to
706 * non-realtime due to backgrounding etc., which would mean that non-RT threads would now
707 * be part of the work interval.
708 *
709 * Since there is no immediate mitigation to this issue, the policy is to set a new
710 * flag on the thread which indicates that such a "leak" has happened. This flag will
711 * be cleared when the remote thread eventually blocks and unjoins from the work interval.
712 */
713 bool thread_on_remote_core = ((thread != current_thread()) && (thread->state & TH_RUN) && (thread->runq == PROCESSOR_NULL));
714
715 if (thread_on_remote_core && ((options & THREAD_WI_THREAD_CTX_SWITCH) == 0)) {
716 assert((options & THREAD_WI_THREAD_LOCK_NEEDED) == 0);
717 os_atomic_or(&thread->th_work_interval_flags, TH_WORK_INTERVAL_FLAGS_AUTO_JOIN_LEAK, relaxed);
718 return KERN_SUCCESS;
719 }
720
721 old_wi_auto_joined = ((thread->sched_flags & TH_SFLAG_THREAD_GROUP_AUTO_JOIN) != 0);
722
723 if ((options & THREAD_WI_AUTO_JOIN_POLICY) || old_wi_auto_joined) {
724 __kdebug_only uint64_t old_tg_id = (old_th_wi) ? thread_group_get_id(old_th_wi->wi_group) : ~0;
725 __kdebug_only uint64_t new_tg_id = (work_interval) ? thread_group_get_id(work_interval->wi_group) : ~0;
726 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_WI_AUTO_JOIN),
727 thread_tid(thread), old_tg_id, new_tg_id, options);
728 }
729
730 if (old_wi_auto_joined) {
731 /*
732 * If thread was auto-joined to a work interval and is not realtime, make sure it
733 * happened due to the "leak" described above.
734 */
735 if (thread->sched_mode != TH_MODE_REALTIME) {
736 assert((thread->th_work_interval_flags & TH_WORK_INTERVAL_FLAGS_AUTO_JOIN_LEAK) != 0);
737 }
738
739 os_atomic_andnot(&thread->th_work_interval_flags, TH_WORK_INTERVAL_FLAGS_AUTO_JOIN_LEAK, relaxed);
740 work_interval_auto_join_decrement(old_th_wi, thread);
741 thread->sched_flags &= ~TH_SFLAG_THREAD_GROUP_AUTO_JOIN;
742 }
743
744 #endif /* CONFIG_SCHED_AUTO_JOIN */
745
746 KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_WORKGROUP, WORKGROUP_INTERVAL_CHANGE),
747 thread_tid(thread), (old_th_wi ? old_th_wi->wi_id : 0), (work_interval ? work_interval->wi_id : 0), !!(options & THREAD_WI_AUTO_JOIN_POLICY));
748
749 /* transfer +1 ref to thread */
750 thread->th_work_interval = work_interval;
751
752 #if CONFIG_SCHED_AUTO_JOIN
753
754 if ((options & THREAD_WI_AUTO_JOIN_POLICY) && work_interval) {
755 assert(work_interval_auto_join_enabled(work_interval) == true);
756 thread->sched_flags |= TH_SFLAG_THREAD_GROUP_AUTO_JOIN;
757 }
758
759 if (options & THREAD_WI_THREAD_LOCK_NEEDED) {
760 thread_unlock(thread);
761 splx(s);
762 }
763 #endif /* CONFIG_SCHED_AUTO_JOIN */
764
765 #if CONFIG_THREAD_GROUPS
766 struct thread_group *new_tg = (work_interval) ? (work_interval->wi_group) : NULL;
767 thread_set_work_interval_thread_group(thread, new_tg, (options & THREAD_WI_AUTO_JOIN_POLICY));
768 #endif /* CONFIG_THREAD_GROUPS */
769
770 if (old_th_wi != NULL) {
771 work_interval_release(old_th_wi, options);
772 }
773
774 return KERN_SUCCESS;
775 }
776
777 static kern_return_t
778 thread_set_work_interval_explicit_join(thread_t thread, struct work_interval *work_interval)
779 {
780 assert(thread == current_thread());
781 return thread_set_work_interval(thread, work_interval, THREAD_WI_EXPLICIT_JOIN_POLICY | THREAD_WI_THREAD_LOCK_NEEDED);
782 }
783
784 kern_return_t
785 work_interval_thread_terminate(thread_t thread)
786 {
787 assert(thread == current_thread());
788 if (thread->th_work_interval != NULL) {
789 return thread_set_work_interval(thread, NULL, THREAD_WI_EXPLICIT_JOIN_POLICY | THREAD_WI_THREAD_LOCK_NEEDED);
790 }
791 return KERN_SUCCESS;
792 }
793
794 kern_return_t
795 kern_work_interval_notify(thread_t thread, struct kern_work_interval_args* kwi_args)
796 {
797 assert(thread == current_thread());
798 assert(kwi_args->work_interval_id != 0);
799
800 struct work_interval *work_interval = thread->th_work_interval;
801
802 if (work_interval == NULL ||
803 work_interval->wi_id != kwi_args->work_interval_id) {
804 /* This thread must have adopted the work interval to be able to notify */
805 return KERN_INVALID_ARGUMENT;
806 }
807
808 task_t notifying_task = current_task();
809
810 if (work_interval->wi_creator_uniqueid != get_task_uniqueid(notifying_task) ||
811 work_interval->wi_creator_pidversion != get_task_version(notifying_task)) {
812 /* Only the creating task can do a notify */
813 return KERN_INVALID_ARGUMENT;
814 }
815
816 spl_t s = splsched();
817
818 #if CONFIG_THREAD_GROUPS
819 assert(work_interval->wi_group == thread->thread_group);
820 #endif /* CONFIG_THREAD_GROUPS */
821
822 uint64_t urgency_param1, urgency_param2;
823 kwi_args->urgency = (uint16_t)thread_get_urgency(thread, &urgency_param1, &urgency_param2);
824
825 splx(s);
826
827 /* called without interrupts disabled */
828 machine_work_interval_notify(thread, kwi_args);
829
830 return KERN_SUCCESS;
831 }
832
833 /* Start at 1, 0 is not a valid work interval ID */
834 static _Atomic uint64_t unique_work_interval_id = 1;
835
836 kern_return_t
837 kern_work_interval_create(thread_t thread,
838 struct kern_work_interval_create_args *create_params)
839 {
840 assert(thread == current_thread());
841
842 uint32_t create_flags = create_params->wica_create_flags;
843
844 if (((create_flags & WORK_INTERVAL_FLAG_JOINABLE) == 0) &&
845 thread->th_work_interval != NULL) {
846 /*
847 * If the thread is doing a legacy combined create and join,
848 * it shouldn't already be part of a work interval.
849 *
850 * (Creating a joinable WI is allowed anytime.)
851 */
852 return KERN_FAILURE;
853 }
854
855 /*
856 * Check the validity of the create flags before allocating the work
857 * interval.
858 */
859 task_t creating_task = current_task();
860 if ((create_flags & WORK_INTERVAL_TYPE_MASK) == WORK_INTERVAL_TYPE_CA_CLIENT) {
861 /*
862 * CA_CLIENT work intervals do not create new thread groups.
863 * There can only be one CA_CLIENT work interval (created by UIKit or AppKit)
864 * per each application task
865 */
866 if (create_flags & WORK_INTERVAL_FLAG_GROUP) {
867 return KERN_FAILURE;
868 }
869 if (!task_is_app(creating_task)) {
870 #if XNU_TARGET_OS_OSX
871 /*
872 * Soft-fail the case of a non-app pretending to be an
873 * app, by allowing it to press the buttons, but they're
874 * not actually connected to anything.
875 */
876 create_flags |= WORK_INTERVAL_FLAG_IGNORED;
877 #else
878 /*
879 * On iOS, it's a hard failure to get your apptype
880 * wrong and then try to render something.
881 */
882 return KERN_NOT_SUPPORTED;
883 #endif /* XNU_TARGET_OS_OSX */
884 }
885 if (task_set_ca_client_wi(creating_task, true) == false) {
886 return KERN_FAILURE;
887 }
888 }
889
890 #if CONFIG_SCHED_AUTO_JOIN
891 if (create_flags & WORK_INTERVAL_FLAG_ENABLE_AUTO_JOIN) {
892 uint32_t type = (create_flags & WORK_INTERVAL_TYPE_MASK);
893 if (type != WORK_INTERVAL_TYPE_COREAUDIO) {
894 return KERN_NOT_SUPPORTED;
895 }
896 if ((create_flags & WORK_INTERVAL_FLAG_GROUP) == 0) {
897 return KERN_NOT_SUPPORTED;
898 }
899 }
900
901 if (create_flags & WORK_INTERVAL_FLAG_ENABLE_DEFERRED_FINISH) {
902 if ((create_flags & WORK_INTERVAL_FLAG_ENABLE_AUTO_JOIN) == 0) {
903 return KERN_NOT_SUPPORTED;
904 }
905 }
906 #endif /* CONFIG_SCHED_AUTO_JOIN */
907
908 struct work_interval *work_interval = kalloc_flags(sizeof(*work_interval),
909 Z_WAITOK | Z_ZERO);
910 assert(work_interval != NULL);
911
912 uint64_t work_interval_id = os_atomic_inc(&unique_work_interval_id, relaxed);
913
914 *work_interval = (struct work_interval) {
915 .wi_id = work_interval_id,
916 .wi_ref_count = {},
917 .wi_create_flags = create_flags,
918 .wi_creator_pid = pid_from_task(creating_task),
919 .wi_creator_uniqueid = get_task_uniqueid(creating_task),
920 .wi_creator_pidversion = get_task_version(creating_task),
921 };
922 os_ref_init(&work_interval->wi_ref_count, NULL);
923
924 __kdebug_only uint64_t tg_id = 0;
925 #if CONFIG_THREAD_GROUPS
926 struct thread_group *tg;
927 if (create_flags & WORK_INTERVAL_FLAG_GROUP) {
928 /* create a new group for the interval to represent */
929 char name[THREAD_GROUP_MAXNAME] = "";
930
931 snprintf(name, sizeof(name), "WI[%d] #%lld",
932 work_interval->wi_creator_pid, work_interval_id);
933
934 tg = thread_group_create_and_retain();
935
936 thread_group_set_name(tg, name);
937
938 work_interval->wi_group = tg;
939 } else {
940 /* the interval represents the thread's home group */
941 tg = thread_group_get_home_group(thread);
942
943 thread_group_retain(tg);
944
945 work_interval->wi_group = tg;
946 }
947
948 /* Capture the tg_id for tracing purposes */
949 tg_id = thread_group_get_id(work_interval->wi_group);
950
951 #endif /* CONFIG_THREAD_GROUPS */
952
953 if (create_flags & WORK_INTERVAL_FLAG_JOINABLE) {
954 mach_port_name_t name = MACH_PORT_NULL;
955
956 /* work_interval has a +1 ref, moves to the port */
957 work_interval->wi_port = ipc_kobject_alloc_port(
958 (ipc_kobject_t)work_interval, IKOT_WORK_INTERVAL,
959 IPC_KOBJECT_ALLOC_MAKE_SEND | IPC_KOBJECT_ALLOC_NSREQUEST);
960
961 name = ipc_port_copyout_send(work_interval->wi_port, current_space());
962
963 if (!MACH_PORT_VALID(name)) {
964 /*
965 * copyout failed (port is already deallocated)
966 * Because of the port-destroyed magic,
967 * the work interval is already deallocated too.
968 */
969 return KERN_RESOURCE_SHORTAGE;
970 }
971
972 create_params->wica_port = name;
973 } else {
974 /* work_interval has a +1 ref, moves to the thread */
975 kern_return_t kr = thread_set_work_interval_explicit_join(thread, work_interval);
976 if (kr != KERN_SUCCESS) {
977 /* No other thread can join this work interval since it isn't
978 * JOINABLE so release the reference on work interval */
979 work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
980 return kr;
981 }
982 create_params->wica_port = MACH_PORT_NULL;
983 }
984
985 create_params->wica_id = work_interval_id;
986
987 KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_WORKGROUP, WORKGROUP_INTERVAL_CREATE),
988 work_interval_id, create_flags, pid_from_task(creating_task), tg_id);
989 return KERN_SUCCESS;
990 }
991
992 kern_return_t
993 kern_work_interval_get_flags_from_port(mach_port_name_t port_name, uint32_t *flags)
994 {
995 assert(flags != NULL);
996
997 kern_return_t kr;
998 struct work_interval *work_interval;
999
1000 kr = port_name_to_work_interval(port_name, &work_interval);
1001 if (kr != KERN_SUCCESS) {
1002 return kr;
1003 }
1004
1005 assert(work_interval != NULL);
1006 *flags = work_interval->wi_create_flags;
1007
1008 work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
1009
1010 return KERN_SUCCESS;
1011 }
1012
1013
1014 kern_return_t
1015 kern_work_interval_destroy(thread_t thread, uint64_t work_interval_id)
1016 {
1017 if (work_interval_id == 0) {
1018 return KERN_INVALID_ARGUMENT;
1019 }
1020
1021 if (thread->th_work_interval == NULL ||
1022 thread->th_work_interval->wi_id != work_interval_id) {
1023 /* work ID isn't valid or doesn't match joined work interval ID */
1024 return KERN_INVALID_ARGUMENT;
1025 }
1026
1027 return thread_set_work_interval_explicit_join(thread, NULL);
1028 }
1029
1030 kern_return_t
1031 kern_work_interval_join(thread_t thread,
1032 mach_port_name_t port_name)
1033 {
1034 struct work_interval *work_interval = NULL;
1035 kern_return_t kr;
1036
1037 if (port_name == MACH_PORT_NULL) {
1038 /* 'Un-join' the current work interval */
1039 return thread_set_work_interval_explicit_join(thread, NULL);
1040 }
1041
1042 kr = port_name_to_work_interval(port_name, &work_interval);
1043 if (kr != KERN_SUCCESS) {
1044 return kr;
1045 }
1046 /* work_interval has a +1 ref */
1047
1048 assert(work_interval != NULL);
1049
1050 kr = thread_set_work_interval_explicit_join(thread, work_interval);
1051 /* ref was consumed by passing it to the thread in the successful case */
1052 if (kr != KERN_SUCCESS) {
1053 work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
1054 }
1055 return kr;
1056 }
1057
1058 /*
1059 * work_interval_port_type_render_server()
1060 *
1061 * Helper routine to determine if the port points to a
1062 * WORK_INTERVAL_TYPE_CA_RENDER_SERVER work interval.
1063 */
1064 bool
1065 work_interval_port_type_render_server(mach_port_name_t port_name)
1066 {
1067 return work_interval_port_type(port_name) == WORK_INTERVAL_TYPE_CA_RENDER_SERVER;
1068 }