osfmk/kern/work_interval.c

   1 /*
   2  * Copyright (c) 2017 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28
  29
  30 #include <sys/work_interval.h>
  31
  32 #include <kern/work_interval.h>
  33
  34 #include <kern/thread.h>
  35 #include <kern/sched_prim.h>
  36 #include <kern/machine.h>
  37 #include <kern/thread_group.h>
  38 #include <kern/ipc_kobject.h>
  39 #include <kern/task.h>
  40 #include <kern/coalition.h>
  41 #include <kern/policy_internal.h>
  42 #include <kern/mpsc_queue.h>
  43
  44 #include <mach/kern_return.h>
  45 #include <mach/notify.h>
  46 #include <os/refcnt.h>
  47
  48 #include <stdatomic.h>
  49
  50 /*
  51  * With the introduction of auto-join work intervals, it is possible
  52  * to change the work interval (and related thread group) of a thread in a
  53  * variety of contexts (thread termination, context switch, thread mode
  54  * change etc.). In order to clearly specify the policy expectation and
  55  * the locking behavior, all calls to thread_set_work_interval() pass
  56  * in a set of flags.
  57  */
  58
  59 __options_decl(thread_work_interval_options_t, uint32_t, {
  60         /* Change the work interval using the explicit join rules */
  61         THREAD_WI_EXPLICIT_JOIN_POLICY = 0x1,
  62         /* Change the work interval using the auto-join rules */
  63         THREAD_WI_AUTO_JOIN_POLICY     = 0x2,
  64         /* Caller already holds the thread lock */
  65         THREAD_WI_THREAD_LOCK_HELD     = 0x4,
  66         /* Caller does not hold the thread lock */
  67         THREAD_WI_THREAD_LOCK_NEEDED   = 0x8,
  68         /* Change the work interval from the context switch path (thread may not be running or on a runq) */
  69         THREAD_WI_THREAD_CTX_SWITCH    = 0x10,
  70 });
  71
  72 static kern_return_t thread_set_work_interval(thread_t, struct work_interval *, thread_work_interval_options_t);
  73
  74 #if CONFIG_SCHED_AUTO_JOIN
  75 /* MPSC queue used to defer deallocate work intervals */
  76 static struct mpsc_daemon_queue work_interval_deallocate_queue;
  77
  78 static void work_interval_deferred_release(struct work_interval *);
  79
  80 /*
  81  * Work Interval Auto-Join Status
  82  *
  83  * work_interval_auto_join_status_t represents the state of auto-join for a given work interval.
  84  * It packs the following information:
  85  * - A bit representing if a "finish" is deferred on the work interval
  86  * - Count of number of threads auto-joined to the work interval
  87  */
  88 #define WORK_INTERVAL_STATUS_DEFERRED_FINISH_MASK    ((uint32_t)(1 << 31))
  89 #define WORK_INTERVAL_STATUS_AUTO_JOIN_COUNT_MASK    ((uint32_t)(WORK_INTERVAL_STATUS_DEFERRED_FINISH_MASK - 1))
  90 #define WORK_INTERVAL_STATUS_AUTO_JOIN_COUNT_MAX     WORK_INTERVAL_STATUS_AUTO_JOIN_COUNT_MASK
  91 typedef uint32_t work_interval_auto_join_status_t;
  92
  93 static inline bool __unused
  94 work_interval_status_deferred_finish(work_interval_auto_join_status_t status)
  95 {
  96         return (status & WORK_INTERVAL_STATUS_DEFERRED_FINISH_MASK) ? true : false;
  97 }
  98
  99 static inline uint32_t __unused
 100 work_interval_status_auto_join_count(work_interval_auto_join_status_t status)
 101 {
 102         return (uint32_t)(status & WORK_INTERVAL_STATUS_AUTO_JOIN_COUNT_MASK);
 103 }
 104
 105 /*
 106  * struct work_interval_deferred_finish_state
 107  *
 108  * Contains the parameters of the finish operation which is being deferred.
 109  */
 110 struct work_interval_deferred_finish_state {
 111         uint64_t instance_id;
 112         uint64_t start;
 113         uint64_t deadline;
 114         uint64_t complexity;
 115 };
 116
 117 struct work_interval_auto_join_info {
 118         struct work_interval_deferred_finish_state deferred_finish_state;
 119         work_interval_auto_join_status_t _Atomic status;
 120 };
 121 #endif /* CONFIG_SCHED_AUTO_JOIN */
 122
 123 /*
 124  * Work Interval structs
 125  *
 126  * This struct represents a thread group and/or work interval context
 127  * in a mechanism that is represented with a kobject.
 128  *
 129  * Every thread that has joined a WI has a +1 ref, and the port
 130  * has a +1 ref as well.
 131  *
 132  * TODO: groups need to have a 'is for WI' flag
 133  *      and they need a flag to create that says 'for WI'
 134  *      This would allow CLPC to avoid allocating WI support
 135  *      data unless it is needed
 136  *
 137  * TODO: Enforce not having more than one non-group joinable work
 138  *      interval per thread group.
 139  *      CLPC only wants to see one WI-notify callout per group.
 140  */
 141
 142 struct work_interval {
 143         uint64_t wi_id;
 144         struct os_refcnt wi_ref_count;
 145         uint32_t wi_create_flags;
 146
 147         /* for debugging purposes only, does not hold a ref on port */
 148         ipc_port_t wi_port;
 149
 150         /*
 151          * holds uniqueid and version of creating process,
 152          * used to permission-gate notify
 153          * TODO: you'd think there would be a better way to do this
 154          */
 155         uint64_t wi_creator_uniqueid;
 156         uint32_t wi_creator_pid;
 157         int wi_creator_pidversion;
 158
 159 #if CONFIG_THREAD_GROUPS
 160         struct thread_group *wi_group;  /* holds +1 ref on group */
 161 #endif /* CONFIG_THREAD_GROUPS */
 162
 163 #if CONFIG_SCHED_AUTO_JOIN
 164         /* Information related to auto-join and deferred finish for work interval */
 165         struct work_interval_auto_join_info wi_auto_join_info;
 166
 167         /*
 168          * Since the deallocation of auto-join work intervals
 169          * can happen in the scheduler when the last thread in
 170          * the WI blocks and the thread lock is held, the deallocation
 171          * might have to be done on a separate thread.
 172          */
 173         struct mpsc_queue_chain   wi_deallocate_link;
 174 #endif /* CONFIG_SCHED_AUTO_JOIN */
 175 };
 176
 177 #if CONFIG_SCHED_AUTO_JOIN
 178
 179 /*
 180  * work_interval_perform_deferred_finish()
 181  *
 182  * Perform a deferred finish for a work interval. The routine accepts the deferred_finish_state as an
 183  * argument rather than looking at the work_interval since the deferred finish can race with another
 184  * start-finish cycle. To address that, the caller ensures that it gets a consistent snapshot of the
 185  * deferred state before calling this routine. This allows the racing start-finish cycle to overwrite
 186  * the deferred state without issues.
 187  */
 188 static inline void
 189 work_interval_perform_deferred_finish(__unused struct work_interval_deferred_finish_state *deferred_finish_state,
 190     __unused struct work_interval *work_interval, __unused thread_t thread)
 191 {
 192
 193         KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_WI_DEFERRED_FINISH),
 194             thread_tid(thread), thread_group_get_id(work_interval->wi_group));
 195 }
 196
 197 /*
 198  * work_interval_auto_join_increment()
 199  *
 200  * Routine to increment auto-join counter when a new thread is auto-joined to
 201  * the work interval.
 202  */
 203 static void
 204 work_interval_auto_join_increment(struct work_interval *work_interval)
 205 {
 206         struct work_interval_auto_join_info *join_info = &work_interval->wi_auto_join_info;
 207         __assert_only work_interval_auto_join_status_t old_status = os_atomic_add_orig(&join_info->status, 1, relaxed);
 208         assert(work_interval_status_auto_join_count(old_status) < WORK_INTERVAL_STATUS_AUTO_JOIN_COUNT_MAX);
 209 }
 210
 211 /*
 212  * work_interval_auto_join_decrement()
 213  *
 214  * Routine to decrement the auto-join counter when a thread unjoins the work interval (due to
 215  * blocking or termination). If this was the last auto-joined thread in the work interval and
 216  * there was a deferred finish, performs the finish operation for the work interval.
 217  */
 218 static void
 219 work_interval_auto_join_decrement(struct work_interval *work_interval, thread_t thread)
 220 {
 221         struct work_interval_auto_join_info *join_info = &work_interval->wi_auto_join_info;
 222         work_interval_auto_join_status_t old_status, new_status;
 223         struct work_interval_deferred_finish_state deferred_finish_state;
 224         bool perform_finish;
 225
 226         /* Update the auto-join count for the work interval atomically */
 227         os_atomic_rmw_loop(&join_info->status, old_status, new_status, acquire, {
 228                 perform_finish = false;
 229                 new_status = old_status;
 230                 assert(work_interval_status_auto_join_count(old_status) > 0);
 231                 new_status -= 1;
 232                 if (new_status == WORK_INTERVAL_STATUS_DEFERRED_FINISH_MASK) {
 233                         /* No auto-joined threads remaining and finish is deferred */
 234                         new_status = 0;
 235                         perform_finish = true;
 236                         /*
 237                          * Its important to copy the deferred finish state here so that this works
 238                          * when racing with another start-finish cycle.
 239                          */
 240                         deferred_finish_state = join_info->deferred_finish_state;
 241                 }
 242         });
 243
 244         if (perform_finish == true) {
 245                 /*
 246                  * Since work_interval_perform_deferred_finish() calls down to
 247                  * the machine layer callout for finish which gets the thread
 248                  * group from the thread passed in here, it is important to
 249                  * make sure that the thread still has the work interval thread
 250                  * group here.
 251                  */
 252                 assert(thread->thread_group == work_interval->wi_group);
 253                 work_interval_perform_deferred_finish(&deferred_finish_state, work_interval, thread);
 254         }
 255 }
 256
 257 /*
 258  * work_interval_auto_join_enabled()
 259  *
 260  * Helper routine to check if work interval has auto-join enabled.
 261  */
 262 static inline bool
 263 work_interval_auto_join_enabled(struct work_interval *work_interval)
 264 {
 265         return (work_interval->wi_create_flags & WORK_INTERVAL_FLAG_ENABLE_AUTO_JOIN) != 0;
 266 }
 267
 268 /*
 269  * work_interval_deferred_finish_enabled()
 270  *
 271  * Helper routine to check if work interval has deferred finish enabled.
 272  */
 273 static inline bool __unused
 274 work_interval_deferred_finish_enabled(struct work_interval *work_interval)
 275 {
 276         return (work_interval->wi_create_flags & WORK_INTERVAL_FLAG_ENABLE_DEFERRED_FINISH) != 0;
 277 }
 278
 279 #endif /* CONFIG_SCHED_AUTO_JOIN */
 280
 281 static inline void
 282 work_interval_retain(struct work_interval *work_interval)
 283 {
 284         /*
 285          * Even though wi_retain is called under a port lock, we have
 286          * to use os_ref_retain instead of os_ref_retain_locked
 287          * because wi_release is not synchronized. wi_release calls
 288          * os_ref_release which is unsafe to pair with os_ref_retain_locked.
 289          */
 290         os_ref_retain(&work_interval->wi_ref_count);
 291 }
 292
 293 static inline void
 294 work_interval_deallocate(struct work_interval *work_interval)
 295 {
 296         KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_WORKGROUP, WORKGROUP_INTERVAL_DESTROY),
 297             work_interval->wi_id);
 298 #if CONFIG_THREAD_GROUPS
 299         thread_group_release(work_interval->wi_group);
 300         work_interval->wi_group = NULL;
 301 #endif /* CONFIG_THREAD_GROUPS */
 302         kfree(work_interval, sizeof(struct work_interval));
 303 }
 304
 305 /*
 306  * work_interval_release()
 307  *
 308  * Routine to release a ref count on the work interval. If the refcount goes down
 309  * to zero, the work interval needs to be de-allocated.
 310  *
 311  * For non auto-join work intervals, they are de-allocated in this context.
 312  *
 313  * For auto-join work intervals, the de-allocation cannot be done from this context
 314  * since that might need the kernel memory allocator lock. In that case, the
 315  * deallocation is done via a thread-call based mpsc queue.
 316  */
 317 static void
 318 work_interval_release(struct work_interval *work_interval, __unused thread_work_interval_options_t options)
 319 {
 320         if (os_ref_release(&work_interval->wi_ref_count) == 0) {
 321 #if CONFIG_SCHED_AUTO_JOIN
 322                 if (options & THREAD_WI_THREAD_LOCK_HELD) {
 323                         work_interval_deferred_release(work_interval);
 324                 } else {
 325                         work_interval_deallocate(work_interval);
 326                 }
 327 #else /* CONFIG_SCHED_AUTO_JOIN */
 328                 work_interval_deallocate(work_interval);
 329 #endif /* CONFIG_SCHED_AUTO_JOIN */
 330         }
 331 }
 332
 333 #if CONFIG_SCHED_AUTO_JOIN
 334
 335 /*
 336  * work_interval_deferred_release()
 337  *
 338  * Routine to enqueue the work interval on the deallocation mpsc queue.
 339  */
 340 static void
 341 work_interval_deferred_release(struct work_interval *work_interval)
 342 {
 343         mpsc_daemon_enqueue(&work_interval_deallocate_queue,
 344             &work_interval->wi_deallocate_link, MPSC_QUEUE_NONE);
 345 }
 346
 347 /*
 348  * work_interval_should_propagate()
 349  *
 350  * Main policy routine to decide if a thread should be auto-joined to
 351  * another thread's work interval. The conditions are arranged such that
 352  * the most common bailout condition are checked the earliest. This routine
 353  * is called from the scheduler context; so it needs to be efficient and
 354  * be careful when taking locks or performing wakeups.
 355  */
 356 inline bool
 357 work_interval_should_propagate(thread_t cthread, thread_t thread)
 358 {
 359         /* Only allow propagation if the current thread has a work interval and the woken up thread does not */
 360         if ((cthread->th_work_interval == NULL) || (thread->th_work_interval != NULL)) {
 361                 return false;
 362         }
 363
 364         /* Only propagate work intervals which have auto-join enabled */
 365         if (work_interval_auto_join_enabled(cthread->th_work_interval) == false) {
 366                 return false;
 367         }
 368
 369         /* Work interval propagation is enabled for realtime threads only */
 370         if ((cthread->sched_mode != TH_MODE_REALTIME) || (thread->sched_mode != TH_MODE_REALTIME)) {
 371                 return false;
 372         }
 373
 374
 375         /* Work interval propagation only works for threads with the same home thread group */
 376         struct thread_group *thread_home_tg = thread_group_get_home_group(thread);
 377         if (thread_group_get_home_group(cthread) != thread_home_tg) {
 378                 return false;
 379         }
 380
 381         /* If woken up thread has adopted vouchers and other thread groups, it does not get propagation */
 382         if (thread->thread_group != thread_home_tg) {
 383                 return false;
 384         }
 385
 386         /* If either thread is inactive (in the termination path), do not propagate auto-join */
 387         if ((!cthread->active) || (!thread->active)) {
 388                 return false;
 389         }
 390
 391         return true;
 392 }
 393
 394 /*
 395  * work_interval_auto_join_propagate()
 396  *
 397  * Routine to auto-join a thread into another thread's work interval
 398  *
 399  * Should only be invoked if work_interval_should_propagate() returns
 400  * true. Also expects "from" thread to be current thread and "to" thread
 401  * to be locked.
 402  */
 403 void
 404 work_interval_auto_join_propagate(thread_t from, thread_t to)
 405 {
 406         assert(from == current_thread());
 407         work_interval_retain(from->th_work_interval);
 408         work_interval_auto_join_increment(from->th_work_interval);
 409         __assert_only kern_return_t kr = thread_set_work_interval(to, from->th_work_interval,
 410             THREAD_WI_AUTO_JOIN_POLICY | THREAD_WI_THREAD_LOCK_HELD | THREAD_WI_THREAD_CTX_SWITCH);
 411         assert(kr == KERN_SUCCESS);
 412 }
 413
 414 /*
 415  * work_interval_auto_join_unwind()
 416  *
 417  * Routine to un-join an auto-joined work interval for a thread that is blocking.
 418  *
 419  * Expects thread to be locked.
 420  */
 421 void
 422 work_interval_auto_join_unwind(thread_t thread)
 423 {
 424         __assert_only kern_return_t kr = thread_set_work_interval(thread, NULL,
 425             THREAD_WI_AUTO_JOIN_POLICY | THREAD_WI_THREAD_LOCK_HELD | THREAD_WI_THREAD_CTX_SWITCH);
 426         assert(kr == KERN_SUCCESS);
 427 }
 428
 429 /*
 430  * work_interval_auto_join_demote()
 431  *
 432  * Routine to un-join an auto-joined work interval when a thread is changing from
 433  * realtime to non-realtime scheduling mode. This could happen due to multiple
 434  * reasons such as RT failsafe, thread backgrounding or thread termination. Also,
 435  * the thread being demoted may not be the current thread.
 436  *
 437  * Expects thread to be locked.
 438  */
 439 void
 440 work_interval_auto_join_demote(thread_t thread)
 441 {
 442         __assert_only kern_return_t kr = thread_set_work_interval(thread, NULL,
 443             THREAD_WI_AUTO_JOIN_POLICY | THREAD_WI_THREAD_LOCK_HELD);
 444         assert(kr == KERN_SUCCESS);
 445 }
 446
 447 static void
 448 work_interval_deallocate_queue_invoke(mpsc_queue_chain_t e,
 449     __assert_only mpsc_daemon_queue_t dq)
 450 {
 451         struct work_interval *work_interval = NULL;
 452         work_interval = mpsc_queue_element(e, struct work_interval, wi_deallocate_link);
 453         assert(dq == &work_interval_deallocate_queue);
 454         assert(os_ref_get_count(&work_interval->wi_ref_count) == 0);
 455         work_interval_deallocate(work_interval);
 456 }
 457
 458 #endif /* CONFIG_SCHED_AUTO_JOIN */
 459
 460 void
 461 work_interval_subsystem_init(void)
 462 {
 463 #if CONFIG_SCHED_AUTO_JOIN
 464         /*
 465          * The work interval deallocation queue must be a thread call based queue
 466          * because it is woken up from contexts where the thread lock is held. The
 467          * only way to perform wakeups safely in those contexts is to wakeup a
 468          * thread call which is guaranteed to be on a different waitq and would
 469          * not hash onto the same global waitq which might be currently locked.
 470          */
 471         mpsc_daemon_queue_init_with_thread_call(&work_interval_deallocate_queue,
 472             work_interval_deallocate_queue_invoke, THREAD_CALL_PRIORITY_KERNEL);
 473 #endif /* CONFIG_SCHED_AUTO_JOIN */
 474 }
 475
 476 /*
 477  * work_interval_port_convert
 478  *
 479  * Called with port locked, returns reference to work interval
 480  * if indeed the port is a work interval kobject port
 481  */
 482 static struct work_interval *
 483 work_interval_port_convert_locked(ipc_port_t port)
 484 {
 485         struct work_interval *work_interval = NULL;
 486
 487         if (!IP_VALID(port)) {
 488                 return NULL;
 489         }
 490
 491         if (!ip_active(port)) {
 492                 return NULL;
 493         }
 494
 495         if (IKOT_WORK_INTERVAL != ip_kotype(port)) {
 496                 return NULL;
 497         }
 498
 499         work_interval = (struct work_interval *) ip_get_kobject(port);
 500
 501         work_interval_retain(work_interval);
 502
 503         return work_interval;
 504 }
 505
 506 /*
 507  * port_name_to_work_interval
 508  *
 509  * Description: Obtain a reference to the work_interval associated with a given port.
 510  *
 511  * Parameters:  name    A Mach port name to translate.
 512  *
 513  * Returns:     NULL    The given Mach port did not reference a work_interval.
 514  *              !NULL   The work_interval that is associated with the Mach port.
 515  */
 516 static kern_return_t
 517 port_name_to_work_interval(mach_port_name_t     name,
 518     struct work_interval **work_interval)
 519 {
 520         if (!MACH_PORT_VALID(name)) {
 521                 return KERN_INVALID_NAME;
 522         }
 523
 524         ipc_port_t port = IPC_PORT_NULL;
 525         kern_return_t kr = KERN_SUCCESS;
 526
 527         kr = ipc_port_translate_send(current_space(), name, &port);
 528         if (kr != KERN_SUCCESS) {
 529                 return kr;
 530         }
 531         /* port is locked */
 532
 533         assert(IP_VALID(port));
 534
 535         struct work_interval *converted_work_interval;
 536
 537         converted_work_interval = work_interval_port_convert_locked(port);
 538
 539         /* the port is valid, but doesn't denote a work_interval */
 540         if (converted_work_interval == NULL) {
 541                 kr = KERN_INVALID_CAPABILITY;
 542         }
 543
 544         ip_unlock(port);
 545
 546         if (kr == KERN_SUCCESS) {
 547                 *work_interval = converted_work_interval;
 548         }
 549
 550         return kr;
 551 }
 552
 553
 554 /*
 555  * work_interval_port_notify
 556  *
 557  * Description: Handle a no-senders notification for a work interval port.
 558  *              Destroys the port and releases its reference on the work interval.
 559  *
 560  * Parameters:  msg     A Mach no-senders notification message.
 561  *
 562  * Note: This assumes that there is only one create-right-from-work-interval point,
 563  *       if the ability to extract another send right after creation is added,
 564  *       this will have to change to handle make-send counts correctly.
 565  */
 566 void
 567 work_interval_port_notify(mach_msg_header_t *msg)
 568 {
 569         mach_no_senders_notification_t *notification = (void *)msg;
 570         ipc_port_t port = notification->not_header.msgh_remote_port;
 571         struct work_interval *work_interval = NULL;
 572
 573         if (!IP_VALID(port)) {
 574                 panic("work_interval_port_notify(): invalid port");
 575         }
 576
 577         ip_lock(port);
 578
 579         if (!ip_active(port)) {
 580                 panic("work_interval_port_notify(): inactive port %p", port);
 581         }
 582
 583         if (ip_kotype(port) != IKOT_WORK_INTERVAL) {
 584                 panic("work_interval_port_notify(): not the right kobject: %p, %d\n",
 585                     port, ip_kotype(port));
 586         }
 587
 588         if (port->ip_mscount != notification->not_count) {
 589                 panic("work_interval_port_notify(): unexpected make-send count: %p, %d, %d",
 590                     port, port->ip_mscount, notification->not_count);
 591         }
 592
 593         if (port->ip_srights != 0) {
 594                 panic("work_interval_port_notify(): unexpected send right count: %p, %d",
 595                     port, port->ip_srights);
 596         }
 597
 598         work_interval = (struct work_interval *) ip_get_kobject(port);
 599
 600         if (work_interval == NULL) {
 601                 panic("work_interval_port_notify(): missing kobject: %p", port);
 602         }
 603
 604         ipc_kobject_set_atomically(port, IKO_NULL, IKOT_NONE);
 605
 606         work_interval->wi_port = MACH_PORT_NULL;
 607
 608         ip_unlock(port);
 609
 610         ipc_port_dealloc_kernel(port);
 611         work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
 612 }
 613
 614 /*
 615  * work_interval_port_type()
 616  *
 617  * Converts a port name into the work interval object and returns its type.
 618  *
 619  * For invalid ports, it returns WORK_INTERVAL_TYPE_LAST (which is not a
 620  * valid type for work intervals).
 621  */
 622 static uint32_t
 623 work_interval_port_type(mach_port_name_t port_name)
 624 {
 625         struct work_interval *work_interval = NULL;
 626         kern_return_t kr;
 627         uint32_t work_interval_type;
 628
 629         if (port_name == MACH_PORT_NULL) {
 630                 return WORK_INTERVAL_TYPE_LAST;
 631         }
 632
 633         kr = port_name_to_work_interval(port_name, &work_interval);
 634         if (kr != KERN_SUCCESS) {
 635                 return WORK_INTERVAL_TYPE_LAST;
 636         }
 637         /* work_interval has a +1 ref */
 638
 639         assert(work_interval != NULL);
 640         work_interval_type = work_interval->wi_create_flags & WORK_INTERVAL_TYPE_MASK;
 641         work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
 642         return work_interval_type;
 643 }
 644
 645
 646 /*
 647  * thread_set_work_interval()
 648  *
 649  * Change thread's bound work interval to the passed-in work interval
 650  * Consumes +1 ref on work_interval upon success.
 651  *
 652  * May also pass NULL to un-set work_interval on the thread
 653  * Will deallocate any old work interval on the thread
 654  * Return error if thread does not satisfy requirements to join work interval
 655  *
 656  * For non auto-join work intervals, deallocate any old work interval on the thread
 657  * For auto-join work intervals, the routine may wakeup the work interval deferred
 658  * deallocation queue since thread locks might be currently held.
 659  */
 660 static kern_return_t
 661 thread_set_work_interval(thread_t thread,
 662     struct work_interval *work_interval, thread_work_interval_options_t options)
 663 {
 664         /* All explicit work interval operations should always be from the current thread */
 665         if (options & THREAD_WI_EXPLICIT_JOIN_POLICY) {
 666                 assert(thread == current_thread());
 667         }
 668
 669         /* All cases of needing the thread lock should be from explicit join scenarios */
 670         if (options & THREAD_WI_THREAD_LOCK_NEEDED) {
 671                 assert((options & THREAD_WI_EXPLICIT_JOIN_POLICY) != 0);
 672         }
 673
 674         /* For all cases of auto join must come in with the thread lock held */
 675         if (options & THREAD_WI_AUTO_JOIN_POLICY) {
 676                 assert((options & THREAD_WI_THREAD_LOCK_HELD) != 0);
 677         }
 678
 679         if (work_interval) {
 680                 uint32_t work_interval_type = work_interval->wi_create_flags & WORK_INTERVAL_TYPE_MASK;
 681
 682                 if ((work_interval_type == WORK_INTERVAL_TYPE_COREAUDIO) &&
 683                     (thread->sched_mode != TH_MODE_REALTIME) && (thread->saved_mode != TH_MODE_REALTIME)) {
 684                         return KERN_INVALID_ARGUMENT;
 685                 }
 686         }
 687
 688         struct work_interval *old_th_wi = thread->th_work_interval;
 689 #if CONFIG_SCHED_AUTO_JOIN
 690         bool old_wi_auto_joined = ((thread->sched_flags & TH_SFLAG_THREAD_GROUP_AUTO_JOIN) != 0);
 691
 692         spl_t s;
 693         /* Take the thread lock if needed */
 694         if (options & THREAD_WI_THREAD_LOCK_NEEDED) {
 695                 s = splsched();
 696                 thread_lock(thread);
 697         }
 698
 699         /*
 700          * Work interval auto-join leak to non-RT threads.
 701          *
 702          * If thread might be running on a remote core and it's not in the context switch path (where
 703          * thread is neither running, blocked or in the runq), its not possible to update the
 704          * work interval & thread group remotely since its not possible to update CLPC for a remote
 705          * core. This situation might happen when a thread is transitioning from realtime to
 706          * non-realtime due to backgrounding etc., which would mean that non-RT threads would now
 707          * be part of the work interval.
 708          *
 709          * Since there is no immediate mitigation to this issue, the policy is to set a new
 710          * flag on the thread which indicates that such a "leak" has happened. This flag will
 711          * be cleared when the remote thread eventually blocks and unjoins from the work interval.
 712          */
 713         bool thread_on_remote_core = ((thread != current_thread()) && (thread->state & TH_RUN) && (thread->runq == PROCESSOR_NULL));
 714
 715         if (thread_on_remote_core && ((options & THREAD_WI_THREAD_CTX_SWITCH) == 0)) {
 716                 assert((options & THREAD_WI_THREAD_LOCK_NEEDED) == 0);
 717                 os_atomic_or(&thread->th_work_interval_flags, TH_WORK_INTERVAL_FLAGS_AUTO_JOIN_LEAK, relaxed);
 718                 return KERN_SUCCESS;
 719         }
 720
 721         old_wi_auto_joined = ((thread->sched_flags & TH_SFLAG_THREAD_GROUP_AUTO_JOIN) != 0);
 722
 723         if ((options & THREAD_WI_AUTO_JOIN_POLICY) || old_wi_auto_joined) {
 724                 __kdebug_only uint64_t old_tg_id = (old_th_wi) ? thread_group_get_id(old_th_wi->wi_group) : ~0;
 725                 __kdebug_only uint64_t new_tg_id = (work_interval) ? thread_group_get_id(work_interval->wi_group) : ~0;
 726                 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_WI_AUTO_JOIN),
 727                     thread_tid(thread), old_tg_id, new_tg_id, options);
 728         }
 729
 730         if (old_wi_auto_joined) {
 731                 /*
 732                  * If thread was auto-joined to a work interval and is not realtime, make sure it
 733                  * happened due to the "leak" described above.
 734                  */
 735                 if (thread->sched_mode != TH_MODE_REALTIME) {
 736                         assert((thread->th_work_interval_flags & TH_WORK_INTERVAL_FLAGS_AUTO_JOIN_LEAK) != 0);
 737                 }
 738
 739                 os_atomic_andnot(&thread->th_work_interval_flags, TH_WORK_INTERVAL_FLAGS_AUTO_JOIN_LEAK, relaxed);
 740                 work_interval_auto_join_decrement(old_th_wi, thread);
 741                 thread->sched_flags &= ~TH_SFLAG_THREAD_GROUP_AUTO_JOIN;
 742         }
 743
 744 #endif /* CONFIG_SCHED_AUTO_JOIN */
 745
 746         KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_WORKGROUP, WORKGROUP_INTERVAL_CHANGE),
 747             thread_tid(thread), (old_th_wi ? old_th_wi->wi_id : 0), (work_interval ? work_interval->wi_id : 0), !!(options & THREAD_WI_AUTO_JOIN_POLICY));
 748
 749         /* transfer +1 ref to thread */
 750         thread->th_work_interval = work_interval;
 751
 752 #if CONFIG_SCHED_AUTO_JOIN
 753
 754         if ((options & THREAD_WI_AUTO_JOIN_POLICY) && work_interval) {
 755                 assert(work_interval_auto_join_enabled(work_interval) == true);
 756                 thread->sched_flags |= TH_SFLAG_THREAD_GROUP_AUTO_JOIN;
 757         }
 758
 759         if (options & THREAD_WI_THREAD_LOCK_NEEDED) {
 760                 thread_unlock(thread);
 761                 splx(s);
 762         }
 763 #endif /* CONFIG_SCHED_AUTO_JOIN */
 764
 765 #if CONFIG_THREAD_GROUPS
 766         struct thread_group *new_tg = (work_interval) ? (work_interval->wi_group) : NULL;
 767         thread_set_work_interval_thread_group(thread, new_tg, (options & THREAD_WI_AUTO_JOIN_POLICY));
 768 #endif /* CONFIG_THREAD_GROUPS */
 769
 770         if (old_th_wi != NULL) {
 771                 work_interval_release(old_th_wi, options);
 772         }
 773
 774         return KERN_SUCCESS;
 775 }
 776
 777 static kern_return_t
 778 thread_set_work_interval_explicit_join(thread_t thread, struct work_interval *work_interval)
 779 {
 780         assert(thread == current_thread());
 781         return thread_set_work_interval(thread, work_interval, THREAD_WI_EXPLICIT_JOIN_POLICY | THREAD_WI_THREAD_LOCK_NEEDED);
 782 }
 783
 784 kern_return_t
 785 work_interval_thread_terminate(thread_t thread)
 786 {
 787         assert(thread == current_thread());
 788         if (thread->th_work_interval != NULL) {
 789                 return thread_set_work_interval(thread, NULL, THREAD_WI_EXPLICIT_JOIN_POLICY | THREAD_WI_THREAD_LOCK_NEEDED);
 790         }
 791         return KERN_SUCCESS;
 792 }
 793
 794 kern_return_t
 795 kern_work_interval_notify(thread_t thread, struct kern_work_interval_args* kwi_args)
 796 {
 797         assert(thread == current_thread());
 798         assert(kwi_args->work_interval_id != 0);
 799
 800         struct work_interval *work_interval = thread->th_work_interval;
 801
 802         if (work_interval == NULL ||
 803             work_interval->wi_id != kwi_args->work_interval_id) {
 804                 /* This thread must have adopted the work interval to be able to notify */
 805                 return KERN_INVALID_ARGUMENT;
 806         }
 807
 808         task_t notifying_task = current_task();
 809
 810         if (work_interval->wi_creator_uniqueid != get_task_uniqueid(notifying_task) ||
 811             work_interval->wi_creator_pidversion != get_task_version(notifying_task)) {
 812                 /* Only the creating task can do a notify */
 813                 return KERN_INVALID_ARGUMENT;
 814         }
 815
 816         spl_t s = splsched();
 817
 818 #if CONFIG_THREAD_GROUPS
 819         assert(work_interval->wi_group == thread->thread_group);
 820 #endif /* CONFIG_THREAD_GROUPS */
 821
 822         uint64_t urgency_param1, urgency_param2;
 823         kwi_args->urgency = (uint16_t)thread_get_urgency(thread, &urgency_param1, &urgency_param2);
 824
 825         splx(s);
 826
 827         /* called without interrupts disabled */
 828         machine_work_interval_notify(thread, kwi_args);
 829
 830         return KERN_SUCCESS;
 831 }
 832
 833 /* Start at 1, 0 is not a valid work interval ID */
 834 static _Atomic uint64_t unique_work_interval_id = 1;
 835
 836 kern_return_t
 837 kern_work_interval_create(thread_t thread,
 838     struct kern_work_interval_create_args *create_params)
 839 {
 840         assert(thread == current_thread());
 841
 842         uint32_t create_flags = create_params->wica_create_flags;
 843
 844         if (((create_flags & WORK_INTERVAL_FLAG_JOINABLE) == 0) &&
 845             thread->th_work_interval != NULL) {
 846                 /*
 847                  * If the thread is doing a legacy combined create and join,
 848                  * it shouldn't already be part of a work interval.
 849                  *
 850                  * (Creating a joinable WI is allowed anytime.)
 851                  */
 852                 return KERN_FAILURE;
 853         }
 854
 855         /*
 856          * Check the validity of the create flags before allocating the work
 857          * interval.
 858          */
 859         task_t creating_task = current_task();
 860         if ((create_flags & WORK_INTERVAL_TYPE_MASK) == WORK_INTERVAL_TYPE_CA_CLIENT) {
 861                 /*
 862                  * CA_CLIENT work intervals do not create new thread groups.
 863                  * There can only be one CA_CLIENT work interval (created by UIKit or AppKit)
 864                  * per each application task
 865                  */
 866                 if (create_flags & WORK_INTERVAL_FLAG_GROUP) {
 867                         return KERN_FAILURE;
 868                 }
 869                 if (!task_is_app(creating_task)) {
 870 #if XNU_TARGET_OS_OSX
 871                         /*
 872                          * Soft-fail the case of a non-app pretending to be an
 873                          * app, by allowing it to press the buttons, but they're
 874                          * not actually connected to anything.
 875                          */
 876                         create_flags |= WORK_INTERVAL_FLAG_IGNORED;
 877 #else
 878                         /*
 879                          * On iOS, it's a hard failure to get your apptype
 880                          * wrong and then try to render something.
 881                          */
 882                         return KERN_NOT_SUPPORTED;
 883 #endif /* XNU_TARGET_OS_OSX */
 884                 }
 885                 if (task_set_ca_client_wi(creating_task, true) == false) {
 886                         return KERN_FAILURE;
 887                 }
 888         }
 889
 890 #if CONFIG_SCHED_AUTO_JOIN
 891         if (create_flags & WORK_INTERVAL_FLAG_ENABLE_AUTO_JOIN) {
 892                 uint32_t type = (create_flags & WORK_INTERVAL_TYPE_MASK);
 893                 if (type != WORK_INTERVAL_TYPE_COREAUDIO) {
 894                         return KERN_NOT_SUPPORTED;
 895                 }
 896                 if ((create_flags & WORK_INTERVAL_FLAG_GROUP) == 0) {
 897                         return KERN_NOT_SUPPORTED;
 898                 }
 899         }
 900
 901         if (create_flags & WORK_INTERVAL_FLAG_ENABLE_DEFERRED_FINISH) {
 902                 if ((create_flags & WORK_INTERVAL_FLAG_ENABLE_AUTO_JOIN) == 0) {
 903                         return KERN_NOT_SUPPORTED;
 904                 }
 905         }
 906 #endif /* CONFIG_SCHED_AUTO_JOIN */
 907
 908         struct work_interval *work_interval = kalloc_flags(sizeof(*work_interval),
 909             Z_WAITOK | Z_ZERO);
 910         assert(work_interval != NULL);
 911
 912         uint64_t work_interval_id = os_atomic_inc(&unique_work_interval_id, relaxed);
 913
 914         *work_interval = (struct work_interval) {
 915                 .wi_id                  = work_interval_id,
 916                 .wi_ref_count           = {},
 917                 .wi_create_flags        = create_flags,
 918                 .wi_creator_pid         = pid_from_task(creating_task),
 919                 .wi_creator_uniqueid    = get_task_uniqueid(creating_task),
 920                 .wi_creator_pidversion  = get_task_version(creating_task),
 921         };
 922         os_ref_init(&work_interval->wi_ref_count, NULL);
 923
 924         __kdebug_only uint64_t tg_id = 0;
 925 #if CONFIG_THREAD_GROUPS
 926         struct thread_group *tg;
 927         if (create_flags & WORK_INTERVAL_FLAG_GROUP) {
 928                 /* create a new group for the interval to represent */
 929                 char name[THREAD_GROUP_MAXNAME] = "";
 930
 931                 snprintf(name, sizeof(name), "WI[%d] #%lld",
 932                     work_interval->wi_creator_pid, work_interval_id);
 933
 934                 tg = thread_group_create_and_retain();
 935
 936                 thread_group_set_name(tg, name);
 937
 938                 work_interval->wi_group = tg;
 939         } else {
 940                 /* the interval represents the thread's home group */
 941                 tg = thread_group_get_home_group(thread);
 942
 943                 thread_group_retain(tg);
 944
 945                 work_interval->wi_group = tg;
 946         }
 947
 948         /* Capture the tg_id for tracing purposes */
 949         tg_id = thread_group_get_id(work_interval->wi_group);
 950
 951 #endif /* CONFIG_THREAD_GROUPS */
 952
 953         if (create_flags & WORK_INTERVAL_FLAG_JOINABLE) {
 954                 mach_port_name_t name = MACH_PORT_NULL;
 955
 956                 /* work_interval has a +1 ref, moves to the port */
 957                 work_interval->wi_port = ipc_kobject_alloc_port(
 958                         (ipc_kobject_t)work_interval, IKOT_WORK_INTERVAL,
 959                         IPC_KOBJECT_ALLOC_MAKE_SEND | IPC_KOBJECT_ALLOC_NSREQUEST);
 960
 961                 name = ipc_port_copyout_send(work_interval->wi_port, current_space());
 962
 963                 if (!MACH_PORT_VALID(name)) {
 964                         /*
 965                          * copyout failed (port is already deallocated)
 966                          * Because of the port-destroyed magic,
 967                          * the work interval is already deallocated too.
 968                          */
 969                         return KERN_RESOURCE_SHORTAGE;
 970                 }
 971
 972                 create_params->wica_port = name;
 973         } else {
 974                 /* work_interval has a +1 ref, moves to the thread */
 975                 kern_return_t kr = thread_set_work_interval_explicit_join(thread, work_interval);
 976                 if (kr != KERN_SUCCESS) {
 977                         /* No other thread can join this work interval since it isn't
 978                          * JOINABLE so release the reference on work interval */
 979                         work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
 980                         return kr;
 981                 }
 982                 create_params->wica_port = MACH_PORT_NULL;
 983         }
 984
 985         create_params->wica_id = work_interval_id;
 986
 987         KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_WORKGROUP, WORKGROUP_INTERVAL_CREATE),
 988             work_interval_id, create_flags, pid_from_task(creating_task), tg_id);
 989         return KERN_SUCCESS;
 990 }
 991
 992 kern_return_t
 993 kern_work_interval_get_flags_from_port(mach_port_name_t port_name, uint32_t *flags)
 994 {
 995         assert(flags != NULL);
 996
 997         kern_return_t kr;
 998         struct work_interval *work_interval;
 999
1000         kr = port_name_to_work_interval(port_name, &work_interval);
1001         if (kr != KERN_SUCCESS) {
1002                 return kr;
1003         }
1004
1005         assert(work_interval != NULL);
1006         *flags = work_interval->wi_create_flags;
1007
1008         work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
1009
1010         return KERN_SUCCESS;
1011 }
1012
1013
1014 kern_return_t
1015 kern_work_interval_destroy(thread_t thread, uint64_t work_interval_id)
1016 {
1017         if (work_interval_id == 0) {
1018                 return KERN_INVALID_ARGUMENT;
1019         }
1020
1021         if (thread->th_work_interval == NULL ||
1022             thread->th_work_interval->wi_id != work_interval_id) {
1023                 /* work ID isn't valid or doesn't match joined work interval ID */
1024                 return KERN_INVALID_ARGUMENT;
1025         }
1026
1027         return thread_set_work_interval_explicit_join(thread, NULL);
1028 }
1029
1030 kern_return_t
1031 kern_work_interval_join(thread_t            thread,
1032     mach_port_name_t    port_name)
1033 {
1034         struct work_interval *work_interval = NULL;
1035         kern_return_t kr;
1036
1037         if (port_name == MACH_PORT_NULL) {
1038                 /* 'Un-join' the current work interval */
1039                 return thread_set_work_interval_explicit_join(thread, NULL);
1040         }
1041
1042         kr = port_name_to_work_interval(port_name, &work_interval);
1043         if (kr != KERN_SUCCESS) {
1044                 return kr;
1045         }
1046         /* work_interval has a +1 ref */
1047
1048         assert(work_interval != NULL);
1049
1050         kr = thread_set_work_interval_explicit_join(thread, work_interval);
1051         /* ref was consumed by passing it to the thread in the successful case */
1052         if (kr != KERN_SUCCESS) {
1053                 work_interval_release(work_interval, THREAD_WI_THREAD_LOCK_NEEDED);
1054         }
1055         return kr;
1056 }
1057
1058 /*
1059  * work_interval_port_type_render_server()
1060  *
1061  * Helper routine to determine if the port points to a
1062  * WORK_INTERVAL_TYPE_CA_RENDER_SERVER work interval.
1063  */
1064 bool
1065 work_interval_port_type_render_server(mach_port_name_t port_name)
1066 {
1067         return work_interval_port_type(port_name) == WORK_INTERVAL_TYPE_CA_RENDER_SERVER;
1068 }