bsd/pthread/pthread_workqueue.c

   1 /*
   2  * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995-2018 Apple, Inc. All Rights Reserved */
  29
  30 #include <sys/cdefs.h>
  31
  32 #include <kern/assert.h>
  33 #include <kern/ast.h>
  34 #include <kern/clock.h>
  35 #include <kern/cpu_data.h>
  36 #include <kern/kern_types.h>
  37 #include <kern/policy_internal.h>
  38 #include <kern/processor.h>
  39 #include <kern/sched_prim.h>    /* for thread_exception_return */
  40 #include <kern/task.h>
  41 #include <kern/thread.h>
  42 #include <kern/zalloc.h>
  43 #include <mach/kern_return.h>
  44 #include <mach/mach_param.h>
  45 #include <mach/mach_port.h>
  46 #include <mach/mach_types.h>
  47 #include <mach/mach_vm.h>
  48 #include <mach/sync_policy.h>
  49 #include <mach/task.h>
  50 #include <mach/thread_act.h> /* for thread_resume */
  51 #include <mach/thread_policy.h>
  52 #include <mach/thread_status.h>
  53 #include <mach/vm_prot.h>
  54 #include <mach/vm_statistics.h>
  55 #include <machine/atomic.h>
  56 #include <machine/machine_routines.h>
  57 #include <vm/vm_map.h>
  58 #include <vm/vm_protos.h>
  59
  60 #include <sys/eventvar.h>
  61 #include <sys/kdebug.h>
  62 #include <sys/kernel.h>
  63 #include <sys/lock.h>
  64 #include <sys/param.h>
  65 #include <sys/proc_info.h>      /* for fill_procworkqueue */
  66 #include <sys/proc_internal.h>
  67 #include <sys/pthread_shims.h>
  68 #include <sys/resourcevar.h>
  69 #include <sys/signalvar.h>
  70 #include <sys/sysctl.h>
  71 #include <sys/sysproto.h>
  72 #include <sys/systm.h>
  73 #include <sys/ulock.h> /* for ulock_owner_value_to_port_name */
  74
  75 #include <pthread/bsdthread_private.h>
  76 #include <pthread/workqueue_syscalls.h>
  77 #include <pthread/workqueue_internal.h>
  78 #include <pthread/workqueue_trace.h>
  79
  80 #include <os/log.h>
  81
  82 static void workq_unpark_continue(void *uth, wait_result_t wr) __dead2;
  83 static void workq_schedule_creator(proc_t p, struct workqueue *wq,
  84     workq_kern_threadreq_flags_t flags);
  85
  86 static bool workq_threadreq_admissible(struct workqueue *wq, struct uthread *uth,
  87     workq_threadreq_t req);
  88
  89 static uint32_t workq_constrained_allowance(struct workqueue *wq,
  90     thread_qos_t at_qos, struct uthread *uth, bool may_start_timer);
  91
  92 static bool workq_thread_is_busy(uint64_t cur_ts,
  93     _Atomic uint64_t *lastblocked_tsp);
  94
  95 static int workq_sysctl_handle_usecs SYSCTL_HANDLER_ARGS;
  96
  97 #pragma mark globals
  98
  99 struct workq_usec_var {
 100         uint32_t usecs;
 101         uint64_t abstime;
 102 };
 103
 104 #define WORKQ_SYSCTL_USECS(var, init) \
 105                 static struct workq_usec_var var = { .usecs = init }; \
 106                 SYSCTL_OID(_kern, OID_AUTO, var##_usecs, \
 107                                 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &var, 0, \
 108                                 workq_sysctl_handle_usecs, "I", "")
 109
 110 static LCK_GRP_DECLARE(workq_lck_grp, "workq");
 111 os_refgrp_decl(static, workq_refgrp, "workq", NULL);
 112
 113 static ZONE_DECLARE(workq_zone_workqueue, "workq.wq",
 114     sizeof(struct workqueue), ZC_NONE);
 115 static ZONE_DECLARE(workq_zone_threadreq, "workq.threadreq",
 116     sizeof(struct workq_threadreq_s), ZC_CACHING);
 117
 118 static struct mpsc_daemon_queue workq_deallocate_queue;
 119
 120 WORKQ_SYSCTL_USECS(wq_stalled_window, WQ_STALLED_WINDOW_USECS);
 121 WORKQ_SYSCTL_USECS(wq_reduce_pool_window, WQ_REDUCE_POOL_WINDOW_USECS);
 122 WORKQ_SYSCTL_USECS(wq_max_timer_interval, WQ_MAX_TIMER_INTERVAL_USECS);
 123 static uint32_t wq_max_threads              = WORKQUEUE_MAXTHREADS;
 124 static uint32_t wq_max_constrained_threads  = WORKQUEUE_MAXTHREADS / 8;
 125 static uint32_t wq_init_constrained_limit   = 1;
 126 static uint16_t wq_death_max_load;
 127 static uint32_t wq_max_parallelism[WORKQ_NUM_QOS_BUCKETS];
 128
 129 #pragma mark sysctls
 130
 131 static int
 132 workq_sysctl_handle_usecs SYSCTL_HANDLER_ARGS
 133 {
 134 #pragma unused(arg2)
 135         struct workq_usec_var *v = arg1;
 136         int error = sysctl_handle_int(oidp, &v->usecs, 0, req);
 137         if (error || !req->newptr) {
 138                 return error;
 139         }
 140         clock_interval_to_absolutetime_interval(v->usecs, NSEC_PER_USEC,
 141             &v->abstime);
 142         return 0;
 143 }
 144
 145 SYSCTL_INT(_kern, OID_AUTO, wq_max_threads, CTLFLAG_RW | CTLFLAG_LOCKED,
 146     &wq_max_threads, 0, "");
 147
 148 SYSCTL_INT(_kern, OID_AUTO, wq_max_constrained_threads, CTLFLAG_RW | CTLFLAG_LOCKED,
 149     &wq_max_constrained_threads, 0, "");
 150
 151 #pragma mark p_wqptr
 152
 153 #define WQPTR_IS_INITING_VALUE ((struct workqueue *)~(uintptr_t)0)
 154
 155 static struct workqueue *
 156 proc_get_wqptr_fast(struct proc *p)
 157 {
 158         return os_atomic_load(&p->p_wqptr, relaxed);
 159 }
 160
 161 static struct workqueue *
 162 proc_get_wqptr(struct proc *p)
 163 {
 164         struct workqueue *wq = proc_get_wqptr_fast(p);
 165         return wq == WQPTR_IS_INITING_VALUE ? NULL : wq;
 166 }
 167
 168 static void
 169 proc_set_wqptr(struct proc *p, struct workqueue *wq)
 170 {
 171         wq = os_atomic_xchg(&p->p_wqptr, wq, release);
 172         if (wq == WQPTR_IS_INITING_VALUE) {
 173                 proc_lock(p);
 174                 thread_wakeup(&p->p_wqptr);
 175                 proc_unlock(p);
 176         }
 177 }
 178
 179 static bool
 180 proc_init_wqptr_or_wait(struct proc *p)
 181 {
 182         struct workqueue *wq;
 183
 184         proc_lock(p);
 185         wq = os_atomic_load(&p->p_wqptr, relaxed);
 186
 187         if (wq == NULL) {
 188                 os_atomic_store(&p->p_wqptr, WQPTR_IS_INITING_VALUE, relaxed);
 189                 proc_unlock(p);
 190                 return true;
 191         }
 192
 193         if (wq == WQPTR_IS_INITING_VALUE) {
 194                 assert_wait(&p->p_wqptr, THREAD_UNINT);
 195                 proc_unlock(p);
 196                 thread_block(THREAD_CONTINUE_NULL);
 197         } else {
 198                 proc_unlock(p);
 199         }
 200         return false;
 201 }
 202
 203 static inline event_t
 204 workq_parked_wait_event(struct uthread *uth)
 205 {
 206         return (event_t)&uth->uu_workq_stackaddr;
 207 }
 208
 209 static inline void
 210 workq_thread_wakeup(struct uthread *uth)
 211 {
 212         thread_wakeup_thread(workq_parked_wait_event(uth), uth->uu_thread);
 213 }
 214
 215 #pragma mark wq_thactive
 216
 217 #if defined(__LP64__)
 218 // Layout is:
 219 //   127 - 115 : 13 bits of zeroes
 220 //   114 - 112 : best QoS among all pending constrained requests
 221 //   111 -   0 : MGR, AUI, UI, IN, DF, UT, BG+MT buckets every 16 bits
 222 #define WQ_THACTIVE_BUCKET_WIDTH 16
 223 #define WQ_THACTIVE_QOS_SHIFT    (7 * WQ_THACTIVE_BUCKET_WIDTH)
 224 #else
 225 // Layout is:
 226 //   63 - 61 : best QoS among all pending constrained requests
 227 //   60      : Manager bucket (0 or 1)
 228 //   59 -  0 : AUI, UI, IN, DF, UT, BG+MT buckets every 10 bits
 229 #define WQ_THACTIVE_BUCKET_WIDTH 10
 230 #define WQ_THACTIVE_QOS_SHIFT    (6 * WQ_THACTIVE_BUCKET_WIDTH + 1)
 231 #endif
 232 #define WQ_THACTIVE_BUCKET_MASK  ((1U << WQ_THACTIVE_BUCKET_WIDTH) - 1)
 233 #define WQ_THACTIVE_BUCKET_HALF  (1U << (WQ_THACTIVE_BUCKET_WIDTH - 1))
 234
 235 static_assert(sizeof(wq_thactive_t) * CHAR_BIT - WQ_THACTIVE_QOS_SHIFT >= 3,
 236     "Make sure we have space to encode a QoS");
 237
 238 static inline wq_thactive_t
 239 _wq_thactive(struct workqueue *wq)
 240 {
 241         return os_atomic_load_wide(&wq->wq_thactive, relaxed);
 242 }
 243
 244 static inline int
 245 _wq_bucket(thread_qos_t qos)
 246 {
 247         // Map both BG and MT to the same bucket by over-shifting down and
 248         // clamping MT and BG together.
 249         switch (qos) {
 250         case THREAD_QOS_MAINTENANCE:
 251                 return 0;
 252         default:
 253                 return qos - 2;
 254         }
 255 }
 256
 257 #define WQ_THACTIVE_BEST_CONSTRAINED_REQ_QOS(tha) \
 258                 ((thread_qos_t)((tha) >> WQ_THACTIVE_QOS_SHIFT))
 259
 260 static inline thread_qos_t
 261 _wq_thactive_best_constrained_req_qos(struct workqueue *wq)
 262 {
 263         // Avoid expensive atomic operations: the three bits we're loading are in
 264         // a single byte, and always updated under the workqueue lock
 265         wq_thactive_t v = *(wq_thactive_t *)&wq->wq_thactive;
 266         return WQ_THACTIVE_BEST_CONSTRAINED_REQ_QOS(v);
 267 }
 268
 269 static void
 270 _wq_thactive_refresh_best_constrained_req_qos(struct workqueue *wq)
 271 {
 272         thread_qos_t old_qos, new_qos;
 273         workq_threadreq_t req;
 274
 275         req = priority_queue_max(&wq->wq_constrained_queue,
 276             struct workq_threadreq_s, tr_entry);
 277         new_qos = req ? req->tr_qos : THREAD_QOS_UNSPECIFIED;
 278         old_qos = _wq_thactive_best_constrained_req_qos(wq);
 279         if (old_qos != new_qos) {
 280                 long delta = (long)new_qos - (long)old_qos;
 281                 wq_thactive_t v = (wq_thactive_t)delta << WQ_THACTIVE_QOS_SHIFT;
 282                 /*
 283                  * We can do an atomic add relative to the initial load because updates
 284                  * to this qos are always serialized under the workqueue lock.
 285                  */
 286                 v = os_atomic_add(&wq->wq_thactive, v, relaxed);
 287 #ifdef __LP64__
 288                 WQ_TRACE_WQ(TRACE_wq_thactive_update, wq, (uint64_t)v,
 289                     (uint64_t)(v >> 64), 0, 0);
 290 #else
 291                 WQ_TRACE_WQ(TRACE_wq_thactive_update, wq, v, 0, 0, 0);
 292 #endif
 293         }
 294 }
 295
 296 static inline wq_thactive_t
 297 _wq_thactive_offset_for_qos(thread_qos_t qos)
 298 {
 299         return (wq_thactive_t)1 << (_wq_bucket(qos) * WQ_THACTIVE_BUCKET_WIDTH);
 300 }
 301
 302 static inline wq_thactive_t
 303 _wq_thactive_inc(struct workqueue *wq, thread_qos_t qos)
 304 {
 305         wq_thactive_t v = _wq_thactive_offset_for_qos(qos);
 306         return os_atomic_add_orig(&wq->wq_thactive, v, relaxed);
 307 }
 308
 309 static inline wq_thactive_t
 310 _wq_thactive_dec(struct workqueue *wq, thread_qos_t qos)
 311 {
 312         wq_thactive_t v = _wq_thactive_offset_for_qos(qos);
 313         return os_atomic_sub_orig(&wq->wq_thactive, v, relaxed);
 314 }
 315
 316 static inline void
 317 _wq_thactive_move(struct workqueue *wq,
 318     thread_qos_t old_qos, thread_qos_t new_qos)
 319 {
 320         wq_thactive_t v = _wq_thactive_offset_for_qos(new_qos) -
 321             _wq_thactive_offset_for_qos(old_qos);
 322         os_atomic_add(&wq->wq_thactive, v, relaxed);
 323         wq->wq_thscheduled_count[_wq_bucket(old_qos)]--;
 324         wq->wq_thscheduled_count[_wq_bucket(new_qos)]++;
 325 }
 326
 327 static inline uint32_t
 328 _wq_thactive_aggregate_downto_qos(struct workqueue *wq, wq_thactive_t v,
 329     thread_qos_t qos, uint32_t *busycount, uint32_t *max_busycount)
 330 {
 331         uint32_t count = 0, active;
 332         uint64_t curtime;
 333
 334         assert(WORKQ_THREAD_QOS_MIN <= qos && qos <= WORKQ_THREAD_QOS_MAX);
 335
 336         if (busycount) {
 337                 curtime = mach_absolute_time();
 338                 *busycount = 0;
 339         }
 340         if (max_busycount) {
 341                 *max_busycount = THREAD_QOS_LAST - qos;
 342         }
 343
 344         int i = _wq_bucket(qos);
 345         v >>= i * WQ_THACTIVE_BUCKET_WIDTH;
 346         for (; i < WORKQ_NUM_QOS_BUCKETS; i++, v >>= WQ_THACTIVE_BUCKET_WIDTH) {
 347                 active = v & WQ_THACTIVE_BUCKET_MASK;
 348                 count += active;
 349
 350                 if (busycount && wq->wq_thscheduled_count[i] > active) {
 351                         if (workq_thread_is_busy(curtime, &wq->wq_lastblocked_ts[i])) {
 352                                 /*
 353                                  * We only consider the last blocked thread for a given bucket
 354                                  * as busy because we don't want to take the list lock in each
 355                                  * sched callback. However this is an approximation that could
 356                                  * contribute to thread creation storms.
 357                                  */
 358                                 (*busycount)++;
 359                         }
 360                 }
 361         }
 362
 363         return count;
 364 }
 365
 366 #pragma mark wq_flags
 367
 368 static inline uint32_t
 369 _wq_flags(struct workqueue *wq)
 370 {
 371         return os_atomic_load(&wq->wq_flags, relaxed);
 372 }
 373
 374 static inline bool
 375 _wq_exiting(struct workqueue *wq)
 376 {
 377         return _wq_flags(wq) & WQ_EXITING;
 378 }
 379
 380 bool
 381 workq_is_exiting(struct proc *p)
 382 {
 383         struct workqueue *wq = proc_get_wqptr(p);
 384         return !wq || _wq_exiting(wq);
 385 }
 386
 387 #pragma mark workqueue lock
 388
 389 static bool
 390 workq_lock_spin_is_acquired_kdp(struct workqueue *wq)
 391 {
 392         return kdp_lck_spin_is_acquired(&wq->wq_lock);
 393 }
 394
 395 static inline void
 396 workq_lock_spin(struct workqueue *wq)
 397 {
 398         lck_spin_lock_grp(&wq->wq_lock, &workq_lck_grp);
 399 }
 400
 401 static inline void
 402 workq_lock_held(__assert_only struct workqueue *wq)
 403 {
 404         LCK_SPIN_ASSERT(&wq->wq_lock, LCK_ASSERT_OWNED);
 405 }
 406
 407 static inline bool
 408 workq_lock_try(struct workqueue *wq)
 409 {
 410         return lck_spin_try_lock_grp(&wq->wq_lock, &workq_lck_grp);
 411 }
 412
 413 static inline void
 414 workq_unlock(struct workqueue *wq)
 415 {
 416         lck_spin_unlock(&wq->wq_lock);
 417 }
 418
 419 #pragma mark idle thread lists
 420
 421 #define WORKQ_POLICY_INIT(qos) \
 422                 (struct uu_workq_policy){ .qos_req = qos, .qos_bucket = qos }
 423
 424 static inline thread_qos_t
 425 workq_pri_bucket(struct uu_workq_policy req)
 426 {
 427         return MAX(MAX(req.qos_req, req.qos_max), req.qos_override);
 428 }
 429
 430 static inline thread_qos_t
 431 workq_pri_override(struct uu_workq_policy req)
 432 {
 433         return MAX(workq_pri_bucket(req), req.qos_bucket);
 434 }
 435
 436 static inline bool
 437 workq_thread_needs_params_change(workq_threadreq_t req, struct uthread *uth)
 438 {
 439         workq_threadreq_param_t cur_trp, req_trp = { };
 440
 441         cur_trp.trp_value = uth->uu_save.uus_workq_park_data.workloop_params;
 442         if (req->tr_flags & WORKQ_TR_FLAG_WL_PARAMS) {
 443                 req_trp = kqueue_threadreq_workloop_param(req);
 444         }
 445
 446         /*
 447          * CPU percent flags are handled separately to policy changes, so ignore
 448          * them for all of these checks.
 449          */
 450         uint16_t cur_flags = (cur_trp.trp_flags & ~TRP_CPUPERCENT);
 451         uint16_t req_flags = (req_trp.trp_flags & ~TRP_CPUPERCENT);
 452
 453         if (!req_flags && !cur_flags) {
 454                 return false;
 455         }
 456
 457         if (req_flags != cur_flags) {
 458                 return true;
 459         }
 460
 461         if ((req_flags & TRP_PRIORITY) && req_trp.trp_pri != cur_trp.trp_pri) {
 462                 return true;
 463         }
 464
 465         if ((req_flags & TRP_POLICY) && req_trp.trp_pol != cur_trp.trp_pol) {
 466                 return true;
 467         }
 468
 469         return false;
 470 }
 471
 472 static inline bool
 473 workq_thread_needs_priority_change(workq_threadreq_t req, struct uthread *uth)
 474 {
 475         if (workq_thread_needs_params_change(req, uth)) {
 476                 return true;
 477         }
 478
 479         return req->tr_qos != workq_pri_override(uth->uu_workq_pri);
 480 }
 481
 482 static void
 483 workq_thread_update_bucket(proc_t p, struct workqueue *wq, struct uthread *uth,
 484     struct uu_workq_policy old_pri, struct uu_workq_policy new_pri,
 485     bool force_run)
 486 {
 487         thread_qos_t old_bucket = old_pri.qos_bucket;
 488         thread_qos_t new_bucket = workq_pri_bucket(new_pri);
 489
 490         if (old_bucket != new_bucket) {
 491                 _wq_thactive_move(wq, old_bucket, new_bucket);
 492         }
 493
 494         new_pri.qos_bucket = new_bucket;
 495         uth->uu_workq_pri = new_pri;
 496
 497         if (workq_pri_override(old_pri) != new_bucket) {
 498                 thread_set_workq_override(uth->uu_thread, new_bucket);
 499         }
 500
 501         if (wq->wq_reqcount && (old_bucket > new_bucket || force_run)) {
 502                 int flags = WORKQ_THREADREQ_CAN_CREATE_THREADS;
 503                 if (old_bucket > new_bucket) {
 504                         /*
 505                          * When lowering our bucket, we may unblock a thread request,
 506                          * but we can't drop our priority before we have evaluated
 507                          * whether this is the case, and if we ever drop the workqueue lock
 508                          * that would cause a priority inversion.
 509                          *
 510                          * We hence have to disallow thread creation in that case.
 511                          */
 512                         flags = 0;
 513                 }
 514                 workq_schedule_creator(p, wq, flags);
 515         }
 516 }
 517
 518 /*
 519  * Sets/resets the cpu percent limits on the current thread. We can't set
 520  * these limits from outside of the current thread, so this function needs
 521  * to be called when we're executing on the intended
 522  */
 523 static void
 524 workq_thread_reset_cpupercent(workq_threadreq_t req, struct uthread *uth)
 525 {
 526         assert(uth == current_uthread());
 527         workq_threadreq_param_t trp = { };
 528
 529         if (req && (req->tr_flags & WORKQ_TR_FLAG_WL_PARAMS)) {
 530                 trp = kqueue_threadreq_workloop_param(req);
 531         }
 532
 533         if (uth->uu_workq_flags & UT_WORKQ_CPUPERCENT) {
 534                 /*
 535                  * Going through disable when we have an existing CPU percent limit
 536                  * set will force the ledger to refill the token bucket of the current
 537                  * thread. Removing any penalty applied by previous thread use.
 538                  */
 539                 thread_set_cpulimit(THREAD_CPULIMIT_DISABLE, 0, 0);
 540                 uth->uu_workq_flags &= ~UT_WORKQ_CPUPERCENT;
 541         }
 542
 543         if (trp.trp_flags & TRP_CPUPERCENT) {
 544                 thread_set_cpulimit(THREAD_CPULIMIT_BLOCK, trp.trp_cpupercent,
 545                     (uint64_t)trp.trp_refillms * NSEC_PER_SEC);
 546                 uth->uu_workq_flags |= UT_WORKQ_CPUPERCENT;
 547         }
 548 }
 549
 550 static void
 551 workq_thread_reset_pri(struct workqueue *wq, struct uthread *uth,
 552     workq_threadreq_t req, bool unpark)
 553 {
 554         thread_t th = uth->uu_thread;
 555         thread_qos_t qos = req ? req->tr_qos : WORKQ_THREAD_QOS_CLEANUP;
 556         workq_threadreq_param_t trp = { };
 557         int priority = 31;
 558         int policy = POLICY_TIMESHARE;
 559
 560         if (req && (req->tr_flags & WORKQ_TR_FLAG_WL_PARAMS)) {
 561                 trp = kqueue_threadreq_workloop_param(req);
 562         }
 563
 564         uth->uu_workq_pri = WORKQ_POLICY_INIT(qos);
 565         uth->uu_workq_flags &= ~UT_WORKQ_OUTSIDE_QOS;
 566
 567         if (unpark) {
 568                 uth->uu_save.uus_workq_park_data.workloop_params = trp.trp_value;
 569                 // qos sent out to userspace (may differ from uu_workq_pri on param threads)
 570                 uth->uu_save.uus_workq_park_data.qos = qos;
 571         }
 572
 573         if (qos == WORKQ_THREAD_QOS_MANAGER) {
 574                 uint32_t mgr_pri = wq->wq_event_manager_priority;
 575                 assert(trp.trp_value == 0); // manager qos and thread policy don't mix
 576
 577                 if (mgr_pri & _PTHREAD_PRIORITY_SCHED_PRI_FLAG) {
 578                         mgr_pri &= _PTHREAD_PRIORITY_SCHED_PRI_MASK;
 579                         thread_set_workq_pri(th, THREAD_QOS_UNSPECIFIED, mgr_pri,
 580                             POLICY_TIMESHARE);
 581                         return;
 582                 }
 583
 584                 qos = _pthread_priority_thread_qos(mgr_pri);
 585         } else {
 586                 if (trp.trp_flags & TRP_PRIORITY) {
 587                         qos = THREAD_QOS_UNSPECIFIED;
 588                         priority = trp.trp_pri;
 589                         uth->uu_workq_flags |= UT_WORKQ_OUTSIDE_QOS;
 590                 }
 591
 592                 if (trp.trp_flags & TRP_POLICY) {
 593                         policy = trp.trp_pol;
 594                 }
 595         }
 596
 597         thread_set_workq_pri(th, qos, priority, policy);
 598 }
 599
 600 /*
 601  * Called by kevent with the NOTE_WL_THREAD_REQUEST knote lock held,
 602  * every time a servicer is being told about a new max QoS.
 603  */
 604 void
 605 workq_thread_set_max_qos(struct proc *p, workq_threadreq_t kqr)
 606 {
 607         struct uu_workq_policy old_pri, new_pri;
 608         struct uthread *uth = current_uthread();
 609         struct workqueue *wq = proc_get_wqptr_fast(p);
 610         thread_qos_t qos = kqr->tr_kq_qos_index;
 611
 612         if (uth->uu_workq_pri.qos_max == qos) {
 613                 return;
 614         }
 615
 616         workq_lock_spin(wq);
 617         old_pri = new_pri = uth->uu_workq_pri;
 618         new_pri.qos_max = qos;
 619         workq_thread_update_bucket(p, wq, uth, old_pri, new_pri, false);
 620         workq_unlock(wq);
 621 }
 622
 623 #pragma mark idle threads accounting and handling
 624
 625 static inline struct uthread *
 626 workq_oldest_killable_idle_thread(struct workqueue *wq)
 627 {
 628         struct uthread *uth = TAILQ_LAST(&wq->wq_thidlelist, workq_uthread_head);
 629
 630         if (uth && !uth->uu_save.uus_workq_park_data.has_stack) {
 631                 uth = TAILQ_PREV(uth, workq_uthread_head, uu_workq_entry);
 632                 if (uth) {
 633                         assert(uth->uu_save.uus_workq_park_data.has_stack);
 634                 }
 635         }
 636         return uth;
 637 }
 638
 639 static inline uint64_t
 640 workq_kill_delay_for_idle_thread(struct workqueue *wq)
 641 {
 642         uint64_t delay = wq_reduce_pool_window.abstime;
 643         uint16_t idle = wq->wq_thidlecount;
 644
 645         /*
 646          * If we have less than wq_death_max_load threads, have a 5s timer.
 647          *
 648          * For the next wq_max_constrained_threads ones, decay linearly from
 649          * from 5s to 50ms.
 650          */
 651         if (idle <= wq_death_max_load) {
 652                 return delay;
 653         }
 654
 655         if (wq_max_constrained_threads > idle - wq_death_max_load) {
 656                 delay *= (wq_max_constrained_threads - (idle - wq_death_max_load));
 657         }
 658         return delay / wq_max_constrained_threads;
 659 }
 660
 661 static inline bool
 662 workq_should_kill_idle_thread(struct workqueue *wq, struct uthread *uth,
 663     uint64_t now)
 664 {
 665         uint64_t delay = workq_kill_delay_for_idle_thread(wq);
 666         return now - uth->uu_save.uus_workq_park_data.idle_stamp > delay;
 667 }
 668
 669 static void
 670 workq_death_call_schedule(struct workqueue *wq, uint64_t deadline)
 671 {
 672         uint32_t wq_flags = os_atomic_load(&wq->wq_flags, relaxed);
 673
 674         if (wq_flags & (WQ_EXITING | WQ_DEATH_CALL_SCHEDULED)) {
 675                 return;
 676         }
 677         os_atomic_or(&wq->wq_flags, WQ_DEATH_CALL_SCHEDULED, relaxed);
 678
 679         WQ_TRACE_WQ(TRACE_wq_death_call | DBG_FUNC_NONE, wq, 1, 0, 0, 0);
 680
 681         /*
 682          * <rdar://problem/13139182> Due to how long term timers work, the leeway
 683          * can't be too short, so use 500ms which is long enough that we will not
 684          * wake up the CPU for killing threads, but short enough that it doesn't
 685          * fall into long-term timer list shenanigans.
 686          */
 687         thread_call_enter_delayed_with_leeway(wq->wq_death_call, NULL, deadline,
 688             wq_reduce_pool_window.abstime / 10,
 689             THREAD_CALL_DELAY_LEEWAY | THREAD_CALL_DELAY_USER_BACKGROUND);
 690 }
 691
 692 /*
 693  * `decrement` is set to the number of threads that are no longer dying:
 694  * - because they have been resuscitated just in time (workq_pop_idle_thread)
 695  * - or have been killed (workq_thread_terminate).
 696  */
 697 static void
 698 workq_death_policy_evaluate(struct workqueue *wq, uint16_t decrement)
 699 {
 700         struct uthread *uth;
 701
 702         assert(wq->wq_thdying_count >= decrement);
 703         if ((wq->wq_thdying_count -= decrement) > 0) {
 704                 return;
 705         }
 706
 707         if (wq->wq_thidlecount <= 1) {
 708                 return;
 709         }
 710
 711         if ((uth = workq_oldest_killable_idle_thread(wq)) == NULL) {
 712                 return;
 713         }
 714
 715         uint64_t now = mach_absolute_time();
 716         uint64_t delay = workq_kill_delay_for_idle_thread(wq);
 717
 718         if (now - uth->uu_save.uus_workq_park_data.idle_stamp > delay) {
 719                 WQ_TRACE_WQ(TRACE_wq_thread_terminate | DBG_FUNC_START,
 720                     wq, wq->wq_thidlecount, 0, 0, 0);
 721                 wq->wq_thdying_count++;
 722                 uth->uu_workq_flags |= UT_WORKQ_DYING;
 723                 if ((uth->uu_workq_flags & UT_WORKQ_IDLE_CLEANUP) == 0) {
 724                         workq_thread_wakeup(uth);
 725                 }
 726                 return;
 727         }
 728
 729         workq_death_call_schedule(wq,
 730             uth->uu_save.uus_workq_park_data.idle_stamp + delay);
 731 }
 732
 733 void
 734 workq_thread_terminate(struct proc *p, struct uthread *uth)
 735 {
 736         struct workqueue *wq = proc_get_wqptr_fast(p);
 737
 738         workq_lock_spin(wq);
 739         TAILQ_REMOVE(&wq->wq_thrunlist, uth, uu_workq_entry);
 740         if (uth->uu_workq_flags & UT_WORKQ_DYING) {
 741                 WQ_TRACE_WQ(TRACE_wq_thread_terminate | DBG_FUNC_END,
 742                     wq, wq->wq_thidlecount, 0, 0, 0);
 743                 workq_death_policy_evaluate(wq, 1);
 744         }
 745         if (wq->wq_nthreads-- == wq_max_threads) {
 746                 /*
 747                  * We got under the thread limit again, which may have prevented
 748                  * thread creation from happening, redrive if there are pending requests
 749                  */
 750                 if (wq->wq_reqcount) {
 751                         workq_schedule_creator(p, wq, WORKQ_THREADREQ_CAN_CREATE_THREADS);
 752                 }
 753         }
 754         workq_unlock(wq);
 755
 756         thread_deallocate(uth->uu_thread);
 757 }
 758
 759 static void
 760 workq_kill_old_threads_call(void *param0, void *param1 __unused)
 761 {
 762         struct workqueue *wq = param0;
 763
 764         workq_lock_spin(wq);
 765         WQ_TRACE_WQ(TRACE_wq_death_call | DBG_FUNC_START, wq, 0, 0, 0, 0);
 766         os_atomic_andnot(&wq->wq_flags, WQ_DEATH_CALL_SCHEDULED, relaxed);
 767         workq_death_policy_evaluate(wq, 0);
 768         WQ_TRACE_WQ(TRACE_wq_death_call | DBG_FUNC_END, wq, 0, 0, 0, 0);
 769         workq_unlock(wq);
 770 }
 771
 772 static struct uthread *
 773 workq_pop_idle_thread(struct workqueue *wq, uint8_t uu_flags,
 774     bool *needs_wakeup)
 775 {
 776         struct uthread *uth;
 777
 778         if ((uth = TAILQ_FIRST(&wq->wq_thidlelist))) {
 779                 TAILQ_REMOVE(&wq->wq_thidlelist, uth, uu_workq_entry);
 780         } else {
 781                 uth = TAILQ_FIRST(&wq->wq_thnewlist);
 782                 TAILQ_REMOVE(&wq->wq_thnewlist, uth, uu_workq_entry);
 783         }
 784         TAILQ_INSERT_TAIL(&wq->wq_thrunlist, uth, uu_workq_entry);
 785
 786         assert((uth->uu_workq_flags & UT_WORKQ_RUNNING) == 0);
 787         uth->uu_workq_flags |= UT_WORKQ_RUNNING | uu_flags;
 788         if ((uu_flags & UT_WORKQ_OVERCOMMIT) == 0) {
 789                 wq->wq_constrained_threads_scheduled++;
 790         }
 791         wq->wq_threads_scheduled++;
 792         wq->wq_thidlecount--;
 793
 794         if (__improbable(uth->uu_workq_flags & UT_WORKQ_DYING)) {
 795                 uth->uu_workq_flags ^= UT_WORKQ_DYING;
 796                 workq_death_policy_evaluate(wq, 1);
 797                 *needs_wakeup = false;
 798         } else if (uth->uu_workq_flags & UT_WORKQ_IDLE_CLEANUP) {
 799                 *needs_wakeup = false;
 800         } else {
 801                 *needs_wakeup = true;
 802         }
 803         return uth;
 804 }
 805
 806 /*
 807  * Called by thread_create_workq_waiting() during thread initialization, before
 808  * assert_wait, before the thread has been started.
 809  */
 810 event_t
 811 workq_thread_init_and_wq_lock(task_t task, thread_t th)
 812 {
 813         struct uthread *uth = get_bsdthread_info(th);
 814
 815         uth->uu_workq_flags = UT_WORKQ_NEW;
 816         uth->uu_workq_pri = WORKQ_POLICY_INIT(THREAD_QOS_LEGACY);
 817         uth->uu_workq_thport = MACH_PORT_NULL;
 818         uth->uu_workq_stackaddr = 0;
 819         uth->uu_workq_pthread_kill_allowed = 0;
 820
 821         thread_set_tag(th, THREAD_TAG_PTHREAD | THREAD_TAG_WORKQUEUE);
 822         thread_reset_workq_qos(th, THREAD_QOS_LEGACY);
 823
 824         workq_lock_spin(proc_get_wqptr_fast(get_bsdtask_info(task)));
 825         return workq_parked_wait_event(uth);
 826 }
 827
 828 /**
 829  * Try to add a new workqueue thread.
 830  *
 831  * - called with workq lock held
 832  * - dropped and retaken around thread creation
 833  * - return with workq lock held
 834  */
 835 static bool
 836 workq_add_new_idle_thread(proc_t p, struct workqueue *wq)
 837 {
 838         mach_vm_offset_t th_stackaddr;
 839         kern_return_t kret;
 840         thread_t th;
 841
 842         wq->wq_nthreads++;
 843
 844         workq_unlock(wq);
 845
 846         vm_map_t vmap = get_task_map(p->task);
 847
 848         kret = pthread_functions->workq_create_threadstack(p, vmap, &th_stackaddr);
 849         if (kret != KERN_SUCCESS) {
 850                 WQ_TRACE_WQ(TRACE_wq_thread_create_failed | DBG_FUNC_NONE, wq,
 851                     kret, 1, 0, 0);
 852                 goto out;
 853         }
 854
 855         kret = thread_create_workq_waiting(p->task, workq_unpark_continue, &th);
 856         if (kret != KERN_SUCCESS) {
 857                 WQ_TRACE_WQ(TRACE_wq_thread_create_failed | DBG_FUNC_NONE, wq,
 858                     kret, 0, 0, 0);
 859                 pthread_functions->workq_destroy_threadstack(p, vmap, th_stackaddr);
 860                 goto out;
 861         }
 862
 863         // thread_create_workq_waiting() will return with the wq lock held
 864         // on success, because it calls workq_thread_init_and_wq_lock() above
 865
 866         struct uthread *uth = get_bsdthread_info(th);
 867
 868         wq->wq_creations++;
 869         wq->wq_thidlecount++;
 870         uth->uu_workq_stackaddr = (user_addr_t)th_stackaddr;
 871         TAILQ_INSERT_TAIL(&wq->wq_thnewlist, uth, uu_workq_entry);
 872
 873         WQ_TRACE_WQ(TRACE_wq_thread_create | DBG_FUNC_NONE, wq, 0, 0, 0, 0);
 874         return true;
 875
 876 out:
 877         workq_lock_spin(wq);
 878         /*
 879          * Do not redrive here if we went under wq_max_threads again,
 880          * it is the responsibility of the callers of this function
 881          * to do so when it fails.
 882          */
 883         wq->wq_nthreads--;
 884         return false;
 885 }
 886
 887 #define WORKQ_UNPARK_FOR_DEATH_WAS_IDLE 0x1
 888
 889 __attribute__((noreturn, noinline))
 890 static void
 891 workq_unpark_for_death_and_unlock(proc_t p, struct workqueue *wq,
 892     struct uthread *uth, uint32_t death_flags, uint32_t setup_flags)
 893 {
 894         thread_qos_t qos = workq_pri_override(uth->uu_workq_pri);
 895         bool first_use = uth->uu_workq_flags & UT_WORKQ_NEW;
 896
 897         if (qos > WORKQ_THREAD_QOS_CLEANUP) {
 898                 workq_thread_reset_pri(wq, uth, NULL, /*unpark*/ true);
 899                 qos = WORKQ_THREAD_QOS_CLEANUP;
 900         }
 901
 902         workq_thread_reset_cpupercent(NULL, uth);
 903
 904         if (death_flags & WORKQ_UNPARK_FOR_DEATH_WAS_IDLE) {
 905                 wq->wq_thidlecount--;
 906                 if (first_use) {
 907                         TAILQ_REMOVE(&wq->wq_thnewlist, uth, uu_workq_entry);
 908                 } else {
 909                         TAILQ_REMOVE(&wq->wq_thidlelist, uth, uu_workq_entry);
 910                 }
 911         }
 912         TAILQ_INSERT_TAIL(&wq->wq_thrunlist, uth, uu_workq_entry);
 913
 914         workq_unlock(wq);
 915
 916         if (setup_flags & WQ_SETUP_CLEAR_VOUCHER) {
 917                 __assert_only kern_return_t kr;
 918                 kr = thread_set_voucher_name(MACH_PORT_NULL);
 919                 assert(kr == KERN_SUCCESS);
 920         }
 921
 922         uint32_t flags = WQ_FLAG_THREAD_NEWSPI | qos | WQ_FLAG_THREAD_PRIO_QOS;
 923         thread_t th = uth->uu_thread;
 924         vm_map_t vmap = get_task_map(p->task);
 925
 926         if (!first_use) {
 927                 flags |= WQ_FLAG_THREAD_REUSE;
 928         }
 929
 930         pthread_functions->workq_setup_thread(p, th, vmap, uth->uu_workq_stackaddr,
 931             uth->uu_workq_thport, 0, WQ_SETUP_EXIT_THREAD, flags);
 932         __builtin_unreachable();
 933 }
 934
 935 bool
 936 workq_is_current_thread_updating_turnstile(struct workqueue *wq)
 937 {
 938         return wq->wq_turnstile_updater == current_thread();
 939 }
 940
 941 __attribute__((always_inline))
 942 static inline void
 943 workq_perform_turnstile_operation_locked(struct workqueue *wq,
 944     void (^operation)(void))
 945 {
 946         workq_lock_held(wq);
 947         wq->wq_turnstile_updater = current_thread();
 948         operation();
 949         wq->wq_turnstile_updater = THREAD_NULL;
 950 }
 951
 952 static void
 953 workq_turnstile_update_inheritor(struct workqueue *wq,
 954     turnstile_inheritor_t inheritor,
 955     turnstile_update_flags_t flags)
 956 {
 957         if (wq->wq_inheritor == inheritor) {
 958                 return;
 959         }
 960         wq->wq_inheritor = inheritor;
 961         workq_perform_turnstile_operation_locked(wq, ^{
 962                 turnstile_update_inheritor(wq->wq_turnstile, inheritor,
 963                 flags | TURNSTILE_IMMEDIATE_UPDATE);
 964                 turnstile_update_inheritor_complete(wq->wq_turnstile,
 965                 TURNSTILE_INTERLOCK_HELD);
 966         });
 967 }
 968
 969 static void
 970 workq_push_idle_thread(proc_t p, struct workqueue *wq, struct uthread *uth,
 971     uint32_t setup_flags)
 972 {
 973         uint64_t now = mach_absolute_time();
 974         bool is_creator = (uth == wq->wq_creator);
 975
 976         if ((uth->uu_workq_flags & UT_WORKQ_OVERCOMMIT) == 0) {
 977                 wq->wq_constrained_threads_scheduled--;
 978         }
 979         uth->uu_workq_flags &= ~(UT_WORKQ_RUNNING | UT_WORKQ_OVERCOMMIT);
 980         TAILQ_REMOVE(&wq->wq_thrunlist, uth, uu_workq_entry);
 981         wq->wq_threads_scheduled--;
 982
 983         if (is_creator) {
 984                 wq->wq_creator = NULL;
 985                 WQ_TRACE_WQ(TRACE_wq_creator_select, wq, 3, 0,
 986                     uth->uu_save.uus_workq_park_data.yields, 0);
 987         }
 988
 989         if (wq->wq_inheritor == uth->uu_thread) {
 990                 assert(wq->wq_creator == NULL);
 991                 if (wq->wq_reqcount) {
 992                         workq_turnstile_update_inheritor(wq, wq, TURNSTILE_INHERITOR_WORKQ);
 993                 } else {
 994                         workq_turnstile_update_inheritor(wq, TURNSTILE_INHERITOR_NULL, 0);
 995                 }
 996         }
 997
 998         if (uth->uu_workq_flags & UT_WORKQ_NEW) {
 999                 assert(is_creator || (_wq_flags(wq) & WQ_EXITING));
1000                 TAILQ_INSERT_TAIL(&wq->wq_thnewlist, uth, uu_workq_entry);
1001                 wq->wq_thidlecount++;
1002                 return;
1003         }
1004
1005         if (!is_creator) {
1006                 _wq_thactive_dec(wq, uth->uu_workq_pri.qos_bucket);
1007                 wq->wq_thscheduled_count[_wq_bucket(uth->uu_workq_pri.qos_bucket)]--;
1008                 uth->uu_workq_flags |= UT_WORKQ_IDLE_CLEANUP;
1009         }
1010
1011         uth->uu_save.uus_workq_park_data.idle_stamp = now;
1012
1013         struct uthread *oldest = workq_oldest_killable_idle_thread(wq);
1014         uint16_t cur_idle = wq->wq_thidlecount;
1015
1016         if (cur_idle >= wq_max_constrained_threads ||
1017             (wq->wq_thdying_count == 0 && oldest &&
1018             workq_should_kill_idle_thread(wq, oldest, now))) {
1019                 /*
1020                  * Immediately kill threads if we have too may of them.
1021                  *
1022                  * And swap "place" with the oldest one we'd have woken up.
1023                  * This is a relatively desperate situation where we really
1024                  * need to kill threads quickly and it's best to kill
1025                  * the one that's currently on core than context switching.
1026                  */
1027                 if (oldest) {
1028                         oldest->uu_save.uus_workq_park_data.idle_stamp = now;
1029                         TAILQ_REMOVE(&wq->wq_thidlelist, oldest, uu_workq_entry);
1030                         TAILQ_INSERT_HEAD(&wq->wq_thidlelist, oldest, uu_workq_entry);
1031                 }
1032
1033                 WQ_TRACE_WQ(TRACE_wq_thread_terminate | DBG_FUNC_START,
1034                     wq, cur_idle, 0, 0, 0);
1035                 wq->wq_thdying_count++;
1036                 uth->uu_workq_flags |= UT_WORKQ_DYING;
1037                 uth->uu_workq_flags &= ~UT_WORKQ_IDLE_CLEANUP;
1038                 workq_unpark_for_death_and_unlock(p, wq, uth, 0, setup_flags);
1039                 __builtin_unreachable();
1040         }
1041
1042         struct uthread *tail = TAILQ_LAST(&wq->wq_thidlelist, workq_uthread_head);
1043
1044         cur_idle += 1;
1045         wq->wq_thidlecount = cur_idle;
1046
1047         if (cur_idle >= wq_death_max_load && tail &&
1048             tail->uu_save.uus_workq_park_data.has_stack) {
1049                 uth->uu_save.uus_workq_park_data.has_stack = false;
1050                 TAILQ_INSERT_TAIL(&wq->wq_thidlelist, uth, uu_workq_entry);
1051         } else {
1052                 uth->uu_save.uus_workq_park_data.has_stack = true;
1053                 TAILQ_INSERT_HEAD(&wq->wq_thidlelist, uth, uu_workq_entry);
1054         }
1055
1056         if (!tail) {
1057                 uint64_t delay = workq_kill_delay_for_idle_thread(wq);
1058                 workq_death_call_schedule(wq, now + delay);
1059         }
1060 }
1061
1062 #pragma mark thread requests
1063
1064 static inline int
1065 workq_priority_for_req(workq_threadreq_t req)
1066 {
1067         thread_qos_t qos = req->tr_qos;
1068
1069         if (req->tr_flags & WORKQ_TR_FLAG_WL_OUTSIDE_QOS) {
1070                 workq_threadreq_param_t trp = kqueue_threadreq_workloop_param(req);
1071                 assert(trp.trp_flags & TRP_PRIORITY);
1072                 return trp.trp_pri;
1073         }
1074         return thread_workq_pri_for_qos(qos);
1075 }
1076
1077 static inline struct priority_queue_sched_max *
1078 workq_priority_queue_for_req(struct workqueue *wq, workq_threadreq_t req)
1079 {
1080         if (req->tr_flags & WORKQ_TR_FLAG_WL_OUTSIDE_QOS) {
1081                 return &wq->wq_special_queue;
1082         } else if (req->tr_flags & WORKQ_TR_FLAG_OVERCOMMIT) {
1083                 return &wq->wq_overcommit_queue;
1084         } else {
1085                 return &wq->wq_constrained_queue;
1086         }
1087 }
1088
1089 /*
1090  * returns true if the the enqueued request is the highest priority item
1091  * in its priority queue.
1092  */
1093 static bool
1094 workq_threadreq_enqueue(struct workqueue *wq, workq_threadreq_t req)
1095 {
1096         assert(req->tr_state == WORKQ_TR_STATE_NEW);
1097
1098         req->tr_state = WORKQ_TR_STATE_QUEUED;
1099         wq->wq_reqcount += req->tr_count;
1100
1101         if (req->tr_qos == WORKQ_THREAD_QOS_MANAGER) {
1102                 assert(wq->wq_event_manager_threadreq == NULL);
1103                 assert(req->tr_flags & WORKQ_TR_FLAG_KEVENT);
1104                 assert(req->tr_count == 1);
1105                 wq->wq_event_manager_threadreq = req;
1106                 return true;
1107         }
1108
1109         struct priority_queue_sched_max *q = workq_priority_queue_for_req(wq, req);
1110         priority_queue_entry_set_sched_pri(q, &req->tr_entry,
1111             workq_priority_for_req(req), false);
1112
1113         if (priority_queue_insert(q, &req->tr_entry)) {
1114                 if ((req->tr_flags & WORKQ_TR_FLAG_OVERCOMMIT) == 0) {
1115                         _wq_thactive_refresh_best_constrained_req_qos(wq);
1116                 }
1117                 return true;
1118         }
1119         return false;
1120 }
1121
1122 /*
1123  * returns true if the the dequeued request was the highest priority item
1124  * in its priority queue.
1125  */
1126 static bool
1127 workq_threadreq_dequeue(struct workqueue *wq, workq_threadreq_t req)
1128 {
1129         wq->wq_reqcount--;
1130
1131         if (--req->tr_count == 0) {
1132                 if (req->tr_qos == WORKQ_THREAD_QOS_MANAGER) {
1133                         assert(wq->wq_event_manager_threadreq == req);
1134                         assert(req->tr_count == 0);
1135                         wq->wq_event_manager_threadreq = NULL;
1136                         return true;
1137                 }
1138                 if (priority_queue_remove(workq_priority_queue_for_req(wq, req),
1139                     &req->tr_entry)) {
1140                         if ((req->tr_flags & WORKQ_TR_FLAG_OVERCOMMIT) == 0) {
1141                                 _wq_thactive_refresh_best_constrained_req_qos(wq);
1142                         }
1143                         return true;
1144                 }
1145         }
1146         return false;
1147 }
1148
1149 static void
1150 workq_threadreq_destroy(proc_t p, workq_threadreq_t req)
1151 {
1152         req->tr_state = WORKQ_TR_STATE_CANCELED;
1153         if (req->tr_flags & (WORKQ_TR_FLAG_WORKLOOP | WORKQ_TR_FLAG_KEVENT)) {
1154                 kqueue_threadreq_cancel(p, req);
1155         } else {
1156                 zfree(workq_zone_threadreq, req);
1157         }
1158 }
1159
1160 #pragma mark workqueue thread creation thread calls
1161
1162 static inline bool
1163 workq_thread_call_prepost(struct workqueue *wq, uint32_t sched, uint32_t pend,
1164     uint32_t fail_mask)
1165 {
1166         uint32_t old_flags, new_flags;
1167
1168         os_atomic_rmw_loop(&wq->wq_flags, old_flags, new_flags, acquire, {
1169                 if (__improbable(old_flags & (WQ_EXITING | sched | pend | fail_mask))) {
1170                         os_atomic_rmw_loop_give_up(return false);
1171                 }
1172                 if (__improbable(old_flags & WQ_PROC_SUSPENDED)) {
1173                         new_flags = old_flags | pend;
1174                 } else {
1175                         new_flags = old_flags | sched;
1176                 }
1177         });
1178
1179         return (old_flags & WQ_PROC_SUSPENDED) == 0;
1180 }
1181
1182 #define WORKQ_SCHEDULE_DELAYED_THREAD_CREATION_RESTART 0x1
1183
1184 static bool
1185 workq_schedule_delayed_thread_creation(struct workqueue *wq, int flags)
1186 {
1187         assert(!preemption_enabled());
1188
1189         if (!workq_thread_call_prepost(wq, WQ_DELAYED_CALL_SCHEDULED,
1190             WQ_DELAYED_CALL_PENDED, WQ_IMMEDIATE_CALL_PENDED |
1191             WQ_IMMEDIATE_CALL_SCHEDULED)) {
1192                 return false;
1193         }
1194
1195         uint64_t now = mach_absolute_time();
1196
1197         if (flags & WORKQ_SCHEDULE_DELAYED_THREAD_CREATION_RESTART) {
1198                 /* do not change the window */
1199         } else if (now - wq->wq_thread_call_last_run <= wq->wq_timer_interval) {
1200                 wq->wq_timer_interval *= 2;
1201                 if (wq->wq_timer_interval > wq_max_timer_interval.abstime) {
1202                         wq->wq_timer_interval = (uint32_t)wq_max_timer_interval.abstime;
1203                 }
1204         } else if (now - wq->wq_thread_call_last_run > 2 * wq->wq_timer_interval) {
1205                 wq->wq_timer_interval /= 2;
1206                 if (wq->wq_timer_interval < wq_stalled_window.abstime) {
1207                         wq->wq_timer_interval = (uint32_t)wq_stalled_window.abstime;
1208                 }
1209         }
1210
1211         WQ_TRACE_WQ(TRACE_wq_start_add_timer, wq, wq->wq_reqcount,
1212             _wq_flags(wq), wq->wq_timer_interval, 0);
1213
1214         thread_call_t call = wq->wq_delayed_call;
1215         uintptr_t arg = WQ_DELAYED_CALL_SCHEDULED;
1216         uint64_t deadline = now + wq->wq_timer_interval;
1217         if (thread_call_enter1_delayed(call, (void *)arg, deadline)) {
1218                 panic("delayed_call was already enqueued");
1219         }
1220         return true;
1221 }
1222
1223 static void
1224 workq_schedule_immediate_thread_creation(struct workqueue *wq)
1225 {
1226         assert(!preemption_enabled());
1227
1228         if (workq_thread_call_prepost(wq, WQ_IMMEDIATE_CALL_SCHEDULED,
1229             WQ_IMMEDIATE_CALL_PENDED, 0)) {
1230                 WQ_TRACE_WQ(TRACE_wq_start_add_timer, wq, wq->wq_reqcount,
1231                     _wq_flags(wq), 0, 0);
1232
1233                 uintptr_t arg = WQ_IMMEDIATE_CALL_SCHEDULED;
1234                 if (thread_call_enter1(wq->wq_immediate_call, (void *)arg)) {
1235                         panic("immediate_call was already enqueued");
1236                 }
1237         }
1238 }
1239
1240 void
1241 workq_proc_suspended(struct proc *p)
1242 {
1243         struct workqueue *wq = proc_get_wqptr(p);
1244
1245         if (wq) {
1246                 os_atomic_or(&wq->wq_flags, WQ_PROC_SUSPENDED, relaxed);
1247         }
1248 }
1249
1250 void
1251 workq_proc_resumed(struct proc *p)
1252 {
1253         struct workqueue *wq = proc_get_wqptr(p);
1254         uint32_t wq_flags;
1255
1256         if (!wq) {
1257                 return;
1258         }
1259
1260         wq_flags = os_atomic_andnot_orig(&wq->wq_flags, WQ_PROC_SUSPENDED |
1261             WQ_DELAYED_CALL_PENDED | WQ_IMMEDIATE_CALL_PENDED, relaxed);
1262         if ((wq_flags & WQ_EXITING) == 0) {
1263                 disable_preemption();
1264                 if (wq_flags & WQ_IMMEDIATE_CALL_PENDED) {
1265                         workq_schedule_immediate_thread_creation(wq);
1266                 } else if (wq_flags & WQ_DELAYED_CALL_PENDED) {
1267                         workq_schedule_delayed_thread_creation(wq,
1268                             WORKQ_SCHEDULE_DELAYED_THREAD_CREATION_RESTART);
1269                 }
1270                 enable_preemption();
1271         }
1272 }
1273
1274 /**
1275  * returns whether lastblocked_tsp is within wq_stalled_window usecs of now
1276  */
1277 static bool
1278 workq_thread_is_busy(uint64_t now, _Atomic uint64_t *lastblocked_tsp)
1279 {
1280         uint64_t lastblocked_ts = os_atomic_load_wide(lastblocked_tsp, relaxed);
1281         if (now <= lastblocked_ts) {
1282                 /*
1283                  * Because the update of the timestamp when a thread blocks
1284                  * isn't serialized against us looking at it (i.e. we don't hold
1285                  * the workq lock), it's possible to have a timestamp that matches
1286                  * the current time or that even looks to be in the future relative
1287                  * to when we grabbed the current time...
1288                  *
1289                  * Just treat this as a busy thread since it must have just blocked.
1290                  */
1291                 return true;
1292         }
1293         return (now - lastblocked_ts) < wq_stalled_window.abstime;
1294 }
1295
1296 static void
1297 workq_add_new_threads_call(void *_p, void *flags)
1298 {
1299         proc_t p = _p;
1300         struct workqueue *wq = proc_get_wqptr(p);
1301         uint32_t my_flag = (uint32_t)(uintptr_t)flags;
1302
1303         /*
1304          * workq_exit() will set the workqueue to NULL before
1305          * it cancels thread calls.
1306          */
1307         if (!wq) {
1308                 return;
1309         }
1310
1311         assert((my_flag == WQ_DELAYED_CALL_SCHEDULED) ||
1312             (my_flag == WQ_IMMEDIATE_CALL_SCHEDULED));
1313
1314         WQ_TRACE_WQ(TRACE_wq_add_timer | DBG_FUNC_START, wq, _wq_flags(wq),
1315             wq->wq_nthreads, wq->wq_thidlecount, 0);
1316
1317         workq_lock_spin(wq);
1318
1319         wq->wq_thread_call_last_run = mach_absolute_time();
1320         os_atomic_andnot(&wq->wq_flags, my_flag, release);
1321
1322         /* This can drop the workqueue lock, and take it again */
1323         workq_schedule_creator(p, wq, WORKQ_THREADREQ_CAN_CREATE_THREADS);
1324
1325         workq_unlock(wq);
1326
1327         WQ_TRACE_WQ(TRACE_wq_add_timer | DBG_FUNC_END, wq, 0,
1328             wq->wq_nthreads, wq->wq_thidlecount, 0);
1329 }
1330
1331 #pragma mark thread state tracking
1332
1333 static void
1334 workq_sched_callback(int type, thread_t thread)
1335 {
1336         struct uthread *uth = get_bsdthread_info(thread);
1337         proc_t proc = get_bsdtask_info(get_threadtask(thread));
1338         struct workqueue *wq = proc_get_wqptr(proc);
1339         thread_qos_t req_qos, qos = uth->uu_workq_pri.qos_bucket;
1340         wq_thactive_t old_thactive;
1341         bool start_timer = false;
1342
1343         if (qos == WORKQ_THREAD_QOS_MANAGER) {
1344                 return;
1345         }
1346
1347         switch (type) {
1348         case SCHED_CALL_BLOCK:
1349                 old_thactive = _wq_thactive_dec(wq, qos);
1350                 req_qos = WQ_THACTIVE_BEST_CONSTRAINED_REQ_QOS(old_thactive);
1351
1352                 /*
1353                  * Remember the timestamp of the last thread that blocked in this
1354                  * bucket, it used used by admission checks to ignore one thread
1355                  * being inactive if this timestamp is recent enough.
1356                  *
1357                  * If we collide with another thread trying to update the
1358                  * last_blocked (really unlikely since another thread would have to
1359                  * get scheduled and then block after we start down this path), it's
1360                  * not a problem.  Either timestamp is adequate, so no need to retry
1361                  */
1362                 os_atomic_store_wide(&wq->wq_lastblocked_ts[_wq_bucket(qos)],
1363                     thread_last_run_time(thread), relaxed);
1364
1365                 if (req_qos == THREAD_QOS_UNSPECIFIED) {
1366                         /*
1367                          * No pending request at the moment we could unblock, move on.
1368                          */
1369                 } else if (qos < req_qos) {
1370                         /*
1371                          * The blocking thread is at a lower QoS than the highest currently
1372                          * pending constrained request, nothing has to be redriven
1373                          */
1374                 } else {
1375                         uint32_t max_busycount, old_req_count;
1376                         old_req_count = _wq_thactive_aggregate_downto_qos(wq, old_thactive,
1377                             req_qos, NULL, &max_busycount);
1378                         /*
1379                          * If it is possible that may_start_constrained_thread had refused
1380                          * admission due to being over the max concurrency, we may need to
1381                          * spin up a new thread.
1382                          *
1383                          * We take into account the maximum number of busy threads
1384                          * that can affect may_start_constrained_thread as looking at the
1385                          * actual number may_start_constrained_thread will see is racy.
1386                          *
1387                          * IOW at NCPU = 4, for IN (req_qos = 1), if the old req count is
1388                          * between NCPU (4) and NCPU - 2 (2) we need to redrive.
1389                          */
1390                         uint32_t conc = wq_max_parallelism[_wq_bucket(qos)];
1391                         if (old_req_count <= conc && conc <= old_req_count + max_busycount) {
1392                                 start_timer = workq_schedule_delayed_thread_creation(wq, 0);
1393                         }
1394                 }
1395                 if (__improbable(kdebug_enable)) {
1396                         __unused uint32_t old = _wq_thactive_aggregate_downto_qos(wq,
1397                             old_thactive, qos, NULL, NULL);
1398                         WQ_TRACE_WQ(TRACE_wq_thread_block | DBG_FUNC_START, wq,
1399                             old - 1, qos | (req_qos << 8),
1400                             wq->wq_reqcount << 1 | start_timer, 0);
1401                 }
1402                 break;
1403
1404         case SCHED_CALL_UNBLOCK:
1405                 /*
1406                  * we cannot take the workqueue_lock here...
1407                  * an UNBLOCK can occur from a timer event which
1408                  * is run from an interrupt context... if the workqueue_lock
1409                  * is already held by this processor, we'll deadlock...
1410                  * the thread lock for the thread being UNBLOCKED
1411                  * is also held
1412                  */
1413                 old_thactive = _wq_thactive_inc(wq, qos);
1414                 if (__improbable(kdebug_enable)) {
1415                         __unused uint32_t old = _wq_thactive_aggregate_downto_qos(wq,
1416                             old_thactive, qos, NULL, NULL);
1417                         req_qos = WQ_THACTIVE_BEST_CONSTRAINED_REQ_QOS(old_thactive);
1418                         WQ_TRACE_WQ(TRACE_wq_thread_block | DBG_FUNC_END, wq,
1419                             old + 1, qos | (req_qos << 8),
1420                             wq->wq_threads_scheduled, 0);
1421                 }
1422                 break;
1423         }
1424 }
1425
1426 #pragma mark workq lifecycle
1427
1428 void
1429 workq_reference(struct workqueue *wq)
1430 {
1431         os_ref_retain(&wq->wq_refcnt);
1432 }
1433
1434 static void
1435 workq_deallocate_queue_invoke(mpsc_queue_chain_t e,
1436     __assert_only mpsc_daemon_queue_t dq)
1437 {
1438         struct workqueue *wq;
1439         struct turnstile *ts;
1440
1441         wq = mpsc_queue_element(e, struct workqueue, wq_destroy_link);
1442         assert(dq == &workq_deallocate_queue);
1443
1444         turnstile_complete((uintptr_t)wq, &wq->wq_turnstile, &ts, TURNSTILE_WORKQS);
1445         assert(ts);
1446         turnstile_cleanup();
1447         turnstile_deallocate(ts);
1448
1449         lck_spin_destroy(&wq->wq_lock, &workq_lck_grp);
1450         zfree(workq_zone_workqueue, wq);
1451 }
1452
1453 static void
1454 workq_deallocate(struct workqueue *wq)
1455 {
1456         if (os_ref_release_relaxed(&wq->wq_refcnt) == 0) {
1457                 workq_deallocate_queue_invoke(&wq->wq_destroy_link,
1458                     &workq_deallocate_queue);
1459         }
1460 }
1461
1462 void
1463 workq_deallocate_safe(struct workqueue *wq)
1464 {
1465         if (__improbable(os_ref_release_relaxed(&wq->wq_refcnt) == 0)) {
1466                 mpsc_daemon_enqueue(&workq_deallocate_queue, &wq->wq_destroy_link,
1467                     MPSC_QUEUE_DISABLE_PREEMPTION);
1468         }
1469 }
1470
1471 /**
1472  * Setup per-process state for the workqueue.
1473  */
1474 int
1475 workq_open(struct proc *p, __unused struct workq_open_args *uap,
1476     __unused int32_t *retval)
1477 {
1478         struct workqueue *wq;
1479         int error = 0;
1480
1481         if ((p->p_lflag & P_LREGISTER) == 0) {
1482                 return EINVAL;
1483         }
1484
1485         if (wq_init_constrained_limit) {
1486                 uint32_t limit, num_cpus = ml_wait_max_cpus();
1487
1488                 /*
1489                  * set up the limit for the constrained pool
1490                  * this is a virtual pool in that we don't
1491                  * maintain it on a separate idle and run list
1492                  */
1493                 limit = num_cpus * WORKQUEUE_CONSTRAINED_FACTOR;
1494
1495                 if (limit > wq_max_constrained_threads) {
1496                         wq_max_constrained_threads = limit;
1497                 }
1498
1499                 if (wq_max_threads > WQ_THACTIVE_BUCKET_HALF) {
1500                         wq_max_threads = WQ_THACTIVE_BUCKET_HALF;
1501                 }
1502                 if (wq_max_threads > CONFIG_THREAD_MAX - 20) {
1503                         wq_max_threads = CONFIG_THREAD_MAX - 20;
1504                 }
1505
1506                 wq_death_max_load = (uint16_t)fls(num_cpus) + 1;
1507
1508                 for (thread_qos_t qos = WORKQ_THREAD_QOS_MIN; qos <= WORKQ_THREAD_QOS_MAX; qos++) {
1509                         wq_max_parallelism[_wq_bucket(qos)] =
1510                             qos_max_parallelism(qos, QOS_PARALLELISM_COUNT_LOGICAL);
1511                 }
1512
1513                 wq_init_constrained_limit = 0;
1514         }
1515
1516         if (proc_get_wqptr(p) == NULL) {
1517                 if (proc_init_wqptr_or_wait(p) == FALSE) {
1518                         assert(proc_get_wqptr(p) != NULL);
1519                         goto out;
1520                 }
1521
1522                 wq = (struct workqueue *)zalloc(workq_zone_workqueue);
1523                 bzero(wq, sizeof(struct workqueue));
1524
1525                 os_ref_init_count(&wq->wq_refcnt, &workq_refgrp, 1);
1526
1527                 // Start the event manager at the priority hinted at by the policy engine
1528                 thread_qos_t mgr_priority_hint = task_get_default_manager_qos(current_task());
1529                 pthread_priority_t pp = _pthread_priority_make_from_thread_qos(mgr_priority_hint, 0, 0);
1530                 wq->wq_event_manager_priority = (uint32_t)pp;
1531                 wq->wq_timer_interval = (uint32_t)wq_stalled_window.abstime;
1532                 wq->wq_proc = p;
1533                 turnstile_prepare((uintptr_t)wq, &wq->wq_turnstile, turnstile_alloc(),
1534                     TURNSTILE_WORKQS);
1535
1536                 TAILQ_INIT(&wq->wq_thrunlist);
1537                 TAILQ_INIT(&wq->wq_thnewlist);
1538                 TAILQ_INIT(&wq->wq_thidlelist);
1539                 priority_queue_init(&wq->wq_overcommit_queue);
1540                 priority_queue_init(&wq->wq_constrained_queue);
1541                 priority_queue_init(&wq->wq_special_queue);
1542
1543                 /* We are only using the delayed thread call for the constrained pool
1544                  * which can't have work at >= UI QoS and so we can be fine with a
1545                  * UI QoS thread call.
1546                  */
1547                 wq->wq_delayed_call = thread_call_allocate_with_qos(
1548                         workq_add_new_threads_call, p, THREAD_QOS_USER_INTERACTIVE,
1549                         THREAD_CALL_OPTIONS_ONCE);
1550                 wq->wq_immediate_call = thread_call_allocate_with_options(
1551                         workq_add_new_threads_call, p, THREAD_CALL_PRIORITY_KERNEL,
1552                         THREAD_CALL_OPTIONS_ONCE);
1553                 wq->wq_death_call = thread_call_allocate_with_options(
1554                         workq_kill_old_threads_call, wq,
1555                         THREAD_CALL_PRIORITY_USER, THREAD_CALL_OPTIONS_ONCE);
1556
1557                 lck_spin_init(&wq->wq_lock, &workq_lck_grp, LCK_ATTR_NULL);
1558
1559                 WQ_TRACE_WQ(TRACE_wq_create | DBG_FUNC_NONE, wq,
1560                     VM_KERNEL_ADDRHIDE(wq), 0, 0, 0);
1561                 proc_set_wqptr(p, wq);
1562         }
1563 out:
1564
1565         return error;
1566 }
1567
1568 /*
1569  * Routine:     workq_mark_exiting
1570  *
1571  * Function:    Mark the work queue such that new threads will not be added to the
1572  *              work queue after we return.
1573  *
1574  * Conditions:  Called against the current process.
1575  */
1576 void
1577 workq_mark_exiting(struct proc *p)
1578 {
1579         struct workqueue *wq = proc_get_wqptr(p);
1580         uint32_t wq_flags;
1581         workq_threadreq_t mgr_req;
1582
1583         if (!wq) {
1584                 return;
1585         }
1586
1587         WQ_TRACE_WQ(TRACE_wq_pthread_exit | DBG_FUNC_START, wq, 0, 0, 0, 0);
1588
1589         workq_lock_spin(wq);
1590
1591         wq_flags = os_atomic_or_orig(&wq->wq_flags, WQ_EXITING, relaxed);
1592         if (__improbable(wq_flags & WQ_EXITING)) {
1593                 panic("workq_mark_exiting called twice");
1594         }
1595
1596         /*
1597          * Opportunistically try to cancel thread calls that are likely in flight.
1598          * workq_exit() will do the proper cleanup.
1599          */
1600         if (wq_flags & WQ_IMMEDIATE_CALL_SCHEDULED) {
1601                 thread_call_cancel(wq->wq_immediate_call);
1602         }
1603         if (wq_flags & WQ_DELAYED_CALL_SCHEDULED) {
1604                 thread_call_cancel(wq->wq_delayed_call);
1605         }
1606         if (wq_flags & WQ_DEATH_CALL_SCHEDULED) {
1607                 thread_call_cancel(wq->wq_death_call);
1608         }
1609
1610         mgr_req = wq->wq_event_manager_threadreq;
1611         wq->wq_event_manager_threadreq = NULL;
1612         wq->wq_reqcount = 0; /* workq_schedule_creator must not look at queues */
1613         wq->wq_creator = NULL;
1614         workq_turnstile_update_inheritor(wq, TURNSTILE_INHERITOR_NULL, 0);
1615
1616         workq_unlock(wq);
1617
1618         if (mgr_req) {
1619                 kqueue_threadreq_cancel(p, mgr_req);
1620         }
1621         /*
1622          * No one touches the priority queues once WQ_EXITING is set.
1623          * It is hence safe to do the tear down without holding any lock.
1624          */
1625         priority_queue_destroy(&wq->wq_overcommit_queue,
1626             struct workq_threadreq_s, tr_entry, ^(workq_threadreq_t e){
1627                 workq_threadreq_destroy(p, e);
1628         });
1629         priority_queue_destroy(&wq->wq_constrained_queue,
1630             struct workq_threadreq_s, tr_entry, ^(workq_threadreq_t e){
1631                 workq_threadreq_destroy(p, e);
1632         });
1633         priority_queue_destroy(&wq->wq_special_queue,
1634             struct workq_threadreq_s, tr_entry, ^(workq_threadreq_t e){
1635                 workq_threadreq_destroy(p, e);
1636         });
1637
1638         WQ_TRACE(TRACE_wq_pthread_exit | DBG_FUNC_END, 0, 0, 0, 0, 0);
1639 }
1640
1641 /*
1642  * Routine:     workq_exit
1643  *
1644  * Function:    clean up the work queue structure(s) now that there are no threads
1645  *              left running inside the work queue (except possibly current_thread).
1646  *
1647  * Conditions:  Called by the last thread in the process.
1648  *              Called against current process.
1649  */
1650 void
1651 workq_exit(struct proc *p)
1652 {
1653         struct workqueue *wq;
1654         struct uthread *uth, *tmp;
1655
1656         wq = os_atomic_xchg(&p->p_wqptr, NULL, relaxed);
1657         if (wq != NULL) {
1658                 thread_t th = current_thread();
1659
1660                 WQ_TRACE_WQ(TRACE_wq_workqueue_exit | DBG_FUNC_START, wq, 0, 0, 0, 0);
1661
1662                 if (thread_get_tag(th) & THREAD_TAG_WORKQUEUE) {
1663                         /*
1664                          * <rdar://problem/40111515> Make sure we will no longer call the
1665                          * sched call, if we ever block this thread, which the cancel_wait
1666                          * below can do.
1667                          */
1668                         thread_sched_call(th, NULL);
1669                 }
1670
1671                 /*
1672                  * Thread calls are always scheduled by the proc itself or under the
1673                  * workqueue spinlock if WQ_EXITING is not yet set.
1674                  *
1675                  * Either way, when this runs, the proc has no threads left beside
1676                  * the one running this very code, so we know no thread call can be
1677                  * dispatched anymore.
1678                  */
1679                 thread_call_cancel_wait(wq->wq_delayed_call);
1680                 thread_call_cancel_wait(wq->wq_immediate_call);
1681                 thread_call_cancel_wait(wq->wq_death_call);
1682                 thread_call_free(wq->wq_delayed_call);
1683                 thread_call_free(wq->wq_immediate_call);
1684                 thread_call_free(wq->wq_death_call);
1685
1686                 /*
1687                  * Clean up workqueue data structures for threads that exited and
1688                  * didn't get a chance to clean up after themselves.
1689                  *
1690                  * idle/new threads should have been interrupted and died on their own
1691                  */
1692                 TAILQ_FOREACH_SAFE(uth, &wq->wq_thrunlist, uu_workq_entry, tmp) {
1693                         thread_sched_call(uth->uu_thread, NULL);
1694                         thread_deallocate(uth->uu_thread);
1695                 }
1696                 assert(TAILQ_EMPTY(&wq->wq_thnewlist));
1697                 assert(TAILQ_EMPTY(&wq->wq_thidlelist));
1698
1699                 WQ_TRACE_WQ(TRACE_wq_destroy | DBG_FUNC_END, wq,
1700                     VM_KERNEL_ADDRHIDE(wq), 0, 0, 0);
1701
1702                 workq_deallocate(wq);
1703
1704                 WQ_TRACE(TRACE_wq_workqueue_exit | DBG_FUNC_END, 0, 0, 0, 0, 0);
1705         }
1706 }
1707
1708
1709 #pragma mark bsd thread control
1710
1711 static bool
1712 _pthread_priority_to_policy(pthread_priority_t priority,
1713     thread_qos_policy_data_t *data)
1714 {
1715         data->qos_tier = _pthread_priority_thread_qos(priority);
1716         data->tier_importance = _pthread_priority_relpri(priority);
1717         if (data->qos_tier == THREAD_QOS_UNSPECIFIED || data->tier_importance > 0 ||
1718             data->tier_importance < THREAD_QOS_MIN_TIER_IMPORTANCE) {
1719                 return false;
1720         }
1721         return true;
1722 }
1723
1724 static int
1725 bsdthread_set_self(proc_t p, thread_t th, pthread_priority_t priority,
1726     mach_port_name_t voucher, enum workq_set_self_flags flags)
1727 {
1728         struct uthread *uth = get_bsdthread_info(th);
1729         struct workqueue *wq = proc_get_wqptr(p);
1730
1731         kern_return_t kr;
1732         int unbind_rv = 0, qos_rv = 0, voucher_rv = 0, fixedpri_rv = 0;
1733         bool is_wq_thread = (thread_get_tag(th) & THREAD_TAG_WORKQUEUE);
1734
1735         if (flags & WORKQ_SET_SELF_WQ_KEVENT_UNBIND) {
1736                 if (!is_wq_thread) {
1737                         unbind_rv = EINVAL;
1738                         goto qos;
1739                 }
1740
1741                 if (uth->uu_workq_pri.qos_bucket == WORKQ_THREAD_QOS_MANAGER) {
1742                         unbind_rv = EINVAL;
1743                         goto qos;
1744                 }
1745
1746                 workq_threadreq_t kqr = uth->uu_kqr_bound;
1747                 if (kqr == NULL) {
1748                         unbind_rv = EALREADY;
1749                         goto qos;
1750                 }
1751
1752                 if (kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP) {
1753                         unbind_rv = EINVAL;
1754                         goto qos;
1755                 }
1756
1757                 kqueue_threadreq_unbind(p, kqr);
1758         }
1759
1760 qos:
1761         if (flags & WORKQ_SET_SELF_QOS_FLAG) {
1762                 thread_qos_policy_data_t new_policy;
1763
1764                 if (!_pthread_priority_to_policy(priority, &new_policy)) {
1765                         qos_rv = EINVAL;
1766                         goto voucher;
1767                 }
1768
1769                 if (!is_wq_thread) {
1770                         /*
1771                          * Threads opted out of QoS can't change QoS
1772                          */
1773                         if (!thread_has_qos_policy(th)) {
1774                                 qos_rv = EPERM;
1775                                 goto voucher;
1776                         }
1777                 } else if (uth->uu_workq_pri.qos_bucket == WORKQ_THREAD_QOS_MANAGER ||
1778                     uth->uu_workq_pri.qos_bucket == WORKQ_THREAD_QOS_ABOVEUI) {
1779                         /*
1780                          * Workqueue manager threads or threads above UI can't change QoS
1781                          */
1782                         qos_rv = EINVAL;
1783                         goto voucher;
1784                 } else {
1785                         /*
1786                          * For workqueue threads, possibly adjust buckets and redrive thread
1787                          * requests.
1788                          */
1789                         bool old_overcommit = uth->uu_workq_flags & UT_WORKQ_OVERCOMMIT;
1790                         bool new_overcommit = priority & _PTHREAD_PRIORITY_OVERCOMMIT_FLAG;
1791                         struct uu_workq_policy old_pri, new_pri;
1792                         bool force_run = false;
1793
1794                         workq_lock_spin(wq);
1795
1796                         if (old_overcommit != new_overcommit) {
1797                                 uth->uu_workq_flags ^= UT_WORKQ_OVERCOMMIT;
1798                                 if (old_overcommit) {
1799                                         wq->wq_constrained_threads_scheduled++;
1800                                 } else if (wq->wq_constrained_threads_scheduled-- ==
1801                                     wq_max_constrained_threads) {
1802                                         force_run = true;
1803                                 }
1804                         }
1805
1806                         old_pri = new_pri = uth->uu_workq_pri;
1807                         new_pri.qos_req = (thread_qos_t)new_policy.qos_tier;
1808                         workq_thread_update_bucket(p, wq, uth, old_pri, new_pri, force_run);
1809                         workq_unlock(wq);
1810                 }
1811
1812                 kr = thread_policy_set_internal(th, THREAD_QOS_POLICY,
1813                     (thread_policy_t)&new_policy, THREAD_QOS_POLICY_COUNT);
1814                 if (kr != KERN_SUCCESS) {
1815                         qos_rv = EINVAL;
1816                 }
1817         }
1818
1819 voucher:
1820         if (flags & WORKQ_SET_SELF_VOUCHER_FLAG) {
1821                 kr = thread_set_voucher_name(voucher);
1822                 if (kr != KERN_SUCCESS) {
1823                         voucher_rv = ENOENT;
1824                         goto fixedpri;
1825                 }
1826         }
1827
1828 fixedpri:
1829         if (qos_rv) {
1830                 goto done;
1831         }
1832         if (flags & WORKQ_SET_SELF_FIXEDPRIORITY_FLAG) {
1833                 thread_extended_policy_data_t extpol = {.timeshare = 0};
1834
1835                 if (is_wq_thread) {
1836                         /* Not allowed on workqueue threads */
1837                         fixedpri_rv = ENOTSUP;
1838                         goto done;
1839                 }
1840
1841                 kr = thread_policy_set_internal(th, THREAD_EXTENDED_POLICY,
1842                     (thread_policy_t)&extpol, THREAD_EXTENDED_POLICY_COUNT);
1843                 if (kr != KERN_SUCCESS) {
1844                         fixedpri_rv = EINVAL;
1845                         goto done;
1846                 }
1847         } else if (flags & WORKQ_SET_SELF_TIMESHARE_FLAG) {
1848                 thread_extended_policy_data_t extpol = {.timeshare = 1};
1849
1850                 if (is_wq_thread) {
1851                         /* Not allowed on workqueue threads */
1852                         fixedpri_rv = ENOTSUP;
1853                         goto done;
1854                 }
1855
1856                 kr = thread_policy_set_internal(th, THREAD_EXTENDED_POLICY,
1857                     (thread_policy_t)&extpol, THREAD_EXTENDED_POLICY_COUNT);
1858                 if (kr != KERN_SUCCESS) {
1859                         fixedpri_rv = EINVAL;
1860                         goto done;
1861                 }
1862         }
1863
1864 done:
1865         if (qos_rv && voucher_rv) {
1866                 /* Both failed, give that a unique error. */
1867                 return EBADMSG;
1868         }
1869
1870         if (unbind_rv) {
1871                 return unbind_rv;
1872         }
1873
1874         if (qos_rv) {
1875                 return qos_rv;
1876         }
1877
1878         if (voucher_rv) {
1879                 return voucher_rv;
1880         }
1881
1882         if (fixedpri_rv) {
1883                 return fixedpri_rv;
1884         }
1885
1886
1887         return 0;
1888 }
1889
1890 static int
1891 bsdthread_add_explicit_override(proc_t p, mach_port_name_t kport,
1892     pthread_priority_t pp, user_addr_t resource)
1893 {
1894         thread_qos_t qos = _pthread_priority_thread_qos(pp);
1895         if (qos == THREAD_QOS_UNSPECIFIED) {
1896                 return EINVAL;
1897         }
1898
1899         thread_t th = port_name_to_thread(kport,
1900             PORT_TO_THREAD_IN_CURRENT_TASK);
1901         if (th == THREAD_NULL) {
1902                 return ESRCH;
1903         }
1904
1905         int rv = proc_thread_qos_add_override(p->task, th, 0, qos, TRUE,
1906             resource, THREAD_QOS_OVERRIDE_TYPE_PTHREAD_EXPLICIT_OVERRIDE);
1907
1908         thread_deallocate(th);
1909         return rv;
1910 }
1911
1912 static int
1913 bsdthread_remove_explicit_override(proc_t p, mach_port_name_t kport,
1914     user_addr_t resource)
1915 {
1916         thread_t th = port_name_to_thread(kport,
1917             PORT_TO_THREAD_IN_CURRENT_TASK);
1918         if (th == THREAD_NULL) {
1919                 return ESRCH;
1920         }
1921
1922         int rv = proc_thread_qos_remove_override(p->task, th, 0, resource,
1923             THREAD_QOS_OVERRIDE_TYPE_PTHREAD_EXPLICIT_OVERRIDE);
1924
1925         thread_deallocate(th);
1926         return rv;
1927 }
1928
1929 static int
1930 workq_thread_add_dispatch_override(proc_t p, mach_port_name_t kport,
1931     pthread_priority_t pp, user_addr_t ulock_addr)
1932 {
1933         struct uu_workq_policy old_pri, new_pri;
1934         struct workqueue *wq = proc_get_wqptr(p);
1935
1936         thread_qos_t qos_override = _pthread_priority_thread_qos(pp);
1937         if (qos_override == THREAD_QOS_UNSPECIFIED) {
1938                 return EINVAL;
1939         }
1940
1941         thread_t thread = port_name_to_thread(kport,
1942             PORT_TO_THREAD_IN_CURRENT_TASK);
1943         if (thread == THREAD_NULL) {
1944                 return ESRCH;
1945         }
1946
1947         struct uthread *uth = get_bsdthread_info(thread);
1948         if ((thread_get_tag(thread) & THREAD_TAG_WORKQUEUE) == 0) {
1949                 thread_deallocate(thread);
1950                 return EPERM;
1951         }
1952
1953         WQ_TRACE_WQ(TRACE_wq_override_dispatch | DBG_FUNC_NONE,
1954             wq, thread_tid(thread), 1, pp, 0);
1955
1956         thread_mtx_lock(thread);
1957
1958         if (ulock_addr) {
1959                 uint32_t val;
1960                 int rc;
1961                 /*
1962                  * Workaround lack of explicit support for 'no-fault copyin'
1963                  * <rdar://problem/24999882>, as disabling preemption prevents paging in
1964                  */
1965                 disable_preemption();
1966                 rc = copyin_atomic32(ulock_addr, &val);
1967                 enable_preemption();
1968                 if (rc == 0 && ulock_owner_value_to_port_name(val) != kport) {
1969                         goto out;
1970                 }
1971         }
1972
1973         workq_lock_spin(wq);
1974
1975         old_pri = uth->uu_workq_pri;
1976         if (old_pri.qos_override >= qos_override) {
1977                 /* Nothing to do */
1978         } else if (thread == current_thread()) {
1979                 new_pri = old_pri;
1980                 new_pri.qos_override = qos_override;
1981                 workq_thread_update_bucket(p, wq, uth, old_pri, new_pri, false);
1982         } else {
1983                 uth->uu_workq_pri.qos_override = qos_override;
1984                 if (qos_override > workq_pri_override(old_pri)) {
1985                         thread_set_workq_override(thread, qos_override);
1986                 }
1987         }
1988
1989         workq_unlock(wq);
1990
1991 out:
1992         thread_mtx_unlock(thread);
1993         thread_deallocate(thread);
1994         return 0;
1995 }
1996
1997 static int
1998 workq_thread_reset_dispatch_override(proc_t p, thread_t thread)
1999 {
2000         struct uu_workq_policy old_pri, new_pri;
2001         struct workqueue *wq = proc_get_wqptr(p);
2002         struct uthread *uth = get_bsdthread_info(thread);
2003
2004         if ((thread_get_tag(thread) & THREAD_TAG_WORKQUEUE) == 0) {
2005                 return EPERM;
2006         }
2007
2008         WQ_TRACE_WQ(TRACE_wq_override_reset | DBG_FUNC_NONE, wq, 0, 0, 0, 0);
2009
2010         workq_lock_spin(wq);
2011         old_pri = new_pri = uth->uu_workq_pri;
2012         new_pri.qos_override = THREAD_QOS_UNSPECIFIED;
2013         workq_thread_update_bucket(p, wq, uth, old_pri, new_pri, false);
2014         workq_unlock(wq);
2015         return 0;
2016 }
2017
2018 static int
2019 workq_thread_allow_kill(__unused proc_t p, thread_t thread, bool enable)
2020 {
2021         if (!(thread_get_tag(thread) & THREAD_TAG_WORKQUEUE)) {
2022                 // If the thread isn't a workqueue thread, don't set the
2023                 // kill_allowed bit; however, we still need to return 0
2024                 // instead of an error code since this code is executed
2025                 // on the abort path which needs to not depend on the
2026                 // pthread_t (returning an error depends on pthread_t via
2027                 // cerror_nocancel)
2028                 return 0;
2029         }
2030         struct uthread *uth = get_bsdthread_info(thread);
2031         uth->uu_workq_pthread_kill_allowed = enable;
2032         return 0;
2033 }
2034
2035 static int
2036 bsdthread_get_max_parallelism(thread_qos_t qos, unsigned long flags,
2037     int *retval)
2038 {
2039         static_assert(QOS_PARALLELISM_COUNT_LOGICAL ==
2040             _PTHREAD_QOS_PARALLELISM_COUNT_LOGICAL, "logical");
2041         static_assert(QOS_PARALLELISM_REALTIME ==
2042             _PTHREAD_QOS_PARALLELISM_REALTIME, "realtime");
2043
2044         if (flags & ~(QOS_PARALLELISM_REALTIME | QOS_PARALLELISM_COUNT_LOGICAL)) {
2045                 return EINVAL;
2046         }
2047
2048         if (flags & QOS_PARALLELISM_REALTIME) {
2049                 if (qos) {
2050                         return EINVAL;
2051                 }
2052         } else if (qos == THREAD_QOS_UNSPECIFIED || qos >= THREAD_QOS_LAST) {
2053                 return EINVAL;
2054         }
2055
2056         *retval = qos_max_parallelism(qos, flags);
2057         return 0;
2058 }
2059
2060 #define ENSURE_UNUSED(arg) \
2061                 ({ if ((arg) != 0) { return EINVAL; } })
2062
2063 int
2064 bsdthread_ctl(struct proc *p, struct bsdthread_ctl_args *uap, int *retval)
2065 {
2066         switch (uap->cmd) {
2067         case BSDTHREAD_CTL_QOS_OVERRIDE_START:
2068                 return bsdthread_add_explicit_override(p, (mach_port_name_t)uap->arg1,
2069                            (pthread_priority_t)uap->arg2, uap->arg3);
2070         case BSDTHREAD_CTL_QOS_OVERRIDE_END:
2071                 ENSURE_UNUSED(uap->arg3);
2072                 return bsdthread_remove_explicit_override(p, (mach_port_name_t)uap->arg1,
2073                            (user_addr_t)uap->arg2);
2074
2075         case BSDTHREAD_CTL_QOS_OVERRIDE_DISPATCH:
2076                 return workq_thread_add_dispatch_override(p, (mach_port_name_t)uap->arg1,
2077                            (pthread_priority_t)uap->arg2, uap->arg3);
2078         case BSDTHREAD_CTL_QOS_OVERRIDE_RESET:
2079                 return workq_thread_reset_dispatch_override(p, current_thread());
2080
2081         case BSDTHREAD_CTL_SET_SELF:
2082                 return bsdthread_set_self(p, current_thread(),
2083                            (pthread_priority_t)uap->arg1, (mach_port_name_t)uap->arg2,
2084                            (enum workq_set_self_flags)uap->arg3);
2085
2086         case BSDTHREAD_CTL_QOS_MAX_PARALLELISM:
2087                 ENSURE_UNUSED(uap->arg3);
2088                 return bsdthread_get_max_parallelism((thread_qos_t)uap->arg1,
2089                            (unsigned long)uap->arg2, retval);
2090         case BSDTHREAD_CTL_WORKQ_ALLOW_KILL:
2091                 ENSURE_UNUSED(uap->arg2);
2092                 ENSURE_UNUSED(uap->arg3);
2093                 return workq_thread_allow_kill(p, current_thread(), (bool)uap->arg1);
2094
2095         case BSDTHREAD_CTL_SET_QOS:
2096         case BSDTHREAD_CTL_QOS_DISPATCH_ASYNCHRONOUS_OVERRIDE_ADD:
2097         case BSDTHREAD_CTL_QOS_DISPATCH_ASYNCHRONOUS_OVERRIDE_RESET:
2098                 /* no longer supported */
2099                 return ENOTSUP;
2100
2101         default:
2102                 return EINVAL;
2103         }
2104 }
2105
2106 #pragma mark workqueue thread manipulation
2107
2108 static void __dead2
2109 workq_unpark_select_threadreq_or_park_and_unlock(proc_t p, struct workqueue *wq,
2110     struct uthread *uth, uint32_t setup_flags);
2111
2112 static void __dead2
2113 workq_select_threadreq_or_park_and_unlock(proc_t p, struct workqueue *wq,
2114     struct uthread *uth, uint32_t setup_flags);
2115
2116 static void workq_setup_and_run(proc_t p, struct uthread *uth, int flags) __dead2;
2117
2118 #if KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD
2119 static inline uint64_t
2120 workq_trace_req_id(workq_threadreq_t req)
2121 {
2122         struct kqworkloop *kqwl;
2123         if (req->tr_flags & WORKQ_TR_FLAG_WORKLOOP) {
2124                 kqwl = __container_of(req, struct kqworkloop, kqwl_request);
2125                 return kqwl->kqwl_dynamicid;
2126         }
2127
2128         return VM_KERNEL_ADDRHIDE(req);
2129 }
2130 #endif
2131
2132 /**
2133  * Entry point for libdispatch to ask for threads
2134  */
2135 static int
2136 workq_reqthreads(struct proc *p, uint32_t reqcount, pthread_priority_t pp)
2137 {
2138         thread_qos_t qos = _pthread_priority_thread_qos(pp);
2139         struct workqueue *wq = proc_get_wqptr(p);
2140         uint32_t unpaced, upcall_flags = WQ_FLAG_THREAD_NEWSPI;
2141
2142         if (wq == NULL || reqcount <= 0 || reqcount > UINT16_MAX ||
2143             qos == THREAD_QOS_UNSPECIFIED) {
2144                 return EINVAL;
2145         }
2146
2147         WQ_TRACE_WQ(TRACE_wq_wqops_reqthreads | DBG_FUNC_NONE,
2148             wq, reqcount, pp, 0, 0);
2149
2150         workq_threadreq_t req = zalloc(workq_zone_threadreq);
2151         priority_queue_entry_init(&req->tr_entry);
2152         req->tr_state = WORKQ_TR_STATE_NEW;
2153         req->tr_flags = 0;
2154         req->tr_qos   = qos;
2155
2156         if (pp & _PTHREAD_PRIORITY_OVERCOMMIT_FLAG) {
2157                 req->tr_flags |= WORKQ_TR_FLAG_OVERCOMMIT;
2158                 upcall_flags |= WQ_FLAG_THREAD_OVERCOMMIT;
2159         }
2160
2161         WQ_TRACE_WQ(TRACE_wq_thread_request_initiate | DBG_FUNC_NONE,
2162             wq, workq_trace_req_id(req), req->tr_qos, reqcount, 0);
2163
2164         workq_lock_spin(wq);
2165         do {
2166                 if (_wq_exiting(wq)) {
2167                         goto exiting;
2168                 }
2169
2170                 /*
2171                  * When userspace is asking for parallelism, wakeup up to (reqcount - 1)
2172                  * threads without pacing, to inform the scheduler of that workload.
2173                  *
2174                  * The last requests, or the ones that failed the admission checks are
2175                  * enqueued and go through the regular creator codepath.
2176                  *
2177                  * If there aren't enough threads, add one, but re-evaluate everything
2178                  * as conditions may now have changed.
2179                  */
2180                 if (reqcount > 1 && (req->tr_flags & WORKQ_TR_FLAG_OVERCOMMIT) == 0) {
2181                         unpaced = workq_constrained_allowance(wq, qos, NULL, false);
2182                         if (unpaced >= reqcount - 1) {
2183                                 unpaced = reqcount - 1;
2184                         }
2185                 } else {
2186                         unpaced = reqcount - 1;
2187                 }
2188
2189                 /*
2190                  * This path does not currently handle custom workloop parameters
2191                  * when creating threads for parallelism.
2192                  */
2193                 assert(!(req->tr_flags & WORKQ_TR_FLAG_WL_PARAMS));
2194
2195                 /*
2196                  * This is a trimmed down version of workq_threadreq_bind_and_unlock()
2197                  */
2198                 while (unpaced > 0 && wq->wq_thidlecount) {
2199                         struct uthread *uth;
2200                         bool needs_wakeup;
2201                         uint8_t uu_flags = UT_WORKQ_EARLY_BOUND;
2202
2203                         if (req->tr_flags & WORKQ_TR_FLAG_OVERCOMMIT) {
2204                                 uu_flags |= UT_WORKQ_OVERCOMMIT;
2205                         }
2206
2207                         uth = workq_pop_idle_thread(wq, uu_flags, &needs_wakeup);
2208
2209                         _wq_thactive_inc(wq, qos);
2210                         wq->wq_thscheduled_count[_wq_bucket(qos)]++;
2211                         workq_thread_reset_pri(wq, uth, req, /*unpark*/ true);
2212                         wq->wq_fulfilled++;
2213
2214                         uth->uu_save.uus_workq_park_data.upcall_flags = upcall_flags;
2215                         uth->uu_save.uus_workq_park_data.thread_request = req;
2216                         if (needs_wakeup) {
2217                                 workq_thread_wakeup(uth);
2218                         }
2219                         unpaced--;
2220                         reqcount--;
2221                 }
2222         } while (unpaced && wq->wq_nthreads < wq_max_threads &&
2223             workq_add_new_idle_thread(p, wq));
2224
2225         if (_wq_exiting(wq)) {
2226                 goto exiting;
2227         }
2228
2229         req->tr_count = (uint16_t)reqcount;
2230         if (workq_threadreq_enqueue(wq, req)) {
2231                 /* This can drop the workqueue lock, and take it again */
2232                 workq_schedule_creator(p, wq, WORKQ_THREADREQ_CAN_CREATE_THREADS);
2233         }
2234         workq_unlock(wq);
2235         return 0;
2236
2237 exiting:
2238         workq_unlock(wq);
2239         zfree(workq_zone_threadreq, req);
2240         return ECANCELED;
2241 }
2242
2243 bool
2244 workq_kern_threadreq_initiate(struct proc *p, workq_threadreq_t req,
2245     struct turnstile *workloop_ts, thread_qos_t qos,
2246     workq_kern_threadreq_flags_t flags)
2247 {
2248         struct workqueue *wq = proc_get_wqptr_fast(p);
2249         struct uthread *uth = NULL;
2250
2251         assert(req->tr_flags & (WORKQ_TR_FLAG_WORKLOOP | WORKQ_TR_FLAG_KEVENT));
2252
2253         if (req->tr_flags & WORKQ_TR_FLAG_WL_OUTSIDE_QOS) {
2254                 workq_threadreq_param_t trp = kqueue_threadreq_workloop_param(req);
2255                 qos = thread_workq_qos_for_pri(trp.trp_pri);
2256                 if (qos == THREAD_QOS_UNSPECIFIED) {
2257                         qos = WORKQ_THREAD_QOS_ABOVEUI;
2258                 }
2259         }
2260
2261         assert(req->tr_state == WORKQ_TR_STATE_IDLE);
2262         priority_queue_entry_init(&req->tr_entry);
2263         req->tr_count = 1;
2264         req->tr_state = WORKQ_TR_STATE_NEW;
2265         req->tr_qos   = qos;
2266
2267         WQ_TRACE_WQ(TRACE_wq_thread_request_initiate | DBG_FUNC_NONE, wq,
2268             workq_trace_req_id(req), qos, 1, 0);
2269
2270         if (flags & WORKQ_THREADREQ_ATTEMPT_REBIND) {
2271                 /*
2272                  * we're called back synchronously from the context of
2273                  * kqueue_threadreq_unbind from within workq_thread_return()
2274                  * we can try to match up this thread with this request !
2275                  */
2276                 uth = current_uthread();
2277                 assert(uth->uu_kqr_bound == NULL);
2278         }
2279
2280         workq_lock_spin(wq);
2281         if (_wq_exiting(wq)) {
2282                 req->tr_state = WORKQ_TR_STATE_IDLE;
2283                 workq_unlock(wq);
2284                 return false;
2285         }
2286
2287         if (uth && workq_threadreq_admissible(wq, uth, req)) {
2288                 assert(uth != wq->wq_creator);
2289                 if (uth->uu_workq_pri.qos_bucket != req->tr_qos) {
2290                         _wq_thactive_move(wq, uth->uu_workq_pri.qos_bucket, req->tr_qos);
2291                         workq_thread_reset_pri(wq, uth, req, /*unpark*/ false);
2292                 }
2293                 /*
2294                  * We're called from workq_kern_threadreq_initiate()
2295                  * due to an unbind, with the kq req held.
2296                  */
2297                 WQ_TRACE_WQ(TRACE_wq_thread_logical_run | DBG_FUNC_START, wq,
2298                     workq_trace_req_id(req), 0, 0, 0);
2299                 wq->wq_fulfilled++;
2300                 kqueue_threadreq_bind(p, req, uth->uu_thread, 0);
2301         } else {
2302                 if (workloop_ts) {
2303                         workq_perform_turnstile_operation_locked(wq, ^{
2304                                 turnstile_update_inheritor(workloop_ts, wq->wq_turnstile,
2305                                 TURNSTILE_IMMEDIATE_UPDATE | TURNSTILE_INHERITOR_TURNSTILE);
2306                                 turnstile_update_inheritor_complete(workloop_ts,
2307                                 TURNSTILE_INTERLOCK_HELD);
2308                         });
2309                 }
2310                 if (workq_threadreq_enqueue(wq, req)) {
2311                         workq_schedule_creator(p, wq, flags);
2312                 }
2313         }
2314
2315         workq_unlock(wq);
2316
2317         return true;
2318 }
2319
2320 void
2321 workq_kern_threadreq_modify(struct proc *p, workq_threadreq_t req,
2322     thread_qos_t qos, workq_kern_threadreq_flags_t flags)
2323 {
2324         struct workqueue *wq = proc_get_wqptr_fast(p);
2325         bool make_overcommit = false;
2326
2327         if (req->tr_flags & WORKQ_TR_FLAG_WL_OUTSIDE_QOS) {
2328                 /* Requests outside-of-QoS shouldn't accept modify operations */
2329                 return;
2330         }
2331
2332         workq_lock_spin(wq);
2333
2334         assert(req->tr_qos != WORKQ_THREAD_QOS_MANAGER);
2335         assert(req->tr_flags & (WORKQ_TR_FLAG_KEVENT | WORKQ_TR_FLAG_WORKLOOP));
2336
2337         if (req->tr_state == WORKQ_TR_STATE_BINDING) {
2338                 kqueue_threadreq_bind(p, req, req->tr_thread, 0);
2339                 workq_unlock(wq);
2340                 return;
2341         }
2342
2343         if (flags & WORKQ_THREADREQ_MAKE_OVERCOMMIT) {
2344                 make_overcommit = (req->tr_flags & WORKQ_TR_FLAG_OVERCOMMIT) == 0;
2345         }
2346
2347         if (_wq_exiting(wq) || (req->tr_qos == qos && !make_overcommit)) {
2348                 workq_unlock(wq);
2349                 return;
2350         }
2351
2352         assert(req->tr_count == 1);
2353         if (req->tr_state != WORKQ_TR_STATE_QUEUED) {
2354                 panic("Invalid thread request (%p) state %d", req, req->tr_state);
2355         }
2356
2357         WQ_TRACE_WQ(TRACE_wq_thread_request_modify | DBG_FUNC_NONE, wq,
2358             workq_trace_req_id(req), qos, 0, 0);
2359
2360         struct priority_queue_sched_max *pq = workq_priority_queue_for_req(wq, req);
2361         workq_threadreq_t req_max;
2362
2363         /*
2364          * Stage 1: Dequeue the request from its priority queue.
2365          *
2366          * If we dequeue the root item of the constrained priority queue,
2367          * maintain the best constrained request qos invariant.
2368          */
2369         if (priority_queue_remove(pq, &req->tr_entry)) {
2370                 if ((req->tr_flags & WORKQ_TR_FLAG_OVERCOMMIT) == 0) {
2371                         _wq_thactive_refresh_best_constrained_req_qos(wq);
2372                 }
2373         }
2374
2375         /*
2376          * Stage 2: Apply changes to the thread request
2377          *
2378          * If the item will not become the root of the priority queue it belongs to,
2379          * then we need to wait in line, just enqueue and return quickly.
2380          */
2381         if (__improbable(make_overcommit)) {
2382                 req->tr_flags ^= WORKQ_TR_FLAG_OVERCOMMIT;
2383                 pq = workq_priority_queue_for_req(wq, req);
2384         }
2385         req->tr_qos = qos;
2386
2387         req_max = priority_queue_max(pq, struct workq_threadreq_s, tr_entry);
2388         if (req_max && req_max->tr_qos >= qos) {
2389                 priority_queue_entry_set_sched_pri(pq, &req->tr_entry,
2390                     workq_priority_for_req(req), false);
2391                 priority_queue_insert(pq, &req->tr_entry);
2392                 workq_unlock(wq);
2393                 return;
2394         }
2395
2396         /*
2397          * Stage 3: Reevaluate whether we should run the thread request.
2398          *
2399          * Pretend the thread request is new again:
2400          * - adjust wq_reqcount to not count it anymore.
2401          * - make its state WORKQ_TR_STATE_NEW (so that workq_threadreq_bind_and_unlock
2402          *   properly attempts a synchronous bind)
2403          */
2404         wq->wq_reqcount--;
2405         req->tr_state = WORKQ_TR_STATE_NEW;
2406         if (workq_threadreq_enqueue(wq, req)) {
2407                 workq_schedule_creator(p, wq, flags);
2408         }
2409         workq_unlock(wq);
2410 }
2411
2412 void
2413 workq_kern_threadreq_lock(struct proc *p)
2414 {
2415         workq_lock_spin(proc_get_wqptr_fast(p));
2416 }
2417
2418 void
2419 workq_kern_threadreq_unlock(struct proc *p)
2420 {
2421         workq_unlock(proc_get_wqptr_fast(p));
2422 }
2423
2424 void
2425 workq_kern_threadreq_update_inheritor(struct proc *p, workq_threadreq_t req,
2426     thread_t owner, struct turnstile *wl_ts,
2427     turnstile_update_flags_t flags)
2428 {
2429         struct workqueue *wq = proc_get_wqptr_fast(p);
2430         turnstile_inheritor_t inheritor;
2431
2432         assert(req->tr_qos != WORKQ_THREAD_QOS_MANAGER);
2433         assert(req->tr_flags & WORKQ_TR_FLAG_WORKLOOP);
2434         workq_lock_held(wq);
2435
2436         if (req->tr_state == WORKQ_TR_STATE_BINDING) {
2437                 kqueue_threadreq_bind(p, req, req->tr_thread,
2438                     KQUEUE_THREADERQ_BIND_NO_INHERITOR_UPDATE);
2439                 return;
2440         }
2441
2442         if (_wq_exiting(wq)) {
2443                 inheritor = TURNSTILE_INHERITOR_NULL;
2444         } else {
2445                 if (req->tr_state != WORKQ_TR_STATE_QUEUED) {
2446                         panic("Invalid thread request (%p) state %d", req, req->tr_state);
2447                 }
2448
2449                 if (owner) {
2450                         inheritor = owner;
2451                         flags |= TURNSTILE_INHERITOR_THREAD;
2452                 } else {
2453                         inheritor = wq->wq_turnstile;
2454                         flags |= TURNSTILE_INHERITOR_TURNSTILE;
2455                 }
2456         }
2457
2458         workq_perform_turnstile_operation_locked(wq, ^{
2459                 turnstile_update_inheritor(wl_ts, inheritor, flags);
2460         });
2461 }
2462
2463 void
2464 workq_kern_threadreq_redrive(struct proc *p, workq_kern_threadreq_flags_t flags)
2465 {
2466         struct workqueue *wq = proc_get_wqptr_fast(p);
2467
2468         workq_lock_spin(wq);
2469         workq_schedule_creator(p, wq, flags);
2470         workq_unlock(wq);
2471 }
2472
2473 void
2474 workq_schedule_creator_turnstile_redrive(struct workqueue *wq, bool locked)
2475 {
2476         if (locked) {
2477                 workq_schedule_creator(NULL, wq, WORKQ_THREADREQ_NONE);
2478         } else {
2479                 workq_schedule_immediate_thread_creation(wq);
2480         }
2481 }
2482
2483 static int
2484 workq_thread_return(struct proc *p, struct workq_kernreturn_args *uap,
2485     struct workqueue *wq)
2486 {
2487         thread_t th = current_thread();
2488         struct uthread *uth = get_bsdthread_info(th);
2489         workq_threadreq_t kqr = uth->uu_kqr_bound;
2490         workq_threadreq_param_t trp = { };
2491         int nevents = uap->affinity, error;
2492         user_addr_t eventlist = uap->item;
2493
2494         if (((thread_get_tag(th) & THREAD_TAG_WORKQUEUE) == 0) ||
2495             (uth->uu_workq_flags & UT_WORKQ_DYING)) {
2496                 return EINVAL;
2497         }
2498
2499         if (eventlist && nevents && kqr == NULL) {
2500                 return EINVAL;
2501         }
2502
2503         /* reset signal mask on the workqueue thread to default state */
2504         if (uth->uu_sigmask != (sigset_t)(~workq_threadmask)) {
2505                 proc_lock(p);
2506                 uth->uu_sigmask = ~workq_threadmask;
2507                 proc_unlock(p);
2508         }
2509
2510         if (kqr && kqr->tr_flags & WORKQ_TR_FLAG_WL_PARAMS) {
2511                 /*
2512                  * Ensure we store the threadreq param before unbinding
2513                  * the kqr from this thread.
2514                  */
2515                 trp = kqueue_threadreq_workloop_param(kqr);
2516         }
2517
2518         /*
2519          * Freeze thee base pri while we decide the fate of this thread.
2520          *
2521          * Either:
2522          * - we return to user and kevent_cleanup will have unfrozen the base pri,
2523          * - or we proceed to workq_select_threadreq_or_park_and_unlock() who will.
2524          */
2525         thread_freeze_base_pri(th);
2526
2527         if (kqr) {
2528                 uint32_t upcall_flags = WQ_FLAG_THREAD_NEWSPI | WQ_FLAG_THREAD_REUSE;
2529                 if (kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP) {
2530                         upcall_flags |= WQ_FLAG_THREAD_WORKLOOP | WQ_FLAG_THREAD_KEVENT;
2531                 } else {
2532                         upcall_flags |= WQ_FLAG_THREAD_KEVENT;
2533                 }
2534                 if (uth->uu_workq_pri.qos_bucket == WORKQ_THREAD_QOS_MANAGER) {
2535                         upcall_flags |= WQ_FLAG_THREAD_EVENT_MANAGER;
2536                 } else {
2537                         if (uth->uu_workq_flags & UT_WORKQ_OVERCOMMIT) {
2538                                 upcall_flags |= WQ_FLAG_THREAD_OVERCOMMIT;
2539                         }
2540                         if (uth->uu_workq_flags & UT_WORKQ_OUTSIDE_QOS) {
2541                                 upcall_flags |= WQ_FLAG_THREAD_OUTSIDEQOS;
2542                         } else {
2543                                 upcall_flags |= uth->uu_workq_pri.qos_req |
2544                                     WQ_FLAG_THREAD_PRIO_QOS;
2545                         }
2546                 }
2547
2548                 error = pthread_functions->workq_handle_stack_events(p, th,
2549                     get_task_map(p->task), uth->uu_workq_stackaddr,
2550                     uth->uu_workq_thport, eventlist, nevents, upcall_flags);
2551                 if (error) {
2552                         assert(uth->uu_kqr_bound == kqr);
2553                         return error;
2554                 }
2555
2556                 // pthread is supposed to pass KEVENT_FLAG_PARKING here
2557                 // which should cause the above call to either:
2558                 // - not return
2559                 // - return an error
2560                 // - return 0 and have unbound properly
2561                 assert(uth->uu_kqr_bound == NULL);
2562         }
2563
2564         WQ_TRACE_WQ(TRACE_wq_runthread | DBG_FUNC_END, wq, uap->options, 0, 0, 0);
2565
2566         thread_sched_call(th, NULL);
2567         thread_will_park_or_terminate(th);
2568 #if CONFIG_WORKLOOP_DEBUG
2569         UU_KEVENT_HISTORY_WRITE_ENTRY(uth, { .uu_error = -1, });
2570 #endif
2571
2572         workq_lock_spin(wq);
2573         WQ_TRACE_WQ(TRACE_wq_thread_logical_run | DBG_FUNC_END, wq, 0, 0, 0, 0);
2574         uth->uu_save.uus_workq_park_data.workloop_params = trp.trp_value;
2575         workq_select_threadreq_or_park_and_unlock(p, wq, uth,
2576             WQ_SETUP_CLEAR_VOUCHER);
2577         __builtin_unreachable();
2578 }
2579
2580 /**
2581  * Multiplexed call to interact with the workqueue mechanism
2582  */
2583 int
2584 workq_kernreturn(struct proc *p, struct workq_kernreturn_args *uap, int32_t *retval)
2585 {
2586         int options = uap->options;
2587         int arg2 = uap->affinity;
2588         int arg3 = uap->prio;
2589         struct workqueue *wq = proc_get_wqptr(p);
2590         int error = 0;
2591
2592         if ((p->p_lflag & P_LREGISTER) == 0) {
2593                 return EINVAL;
2594         }
2595
2596         switch (options) {
2597         case WQOPS_QUEUE_NEWSPISUPP: {
2598                 /*
2599                  * arg2 = offset of serialno into dispatch queue
2600                  * arg3 = kevent support
2601                  */
2602                 int offset = arg2;
2603                 if (arg3 & 0x01) {
2604                         // If we get here, then userspace has indicated support for kevent delivery.
2605                 }
2606
2607                 p->p_dispatchqueue_serialno_offset = (uint64_t)offset;
2608                 break;
2609         }
2610         case WQOPS_QUEUE_REQTHREADS: {
2611                 /*
2612                  * arg2 = number of threads to start
2613                  * arg3 = priority
2614                  */
2615                 error = workq_reqthreads(p, arg2, arg3);
2616                 break;
2617         }
2618         case WQOPS_SET_EVENT_MANAGER_PRIORITY: {
2619                 /*
2620                  * arg2 = priority for the manager thread
2621                  *
2622                  * if _PTHREAD_PRIORITY_SCHED_PRI_FLAG is set,
2623                  * the low bits of the value contains a scheduling priority
2624                  * instead of a QOS value
2625                  */
2626                 pthread_priority_t pri = arg2;
2627
2628                 if (wq == NULL) {
2629                         error = EINVAL;
2630                         break;
2631                 }
2632
2633                 /*
2634                  * Normalize the incoming priority so that it is ordered numerically.
2635                  */
2636                 if (pri & _PTHREAD_PRIORITY_SCHED_PRI_FLAG) {
2637                         pri &= (_PTHREAD_PRIORITY_SCHED_PRI_MASK |
2638                             _PTHREAD_PRIORITY_SCHED_PRI_FLAG);
2639                 } else {
2640                         thread_qos_t qos = _pthread_priority_thread_qos(pri);
2641                         int relpri = _pthread_priority_relpri(pri);
2642                         if (relpri > 0 || relpri < THREAD_QOS_MIN_TIER_IMPORTANCE ||
2643                             qos == THREAD_QOS_UNSPECIFIED) {
2644                                 error = EINVAL;
2645                                 break;
2646                         }
2647                         pri &= ~_PTHREAD_PRIORITY_FLAGS_MASK;
2648                 }
2649
2650                 /*
2651                  * If userspace passes a scheduling priority, that wins over any QoS.
2652                  * Userspace should takes care not to lower the priority this way.
2653                  */
2654                 workq_lock_spin(wq);
2655                 if (wq->wq_event_manager_priority < (uint32_t)pri) {
2656                         wq->wq_event_manager_priority = (uint32_t)pri;
2657                 }
2658                 workq_unlock(wq);
2659                 break;
2660         }
2661         case WQOPS_THREAD_KEVENT_RETURN:
2662         case WQOPS_THREAD_WORKLOOP_RETURN:
2663         case WQOPS_THREAD_RETURN: {
2664                 error = workq_thread_return(p, uap, wq);
2665                 break;
2666         }
2667
2668         case WQOPS_SHOULD_NARROW: {
2669                 /*
2670                  * arg2 = priority to test
2671                  * arg3 = unused
2672                  */
2673                 thread_t th = current_thread();
2674                 struct uthread *uth = get_bsdthread_info(th);
2675                 if (((thread_get_tag(th) & THREAD_TAG_WORKQUEUE) == 0) ||
2676                     (uth->uu_workq_flags & (UT_WORKQ_DYING | UT_WORKQ_OVERCOMMIT))) {
2677                         error = EINVAL;
2678                         break;
2679                 }
2680
2681                 thread_qos_t qos = _pthread_priority_thread_qos(arg2);
2682                 if (qos == THREAD_QOS_UNSPECIFIED) {
2683                         error = EINVAL;
2684                         break;
2685                 }
2686                 workq_lock_spin(wq);
2687                 bool should_narrow = !workq_constrained_allowance(wq, qos, uth, false);
2688                 workq_unlock(wq);
2689
2690                 *retval = should_narrow;
2691                 break;
2692         }
2693         case WQOPS_SETUP_DISPATCH: {
2694                 /*
2695                  * item = pointer to workq_dispatch_config structure
2696                  * arg2 = sizeof(item)
2697                  */
2698                 struct workq_dispatch_config cfg;
2699                 bzero(&cfg, sizeof(cfg));
2700
2701                 error = copyin(uap->item, &cfg, MIN(sizeof(cfg), (unsigned long) arg2));
2702                 if (error) {
2703                         break;
2704                 }
2705
2706                 if (cfg.wdc_flags & ~WORKQ_DISPATCH_SUPPORTED_FLAGS ||
2707                     cfg.wdc_version < WORKQ_DISPATCH_MIN_SUPPORTED_VERSION) {
2708                         error = ENOTSUP;
2709                         break;
2710                 }
2711
2712                 /* Load fields from version 1 */
2713                 p->p_dispatchqueue_serialno_offset = cfg.wdc_queue_serialno_offs;
2714
2715                 /* Load fields from version 2 */
2716                 if (cfg.wdc_version >= 2) {
2717                         p->p_dispatchqueue_label_offset = cfg.wdc_queue_label_offs;
2718                 }
2719
2720                 break;
2721         }
2722         default:
2723                 error = EINVAL;
2724                 break;
2725         }
2726
2727         return error;
2728 }
2729
2730 /*
2731  * We have no work to do, park ourselves on the idle list.
2732  *
2733  * Consumes the workqueue lock and does not return.
2734  */
2735 __attribute__((noreturn, noinline))
2736 static void
2737 workq_park_and_unlock(proc_t p, struct workqueue *wq, struct uthread *uth,
2738     uint32_t setup_flags)
2739 {
2740         assert(uth == current_uthread());
2741         assert(uth->uu_kqr_bound == NULL);
2742         workq_push_idle_thread(p, wq, uth, setup_flags); // may not return
2743
2744         workq_thread_reset_cpupercent(NULL, uth);
2745
2746         if ((uth->uu_workq_flags & UT_WORKQ_IDLE_CLEANUP) &&
2747             !(uth->uu_workq_flags & UT_WORKQ_DYING)) {
2748                 workq_unlock(wq);
2749
2750                 /*
2751                  * workq_push_idle_thread() will unset `has_stack`
2752                  * if it wants us to free the stack before parking.
2753                  */
2754                 if (!uth->uu_save.uus_workq_park_data.has_stack) {
2755                         pthread_functions->workq_markfree_threadstack(p, uth->uu_thread,
2756                             get_task_map(p->task), uth->uu_workq_stackaddr);
2757                 }
2758
2759                 /*
2760                  * When we remove the voucher from the thread, we may lose our importance
2761                  * causing us to get preempted, so we do this after putting the thread on
2762                  * the idle list.  Then, when we get our importance back we'll be able to
2763                  * use this thread from e.g. the kevent call out to deliver a boosting
2764                  * message.
2765                  */
2766                 __assert_only kern_return_t kr;
2767                 kr = thread_set_voucher_name(MACH_PORT_NULL);
2768                 assert(kr == KERN_SUCCESS);
2769
2770                 workq_lock_spin(wq);
2771                 uth->uu_workq_flags &= ~UT_WORKQ_IDLE_CLEANUP;
2772                 setup_flags &= ~WQ_SETUP_CLEAR_VOUCHER;
2773         }
2774
2775         WQ_TRACE_WQ(TRACE_wq_thread_logical_run | DBG_FUNC_END, wq, 0, 0, 0, 0);
2776
2777         if (uth->uu_workq_flags & UT_WORKQ_RUNNING) {
2778                 /*
2779                  * While we'd dropped the lock to unset our voucher, someone came
2780                  * around and made us runnable.  But because we weren't waiting on the
2781                  * event their thread_wakeup() was ineffectual.  To correct for that,
2782                  * we just run the continuation ourselves.
2783                  */
2784                 workq_unpark_select_threadreq_or_park_and_unlock(p, wq, uth, setup_flags);
2785                 __builtin_unreachable();
2786         }
2787
2788         if (uth->uu_workq_flags & UT_WORKQ_DYING) {
2789                 workq_unpark_for_death_and_unlock(p, wq, uth,
2790                     WORKQ_UNPARK_FOR_DEATH_WAS_IDLE, setup_flags);
2791                 __builtin_unreachable();
2792         }
2793
2794         thread_set_pending_block_hint(uth->uu_thread, kThreadWaitParkedWorkQueue);
2795         assert_wait(workq_parked_wait_event(uth), THREAD_INTERRUPTIBLE);
2796         workq_unlock(wq);
2797         thread_block(workq_unpark_continue);
2798         __builtin_unreachable();
2799 }
2800
2801 static inline bool
2802 workq_may_start_event_mgr_thread(struct workqueue *wq, struct uthread *uth)
2803 {
2804         /*
2805          * There's an event manager request and either:
2806          * - no event manager currently running
2807          * - we are re-using the event manager
2808          */
2809         return wq->wq_thscheduled_count[_wq_bucket(WORKQ_THREAD_QOS_MANAGER)] == 0 ||
2810                (uth && uth->uu_workq_pri.qos_bucket == WORKQ_THREAD_QOS_MANAGER);
2811 }
2812
2813 static uint32_t
2814 workq_constrained_allowance(struct workqueue *wq, thread_qos_t at_qos,
2815     struct uthread *uth, bool may_start_timer)
2816 {
2817         assert(at_qos != WORKQ_THREAD_QOS_MANAGER);
2818         uint32_t count = 0;
2819
2820         uint32_t max_count = wq->wq_constrained_threads_scheduled;
2821         if (uth && (uth->uu_workq_flags & UT_WORKQ_OVERCOMMIT) == 0) {
2822                 /*
2823                  * don't count the current thread as scheduled
2824                  */
2825                 assert(max_count > 0);
2826                 max_count--;
2827         }
2828         if (max_count >= wq_max_constrained_threads) {
2829                 WQ_TRACE_WQ(TRACE_wq_constrained_admission | DBG_FUNC_NONE, wq, 1,
2830                     wq->wq_constrained_threads_scheduled,
2831                     wq_max_constrained_threads, 0);
2832                 /*
2833                  * we need 1 or more constrained threads to return to the kernel before
2834                  * we can dispatch additional work
2835                  */
2836                 return 0;
2837         }
2838         max_count -= wq_max_constrained_threads;
2839
2840         /*
2841          * Compute a metric for many how many threads are active.  We find the
2842          * highest priority request outstanding and then add up the number of active
2843          * threads in that and all higher-priority buckets.  We'll also add any
2844          * "busy" threads which are not currently active but blocked recently enough
2845          * that we can't be sure that they won't be unblocked soon and start
2846          * being active again.
2847          *
2848          * We'll then compare this metric to our max concurrency to decide whether
2849          * to add a new thread.
2850          */
2851
2852         uint32_t busycount, thactive_count;
2853
2854         thactive_count = _wq_thactive_aggregate_downto_qos(wq, _wq_thactive(wq),
2855             at_qos, &busycount, NULL);
2856
2857         if (uth && uth->uu_workq_pri.qos_bucket != WORKQ_THREAD_QOS_MANAGER &&
2858             at_qos <= uth->uu_workq_pri.qos_bucket) {
2859                 /*
2860                  * Don't count this thread as currently active, but only if it's not
2861                  * a manager thread, as _wq_thactive_aggregate_downto_qos ignores active
2862                  * managers.
2863                  */
2864                 assert(thactive_count > 0);
2865                 thactive_count--;
2866         }
2867
2868         count = wq_max_parallelism[_wq_bucket(at_qos)];
2869         if (count > thactive_count + busycount) {
2870                 count -= thactive_count + busycount;
2871                 WQ_TRACE_WQ(TRACE_wq_constrained_admission | DBG_FUNC_NONE, wq, 2,
2872                     thactive_count, busycount, 0);
2873                 return MIN(count, max_count);
2874         } else {
2875                 WQ_TRACE_WQ(TRACE_wq_constrained_admission | DBG_FUNC_NONE, wq, 3,
2876                     thactive_count, busycount, 0);
2877         }
2878
2879         if (may_start_timer) {
2880                 /*
2881                  * If this is called from the add timer, we won't have another timer
2882                  * fire when the thread exits the "busy" state, so rearm the timer.
2883                  */
2884                 workq_schedule_delayed_thread_creation(wq, 0);
2885         }
2886
2887         return 0;
2888 }
2889
2890 static bool
2891 workq_threadreq_admissible(struct workqueue *wq, struct uthread *uth,
2892     workq_threadreq_t req)
2893 {
2894         if (req->tr_qos == WORKQ_THREAD_QOS_MANAGER) {
2895                 return workq_may_start_event_mgr_thread(wq, uth);
2896         }
2897         if ((req->tr_flags & WORKQ_TR_FLAG_OVERCOMMIT) == 0) {
2898                 return workq_constrained_allowance(wq, req->tr_qos, uth, true);
2899         }
2900         return true;
2901 }
2902
2903 static workq_threadreq_t
2904 workq_threadreq_select_for_creator(struct workqueue *wq)
2905 {
2906         workq_threadreq_t req_qos, req_pri, req_tmp, req_mgr;
2907         thread_qos_t qos = THREAD_QOS_UNSPECIFIED;
2908         uint8_t pri = 0;
2909
2910         /*
2911          * Compute the best priority request, and ignore the turnstile for now
2912          */
2913
2914         req_pri = priority_queue_max(&wq->wq_special_queue,
2915             struct workq_threadreq_s, tr_entry);
2916         if (req_pri) {
2917                 pri = (uint8_t)priority_queue_entry_sched_pri(&wq->wq_special_queue,
2918                     &req_pri->tr_entry);
2919         }
2920
2921         /*
2922          * Handle the manager thread request. The special queue might yield
2923          * a higher priority, but the manager always beats the QoS world.
2924          */
2925
2926         req_mgr = wq->wq_event_manager_threadreq;
2927         if (req_mgr && workq_may_start_event_mgr_thread(wq, NULL)) {
2928                 uint32_t mgr_pri = wq->wq_event_manager_priority;
2929
2930                 if (mgr_pri & _PTHREAD_PRIORITY_SCHED_PRI_FLAG) {
2931                         mgr_pri &= _PTHREAD_PRIORITY_SCHED_PRI_MASK;
2932                 } else {
2933                         mgr_pri = thread_workq_pri_for_qos(
2934                                 _pthread_priority_thread_qos(mgr_pri));
2935                 }
2936
2937                 return mgr_pri >= pri ? req_mgr : req_pri;
2938         }
2939
2940         /*
2941          * Compute the best QoS Request, and check whether it beats the "pri" one
2942          */
2943
2944         req_qos = priority_queue_max(&wq->wq_overcommit_queue,
2945             struct workq_threadreq_s, tr_entry);
2946         if (req_qos) {
2947                 qos = req_qos->tr_qos;
2948         }
2949
2950         req_tmp = priority_queue_max(&wq->wq_constrained_queue,
2951             struct workq_threadreq_s, tr_entry);
2952
2953         if (req_tmp && qos < req_tmp->tr_qos) {
2954                 if (pri && pri >= thread_workq_pri_for_qos(req_tmp->tr_qos)) {
2955                         return req_pri;
2956                 }
2957
2958                 if (workq_constrained_allowance(wq, req_tmp->tr_qos, NULL, true)) {
2959                         /*
2960                          * If the constrained thread request is the best one and passes
2961                          * the admission check, pick it.
2962                          */
2963                         return req_tmp;
2964                 }
2965         }
2966
2967         if (pri && (!qos || pri >= thread_workq_pri_for_qos(qos))) {
2968                 return req_pri;
2969         }
2970
2971         if (req_qos) {
2972                 return req_qos;
2973         }
2974
2975         /*
2976          * If we had no eligible request but we have a turnstile push,
2977          * it must be a non overcommit thread request that failed
2978          * the admission check.
2979          *
2980          * Just fake a BG thread request so that if the push stops the creator
2981          * priority just drops to 4.
2982          */
2983         if (turnstile_workq_proprietor_of_max_turnstile(wq->wq_turnstile, NULL)) {
2984                 static struct workq_threadreq_s workq_sync_push_fake_req = {
2985                         .tr_qos = THREAD_QOS_BACKGROUND,
2986                 };
2987
2988                 return &workq_sync_push_fake_req;
2989         }
2990
2991         return NULL;
2992 }
2993
2994 static workq_threadreq_t
2995 workq_threadreq_select(struct workqueue *wq, struct uthread *uth)
2996 {
2997         workq_threadreq_t req_qos, req_pri, req_tmp, req_mgr;
2998         uintptr_t proprietor;
2999         thread_qos_t qos = THREAD_QOS_UNSPECIFIED;
3000         uint8_t pri = 0;
3001
3002         if (uth == wq->wq_creator) {
3003                 uth = NULL;
3004         }
3005
3006         /*
3007          * Compute the best priority request (special or turnstile)
3008          */
3009
3010         pri = (uint8_t)turnstile_workq_proprietor_of_max_turnstile(wq->wq_turnstile,
3011             &proprietor);
3012         if (pri) {
3013                 struct kqworkloop *kqwl = (struct kqworkloop *)proprietor;
3014                 req_pri = &kqwl->kqwl_request;
3015                 if (req_pri->tr_state != WORKQ_TR_STATE_QUEUED) {
3016                         panic("Invalid thread request (%p) state %d",
3017                             req_pri, req_pri->tr_state);
3018                 }
3019         } else {
3020                 req_pri = NULL;
3021         }
3022
3023         req_tmp = priority_queue_max(&wq->wq_special_queue,
3024             struct workq_threadreq_s, tr_entry);
3025         if (req_tmp && pri < priority_queue_entry_sched_pri(&wq->wq_special_queue,
3026             &req_tmp->tr_entry)) {
3027                 req_pri = req_tmp;
3028                 pri = (uint8_t)priority_queue_entry_sched_pri(&wq->wq_special_queue,
3029                     &req_tmp->tr_entry);
3030         }
3031
3032         /*
3033          * Handle the manager thread request. The special queue might yield
3034          * a higher priority, but the manager always beats the QoS world.
3035          */
3036
3037         req_mgr = wq->wq_event_manager_threadreq;
3038         if (req_mgr && workq_may_start_event_mgr_thread(wq, uth)) {
3039                 uint32_t mgr_pri = wq->wq_event_manager_priority;
3040
3041                 if (mgr_pri & _PTHREAD_PRIORITY_SCHED_PRI_FLAG) {
3042                         mgr_pri &= _PTHREAD_PRIORITY_SCHED_PRI_MASK;
3043                 } else {
3044                         mgr_pri = thread_workq_pri_for_qos(
3045                                 _pthread_priority_thread_qos(mgr_pri));
3046                 }
3047
3048                 return mgr_pri >= pri ? req_mgr : req_pri;
3049         }
3050
3051         /*
3052          * Compute the best QoS Request, and check whether it beats the "pri" one
3053          */
3054
3055         req_qos = priority_queue_max(&wq->wq_overcommit_queue,
3056             struct workq_threadreq_s, tr_entry);
3057         if (req_qos) {
3058                 qos = req_qos->tr_qos;
3059         }
3060
3061         req_tmp = priority_queue_max(&wq->wq_constrained_queue,
3062             struct workq_threadreq_s, tr_entry);
3063
3064         if (req_tmp && qos < req_tmp->tr_qos) {
3065                 if (pri && pri >= thread_workq_pri_for_qos(req_tmp->tr_qos)) {
3066                         return req_pri;
3067                 }
3068
3069                 if (workq_constrained_allowance(wq, req_tmp->tr_qos, uth, true)) {
3070                         /*
3071                          * If the constrained thread request is the best one and passes
3072                          * the admission check, pick it.
3073                          */
3074                         return req_tmp;
3075                 }
3076         }
3077
3078         if (req_pri && (!qos || pri >= thread_workq_pri_for_qos(qos))) {
3079                 return req_pri;
3080         }
3081
3082         return req_qos;
3083 }
3084
3085 /*
3086  * The creator is an anonymous thread that is counted as scheduled,
3087  * but otherwise without its scheduler callback set or tracked as active
3088  * that is used to make other threads.
3089  *
3090  * When more requests are added or an existing one is hurried along,
3091  * a creator is elected and setup, or the existing one overridden accordingly.
3092  *
3093  * While this creator is in flight, because no request has been dequeued,
3094  * already running threads have a chance at stealing thread requests avoiding
3095  * useless context switches, and the creator once scheduled may not find any
3096  * work to do and will then just park again.
3097  *
3098  * The creator serves the dual purpose of informing the scheduler of work that
3099  * hasn't be materialized as threads yet, and also as a natural pacing mechanism
3100  * for thread creation.
3101  *
3102  * By being anonymous (and not bound to anything) it means that thread requests
3103  * can be stolen from this creator by threads already on core yielding more
3104  * efficient scheduling and reduced context switches.
3105  */
3106 static void
3107 workq_schedule_creator(proc_t p, struct workqueue *wq,
3108     workq_kern_threadreq_flags_t flags)
3109 {
3110         workq_threadreq_t req;
3111         struct uthread *uth;
3112         bool needs_wakeup;
3113
3114         workq_lock_held(wq);
3115         assert(p || (flags & WORKQ_THREADREQ_CAN_CREATE_THREADS) == 0);
3116
3117 again:
3118         uth = wq->wq_creator;
3119
3120         if (!wq->wq_reqcount) {
3121                 /*
3122                  * There is no thread request left.
3123                  *
3124                  * If there is a creator, leave everything in place, so that it cleans
3125                  * up itself in workq_push_idle_thread().
3126                  *
3127                  * Else, make sure the turnstile state is reset to no inheritor.
3128                  */
3129                 if (uth == NULL) {
3130                         workq_turnstile_update_inheritor(wq, TURNSTILE_INHERITOR_NULL, 0);
3131                 }
3132                 return;
3133         }
3134
3135         req = workq_threadreq_select_for_creator(wq);
3136         if (req == NULL) {
3137                 /*
3138                  * There isn't a thread request that passes the admission check.
3139                  *
3140                  * If there is a creator, do not touch anything, the creator will sort
3141                  * it out when it runs.
3142                  *
3143                  * Else, set the inheritor to "WORKQ" so that the turnstile propagation
3144                  * code calls us if anything changes.
3145                  */
3146                 if (uth == NULL) {
3147                         workq_turnstile_update_inheritor(wq, wq, TURNSTILE_INHERITOR_WORKQ);
3148                 }
3149                 return;
3150         }
3151
3152         if (uth) {
3153                 /*
3154                  * We need to maybe override the creator we already have
3155                  */
3156                 if (workq_thread_needs_priority_change(req, uth)) {
3157                         WQ_TRACE_WQ(TRACE_wq_creator_select | DBG_FUNC_NONE,
3158                             wq, 1, thread_tid(uth->uu_thread), req->tr_qos, 0);
3159                         workq_thread_reset_pri(wq, uth, req, /*unpark*/ true);
3160                 }
3161                 assert(wq->wq_inheritor == uth->uu_thread);
3162         } else if (wq->wq_thidlecount) {
3163                 /*
3164                  * We need to unpark a creator thread
3165                  */
3166                 wq->wq_creator = uth = workq_pop_idle_thread(wq, UT_WORKQ_OVERCOMMIT,
3167                     &needs_wakeup);
3168                 /* Always reset the priorities on the newly chosen creator */
3169                 workq_thread_reset_pri(wq, uth, req, /*unpark*/ true);
3170                 workq_turnstile_update_inheritor(wq, uth->uu_thread,
3171                     TURNSTILE_INHERITOR_THREAD);
3172                 WQ_TRACE_WQ(TRACE_wq_creator_select | DBG_FUNC_NONE,
3173                     wq, 2, thread_tid(uth->uu_thread), req->tr_qos, 0);
3174                 uth->uu_save.uus_workq_park_data.fulfilled_snapshot = wq->wq_fulfilled;
3175                 uth->uu_save.uus_workq_park_data.yields = 0;
3176                 if (needs_wakeup) {
3177                         workq_thread_wakeup(uth);
3178                 }
3179         } else {
3180                 /*
3181                  * We need to allocate a thread...
3182                  */
3183                 if (__improbable(wq->wq_nthreads >= wq_max_threads)) {
3184                         /* out of threads, just go away */
3185                         flags = WORKQ_THREADREQ_NONE;
3186                 } else if (flags & WORKQ_THREADREQ_SET_AST_ON_FAILURE) {
3187                         act_set_astkevent(current_thread(), AST_KEVENT_REDRIVE_THREADREQ);
3188                 } else if (!(flags & WORKQ_THREADREQ_CAN_CREATE_THREADS)) {
3189                         /* This can drop the workqueue lock, and take it again */
3190                         workq_schedule_immediate_thread_creation(wq);
3191                 } else if (workq_add_new_idle_thread(p, wq)) {
3192                         goto again;
3193                 } else {
3194                         workq_schedule_delayed_thread_creation(wq, 0);
3195                 }
3196
3197                 /*
3198                  * If the current thread is the inheritor:
3199                  *
3200                  * If we set the AST, then the thread will stay the inheritor until
3201                  * either the AST calls workq_kern_threadreq_redrive(), or it parks
3202                  * and calls workq_push_idle_thread().
3203                  *
3204                  * Else, the responsibility of the thread creation is with a thread-call
3205                  * and we need to clear the inheritor.
3206                  */
3207                 if ((flags & WORKQ_THREADREQ_SET_AST_ON_FAILURE) == 0 &&
3208                     wq->wq_inheritor == current_thread()) {
3209                         workq_turnstile_update_inheritor(wq, TURNSTILE_INHERITOR_NULL, 0);
3210                 }
3211         }
3212 }
3213
3214 /**
3215  * Same as workq_unpark_select_threadreq_or_park_and_unlock,
3216  * but do not allow early binds.
3217  *
3218  * Called with the base pri frozen, will unfreeze it.
3219  */
3220 __attribute__((noreturn, noinline))
3221 static void
3222 workq_select_threadreq_or_park_and_unlock(proc_t p, struct workqueue *wq,
3223     struct uthread *uth, uint32_t setup_flags)
3224 {
3225         workq_threadreq_t req = NULL;
3226         bool is_creator = (wq->wq_creator == uth);
3227         bool schedule_creator = false;
3228
3229         if (__improbable(_wq_exiting(wq))) {
3230                 WQ_TRACE_WQ(TRACE_wq_select_threadreq | DBG_FUNC_NONE, wq, 0, 0, 0, 0);
3231                 goto park;
3232         }
3233
3234         if (wq->wq_reqcount == 0) {
3235                 WQ_TRACE_WQ(TRACE_wq_select_threadreq | DBG_FUNC_NONE, wq, 1, 0, 0, 0);
3236                 goto park;
3237         }
3238
3239         req = workq_threadreq_select(wq, uth);
3240         if (__improbable(req == NULL)) {
3241                 WQ_TRACE_WQ(TRACE_wq_select_threadreq | DBG_FUNC_NONE, wq, 2, 0, 0, 0);
3242                 goto park;
3243         }
3244
3245         uint8_t tr_flags = req->tr_flags;
3246         struct turnstile *req_ts = kqueue_threadreq_get_turnstile(req);
3247
3248         /*
3249          * Attempt to setup ourselves as the new thing to run, moving all priority
3250          * pushes to ourselves.
3251          *
3252          * If the current thread is the creator, then the fact that we are presently
3253          * running is proof that we'll do something useful, so keep going.
3254          *
3255          * For other cases, peek at the AST to know whether the scheduler wants
3256          * to preempt us, if yes, park instead, and move the thread request
3257          * turnstile back to the workqueue.
3258          */
3259         if (req_ts) {
3260                 workq_perform_turnstile_operation_locked(wq, ^{
3261                         turnstile_update_inheritor(req_ts, uth->uu_thread,
3262                         TURNSTILE_IMMEDIATE_UPDATE | TURNSTILE_INHERITOR_THREAD);
3263                         turnstile_update_inheritor_complete(req_ts,
3264                         TURNSTILE_INTERLOCK_HELD);
3265                 });
3266         }
3267
3268         if (is_creator) {
3269                 WQ_TRACE_WQ(TRACE_wq_creator_select, wq, 4, 0,
3270                     uth->uu_save.uus_workq_park_data.yields, 0);
3271                 wq->wq_creator = NULL;
3272                 _wq_thactive_inc(wq, req->tr_qos);
3273                 wq->wq_thscheduled_count[_wq_bucket(req->tr_qos)]++;
3274         } else if (uth->uu_workq_pri.qos_bucket != req->tr_qos) {
3275                 _wq_thactive_move(wq, uth->uu_workq_pri.qos_bucket, req->tr_qos);
3276         }
3277
3278         workq_thread_reset_pri(wq, uth, req, /*unpark*/ true);
3279
3280         if (__improbable(thread_unfreeze_base_pri(uth->uu_thread) && !is_creator)) {
3281                 if (req_ts) {
3282                         workq_perform_turnstile_operation_locked(wq, ^{
3283                                 turnstile_update_inheritor(req_ts, wq->wq_turnstile,
3284                                 TURNSTILE_IMMEDIATE_UPDATE | TURNSTILE_INHERITOR_TURNSTILE);
3285                                 turnstile_update_inheritor_complete(req_ts,
3286                                 TURNSTILE_INTERLOCK_HELD);
3287                         });
3288                 }
3289                 WQ_TRACE_WQ(TRACE_wq_select_threadreq | DBG_FUNC_NONE, wq, 3, 0, 0, 0);
3290                 goto park_thawed;
3291         }
3292
3293         /*
3294          * We passed all checks, dequeue the request, bind to it, and set it up
3295          * to return to user.
3296          */
3297         WQ_TRACE_WQ(TRACE_wq_thread_logical_run | DBG_FUNC_START, wq,
3298             workq_trace_req_id(req), 0, 0, 0);
3299         wq->wq_fulfilled++;
3300         schedule_creator = workq_threadreq_dequeue(wq, req);
3301
3302         if (tr_flags & (WORKQ_TR_FLAG_KEVENT | WORKQ_TR_FLAG_WORKLOOP)) {
3303                 kqueue_threadreq_bind_prepost(p, req, uth);
3304                 req = NULL;
3305         } else if (req->tr_count > 0) {
3306                 req = NULL;
3307         }
3308
3309         workq_thread_reset_cpupercent(req, uth);
3310         if (uth->uu_workq_flags & UT_WORKQ_NEW) {
3311                 uth->uu_workq_flags ^= UT_WORKQ_NEW;
3312                 setup_flags |= WQ_SETUP_FIRST_USE;
3313         }
3314         if (tr_flags & WORKQ_TR_FLAG_OVERCOMMIT) {
3315                 if ((uth->uu_workq_flags & UT_WORKQ_OVERCOMMIT) == 0) {
3316                         uth->uu_workq_flags |= UT_WORKQ_OVERCOMMIT;
3317                         wq->wq_constrained_threads_scheduled--;
3318                 }
3319         } else {
3320                 if ((uth->uu_workq_flags & UT_WORKQ_OVERCOMMIT) != 0) {
3321                         uth->uu_workq_flags &= ~UT_WORKQ_OVERCOMMIT;
3322                         wq->wq_constrained_threads_scheduled++;
3323                 }
3324         }
3325
3326         if (is_creator || schedule_creator) {
3327                 /* This can drop the workqueue lock, and take it again */
3328                 workq_schedule_creator(p, wq, WORKQ_THREADREQ_CAN_CREATE_THREADS);
3329         }
3330
3331         workq_unlock(wq);
3332
3333         if (req) {
3334                 zfree(workq_zone_threadreq, req);
3335         }
3336
3337         /*
3338          * Run Thread, Run!
3339          */
3340         uint32_t upcall_flags = WQ_FLAG_THREAD_NEWSPI;
3341         if (uth->uu_workq_pri.qos_bucket == WORKQ_THREAD_QOS_MANAGER) {
3342                 upcall_flags |= WQ_FLAG_THREAD_EVENT_MANAGER;
3343         } else if (tr_flags & WORKQ_TR_FLAG_OVERCOMMIT) {
3344                 upcall_flags |= WQ_FLAG_THREAD_OVERCOMMIT;
3345         }
3346         if (tr_flags & WORKQ_TR_FLAG_KEVENT) {
3347                 upcall_flags |= WQ_FLAG_THREAD_KEVENT;
3348         }
3349         if (tr_flags & WORKQ_TR_FLAG_WORKLOOP) {
3350                 upcall_flags |= WQ_FLAG_THREAD_WORKLOOP | WQ_FLAG_THREAD_KEVENT;
3351         }
3352         uth->uu_save.uus_workq_park_data.upcall_flags = upcall_flags;
3353
3354         if (tr_flags & (WORKQ_TR_FLAG_KEVENT | WORKQ_TR_FLAG_WORKLOOP)) {
3355                 kqueue_threadreq_bind_commit(p, uth->uu_thread);
3356         }
3357         workq_setup_and_run(p, uth, setup_flags);
3358         __builtin_unreachable();
3359
3360 park:
3361         thread_unfreeze_base_pri(uth->uu_thread);
3362 park_thawed:
3363         workq_park_and_unlock(p, wq, uth, setup_flags);
3364 }
3365
3366 /**
3367  * Runs a thread request on a thread
3368  *
3369  * - if thread is THREAD_NULL, will find a thread and run the request there.
3370  *   Otherwise, the thread must be the current thread.
3371  *
3372  * - if req is NULL, will find the highest priority request and run that.  If
3373  *   it is not NULL, it must be a threadreq object in state NEW.  If it can not
3374  *   be run immediately, it will be enqueued and moved to state QUEUED.
3375  *
3376  *   Either way, the thread request object serviced will be moved to state
3377  *   BINDING and attached to the uthread.
3378  *
3379  * Should be called with the workqueue lock held.  Will drop it.
3380  * Should be called with the base pri not frozen.
3381  */
3382 __attribute__((noreturn, noinline))
3383 static void
3384 workq_unpark_select_threadreq_or_park_and_unlock(proc_t p, struct workqueue *wq,
3385     struct uthread *uth, uint32_t setup_flags)
3386 {
3387         if (uth->uu_workq_flags & UT_WORKQ_EARLY_BOUND) {
3388                 if (uth->uu_workq_flags & UT_WORKQ_NEW) {
3389                         setup_flags |= WQ_SETUP_FIRST_USE;
3390                 }
3391                 uth->uu_workq_flags &= ~(UT_WORKQ_NEW | UT_WORKQ_EARLY_BOUND);
3392                 /*
3393                  * This pointer is possibly freed and only used for tracing purposes.
3394                  */
3395                 workq_threadreq_t req = uth->uu_save.uus_workq_park_data.thread_request;
3396                 workq_unlock(wq);
3397                 WQ_TRACE_WQ(TRACE_wq_thread_logical_run | DBG_FUNC_START, wq,
3398                     VM_KERNEL_ADDRHIDE(req), 0, 0, 0);
3399                 (void)req;
3400                 workq_setup_and_run(p, uth, setup_flags);
3401                 __builtin_unreachable();
3402         }
3403
3404         thread_freeze_base_pri(uth->uu_thread);
3405         workq_select_threadreq_or_park_and_unlock(p, wq, uth, setup_flags);
3406 }
3407
3408 static bool
3409 workq_creator_should_yield(struct workqueue *wq, struct uthread *uth)
3410 {
3411         thread_qos_t qos = workq_pri_override(uth->uu_workq_pri);
3412
3413         if (qos >= THREAD_QOS_USER_INTERACTIVE) {
3414                 return false;
3415         }
3416
3417         uint32_t snapshot = uth->uu_save.uus_workq_park_data.fulfilled_snapshot;
3418         if (wq->wq_fulfilled == snapshot) {
3419                 return false;
3420         }
3421
3422         uint32_t cnt = 0, conc = wq_max_parallelism[_wq_bucket(qos)];
3423         if (wq->wq_fulfilled - snapshot > conc) {
3424                 /* we fulfilled more than NCPU requests since being dispatched */
3425                 WQ_TRACE_WQ(TRACE_wq_creator_yield, wq, 1,
3426                     wq->wq_fulfilled, snapshot, 0);
3427                 return true;
3428         }
3429
3430         for (int i = _wq_bucket(qos); i < WORKQ_NUM_QOS_BUCKETS; i++) {
3431                 cnt += wq->wq_thscheduled_count[i];
3432         }
3433         if (conc <= cnt) {
3434                 /* We fulfilled requests and have more than NCPU scheduled threads */
3435                 WQ_TRACE_WQ(TRACE_wq_creator_yield, wq, 2,
3436                     wq->wq_fulfilled, snapshot, 0);
3437                 return true;
3438         }
3439
3440         return false;
3441 }
3442
3443 /**
3444  * parked thread wakes up
3445  */
3446 __attribute__((noreturn, noinline))
3447 static void
3448 workq_unpark_continue(void *parameter __unused, wait_result_t wr __unused)
3449 {
3450         thread_t th = current_thread();
3451         struct uthread *uth = get_bsdthread_info(th);
3452         proc_t p = current_proc();
3453         struct workqueue *wq = proc_get_wqptr_fast(p);
3454
3455         workq_lock_spin(wq);
3456
3457         if (wq->wq_creator == uth && workq_creator_should_yield(wq, uth)) {
3458                 /*
3459                  * If the number of threads we have out are able to keep up with the
3460                  * demand, then we should avoid sending this creator thread to
3461                  * userspace.
3462                  */
3463                 uth->uu_save.uus_workq_park_data.fulfilled_snapshot = wq->wq_fulfilled;
3464                 uth->uu_save.uus_workq_park_data.yields++;
3465                 workq_unlock(wq);
3466                 thread_yield_with_continuation(workq_unpark_continue, NULL);
3467                 __builtin_unreachable();
3468         }
3469
3470         if (__probable(uth->uu_workq_flags & UT_WORKQ_RUNNING)) {
3471                 workq_unpark_select_threadreq_or_park_and_unlock(p, wq, uth, WQ_SETUP_NONE);
3472                 __builtin_unreachable();
3473         }
3474
3475         if (__probable(wr == THREAD_AWAKENED)) {
3476                 /*
3477                  * We were set running, but for the purposes of dying.
3478                  */
3479                 assert(uth->uu_workq_flags & UT_WORKQ_DYING);
3480                 assert((uth->uu_workq_flags & UT_WORKQ_NEW) == 0);
3481         } else {
3482                 /*
3483                  * workaround for <rdar://problem/38647347>,
3484                  * in case we do hit userspace, make sure calling
3485                  * workq_thread_terminate() does the right thing here,
3486                  * and if we never call it, that workq_exit() will too because it sees
3487                  * this thread on the runlist.
3488                  */
3489                 assert(wr == THREAD_INTERRUPTED);
3490                 wq->wq_thdying_count++;
3491                 uth->uu_workq_flags |= UT_WORKQ_DYING;
3492         }
3493
3494         workq_unpark_for_death_and_unlock(p, wq, uth,
3495             WORKQ_UNPARK_FOR_DEATH_WAS_IDLE, WQ_SETUP_NONE);
3496         __builtin_unreachable();
3497 }
3498
3499 __attribute__((noreturn, noinline))
3500 static void
3501 workq_setup_and_run(proc_t p, struct uthread *uth, int setup_flags)
3502 {
3503         thread_t th = uth->uu_thread;
3504         vm_map_t vmap = get_task_map(p->task);
3505
3506         if (setup_flags & WQ_SETUP_CLEAR_VOUCHER) {
3507                 /*
3508                  * For preemption reasons, we want to reset the voucher as late as
3509                  * possible, so we do it in two places:
3510                  *   - Just before parking (i.e. in workq_park_and_unlock())
3511                  *   - Prior to doing the setup for the next workitem (i.e. here)
3512                  *
3513                  * Those two places are sufficient to ensure we always reset it before
3514                  * it goes back out to user space, but be careful to not break that
3515                  * guarantee.
3516                  */
3517                 __assert_only kern_return_t kr;
3518                 kr = thread_set_voucher_name(MACH_PORT_NULL);
3519                 assert(kr == KERN_SUCCESS);
3520         }
3521
3522         uint32_t upcall_flags = uth->uu_save.uus_workq_park_data.upcall_flags;
3523         if (!(setup_flags & WQ_SETUP_FIRST_USE)) {
3524                 upcall_flags |= WQ_FLAG_THREAD_REUSE;
3525         }
3526
3527         if (uth->uu_workq_flags & UT_WORKQ_OUTSIDE_QOS) {
3528                 /*
3529                  * For threads that have an outside-of-QoS thread priority, indicate
3530                  * to userspace that setting QoS should only affect the TSD and not
3531                  * change QOS in the kernel.
3532                  */
3533                 upcall_flags |= WQ_FLAG_THREAD_OUTSIDEQOS;
3534         } else {
3535                 /*
3536                  * Put the QoS class value into the lower bits of the reuse_thread
3537                  * register, this is where the thread priority used to be stored
3538                  * anyway.
3539                  */
3540                 upcall_flags |= uth->uu_save.uus_workq_park_data.qos |
3541                     WQ_FLAG_THREAD_PRIO_QOS;
3542         }
3543
3544         if (uth->uu_workq_thport == MACH_PORT_NULL) {
3545                 /* convert_thread_to_port_pinned() consumes a reference */
3546                 thread_reference(th);
3547                 /* Convert to immovable/pinned thread port, but port is not pinned yet */
3548                 ipc_port_t port = convert_thread_to_port_pinned(th);
3549                 /* Atomically, pin and copy out the port */
3550                 uth->uu_workq_thport = ipc_port_copyout_send_pinned(port, get_task_ipcspace(p->task));
3551         }
3552
3553         /*
3554          * Call out to pthread, this sets up the thread, pulls in kevent structs
3555          * onto the stack, sets up the thread state and then returns to userspace.
3556          */
3557         WQ_TRACE_WQ(TRACE_wq_runthread | DBG_FUNC_START,
3558             proc_get_wqptr_fast(p), 0, 0, 0, 0);
3559         thread_sched_call(th, workq_sched_callback);
3560         pthread_functions->workq_setup_thread(p, th, vmap, uth->uu_workq_stackaddr,
3561             uth->uu_workq_thport, 0, setup_flags, upcall_flags);
3562
3563         __builtin_unreachable();
3564 }
3565
3566 #pragma mark misc
3567
3568 int
3569 fill_procworkqueue(proc_t p, struct proc_workqueueinfo * pwqinfo)
3570 {
3571         struct workqueue *wq = proc_get_wqptr(p);
3572         int error = 0;
3573         int     activecount;
3574
3575         if (wq == NULL) {
3576                 return EINVAL;
3577         }
3578
3579         /*
3580          * This is sometimes called from interrupt context by the kperf sampler.
3581          * In that case, it's not safe to spin trying to take the lock since we
3582          * might already hold it.  So, we just try-lock it and error out if it's
3583          * already held.  Since this is just a debugging aid, and all our callers
3584          * are able to handle an error, that's fine.
3585          */
3586         bool locked = workq_lock_try(wq);
3587         if (!locked) {
3588                 return EBUSY;
3589         }
3590
3591         wq_thactive_t act = _wq_thactive(wq);
3592         activecount = _wq_thactive_aggregate_downto_qos(wq, act,
3593             WORKQ_THREAD_QOS_MIN, NULL, NULL);
3594         if (act & _wq_thactive_offset_for_qos(WORKQ_THREAD_QOS_MANAGER)) {
3595                 activecount++;
3596         }
3597         pwqinfo->pwq_nthreads = wq->wq_nthreads;
3598         pwqinfo->pwq_runthreads = activecount;
3599         pwqinfo->pwq_blockedthreads = wq->wq_threads_scheduled - activecount;
3600         pwqinfo->pwq_state = 0;
3601
3602         if (wq->wq_constrained_threads_scheduled >= wq_max_constrained_threads) {
3603                 pwqinfo->pwq_state |= WQ_EXCEEDED_CONSTRAINED_THREAD_LIMIT;
3604         }
3605
3606         if (wq->wq_nthreads >= wq_max_threads) {
3607                 pwqinfo->pwq_state |= WQ_EXCEEDED_TOTAL_THREAD_LIMIT;
3608         }
3609
3610         workq_unlock(wq);
3611         return error;
3612 }
3613
3614 boolean_t
3615 workqueue_get_pwq_exceeded(void *v, boolean_t *exceeded_total,
3616     boolean_t *exceeded_constrained)
3617 {
3618         proc_t p = v;
3619         struct proc_workqueueinfo pwqinfo;
3620         int err;
3621
3622         assert(p != NULL);
3623         assert(exceeded_total != NULL);
3624         assert(exceeded_constrained != NULL);
3625
3626         err = fill_procworkqueue(p, &pwqinfo);
3627         if (err) {
3628                 return FALSE;
3629         }
3630         if (!(pwqinfo.pwq_state & WQ_FLAGS_AVAILABLE)) {
3631                 return FALSE;
3632         }
3633
3634         *exceeded_total = (pwqinfo.pwq_state & WQ_EXCEEDED_TOTAL_THREAD_LIMIT);
3635         *exceeded_constrained = (pwqinfo.pwq_state & WQ_EXCEEDED_CONSTRAINED_THREAD_LIMIT);
3636
3637         return TRUE;
3638 }
3639
3640 uint32_t
3641 workqueue_get_pwq_state_kdp(void * v)
3642 {
3643         static_assert((WQ_EXCEEDED_CONSTRAINED_THREAD_LIMIT << 17) ==
3644             kTaskWqExceededConstrainedThreadLimit);
3645         static_assert((WQ_EXCEEDED_TOTAL_THREAD_LIMIT << 17) ==
3646             kTaskWqExceededTotalThreadLimit);
3647         static_assert((WQ_FLAGS_AVAILABLE << 17) == kTaskWqFlagsAvailable);
3648         static_assert((WQ_FLAGS_AVAILABLE | WQ_EXCEEDED_TOTAL_THREAD_LIMIT |
3649             WQ_EXCEEDED_CONSTRAINED_THREAD_LIMIT) == 0x7);
3650
3651         if (v == NULL) {
3652                 return 0;
3653         }
3654
3655         proc_t p = v;
3656         struct workqueue *wq = proc_get_wqptr(p);
3657
3658         if (wq == NULL || workq_lock_spin_is_acquired_kdp(wq)) {
3659                 return 0;
3660         }
3661
3662         uint32_t pwq_state = WQ_FLAGS_AVAILABLE;
3663
3664         if (wq->wq_constrained_threads_scheduled >= wq_max_constrained_threads) {
3665                 pwq_state |= WQ_EXCEEDED_CONSTRAINED_THREAD_LIMIT;
3666         }
3667
3668         if (wq->wq_nthreads >= wq_max_threads) {
3669                 pwq_state |= WQ_EXCEEDED_TOTAL_THREAD_LIMIT;
3670         }
3671
3672         return pwq_state;
3673 }
3674
3675 void
3676 workq_init(void)
3677 {
3678         clock_interval_to_absolutetime_interval(wq_stalled_window.usecs,
3679             NSEC_PER_USEC, &wq_stalled_window.abstime);
3680         clock_interval_to_absolutetime_interval(wq_reduce_pool_window.usecs,
3681             NSEC_PER_USEC, &wq_reduce_pool_window.abstime);
3682         clock_interval_to_absolutetime_interval(wq_max_timer_interval.usecs,
3683             NSEC_PER_USEC, &wq_max_timer_interval.abstime);
3684
3685         thread_deallocate_daemon_register_queue(&workq_deallocate_queue,
3686             workq_deallocate_queue_invoke);
3687 }