bsd/pthread/pthread_workqueue.c

   1 /*
   2  * Copyright (c) 2000-2017 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995-2018 Apple, Inc. All Rights Reserved */
  29
  30 #include <sys/cdefs.h>
  31
  32 #include <kern/assert.h>
  33 #include <kern/ast.h>
  34 #include <kern/clock.h>
  35 #include <kern/cpu_data.h>
  36 #include <kern/kern_types.h>
  37 #include <kern/policy_internal.h>
  38 #include <kern/processor.h>
  39 #include <kern/sched_prim.h>    /* for thread_exception_return */
  40 #include <kern/task.h>
  41 #include <kern/thread.h>
  42 #include <kern/zalloc.h>
  43 #include <mach/kern_return.h>
  44 #include <mach/mach_param.h>
  45 #include <mach/mach_port.h>
  46 #include <mach/mach_types.h>
  47 #include <mach/mach_vm.h>
  48 #include <mach/sync_policy.h>
  49 #include <mach/task.h>
  50 #include <mach/thread_act.h> /* for thread_resume */
  51 #include <mach/thread_policy.h>
  52 #include <mach/thread_status.h>
  53 #include <mach/vm_prot.h>
  54 #include <mach/vm_statistics.h>
  55 #include <machine/atomic.h>
  56 #include <machine/machine_routines.h>
  57 #include <vm/vm_map.h>
  58 #include <vm/vm_protos.h>
  59
  60 #include <sys/eventvar.h>
  61 #include <sys/kdebug.h>
  62 #include <sys/kernel.h>
  63 #include <sys/lock.h>
  64 #include <sys/param.h>
  65 #include <sys/proc_info.h>      /* for fill_procworkqueue */
  66 #include <sys/proc_internal.h>
  67 #include <sys/pthread_shims.h>
  68 #include <sys/resourcevar.h>
  69 #include <sys/signalvar.h>
  70 #include <sys/sysctl.h>
  71 #include <sys/sysproto.h>
  72 #include <sys/systm.h>
  73 #include <sys/ulock.h> /* for ulock_owner_value_to_port_name */
  74
  75 #include <pthread/bsdthread_private.h>
  76 #include <pthread/workqueue_syscalls.h>
  77 #include <pthread/workqueue_internal.h>
  78 #include <pthread/workqueue_trace.h>
  79
  80 #include <os/log.h>
  81
  82 static void workq_unpark_continue(void *uth, wait_result_t wr) __dead2;
  83 static void workq_schedule_creator(proc_t p, struct workqueue *wq,
  84     workq_kern_threadreq_flags_t flags);
  85
  86 static bool workq_threadreq_admissible(struct workqueue *wq, struct uthread *uth,
  87     workq_threadreq_t req);
  88
  89 static uint32_t workq_constrained_allowance(struct workqueue *wq,
  90     thread_qos_t at_qos, struct uthread *uth, bool may_start_timer);
  91
  92 static bool workq_thread_is_busy(uint64_t cur_ts,
  93     _Atomic uint64_t *lastblocked_tsp);
  94
  95 static int workq_sysctl_handle_usecs SYSCTL_HANDLER_ARGS;
  96
  97 #pragma mark globals
  98
  99 struct workq_usec_var {
 100         uint32_t usecs;
 101         uint64_t abstime;
 102 };
 103
 104 #define WORKQ_SYSCTL_USECS(var, init) \
 105                 static struct workq_usec_var var = { .usecs = init }; \
 106                 SYSCTL_OID(_kern, OID_AUTO, var##_usecs, \
 107                                 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &var, 0, \
 108                                 workq_sysctl_handle_usecs, "I", "")
 109
 110 static lck_grp_t      *workq_lck_grp;
 111 static lck_attr_t     *workq_lck_attr;
 112 static lck_grp_attr_t *workq_lck_grp_attr;
 113 os_refgrp_decl(static, workq_refgrp, "workq", NULL);
 114
 115 static struct mpsc_daemon_queue workq_deallocate_queue;
 116 static zone_t workq_zone_workqueue;
 117 static zone_t workq_zone_threadreq;
 118
 119 WORKQ_SYSCTL_USECS(wq_stalled_window, WQ_STALLED_WINDOW_USECS);
 120 WORKQ_SYSCTL_USECS(wq_reduce_pool_window, WQ_REDUCE_POOL_WINDOW_USECS);
 121 WORKQ_SYSCTL_USECS(wq_max_timer_interval, WQ_MAX_TIMER_INTERVAL_USECS);
 122 static uint32_t wq_max_threads              = WORKQUEUE_MAXTHREADS;
 123 static uint32_t wq_max_constrained_threads  = WORKQUEUE_MAXTHREADS / 8;
 124 static uint32_t wq_init_constrained_limit   = 1;
 125 static uint16_t wq_death_max_load;
 126 static uint32_t wq_max_parallelism[WORKQ_NUM_QOS_BUCKETS];
 127
 128 #pragma mark sysctls
 129
 130 static int
 131 workq_sysctl_handle_usecs SYSCTL_HANDLER_ARGS
 132 {
 133 #pragma unused(arg2)
 134         struct workq_usec_var *v = arg1;
 135         int error = sysctl_handle_int(oidp, &v->usecs, 0, req);
 136         if (error || !req->newptr) {
 137                 return error;
 138         }
 139         clock_interval_to_absolutetime_interval(v->usecs, NSEC_PER_USEC,
 140             &v->abstime);
 141         return 0;
 142 }
 143
 144 SYSCTL_INT(_kern, OID_AUTO, wq_max_threads, CTLFLAG_RW | CTLFLAG_LOCKED,
 145     &wq_max_threads, 0, "");
 146
 147 SYSCTL_INT(_kern, OID_AUTO, wq_max_constrained_threads, CTLFLAG_RW | CTLFLAG_LOCKED,
 148     &wq_max_constrained_threads, 0, "");
 149
 150 #pragma mark p_wqptr
 151
 152 #define WQPTR_IS_INITING_VALUE ((struct workqueue *)~(uintptr_t)0)
 153
 154 static struct workqueue *
 155 proc_get_wqptr_fast(struct proc *p)
 156 {
 157         return os_atomic_load(&p->p_wqptr, relaxed);
 158 }
 159
 160 static struct workqueue *
 161 proc_get_wqptr(struct proc *p)
 162 {
 163         struct workqueue *wq = proc_get_wqptr_fast(p);
 164         return wq == WQPTR_IS_INITING_VALUE ? NULL : wq;
 165 }
 166
 167 static void
 168 proc_set_wqptr(struct proc *p, struct workqueue *wq)
 169 {
 170         wq = os_atomic_xchg(&p->p_wqptr, wq, release);
 171         if (wq == WQPTR_IS_INITING_VALUE) {
 172                 proc_lock(p);
 173                 thread_wakeup(&p->p_wqptr);
 174                 proc_unlock(p);
 175         }
 176 }
 177
 178 static bool
 179 proc_init_wqptr_or_wait(struct proc *p)
 180 {
 181         struct workqueue *wq;
 182
 183         proc_lock(p);
 184         wq = os_atomic_load(&p->p_wqptr, relaxed);
 185
 186         if (wq == NULL) {
 187                 os_atomic_store(&p->p_wqptr, WQPTR_IS_INITING_VALUE, relaxed);
 188                 proc_unlock(p);
 189                 return true;
 190         }
 191
 192         if (wq == WQPTR_IS_INITING_VALUE) {
 193                 assert_wait(&p->p_wqptr, THREAD_UNINT);
 194                 proc_unlock(p);
 195                 thread_block(THREAD_CONTINUE_NULL);
 196         } else {
 197                 proc_unlock(p);
 198         }
 199         return false;
 200 }
 201
 202 static inline event_t
 203 workq_parked_wait_event(struct uthread *uth)
 204 {
 205         return (event_t)&uth->uu_workq_stackaddr;
 206 }
 207
 208 static inline void
 209 workq_thread_wakeup(struct uthread *uth)
 210 {
 211         thread_wakeup_thread(workq_parked_wait_event(uth), uth->uu_thread);
 212 }
 213
 214 #pragma mark wq_thactive
 215
 216 #if defined(__LP64__)
 217 // Layout is:
 218 //   127 - 115 : 13 bits of zeroes
 219 //   114 - 112 : best QoS among all pending constrained requests
 220 //   111 -   0 : MGR, AUI, UI, IN, DF, UT, BG+MT buckets every 16 bits
 221 #define WQ_THACTIVE_BUCKET_WIDTH 16
 222 #define WQ_THACTIVE_QOS_SHIFT    (7 * WQ_THACTIVE_BUCKET_WIDTH)
 223 #else
 224 // Layout is:
 225 //   63 - 61 : best QoS among all pending constrained requests
 226 //   60      : Manager bucket (0 or 1)
 227 //   59 -  0 : AUI, UI, IN, DF, UT, BG+MT buckets every 10 bits
 228 #define WQ_THACTIVE_BUCKET_WIDTH 10
 229 #define WQ_THACTIVE_QOS_SHIFT    (6 * WQ_THACTIVE_BUCKET_WIDTH + 1)
 230 #endif
 231 #define WQ_THACTIVE_BUCKET_MASK  ((1U << WQ_THACTIVE_BUCKET_WIDTH) - 1)
 232 #define WQ_THACTIVE_BUCKET_HALF  (1U << (WQ_THACTIVE_BUCKET_WIDTH - 1))
 233
 234 static_assert(sizeof(wq_thactive_t) * CHAR_BIT - WQ_THACTIVE_QOS_SHIFT >= 3,
 235     "Make sure we have space to encode a QoS");
 236
 237 static inline wq_thactive_t
 238 _wq_thactive(struct workqueue *wq)
 239 {
 240         return os_atomic_load_wide(&wq->wq_thactive, relaxed);
 241 }
 242
 243 static inline int
 244 _wq_bucket(thread_qos_t qos)
 245 {
 246         // Map both BG and MT to the same bucket by over-shifting down and
 247         // clamping MT and BG together.
 248         switch (qos) {
 249         case THREAD_QOS_MAINTENANCE:
 250                 return 0;
 251         default:
 252                 return qos - 2;
 253         }
 254 }
 255
 256 #define WQ_THACTIVE_BEST_CONSTRAINED_REQ_QOS(tha) \
 257                 ((tha) >> WQ_THACTIVE_QOS_SHIFT)
 258
 259 static inline thread_qos_t
 260 _wq_thactive_best_constrained_req_qos(struct workqueue *wq)
 261 {
 262         // Avoid expensive atomic operations: the three bits we're loading are in
 263         // a single byte, and always updated under the workqueue lock
 264         wq_thactive_t v = *(wq_thactive_t *)&wq->wq_thactive;
 265         return WQ_THACTIVE_BEST_CONSTRAINED_REQ_QOS(v);
 266 }
 267
 268 static void
 269 _wq_thactive_refresh_best_constrained_req_qos(struct workqueue *wq)
 270 {
 271         thread_qos_t old_qos, new_qos;
 272         workq_threadreq_t req;
 273
 274         req = priority_queue_max(&wq->wq_constrained_queue,
 275             struct workq_threadreq_s, tr_entry);
 276         new_qos = req ? req->tr_qos : THREAD_QOS_UNSPECIFIED;
 277         old_qos = _wq_thactive_best_constrained_req_qos(wq);
 278         if (old_qos != new_qos) {
 279                 long delta = (long)new_qos - (long)old_qos;
 280                 wq_thactive_t v = (wq_thactive_t)delta << WQ_THACTIVE_QOS_SHIFT;
 281                 /*
 282                  * We can do an atomic add relative to the initial load because updates
 283                  * to this qos are always serialized under the workqueue lock.
 284                  */
 285                 v = os_atomic_add(&wq->wq_thactive, v, relaxed);
 286 #ifdef __LP64__
 287                 WQ_TRACE_WQ(TRACE_wq_thactive_update, wq, (uint64_t)v,
 288                     (uint64_t)(v >> 64), 0, 0);
 289 #else
 290                 WQ_TRACE_WQ(TRACE_wq_thactive_update, wq, v, 0, 0, 0);
 291 #endif
 292         }
 293 }
 294
 295 static inline wq_thactive_t
 296 _wq_thactive_offset_for_qos(thread_qos_t qos)
 297 {
 298         return (wq_thactive_t)1 << (_wq_bucket(qos) * WQ_THACTIVE_BUCKET_WIDTH);
 299 }
 300
 301 static inline wq_thactive_t
 302 _wq_thactive_inc(struct workqueue *wq, thread_qos_t qos)
 303 {
 304         wq_thactive_t v = _wq_thactive_offset_for_qos(qos);
 305         return os_atomic_add_orig(&wq->wq_thactive, v, relaxed);
 306 }
 307
 308 static inline wq_thactive_t
 309 _wq_thactive_dec(struct workqueue *wq, thread_qos_t qos)
 310 {
 311         wq_thactive_t v = _wq_thactive_offset_for_qos(qos);
 312         return os_atomic_sub_orig(&wq->wq_thactive, v, relaxed);
 313 }
 314
 315 static inline void
 316 _wq_thactive_move(struct workqueue *wq,
 317     thread_qos_t old_qos, thread_qos_t new_qos)
 318 {
 319         wq_thactive_t v = _wq_thactive_offset_for_qos(new_qos) -
 320             _wq_thactive_offset_for_qos(old_qos);
 321         os_atomic_add(&wq->wq_thactive, v, relaxed);
 322         wq->wq_thscheduled_count[_wq_bucket(old_qos)]--;
 323         wq->wq_thscheduled_count[_wq_bucket(new_qos)]++;
 324 }
 325
 326 static inline uint32_t
 327 _wq_thactive_aggregate_downto_qos(struct workqueue *wq, wq_thactive_t v,
 328     thread_qos_t qos, uint32_t *busycount, uint32_t *max_busycount)
 329 {
 330         uint32_t count = 0, active;
 331         uint64_t curtime;
 332
 333         assert(WORKQ_THREAD_QOS_MIN <= qos && qos <= WORKQ_THREAD_QOS_MAX);
 334
 335         if (busycount) {
 336                 curtime = mach_absolute_time();
 337                 *busycount = 0;
 338         }
 339         if (max_busycount) {
 340                 *max_busycount = THREAD_QOS_LAST - qos;
 341         }
 342
 343         int i = _wq_bucket(qos);
 344         v >>= i * WQ_THACTIVE_BUCKET_WIDTH;
 345         for (; i < WORKQ_NUM_QOS_BUCKETS; i++, v >>= WQ_THACTIVE_BUCKET_WIDTH) {
 346                 active = v & WQ_THACTIVE_BUCKET_MASK;
 347                 count += active;
 348
 349                 if (busycount && wq->wq_thscheduled_count[i] > active) {
 350                         if (workq_thread_is_busy(curtime, &wq->wq_lastblocked_ts[i])) {
 351                                 /*
 352                                  * We only consider the last blocked thread for a given bucket
 353                                  * as busy because we don't want to take the list lock in each
 354                                  * sched callback. However this is an approximation that could
 355                                  * contribute to thread creation storms.
 356                                  */
 357                                 (*busycount)++;
 358                         }
 359                 }
 360         }
 361
 362         return count;
 363 }
 364
 365 #pragma mark wq_flags
 366
 367 static inline uint32_t
 368 _wq_flags(struct workqueue *wq)
 369 {
 370         return os_atomic_load(&wq->wq_flags, relaxed);
 371 }
 372
 373 static inline bool
 374 _wq_exiting(struct workqueue *wq)
 375 {
 376         return _wq_flags(wq) & WQ_EXITING;
 377 }
 378
 379 bool
 380 workq_is_exiting(struct proc *p)
 381 {
 382         struct workqueue *wq = proc_get_wqptr(p);
 383         return !wq || _wq_exiting(wq);
 384 }
 385
 386 #pragma mark workqueue lock
 387
 388 static bool
 389 workq_lock_spin_is_acquired_kdp(struct workqueue *wq)
 390 {
 391         return kdp_lck_spin_is_acquired(&wq->wq_lock);
 392 }
 393
 394 static inline void
 395 workq_lock_spin(struct workqueue *wq)
 396 {
 397         lck_spin_lock_grp(&wq->wq_lock, workq_lck_grp);
 398 }
 399
 400 static inline void
 401 workq_lock_held(__assert_only struct workqueue *wq)
 402 {
 403         LCK_SPIN_ASSERT(&wq->wq_lock, LCK_ASSERT_OWNED);
 404 }
 405
 406 static inline bool
 407 workq_lock_try(struct workqueue *wq)
 408 {
 409         return lck_spin_try_lock_grp(&wq->wq_lock, workq_lck_grp);
 410 }
 411
 412 static inline void
 413 workq_unlock(struct workqueue *wq)
 414 {
 415         lck_spin_unlock(&wq->wq_lock);
 416 }
 417
 418 #pragma mark idle thread lists
 419
 420 #define WORKQ_POLICY_INIT(qos) \
 421                 (struct uu_workq_policy){ .qos_req = qos, .qos_bucket = qos }
 422
 423 static inline thread_qos_t
 424 workq_pri_bucket(struct uu_workq_policy req)
 425 {
 426         return MAX(MAX(req.qos_req, req.qos_max), req.qos_override);
 427 }
 428
 429 static inline thread_qos_t
 430 workq_pri_override(struct uu_workq_policy req)
 431 {
 432         return MAX(workq_pri_bucket(req), req.qos_bucket);
 433 }
 434
 435 static inline bool
 436 workq_thread_needs_params_change(workq_threadreq_t req, struct uthread *uth)
 437 {
 438         workq_threadreq_param_t cur_trp, req_trp = { };
 439
 440         cur_trp.trp_value = uth->uu_save.uus_workq_park_data.workloop_params;
 441         if (req->tr_flags & WORKQ_TR_FLAG_WL_PARAMS) {
 442                 req_trp = kqueue_threadreq_workloop_param(req);
 443         }
 444
 445         /*
 446          * CPU percent flags are handled separately to policy changes, so ignore
 447          * them for all of these checks.
 448          */
 449         uint16_t cur_flags = (cur_trp.trp_flags & ~TRP_CPUPERCENT);
 450         uint16_t req_flags = (req_trp.trp_flags & ~TRP_CPUPERCENT);
 451
 452         if (!req_flags && !cur_flags) {
 453                 return false;
 454         }
 455
 456         if (req_flags != cur_flags) {
 457                 return true;
 458         }
 459
 460         if ((req_flags & TRP_PRIORITY) && req_trp.trp_pri != cur_trp.trp_pri) {
 461                 return true;
 462         }
 463
 464         if ((req_flags & TRP_POLICY) && cur_trp.trp_pol != cur_trp.trp_pol) {
 465                 return true;
 466         }
 467
 468         return false;
 469 }
 470
 471 static inline bool
 472 workq_thread_needs_priority_change(workq_threadreq_t req, struct uthread *uth)
 473 {
 474         if (workq_thread_needs_params_change(req, uth)) {
 475                 return true;
 476         }
 477
 478         return req->tr_qos != workq_pri_override(uth->uu_workq_pri);
 479 }
 480
 481 static void
 482 workq_thread_update_bucket(proc_t p, struct workqueue *wq, struct uthread *uth,
 483     struct uu_workq_policy old_pri, struct uu_workq_policy new_pri,
 484     bool force_run)
 485 {
 486         thread_qos_t old_bucket = old_pri.qos_bucket;
 487         thread_qos_t new_bucket = workq_pri_bucket(new_pri);
 488
 489         if (old_bucket != new_bucket) {
 490                 _wq_thactive_move(wq, old_bucket, new_bucket);
 491         }
 492
 493         new_pri.qos_bucket = new_bucket;
 494         uth->uu_workq_pri = new_pri;
 495
 496         if (workq_pri_override(old_pri) != new_bucket) {
 497                 thread_set_workq_override(uth->uu_thread, new_bucket);
 498         }
 499
 500         if (wq->wq_reqcount && (old_bucket > new_bucket || force_run)) {
 501                 int flags = WORKQ_THREADREQ_CAN_CREATE_THREADS;
 502                 if (old_bucket > new_bucket) {
 503                         /*
 504                          * When lowering our bucket, we may unblock a thread request,
 505                          * but we can't drop our priority before we have evaluated
 506                          * whether this is the case, and if we ever drop the workqueue lock
 507                          * that would cause a priority inversion.
 508                          *
 509                          * We hence have to disallow thread creation in that case.
 510                          */
 511                         flags = 0;
 512                 }
 513                 workq_schedule_creator(p, wq, flags);
 514         }
 515 }
 516
 517 /*
 518  * Sets/resets the cpu percent limits on the current thread. We can't set
 519  * these limits from outside of the current thread, so this function needs
 520  * to be called when we're executing on the intended
 521  */
 522 static void
 523 workq_thread_reset_cpupercent(workq_threadreq_t req, struct uthread *uth)
 524 {
 525         assert(uth == current_uthread());
 526         workq_threadreq_param_t trp = { };
 527
 528         if (req && (req->tr_flags & WORKQ_TR_FLAG_WL_PARAMS)) {
 529                 trp = kqueue_threadreq_workloop_param(req);
 530         }
 531
 532         if (uth->uu_workq_flags & UT_WORKQ_CPUPERCENT) {
 533                 /*
 534                  * Going through disable when we have an existing CPU percent limit
 535                  * set will force the ledger to refill the token bucket of the current
 536                  * thread. Removing any penalty applied by previous thread use.
 537                  */
 538                 thread_set_cpulimit(THREAD_CPULIMIT_DISABLE, 0, 0);
 539                 uth->uu_workq_flags &= ~UT_WORKQ_CPUPERCENT;
 540         }
 541
 542         if (trp.trp_flags & TRP_CPUPERCENT) {
 543                 thread_set_cpulimit(THREAD_CPULIMIT_BLOCK, trp.trp_cpupercent,
 544                     (uint64_t)trp.trp_refillms * NSEC_PER_SEC);
 545                 uth->uu_workq_flags |= UT_WORKQ_CPUPERCENT;
 546         }
 547 }
 548
 549 static void
 550 workq_thread_reset_pri(struct workqueue *wq, struct uthread *uth,
 551     workq_threadreq_t req, bool unpark)
 552 {
 553         thread_t th = uth->uu_thread;
 554         thread_qos_t qos = req ? req->tr_qos : WORKQ_THREAD_QOS_CLEANUP;
 555         workq_threadreq_param_t trp = { };
 556         int priority = 31;
 557         int policy = POLICY_TIMESHARE;
 558
 559         if (req && (req->tr_flags & WORKQ_TR_FLAG_WL_PARAMS)) {
 560                 trp = kqueue_threadreq_workloop_param(req);
 561         }
 562
 563         uth->uu_workq_pri = WORKQ_POLICY_INIT(qos);
 564         uth->uu_workq_flags &= ~UT_WORKQ_OUTSIDE_QOS;
 565
 566         if (unpark) {
 567                 uth->uu_save.uus_workq_park_data.workloop_params = trp.trp_value;
 568                 // qos sent out to userspace (may differ from uu_workq_pri on param threads)
 569                 uth->uu_save.uus_workq_park_data.qos = qos;
 570         }
 571
 572         if (qos == WORKQ_THREAD_QOS_MANAGER) {
 573                 uint32_t mgr_pri = wq->wq_event_manager_priority;
 574                 assert(trp.trp_value == 0); // manager qos and thread policy don't mix
 575
 576                 if (mgr_pri & _PTHREAD_PRIORITY_SCHED_PRI_FLAG) {
 577                         mgr_pri &= _PTHREAD_PRIORITY_SCHED_PRI_MASK;
 578                         thread_set_workq_pri(th, THREAD_QOS_UNSPECIFIED, mgr_pri,
 579                             POLICY_TIMESHARE);
 580                         return;
 581                 }
 582
 583                 qos = _pthread_priority_thread_qos(mgr_pri);
 584         } else {
 585                 if (trp.trp_flags & TRP_PRIORITY) {
 586                         qos = THREAD_QOS_UNSPECIFIED;
 587                         priority = trp.trp_pri;
 588                         uth->uu_workq_flags |= UT_WORKQ_OUTSIDE_QOS;
 589                 }
 590
 591                 if (trp.trp_flags & TRP_POLICY) {
 592                         policy = trp.trp_pol;
 593                 }
 594         }
 595
 596         thread_set_workq_pri(th, qos, priority, policy);
 597 }
 598
 599 /*
 600  * Called by kevent with the NOTE_WL_THREAD_REQUEST knote lock held,
 601  * every time a servicer is being told about a new max QoS.
 602  */
 603 void
 604 workq_thread_set_max_qos(struct proc *p, workq_threadreq_t kqr)
 605 {
 606         struct uu_workq_policy old_pri, new_pri;
 607         struct uthread *uth = current_uthread();
 608         struct workqueue *wq = proc_get_wqptr_fast(p);
 609         thread_qos_t qos = kqr->tr_kq_qos_index;
 610
 611         if (uth->uu_workq_pri.qos_max == qos) {
 612                 return;
 613         }
 614
 615         workq_lock_spin(wq);
 616         old_pri = new_pri = uth->uu_workq_pri;
 617         new_pri.qos_max = qos;
 618         workq_thread_update_bucket(p, wq, uth, old_pri, new_pri, false);
 619         workq_unlock(wq);
 620 }
 621
 622 #pragma mark idle threads accounting and handling
 623
 624 static inline struct uthread *
 625 workq_oldest_killable_idle_thread(struct workqueue *wq)
 626 {
 627         struct uthread *uth = TAILQ_LAST(&wq->wq_thidlelist, workq_uthread_head);
 628
 629         if (uth && !uth->uu_save.uus_workq_park_data.has_stack) {
 630                 uth = TAILQ_PREV(uth, workq_uthread_head, uu_workq_entry);
 631                 if (uth) {
 632                         assert(uth->uu_save.uus_workq_park_data.has_stack);
 633                 }
 634         }
 635         return uth;
 636 }
 637
 638 static inline uint64_t
 639 workq_kill_delay_for_idle_thread(struct workqueue *wq)
 640 {
 641         uint64_t delay = wq_reduce_pool_window.abstime;
 642         uint16_t idle = wq->wq_thidlecount;
 643
 644         /*
 645          * If we have less than wq_death_max_load threads, have a 5s timer.
 646          *
 647          * For the next wq_max_constrained_threads ones, decay linearly from
 648          * from 5s to 50ms.
 649          */
 650         if (idle <= wq_death_max_load) {
 651                 return delay;
 652         }
 653
 654         if (wq_max_constrained_threads > idle - wq_death_max_load) {
 655                 delay *= (wq_max_constrained_threads - (idle - wq_death_max_load));
 656         }
 657         return delay / wq_max_constrained_threads;
 658 }
 659
 660 static inline bool
 661 workq_should_kill_idle_thread(struct workqueue *wq, struct uthread *uth,
 662     uint64_t now)
 663 {
 664         uint64_t delay = workq_kill_delay_for_idle_thread(wq);
 665         return now - uth->uu_save.uus_workq_park_data.idle_stamp > delay;
 666 }
 667
 668 static void
 669 workq_death_call_schedule(struct workqueue *wq, uint64_t deadline)
 670 {
 671         uint32_t wq_flags = os_atomic_load(&wq->wq_flags, relaxed);
 672
 673         if (wq_flags & (WQ_EXITING | WQ_DEATH_CALL_SCHEDULED)) {
 674                 return;
 675         }
 676         os_atomic_or(&wq->wq_flags, WQ_DEATH_CALL_SCHEDULED, relaxed);
 677
 678         WQ_TRACE_WQ(TRACE_wq_death_call | DBG_FUNC_NONE, wq, 1, 0, 0, 0);
 679
 680         /*
 681          * <rdar://problem/13139182> Due to how long term timers work, the leeway
 682          * can't be too short, so use 500ms which is long enough that we will not
 683          * wake up the CPU for killing threads, but short enough that it doesn't
 684          * fall into long-term timer list shenanigans.
 685          */
 686         thread_call_enter_delayed_with_leeway(wq->wq_death_call, NULL, deadline,
 687             wq_reduce_pool_window.abstime / 10,
 688             THREAD_CALL_DELAY_LEEWAY | THREAD_CALL_DELAY_USER_BACKGROUND);
 689 }
 690
 691 /*
 692  * `decrement` is set to the number of threads that are no longer dying:
 693  * - because they have been resuscitated just in time (workq_pop_idle_thread)
 694  * - or have been killed (workq_thread_terminate).
 695  */
 696 static void
 697 workq_death_policy_evaluate(struct workqueue *wq, uint16_t decrement)
 698 {
 699         struct uthread *uth;
 700
 701         assert(wq->wq_thdying_count >= decrement);
 702         if ((wq->wq_thdying_count -= decrement) > 0) {
 703                 return;
 704         }
 705
 706         if (wq->wq_thidlecount <= 1) {
 707                 return;
 708         }
 709
 710         if ((uth = workq_oldest_killable_idle_thread(wq)) == NULL) {
 711                 return;
 712         }
 713
 714         uint64_t now = mach_absolute_time();
 715         uint64_t delay = workq_kill_delay_for_idle_thread(wq);
 716
 717         if (now - uth->uu_save.uus_workq_park_data.idle_stamp > delay) {
 718                 WQ_TRACE_WQ(TRACE_wq_thread_terminate | DBG_FUNC_START,
 719                     wq, wq->wq_thidlecount, 0, 0, 0);
 720                 wq->wq_thdying_count++;
 721                 uth->uu_workq_flags |= UT_WORKQ_DYING;
 722                 if ((uth->uu_workq_flags & UT_WORKQ_IDLE_CLEANUP) == 0) {
 723                         workq_thread_wakeup(uth);
 724                 }
 725                 return;
 726         }
 727
 728         workq_death_call_schedule(wq,
 729             uth->uu_save.uus_workq_park_data.idle_stamp + delay);
 730 }
 731
 732 void
 733 workq_thread_terminate(struct proc *p, struct uthread *uth)
 734 {
 735         struct workqueue *wq = proc_get_wqptr_fast(p);
 736
 737         workq_lock_spin(wq);
 738         TAILQ_REMOVE(&wq->wq_thrunlist, uth, uu_workq_entry);
 739         if (uth->uu_workq_flags & UT_WORKQ_DYING) {
 740                 WQ_TRACE_WQ(TRACE_wq_thread_terminate | DBG_FUNC_END,
 741                     wq, wq->wq_thidlecount, 0, 0, 0);
 742                 workq_death_policy_evaluate(wq, 1);
 743         }
 744         if (wq->wq_nthreads-- == wq_max_threads) {
 745                 /*
 746                  * We got under the thread limit again, which may have prevented
 747                  * thread creation from happening, redrive if there are pending requests
 748                  */
 749                 if (wq->wq_reqcount) {
 750                         workq_schedule_creator(p, wq, WORKQ_THREADREQ_CAN_CREATE_THREADS);
 751                 }
 752         }
 753         workq_unlock(wq);
 754
 755         thread_deallocate(uth->uu_thread);
 756 }
 757
 758 static void
 759 workq_kill_old_threads_call(void *param0, void *param1 __unused)
 760 {
 761         struct workqueue *wq = param0;
 762
 763         workq_lock_spin(wq);
 764         WQ_TRACE_WQ(TRACE_wq_death_call | DBG_FUNC_START, wq, 0, 0, 0, 0);
 765         os_atomic_andnot(&wq->wq_flags, WQ_DEATH_CALL_SCHEDULED, relaxed);
 766         workq_death_policy_evaluate(wq, 0);
 767         WQ_TRACE_WQ(TRACE_wq_death_call | DBG_FUNC_END, wq, 0, 0, 0, 0);
 768         workq_unlock(wq);
 769 }
 770
 771 static struct uthread *
 772 workq_pop_idle_thread(struct workqueue *wq, uint8_t uu_flags,
 773     bool *needs_wakeup)
 774 {
 775         struct uthread *uth;
 776
 777         if ((uth = TAILQ_FIRST(&wq->wq_thidlelist))) {
 778                 TAILQ_REMOVE(&wq->wq_thidlelist, uth, uu_workq_entry);
 779         } else {
 780                 uth = TAILQ_FIRST(&wq->wq_thnewlist);
 781                 TAILQ_REMOVE(&wq->wq_thnewlist, uth, uu_workq_entry);
 782         }
 783         TAILQ_INSERT_TAIL(&wq->wq_thrunlist, uth, uu_workq_entry);
 784
 785         assert((uth->uu_workq_flags & UT_WORKQ_RUNNING) == 0);
 786         uth->uu_workq_flags |= UT_WORKQ_RUNNING | uu_flags;
 787         if ((uu_flags & UT_WORKQ_OVERCOMMIT) == 0) {
 788                 wq->wq_constrained_threads_scheduled++;
 789         }
 790         wq->wq_threads_scheduled++;
 791         wq->wq_thidlecount--;
 792
 793         if (__improbable(uth->uu_workq_flags & UT_WORKQ_DYING)) {
 794                 uth->uu_workq_flags ^= UT_WORKQ_DYING;
 795                 workq_death_policy_evaluate(wq, 1);
 796                 *needs_wakeup = false;
 797         } else if (uth->uu_workq_flags & UT_WORKQ_IDLE_CLEANUP) {
 798                 *needs_wakeup = false;
 799         } else {
 800                 *needs_wakeup = true;
 801         }
 802         return uth;
 803 }
 804
 805 /*
 806  * Called by thread_create_workq_waiting() during thread initialization, before
 807  * assert_wait, before the thread has been started.
 808  */
 809 event_t
 810 workq_thread_init_and_wq_lock(task_t task, thread_t th)
 811 {
 812         struct uthread *uth = get_bsdthread_info(th);
 813
 814         uth->uu_workq_flags = UT_WORKQ_NEW;
 815         uth->uu_workq_pri = WORKQ_POLICY_INIT(THREAD_QOS_LEGACY);
 816         uth->uu_workq_thport = MACH_PORT_NULL;
 817         uth->uu_workq_stackaddr = 0;
 818         uth->uu_workq_pthread_kill_allowed = 0;
 819
 820         thread_set_tag(th, THREAD_TAG_PTHREAD | THREAD_TAG_WORKQUEUE);
 821         thread_reset_workq_qos(th, THREAD_QOS_LEGACY);
 822
 823         workq_lock_spin(proc_get_wqptr_fast(get_bsdtask_info(task)));
 824         return workq_parked_wait_event(uth);
 825 }
 826
 827 /**
 828  * Try to add a new workqueue thread.
 829  *
 830  * - called with workq lock held
 831  * - dropped and retaken around thread creation
 832  * - return with workq lock held
 833  */
 834 static bool
 835 workq_add_new_idle_thread(proc_t p, struct workqueue *wq)
 836 {
 837         mach_vm_offset_t th_stackaddr;
 838         kern_return_t kret;
 839         thread_t th;
 840
 841         wq->wq_nthreads++;
 842
 843         workq_unlock(wq);
 844
 845         vm_map_t vmap = get_task_map(p->task);
 846
 847         kret = pthread_functions->workq_create_threadstack(p, vmap, &th_stackaddr);
 848         if (kret != KERN_SUCCESS) {
 849                 WQ_TRACE_WQ(TRACE_wq_thread_create_failed | DBG_FUNC_NONE, wq,
 850                     kret, 1, 0, 0);
 851                 goto out;
 852         }
 853
 854         kret = thread_create_workq_waiting(p->task, workq_unpark_continue, &th);
 855         if (kret != KERN_SUCCESS) {
 856                 WQ_TRACE_WQ(TRACE_wq_thread_create_failed | DBG_FUNC_NONE, wq,
 857                     kret, 0, 0, 0);
 858                 pthread_functions->workq_destroy_threadstack(p, vmap, th_stackaddr);
 859                 goto out;
 860         }
 861
 862         // thread_create_workq_waiting() will return with the wq lock held
 863         // on success, because it calls workq_thread_init_and_wq_lock() above
 864
 865         struct uthread *uth = get_bsdthread_info(th);
 866
 867         wq->wq_creations++;
 868         wq->wq_thidlecount++;
 869         uth->uu_workq_stackaddr = th_stackaddr;
 870         TAILQ_INSERT_TAIL(&wq->wq_thnewlist, uth, uu_workq_entry);
 871
 872         WQ_TRACE_WQ(TRACE_wq_thread_create | DBG_FUNC_NONE, wq, 0, 0, 0, 0);
 873         return true;
 874
 875 out:
 876         workq_lock_spin(wq);
 877         /*
 878          * Do not redrive here if we went under wq_max_threads again,
 879          * it is the responsibility of the callers of this function
 880          * to do so when it fails.
 881          */
 882         wq->wq_nthreads--;
 883         return false;
 884 }
 885
 886 #define WORKQ_UNPARK_FOR_DEATH_WAS_IDLE 0x1
 887
 888 __attribute__((noreturn, noinline))
 889 static void
 890 workq_unpark_for_death_and_unlock(proc_t p, struct workqueue *wq,
 891     struct uthread *uth, uint32_t death_flags, uint32_t setup_flags)
 892 {
 893         thread_qos_t qos = workq_pri_override(uth->uu_workq_pri);
 894         bool first_use = uth->uu_workq_flags & UT_WORKQ_NEW;
 895
 896         if (qos > WORKQ_THREAD_QOS_CLEANUP) {
 897                 workq_thread_reset_pri(wq, uth, NULL, /*unpark*/ true);
 898                 qos = WORKQ_THREAD_QOS_CLEANUP;
 899         }
 900
 901         workq_thread_reset_cpupercent(NULL, uth);
 902
 903         if (death_flags & WORKQ_UNPARK_FOR_DEATH_WAS_IDLE) {
 904                 wq->wq_thidlecount--;
 905                 if (first_use) {
 906                         TAILQ_REMOVE(&wq->wq_thnewlist, uth, uu_workq_entry);
 907                 } else {
 908                         TAILQ_REMOVE(&wq->wq_thidlelist, uth, uu_workq_entry);
 909                 }
 910         }
 911         TAILQ_INSERT_TAIL(&wq->wq_thrunlist, uth, uu_workq_entry);
 912
 913         workq_unlock(wq);
 914
 915         if (setup_flags & WQ_SETUP_CLEAR_VOUCHER) {
 916                 __assert_only kern_return_t kr;
 917                 kr = thread_set_voucher_name(MACH_PORT_NULL);
 918                 assert(kr == KERN_SUCCESS);
 919         }
 920
 921         uint32_t flags = WQ_FLAG_THREAD_NEWSPI | qos | WQ_FLAG_THREAD_PRIO_QOS;
 922         thread_t th = uth->uu_thread;
 923         vm_map_t vmap = get_task_map(p->task);
 924
 925         if (!first_use) {
 926                 flags |= WQ_FLAG_THREAD_REUSE;
 927         }
 928
 929         pthread_functions->workq_setup_thread(p, th, vmap, uth->uu_workq_stackaddr,
 930             uth->uu_workq_thport, 0, WQ_SETUP_EXIT_THREAD, flags);
 931         __builtin_unreachable();
 932 }
 933
 934 bool
 935 workq_is_current_thread_updating_turnstile(struct workqueue *wq)
 936 {
 937         return wq->wq_turnstile_updater == current_thread();
 938 }
 939
 940 __attribute__((always_inline))
 941 static inline void
 942 workq_perform_turnstile_operation_locked(struct workqueue *wq,
 943     void (^operation)(void))
 944 {
 945         workq_lock_held(wq);
 946         wq->wq_turnstile_updater = current_thread();
 947         operation();
 948         wq->wq_turnstile_updater = THREAD_NULL;
 949 }
 950
 951 static void
 952 workq_turnstile_update_inheritor(struct workqueue *wq,
 953     turnstile_inheritor_t inheritor,
 954     turnstile_update_flags_t flags)
 955 {
 956         if (wq->wq_inheritor == inheritor) {
 957                 return;
 958         }
 959         wq->wq_inheritor = inheritor;
 960         workq_perform_turnstile_operation_locked(wq, ^{
 961                 turnstile_update_inheritor(wq->wq_turnstile, inheritor,
 962                 flags | TURNSTILE_IMMEDIATE_UPDATE);
 963                 turnstile_update_inheritor_complete(wq->wq_turnstile,
 964                 TURNSTILE_INTERLOCK_HELD);
 965         });
 966 }
 967
 968 static void
 969 workq_push_idle_thread(proc_t p, struct workqueue *wq, struct uthread *uth,
 970     uint32_t setup_flags)
 971 {
 972         uint64_t now = mach_absolute_time();
 973         bool is_creator = (uth == wq->wq_creator);
 974
 975         if ((uth->uu_workq_flags & UT_WORKQ_OVERCOMMIT) == 0) {
 976                 wq->wq_constrained_threads_scheduled--;
 977         }
 978         uth->uu_workq_flags &= ~(UT_WORKQ_RUNNING | UT_WORKQ_OVERCOMMIT);
 979         TAILQ_REMOVE(&wq->wq_thrunlist, uth, uu_workq_entry);
 980         wq->wq_threads_scheduled--;
 981
 982         if (is_creator) {
 983                 wq->wq_creator = NULL;
 984                 WQ_TRACE_WQ(TRACE_wq_creator_select, wq, 3, 0,
 985                     uth->uu_save.uus_workq_park_data.yields, 0);
 986         }
 987
 988         if (wq->wq_inheritor == uth->uu_thread) {
 989                 assert(wq->wq_creator == NULL);
 990                 if (wq->wq_reqcount) {
 991                         workq_turnstile_update_inheritor(wq, wq, TURNSTILE_INHERITOR_WORKQ);
 992                 } else {
 993                         workq_turnstile_update_inheritor(wq, TURNSTILE_INHERITOR_NULL, 0);
 994                 }
 995         }
 996
 997         if (uth->uu_workq_flags & UT_WORKQ_NEW) {
 998                 assert(is_creator || (_wq_flags(wq) & WQ_EXITING));
 999                 TAILQ_INSERT_TAIL(&wq->wq_thnewlist, uth, uu_workq_entry);
1000                 wq->wq_thidlecount++;
1001                 return;
1002         }
1003
1004         if (!is_creator) {
1005                 _wq_thactive_dec(wq, uth->uu_workq_pri.qos_bucket);
1006                 wq->wq_thscheduled_count[_wq_bucket(uth->uu_workq_pri.qos_bucket)]--;
1007                 uth->uu_workq_flags |= UT_WORKQ_IDLE_CLEANUP;
1008         }
1009
1010         uth->uu_save.uus_workq_park_data.idle_stamp = now;
1011
1012         struct uthread *oldest = workq_oldest_killable_idle_thread(wq);
1013         uint16_t cur_idle = wq->wq_thidlecount;
1014
1015         if (cur_idle >= wq_max_constrained_threads ||
1016             (wq->wq_thdying_count == 0 && oldest &&
1017             workq_should_kill_idle_thread(wq, oldest, now))) {
1018                 /*
1019                  * Immediately kill threads if we have too may of them.
1020                  *
1021                  * And swap "place" with the oldest one we'd have woken up.
1022                  * This is a relatively desperate situation where we really
1023                  * need to kill threads quickly and it's best to kill
1024                  * the one that's currently on core than context switching.
1025                  */
1026                 if (oldest) {
1027                         oldest->uu_save.uus_workq_park_data.idle_stamp = now;
1028                         TAILQ_REMOVE(&wq->wq_thidlelist, oldest, uu_workq_entry);
1029                         TAILQ_INSERT_HEAD(&wq->wq_thidlelist, oldest, uu_workq_entry);
1030                 }
1031
1032                 WQ_TRACE_WQ(TRACE_wq_thread_terminate | DBG_FUNC_START,
1033                     wq, cur_idle, 0, 0, 0);
1034                 wq->wq_thdying_count++;
1035                 uth->uu_workq_flags |= UT_WORKQ_DYING;
1036                 uth->uu_workq_flags &= ~UT_WORKQ_IDLE_CLEANUP;
1037                 workq_unpark_for_death_and_unlock(p, wq, uth, 0, setup_flags);
1038                 __builtin_unreachable();
1039         }
1040
1041         struct uthread *tail = TAILQ_LAST(&wq->wq_thidlelist, workq_uthread_head);
1042
1043         cur_idle += 1;
1044         wq->wq_thidlecount = cur_idle;
1045
1046         if (cur_idle >= wq_death_max_load && tail &&
1047             tail->uu_save.uus_workq_park_data.has_stack) {
1048                 uth->uu_save.uus_workq_park_data.has_stack = false;
1049                 TAILQ_INSERT_TAIL(&wq->wq_thidlelist, uth, uu_workq_entry);
1050         } else {
1051                 uth->uu_save.uus_workq_park_data.has_stack = true;
1052                 TAILQ_INSERT_HEAD(&wq->wq_thidlelist, uth, uu_workq_entry);
1053         }
1054
1055         if (!tail) {
1056                 uint64_t delay = workq_kill_delay_for_idle_thread(wq);
1057                 workq_death_call_schedule(wq, now + delay);
1058         }
1059 }
1060
1061 #pragma mark thread requests
1062
1063 static inline int
1064 workq_priority_for_req(workq_threadreq_t req)
1065 {
1066         thread_qos_t qos = req->tr_qos;
1067
1068         if (req->tr_flags & WORKQ_TR_FLAG_WL_OUTSIDE_QOS) {
1069                 workq_threadreq_param_t trp = kqueue_threadreq_workloop_param(req);
1070                 assert(trp.trp_flags & TRP_PRIORITY);
1071                 return trp.trp_pri;
1072         }
1073         return thread_workq_pri_for_qos(qos);
1074 }
1075
1076 static inline struct priority_queue *
1077 workq_priority_queue_for_req(struct workqueue *wq, workq_threadreq_t req)
1078 {
1079         if (req->tr_flags & WORKQ_TR_FLAG_WL_OUTSIDE_QOS) {
1080                 return &wq->wq_special_queue;
1081         } else if (req->tr_flags & WORKQ_TR_FLAG_OVERCOMMIT) {
1082                 return &wq->wq_overcommit_queue;
1083         } else {
1084                 return &wq->wq_constrained_queue;
1085         }
1086 }
1087
1088 /*
1089  * returns true if the the enqueued request is the highest priority item
1090  * in its priority queue.
1091  */
1092 static bool
1093 workq_threadreq_enqueue(struct workqueue *wq, workq_threadreq_t req)
1094 {
1095         assert(req->tr_state == WORKQ_TR_STATE_NEW);
1096
1097         req->tr_state = WORKQ_TR_STATE_QUEUED;
1098         wq->wq_reqcount += req->tr_count;
1099
1100         if (req->tr_qos == WORKQ_THREAD_QOS_MANAGER) {
1101                 assert(wq->wq_event_manager_threadreq == NULL);
1102                 assert(req->tr_flags & WORKQ_TR_FLAG_KEVENT);
1103                 assert(req->tr_count == 1);
1104                 wq->wq_event_manager_threadreq = req;
1105                 return true;
1106         }
1107         if (priority_queue_insert(workq_priority_queue_for_req(wq, req),
1108             &req->tr_entry, workq_priority_for_req(req),
1109             PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE)) {
1110                 if ((req->tr_flags & WORKQ_TR_FLAG_OVERCOMMIT) == 0) {
1111                         _wq_thactive_refresh_best_constrained_req_qos(wq);
1112                 }
1113                 return true;
1114         }
1115         return false;
1116 }
1117
1118 /*
1119  * returns true if the the dequeued request was the highest priority item
1120  * in its priority queue.
1121  */
1122 static bool
1123 workq_threadreq_dequeue(struct workqueue *wq, workq_threadreq_t req)
1124 {
1125         wq->wq_reqcount--;
1126
1127         if (--req->tr_count == 0) {
1128                 if (req->tr_qos == WORKQ_THREAD_QOS_MANAGER) {
1129                         assert(wq->wq_event_manager_threadreq == req);
1130                         assert(req->tr_count == 0);
1131                         wq->wq_event_manager_threadreq = NULL;
1132                         return true;
1133                 }
1134                 if (priority_queue_remove(workq_priority_queue_for_req(wq, req),
1135                     &req->tr_entry, PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE)) {
1136                         if ((req->tr_flags & WORKQ_TR_FLAG_OVERCOMMIT) == 0) {
1137                                 _wq_thactive_refresh_best_constrained_req_qos(wq);
1138                         }
1139                         return true;
1140                 }
1141         }
1142         return false;
1143 }
1144
1145 static void
1146 workq_threadreq_destroy(proc_t p, workq_threadreq_t req)
1147 {
1148         req->tr_state = WORKQ_TR_STATE_CANCELED;
1149         if (req->tr_flags & (WORKQ_TR_FLAG_WORKLOOP | WORKQ_TR_FLAG_KEVENT)) {
1150                 kqueue_threadreq_cancel(p, req);
1151         } else {
1152                 zfree(workq_zone_threadreq, req);
1153         }
1154 }
1155
1156 #pragma mark workqueue thread creation thread calls
1157
1158 static inline bool
1159 workq_thread_call_prepost(struct workqueue *wq, uint32_t sched, uint32_t pend,
1160     uint32_t fail_mask)
1161 {
1162         uint32_t old_flags, new_flags;
1163
1164         os_atomic_rmw_loop(&wq->wq_flags, old_flags, new_flags, acquire, {
1165                 if (__improbable(old_flags & (WQ_EXITING | sched | pend | fail_mask))) {
1166                         os_atomic_rmw_loop_give_up(return false);
1167                 }
1168                 if (__improbable(old_flags & WQ_PROC_SUSPENDED)) {
1169                         new_flags = old_flags | pend;
1170                 } else {
1171                         new_flags = old_flags | sched;
1172                 }
1173         });
1174
1175         return (old_flags & WQ_PROC_SUSPENDED) == 0;
1176 }
1177
1178 #define WORKQ_SCHEDULE_DELAYED_THREAD_CREATION_RESTART 0x1
1179
1180 static bool
1181 workq_schedule_delayed_thread_creation(struct workqueue *wq, int flags)
1182 {
1183         assert(!preemption_enabled());
1184
1185         if (!workq_thread_call_prepost(wq, WQ_DELAYED_CALL_SCHEDULED,
1186             WQ_DELAYED_CALL_PENDED, WQ_IMMEDIATE_CALL_PENDED |
1187             WQ_IMMEDIATE_CALL_SCHEDULED)) {
1188                 return false;
1189         }
1190
1191         uint64_t now = mach_absolute_time();
1192
1193         if (flags & WORKQ_SCHEDULE_DELAYED_THREAD_CREATION_RESTART) {
1194                 /* do not change the window */
1195         } else if (now - wq->wq_thread_call_last_run <= wq->wq_timer_interval) {
1196                 wq->wq_timer_interval *= 2;
1197                 if (wq->wq_timer_interval > wq_max_timer_interval.abstime) {
1198                         wq->wq_timer_interval = wq_max_timer_interval.abstime;
1199                 }
1200         } else if (now - wq->wq_thread_call_last_run > 2 * wq->wq_timer_interval) {
1201                 wq->wq_timer_interval /= 2;
1202                 if (wq->wq_timer_interval < wq_stalled_window.abstime) {
1203                         wq->wq_timer_interval = wq_stalled_window.abstime;
1204                 }
1205         }
1206
1207         WQ_TRACE_WQ(TRACE_wq_start_add_timer, wq, wq->wq_reqcount,
1208             _wq_flags(wq), wq->wq_timer_interval, 0);
1209
1210         thread_call_t call = wq->wq_delayed_call;
1211         uintptr_t arg = WQ_DELAYED_CALL_SCHEDULED;
1212         uint64_t deadline = now + wq->wq_timer_interval;
1213         if (thread_call_enter1_delayed(call, (void *)arg, deadline)) {
1214                 panic("delayed_call was already enqueued");
1215         }
1216         return true;
1217 }
1218
1219 static void
1220 workq_schedule_immediate_thread_creation(struct workqueue *wq)
1221 {
1222         assert(!preemption_enabled());
1223
1224         if (workq_thread_call_prepost(wq, WQ_IMMEDIATE_CALL_SCHEDULED,
1225             WQ_IMMEDIATE_CALL_PENDED, 0)) {
1226                 WQ_TRACE_WQ(TRACE_wq_start_add_timer, wq, wq->wq_reqcount,
1227                     _wq_flags(wq), 0, 0);
1228
1229                 uintptr_t arg = WQ_IMMEDIATE_CALL_SCHEDULED;
1230                 if (thread_call_enter1(wq->wq_immediate_call, (void *)arg)) {
1231                         panic("immediate_call was already enqueued");
1232                 }
1233         }
1234 }
1235
1236 void
1237 workq_proc_suspended(struct proc *p)
1238 {
1239         struct workqueue *wq = proc_get_wqptr(p);
1240
1241         if (wq) {
1242                 os_atomic_or(&wq->wq_flags, WQ_PROC_SUSPENDED, relaxed);
1243         }
1244 }
1245
1246 void
1247 workq_proc_resumed(struct proc *p)
1248 {
1249         struct workqueue *wq = proc_get_wqptr(p);
1250         uint32_t wq_flags;
1251
1252         if (!wq) {
1253                 return;
1254         }
1255
1256         wq_flags = os_atomic_andnot_orig(&wq->wq_flags, WQ_PROC_SUSPENDED |
1257             WQ_DELAYED_CALL_PENDED | WQ_IMMEDIATE_CALL_PENDED, relaxed);
1258         if ((wq_flags & WQ_EXITING) == 0) {
1259                 disable_preemption();
1260                 if (wq_flags & WQ_IMMEDIATE_CALL_PENDED) {
1261                         workq_schedule_immediate_thread_creation(wq);
1262                 } else if (wq_flags & WQ_DELAYED_CALL_PENDED) {
1263                         workq_schedule_delayed_thread_creation(wq,
1264                             WORKQ_SCHEDULE_DELAYED_THREAD_CREATION_RESTART);
1265                 }
1266                 enable_preemption();
1267         }
1268 }
1269
1270 /**
1271  * returns whether lastblocked_tsp is within wq_stalled_window usecs of now
1272  */
1273 static bool
1274 workq_thread_is_busy(uint64_t now, _Atomic uint64_t *lastblocked_tsp)
1275 {
1276         uint64_t lastblocked_ts = os_atomic_load_wide(lastblocked_tsp, relaxed);
1277         if (now <= lastblocked_ts) {
1278                 /*
1279                  * Because the update of the timestamp when a thread blocks
1280                  * isn't serialized against us looking at it (i.e. we don't hold
1281                  * the workq lock), it's possible to have a timestamp that matches
1282                  * the current time or that even looks to be in the future relative
1283                  * to when we grabbed the current time...
1284                  *
1285                  * Just treat this as a busy thread since it must have just blocked.
1286                  */
1287                 return true;
1288         }
1289         return (now - lastblocked_ts) < wq_stalled_window.abstime;
1290 }
1291
1292 static void
1293 workq_add_new_threads_call(void *_p, void *flags)
1294 {
1295         proc_t p = _p;
1296         struct workqueue *wq = proc_get_wqptr(p);
1297         uint32_t my_flag = (uint32_t)(uintptr_t)flags;
1298
1299         /*
1300          * workq_exit() will set the workqueue to NULL before
1301          * it cancels thread calls.
1302          */
1303         if (!wq) {
1304                 return;
1305         }
1306
1307         assert((my_flag == WQ_DELAYED_CALL_SCHEDULED) ||
1308             (my_flag == WQ_IMMEDIATE_CALL_SCHEDULED));
1309
1310         WQ_TRACE_WQ(TRACE_wq_add_timer | DBG_FUNC_START, wq, _wq_flags(wq),
1311             wq->wq_nthreads, wq->wq_thidlecount, 0);
1312
1313         workq_lock_spin(wq);
1314
1315         wq->wq_thread_call_last_run = mach_absolute_time();
1316         os_atomic_andnot(&wq->wq_flags, my_flag, release);
1317
1318         /* This can drop the workqueue lock, and take it again */
1319         workq_schedule_creator(p, wq, WORKQ_THREADREQ_CAN_CREATE_THREADS);
1320
1321         workq_unlock(wq);
1322
1323         WQ_TRACE_WQ(TRACE_wq_add_timer | DBG_FUNC_END, wq, 0,
1324             wq->wq_nthreads, wq->wq_thidlecount, 0);
1325 }
1326
1327 #pragma mark thread state tracking
1328
1329 static void
1330 workq_sched_callback(int type, thread_t thread)
1331 {
1332         struct uthread *uth = get_bsdthread_info(thread);
1333         proc_t proc = get_bsdtask_info(get_threadtask(thread));
1334         struct workqueue *wq = proc_get_wqptr(proc);
1335         thread_qos_t req_qos, qos = uth->uu_workq_pri.qos_bucket;
1336         wq_thactive_t old_thactive;
1337         bool start_timer = false;
1338
1339         if (qos == WORKQ_THREAD_QOS_MANAGER) {
1340                 return;
1341         }
1342
1343         switch (type) {
1344         case SCHED_CALL_BLOCK:
1345                 old_thactive = _wq_thactive_dec(wq, qos);
1346                 req_qos = WQ_THACTIVE_BEST_CONSTRAINED_REQ_QOS(old_thactive);
1347
1348                 /*
1349                  * Remember the timestamp of the last thread that blocked in this
1350                  * bucket, it used used by admission checks to ignore one thread
1351                  * being inactive if this timestamp is recent enough.
1352                  *
1353                  * If we collide with another thread trying to update the
1354                  * last_blocked (really unlikely since another thread would have to
1355                  * get scheduled and then block after we start down this path), it's
1356                  * not a problem.  Either timestamp is adequate, so no need to retry
1357                  */
1358                 os_atomic_store_wide(&wq->wq_lastblocked_ts[_wq_bucket(qos)],
1359                     thread_last_run_time(thread), relaxed);
1360
1361                 if (req_qos == THREAD_QOS_UNSPECIFIED) {
1362                         /*
1363                          * No pending request at the moment we could unblock, move on.
1364                          */
1365                 } else if (qos < req_qos) {
1366                         /*
1367                          * The blocking thread is at a lower QoS than the highest currently
1368                          * pending constrained request, nothing has to be redriven
1369                          */
1370                 } else {
1371                         uint32_t max_busycount, old_req_count;
1372                         old_req_count = _wq_thactive_aggregate_downto_qos(wq, old_thactive,
1373                             req_qos, NULL, &max_busycount);
1374                         /*
1375                          * If it is possible that may_start_constrained_thread had refused
1376                          * admission due to being over the max concurrency, we may need to
1377                          * spin up a new thread.
1378                          *
1379                          * We take into account the maximum number of busy threads
1380                          * that can affect may_start_constrained_thread as looking at the
1381                          * actual number may_start_constrained_thread will see is racy.
1382                          *
1383                          * IOW at NCPU = 4, for IN (req_qos = 1), if the old req count is
1384                          * between NCPU (4) and NCPU - 2 (2) we need to redrive.
1385                          */
1386                         uint32_t conc = wq_max_parallelism[_wq_bucket(qos)];
1387                         if (old_req_count <= conc && conc <= old_req_count + max_busycount) {
1388                                 start_timer = workq_schedule_delayed_thread_creation(wq, 0);
1389                         }
1390                 }
1391                 if (__improbable(kdebug_enable)) {
1392                         __unused uint32_t old = _wq_thactive_aggregate_downto_qos(wq,
1393                             old_thactive, qos, NULL, NULL);
1394                         WQ_TRACE_WQ(TRACE_wq_thread_block | DBG_FUNC_START, wq,
1395                             old - 1, qos | (req_qos << 8),
1396                             wq->wq_reqcount << 1 | start_timer, 0);
1397                 }
1398                 break;
1399
1400         case SCHED_CALL_UNBLOCK:
1401                 /*
1402                  * we cannot take the workqueue_lock here...
1403                  * an UNBLOCK can occur from a timer event which
1404                  * is run from an interrupt context... if the workqueue_lock
1405                  * is already held by this processor, we'll deadlock...
1406                  * the thread lock for the thread being UNBLOCKED
1407                  * is also held
1408                  */
1409                 old_thactive = _wq_thactive_inc(wq, qos);
1410                 if (__improbable(kdebug_enable)) {
1411                         __unused uint32_t old = _wq_thactive_aggregate_downto_qos(wq,
1412                             old_thactive, qos, NULL, NULL);
1413                         req_qos = WQ_THACTIVE_BEST_CONSTRAINED_REQ_QOS(old_thactive);
1414                         WQ_TRACE_WQ(TRACE_wq_thread_block | DBG_FUNC_END, wq,
1415                             old + 1, qos | (req_qos << 8),
1416                             wq->wq_threads_scheduled, 0);
1417                 }
1418                 break;
1419         }
1420 }
1421
1422 #pragma mark workq lifecycle
1423
1424 void
1425 workq_reference(struct workqueue *wq)
1426 {
1427         os_ref_retain(&wq->wq_refcnt);
1428 }
1429
1430 static void
1431 workq_deallocate_queue_invoke(mpsc_queue_chain_t e,
1432     __assert_only mpsc_daemon_queue_t dq)
1433 {
1434         struct workqueue *wq;
1435         struct turnstile *ts;
1436
1437         wq = mpsc_queue_element(e, struct workqueue, wq_destroy_link);
1438         assert(dq == &workq_deallocate_queue);
1439
1440         turnstile_complete((uintptr_t)wq, &wq->wq_turnstile, &ts, TURNSTILE_WORKQS);
1441         assert(ts);
1442         turnstile_cleanup();
1443         turnstile_deallocate(ts);
1444
1445         lck_spin_destroy(&wq->wq_lock, workq_lck_grp);
1446         zfree(workq_zone_workqueue, wq);
1447 }
1448
1449 static void
1450 workq_deallocate(struct workqueue *wq)
1451 {
1452         if (os_ref_release_relaxed(&wq->wq_refcnt) == 0) {
1453                 workq_deallocate_queue_invoke(&wq->wq_destroy_link,
1454                     &workq_deallocate_queue);
1455         }
1456 }
1457
1458 void
1459 workq_deallocate_safe(struct workqueue *wq)
1460 {
1461         if (__improbable(os_ref_release_relaxed(&wq->wq_refcnt) == 0)) {
1462                 mpsc_daemon_enqueue(&workq_deallocate_queue, &wq->wq_destroy_link,
1463                     MPSC_QUEUE_DISABLE_PREEMPTION);
1464         }
1465 }
1466
1467 /**
1468  * Setup per-process state for the workqueue.
1469  */
1470 int
1471 workq_open(struct proc *p, __unused struct workq_open_args *uap,
1472     __unused int32_t *retval)
1473 {
1474         struct workqueue *wq;
1475         int error = 0;
1476
1477         if ((p->p_lflag & P_LREGISTER) == 0) {
1478                 return EINVAL;
1479         }
1480
1481         if (wq_init_constrained_limit) {
1482                 uint32_t limit, num_cpus = ml_get_max_cpus();
1483
1484                 /*
1485                  * set up the limit for the constrained pool
1486                  * this is a virtual pool in that we don't
1487                  * maintain it on a separate idle and run list
1488                  */
1489                 limit = num_cpus * WORKQUEUE_CONSTRAINED_FACTOR;
1490
1491                 if (limit > wq_max_constrained_threads) {
1492                         wq_max_constrained_threads = limit;
1493                 }
1494
1495                 if (wq_max_threads > WQ_THACTIVE_BUCKET_HALF) {
1496                         wq_max_threads = WQ_THACTIVE_BUCKET_HALF;
1497                 }
1498                 if (wq_max_threads > CONFIG_THREAD_MAX - 20) {
1499                         wq_max_threads = CONFIG_THREAD_MAX - 20;
1500                 }
1501
1502                 wq_death_max_load = (uint16_t)fls(num_cpus) + 1;
1503
1504                 for (thread_qos_t qos = WORKQ_THREAD_QOS_MIN; qos <= WORKQ_THREAD_QOS_MAX; qos++) {
1505                         wq_max_parallelism[_wq_bucket(qos)] =
1506                             qos_max_parallelism(qos, QOS_PARALLELISM_COUNT_LOGICAL);
1507                 }
1508
1509                 wq_init_constrained_limit = 0;
1510         }
1511
1512         if (proc_get_wqptr(p) == NULL) {
1513                 if (proc_init_wqptr_or_wait(p) == FALSE) {
1514                         assert(proc_get_wqptr(p) != NULL);
1515                         goto out;
1516                 }
1517
1518                 wq = (struct workqueue *)zalloc(workq_zone_workqueue);
1519                 bzero(wq, sizeof(struct workqueue));
1520
1521                 os_ref_init_count(&wq->wq_refcnt, &workq_refgrp, 1);
1522
1523                 // Start the event manager at the priority hinted at by the policy engine
1524                 thread_qos_t mgr_priority_hint = task_get_default_manager_qos(current_task());
1525                 pthread_priority_t pp = _pthread_priority_make_from_thread_qos(mgr_priority_hint, 0, 0);
1526                 wq->wq_event_manager_priority = (uint32_t)pp;
1527                 wq->wq_timer_interval = wq_stalled_window.abstime;
1528                 wq->wq_proc = p;
1529                 turnstile_prepare((uintptr_t)wq, &wq->wq_turnstile, turnstile_alloc(),
1530                     TURNSTILE_WORKQS);
1531
1532                 TAILQ_INIT(&wq->wq_thrunlist);
1533                 TAILQ_INIT(&wq->wq_thnewlist);
1534                 TAILQ_INIT(&wq->wq_thidlelist);
1535                 priority_queue_init(&wq->wq_overcommit_queue,
1536                     PRIORITY_QUEUE_BUILTIN_MAX_HEAP);
1537                 priority_queue_init(&wq->wq_constrained_queue,
1538                     PRIORITY_QUEUE_BUILTIN_MAX_HEAP);
1539                 priority_queue_init(&wq->wq_special_queue,
1540                     PRIORITY_QUEUE_BUILTIN_MAX_HEAP);
1541
1542                 wq->wq_delayed_call = thread_call_allocate_with_options(
1543                         workq_add_new_threads_call, p, THREAD_CALL_PRIORITY_KERNEL,
1544                         THREAD_CALL_OPTIONS_ONCE);
1545                 wq->wq_immediate_call = thread_call_allocate_with_options(
1546                         workq_add_new_threads_call, p, THREAD_CALL_PRIORITY_KERNEL,
1547                         THREAD_CALL_OPTIONS_ONCE);
1548                 wq->wq_death_call = thread_call_allocate_with_options(
1549                         workq_kill_old_threads_call, wq,
1550                         THREAD_CALL_PRIORITY_USER, THREAD_CALL_OPTIONS_ONCE);
1551
1552                 lck_spin_init(&wq->wq_lock, workq_lck_grp, workq_lck_attr);
1553
1554                 WQ_TRACE_WQ(TRACE_wq_create | DBG_FUNC_NONE, wq,
1555                     VM_KERNEL_ADDRHIDE(wq), 0, 0, 0);
1556                 proc_set_wqptr(p, wq);
1557         }
1558 out:
1559
1560         return error;
1561 }
1562
1563 /*
1564  * Routine:     workq_mark_exiting
1565  *
1566  * Function:    Mark the work queue such that new threads will not be added to the
1567  *              work queue after we return.
1568  *
1569  * Conditions:  Called against the current process.
1570  */
1571 void
1572 workq_mark_exiting(struct proc *p)
1573 {
1574         struct workqueue *wq = proc_get_wqptr(p);
1575         uint32_t wq_flags;
1576         workq_threadreq_t mgr_req;
1577
1578         if (!wq) {
1579                 return;
1580         }
1581
1582         WQ_TRACE_WQ(TRACE_wq_pthread_exit | DBG_FUNC_START, wq, 0, 0, 0, 0);
1583
1584         workq_lock_spin(wq);
1585
1586         wq_flags = os_atomic_or_orig(&wq->wq_flags, WQ_EXITING, relaxed);
1587         if (__improbable(wq_flags & WQ_EXITING)) {
1588                 panic("workq_mark_exiting called twice");
1589         }
1590
1591         /*
1592          * Opportunistically try to cancel thread calls that are likely in flight.
1593          * workq_exit() will do the proper cleanup.
1594          */
1595         if (wq_flags & WQ_IMMEDIATE_CALL_SCHEDULED) {
1596                 thread_call_cancel(wq->wq_immediate_call);
1597         }
1598         if (wq_flags & WQ_DELAYED_CALL_SCHEDULED) {
1599                 thread_call_cancel(wq->wq_delayed_call);
1600         }
1601         if (wq_flags & WQ_DEATH_CALL_SCHEDULED) {
1602                 thread_call_cancel(wq->wq_death_call);
1603         }
1604
1605         mgr_req = wq->wq_event_manager_threadreq;
1606         wq->wq_event_manager_threadreq = NULL;
1607         wq->wq_reqcount = 0; /* workq_schedule_creator must not look at queues */
1608         wq->wq_creator = NULL;
1609         workq_turnstile_update_inheritor(wq, TURNSTILE_INHERITOR_NULL, 0);
1610
1611         workq_unlock(wq);
1612
1613         if (mgr_req) {
1614                 kqueue_threadreq_cancel(p, mgr_req);
1615         }
1616         /*
1617          * No one touches the priority queues once WQ_EXITING is set.
1618          * It is hence safe to do the tear down without holding any lock.
1619          */
1620         priority_queue_destroy(&wq->wq_overcommit_queue,
1621             struct workq_threadreq_s, tr_entry, ^(void *e){
1622                 workq_threadreq_destroy(p, e);
1623         });
1624         priority_queue_destroy(&wq->wq_constrained_queue,
1625             struct workq_threadreq_s, tr_entry, ^(void *e){
1626                 workq_threadreq_destroy(p, e);
1627         });
1628         priority_queue_destroy(&wq->wq_special_queue,
1629             struct workq_threadreq_s, tr_entry, ^(void *e){
1630                 workq_threadreq_destroy(p, e);
1631         });
1632
1633         WQ_TRACE(TRACE_wq_pthread_exit | DBG_FUNC_END, 0, 0, 0, 0, 0);
1634 }
1635
1636 /*
1637  * Routine:     workq_exit
1638  *
1639  * Function:    clean up the work queue structure(s) now that there are no threads
1640  *              left running inside the work queue (except possibly current_thread).
1641  *
1642  * Conditions:  Called by the last thread in the process.
1643  *              Called against current process.
1644  */
1645 void
1646 workq_exit(struct proc *p)
1647 {
1648         struct workqueue *wq;
1649         struct uthread *uth, *tmp;
1650
1651         wq = os_atomic_xchg(&p->p_wqptr, NULL, relaxed);
1652         if (wq != NULL) {
1653                 thread_t th = current_thread();
1654
1655                 WQ_TRACE_WQ(TRACE_wq_workqueue_exit | DBG_FUNC_START, wq, 0, 0, 0, 0);
1656
1657                 if (thread_get_tag(th) & THREAD_TAG_WORKQUEUE) {
1658                         /*
1659                          * <rdar://problem/40111515> Make sure we will no longer call the
1660                          * sched call, if we ever block this thread, which the cancel_wait
1661                          * below can do.
1662                          */
1663                         thread_sched_call(th, NULL);
1664                 }
1665
1666                 /*
1667                  * Thread calls are always scheduled by the proc itself or under the
1668                  * workqueue spinlock if WQ_EXITING is not yet set.
1669                  *
1670                  * Either way, when this runs, the proc has no threads left beside
1671                  * the one running this very code, so we know no thread call can be
1672                  * dispatched anymore.
1673                  */
1674                 thread_call_cancel_wait(wq->wq_delayed_call);
1675                 thread_call_cancel_wait(wq->wq_immediate_call);
1676                 thread_call_cancel_wait(wq->wq_death_call);
1677                 thread_call_free(wq->wq_delayed_call);
1678                 thread_call_free(wq->wq_immediate_call);
1679                 thread_call_free(wq->wq_death_call);
1680
1681                 /*
1682                  * Clean up workqueue data structures for threads that exited and
1683                  * didn't get a chance to clean up after themselves.
1684                  *
1685                  * idle/new threads should have been interrupted and died on their own
1686                  */
1687                 TAILQ_FOREACH_SAFE(uth, &wq->wq_thrunlist, uu_workq_entry, tmp) {
1688                         thread_sched_call(uth->uu_thread, NULL);
1689                         thread_deallocate(uth->uu_thread);
1690                 }
1691                 assert(TAILQ_EMPTY(&wq->wq_thnewlist));
1692                 assert(TAILQ_EMPTY(&wq->wq_thidlelist));
1693
1694                 WQ_TRACE_WQ(TRACE_wq_destroy | DBG_FUNC_END, wq,
1695                     VM_KERNEL_ADDRHIDE(wq), 0, 0, 0);
1696
1697                 workq_deallocate(wq);
1698
1699                 WQ_TRACE(TRACE_wq_workqueue_exit | DBG_FUNC_END, 0, 0, 0, 0, 0);
1700         }
1701 }
1702
1703
1704 #pragma mark bsd thread control
1705
1706 static bool
1707 _pthread_priority_to_policy(pthread_priority_t priority,
1708     thread_qos_policy_data_t *data)
1709 {
1710         data->qos_tier = _pthread_priority_thread_qos(priority);
1711         data->tier_importance = _pthread_priority_relpri(priority);
1712         if (data->qos_tier == THREAD_QOS_UNSPECIFIED || data->tier_importance > 0 ||
1713             data->tier_importance < THREAD_QOS_MIN_TIER_IMPORTANCE) {
1714                 return false;
1715         }
1716         return true;
1717 }
1718
1719 static int
1720 bsdthread_set_self(proc_t p, thread_t th, pthread_priority_t priority,
1721     mach_port_name_t voucher, enum workq_set_self_flags flags)
1722 {
1723         struct uthread *uth = get_bsdthread_info(th);
1724         struct workqueue *wq = proc_get_wqptr(p);
1725
1726         kern_return_t kr;
1727         int unbind_rv = 0, qos_rv = 0, voucher_rv = 0, fixedpri_rv = 0;
1728         bool is_wq_thread = (thread_get_tag(th) & THREAD_TAG_WORKQUEUE);
1729
1730         if (flags & WORKQ_SET_SELF_WQ_KEVENT_UNBIND) {
1731                 if (!is_wq_thread) {
1732                         unbind_rv = EINVAL;
1733                         goto qos;
1734                 }
1735
1736                 if (uth->uu_workq_pri.qos_bucket == WORKQ_THREAD_QOS_MANAGER) {
1737                         unbind_rv = EINVAL;
1738                         goto qos;
1739                 }
1740
1741                 workq_threadreq_t kqr = uth->uu_kqr_bound;
1742                 if (kqr == NULL) {
1743                         unbind_rv = EALREADY;
1744                         goto qos;
1745                 }
1746
1747                 if (kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP) {
1748                         unbind_rv = EINVAL;
1749                         goto qos;
1750                 }
1751
1752                 kqueue_threadreq_unbind(p, kqr);
1753         }
1754
1755 qos:
1756         if (flags & WORKQ_SET_SELF_QOS_FLAG) {
1757                 thread_qos_policy_data_t new_policy;
1758
1759                 if (!_pthread_priority_to_policy(priority, &new_policy)) {
1760                         qos_rv = EINVAL;
1761                         goto voucher;
1762                 }
1763
1764                 if (!is_wq_thread) {
1765                         /*
1766                          * Threads opted out of QoS can't change QoS
1767                          */
1768                         if (!thread_has_qos_policy(th)) {
1769                                 qos_rv = EPERM;
1770                                 goto voucher;
1771                         }
1772                 } else if (uth->uu_workq_pri.qos_bucket == WORKQ_THREAD_QOS_MANAGER ||
1773                     uth->uu_workq_pri.qos_bucket == WORKQ_THREAD_QOS_ABOVEUI) {
1774                         /*
1775                          * Workqueue manager threads or threads above UI can't change QoS
1776                          */
1777                         qos_rv = EINVAL;
1778                         goto voucher;
1779                 } else {
1780                         /*
1781                          * For workqueue threads, possibly adjust buckets and redrive thread
1782                          * requests.
1783                          */
1784                         bool old_overcommit = uth->uu_workq_flags & UT_WORKQ_OVERCOMMIT;
1785                         bool new_overcommit = priority & _PTHREAD_PRIORITY_OVERCOMMIT_FLAG;
1786                         struct uu_workq_policy old_pri, new_pri;
1787                         bool force_run = false;
1788
1789                         workq_lock_spin(wq);
1790
1791                         if (old_overcommit != new_overcommit) {
1792                                 uth->uu_workq_flags ^= UT_WORKQ_OVERCOMMIT;
1793                                 if (old_overcommit) {
1794                                         wq->wq_constrained_threads_scheduled++;
1795                                 } else if (wq->wq_constrained_threads_scheduled-- ==
1796                                     wq_max_constrained_threads) {
1797                                         force_run = true;
1798                                 }
1799                         }
1800
1801                         old_pri = new_pri = uth->uu_workq_pri;
1802                         new_pri.qos_req = new_policy.qos_tier;
1803                         workq_thread_update_bucket(p, wq, uth, old_pri, new_pri, force_run);
1804                         workq_unlock(wq);
1805                 }
1806
1807                 kr = thread_policy_set_internal(th, THREAD_QOS_POLICY,
1808                     (thread_policy_t)&new_policy, THREAD_QOS_POLICY_COUNT);
1809                 if (kr != KERN_SUCCESS) {
1810                         qos_rv = EINVAL;
1811                 }
1812         }
1813
1814 voucher:
1815         if (flags & WORKQ_SET_SELF_VOUCHER_FLAG) {
1816                 kr = thread_set_voucher_name(voucher);
1817                 if (kr != KERN_SUCCESS) {
1818                         voucher_rv = ENOENT;
1819                         goto fixedpri;
1820                 }
1821         }
1822
1823 fixedpri:
1824         if (qos_rv) {
1825                 goto done;
1826         }
1827         if (flags & WORKQ_SET_SELF_FIXEDPRIORITY_FLAG) {
1828                 thread_extended_policy_data_t extpol = {.timeshare = 0};
1829
1830                 if (is_wq_thread) {
1831                         /* Not allowed on workqueue threads */
1832                         fixedpri_rv = ENOTSUP;
1833                         goto done;
1834                 }
1835
1836                 kr = thread_policy_set_internal(th, THREAD_EXTENDED_POLICY,
1837                     (thread_policy_t)&extpol, THREAD_EXTENDED_POLICY_COUNT);
1838                 if (kr != KERN_SUCCESS) {
1839                         fixedpri_rv = EINVAL;
1840                         goto done;
1841                 }
1842         } else if (flags & WORKQ_SET_SELF_TIMESHARE_FLAG) {
1843                 thread_extended_policy_data_t extpol = {.timeshare = 1};
1844
1845                 if (is_wq_thread) {
1846                         /* Not allowed on workqueue threads */
1847                         fixedpri_rv = ENOTSUP;
1848                         goto done;
1849                 }
1850
1851                 kr = thread_policy_set_internal(th, THREAD_EXTENDED_POLICY,
1852                     (thread_policy_t)&extpol, THREAD_EXTENDED_POLICY_COUNT);
1853                 if (kr != KERN_SUCCESS) {
1854                         fixedpri_rv = EINVAL;
1855                         goto done;
1856                 }
1857         }
1858
1859
1860 done:
1861         if (qos_rv && voucher_rv) {
1862                 /* Both failed, give that a unique error. */
1863                 return EBADMSG;
1864         }
1865
1866         if (unbind_rv) {
1867                 return unbind_rv;
1868         }
1869
1870         if (qos_rv) {
1871                 return qos_rv;
1872         }
1873
1874         if (voucher_rv) {
1875                 return voucher_rv;
1876         }
1877
1878         if (fixedpri_rv) {
1879                 return fixedpri_rv;
1880         }
1881
1882         return 0;
1883 }
1884
1885 static int
1886 bsdthread_add_explicit_override(proc_t p, mach_port_name_t kport,
1887     pthread_priority_t pp, user_addr_t resource)
1888 {
1889         thread_qos_t qos = _pthread_priority_thread_qos(pp);
1890         if (qos == THREAD_QOS_UNSPECIFIED) {
1891                 return EINVAL;
1892         }
1893
1894         thread_t th = port_name_to_thread(kport,
1895             PORT_TO_THREAD_IN_CURRENT_TASK);
1896         if (th == THREAD_NULL) {
1897                 return ESRCH;
1898         }
1899
1900         int rv = proc_thread_qos_add_override(p->task, th, 0, qos, TRUE,
1901             resource, THREAD_QOS_OVERRIDE_TYPE_PTHREAD_EXPLICIT_OVERRIDE);
1902
1903         thread_deallocate(th);
1904         return rv;
1905 }
1906
1907 static int
1908 bsdthread_remove_explicit_override(proc_t p, mach_port_name_t kport,
1909     user_addr_t resource)
1910 {
1911         thread_t th = port_name_to_thread(kport,
1912             PORT_TO_THREAD_IN_CURRENT_TASK);
1913         if (th == THREAD_NULL) {
1914                 return ESRCH;
1915         }
1916
1917         int rv = proc_thread_qos_remove_override(p->task, th, 0, resource,
1918             THREAD_QOS_OVERRIDE_TYPE_PTHREAD_EXPLICIT_OVERRIDE);
1919
1920         thread_deallocate(th);
1921         return rv;
1922 }
1923
1924 static int
1925 workq_thread_add_dispatch_override(proc_t p, mach_port_name_t kport,
1926     pthread_priority_t pp, user_addr_t ulock_addr)
1927 {
1928         struct uu_workq_policy old_pri, new_pri;
1929         struct workqueue *wq = proc_get_wqptr(p);
1930
1931         thread_qos_t qos_override = _pthread_priority_thread_qos(pp);
1932         if (qos_override == THREAD_QOS_UNSPECIFIED) {
1933                 return EINVAL;
1934         }
1935
1936         thread_t thread = port_name_to_thread(kport,
1937             PORT_TO_THREAD_IN_CURRENT_TASK);
1938         if (thread == THREAD_NULL) {
1939                 return ESRCH;
1940         }
1941
1942         struct uthread *uth = get_bsdthread_info(thread);
1943         if ((thread_get_tag(thread) & THREAD_TAG_WORKQUEUE) == 0) {
1944                 thread_deallocate(thread);
1945                 return EPERM;
1946         }
1947
1948         WQ_TRACE_WQ(TRACE_wq_override_dispatch | DBG_FUNC_NONE,
1949             wq, thread_tid(thread), 1, pp, 0);
1950
1951         thread_mtx_lock(thread);
1952
1953         if (ulock_addr) {
1954                 uint32_t val;
1955                 int rc;
1956                 /*
1957                  * Workaround lack of explicit support for 'no-fault copyin'
1958                  * <rdar://problem/24999882>, as disabling preemption prevents paging in
1959                  */
1960                 disable_preemption();
1961                 rc = copyin_atomic32(ulock_addr, &val);
1962                 enable_preemption();
1963                 if (rc == 0 && ulock_owner_value_to_port_name(val) != kport) {
1964                         goto out;
1965                 }
1966         }
1967
1968         workq_lock_spin(wq);
1969
1970         old_pri = uth->uu_workq_pri;
1971         if (old_pri.qos_override >= qos_override) {
1972                 /* Nothing to do */
1973         } else if (thread == current_thread()) {
1974                 new_pri = old_pri;
1975                 new_pri.qos_override = qos_override;
1976                 workq_thread_update_bucket(p, wq, uth, old_pri, new_pri, false);
1977         } else {
1978                 uth->uu_workq_pri.qos_override = qos_override;
1979                 if (qos_override > workq_pri_override(old_pri)) {
1980                         thread_set_workq_override(thread, qos_override);
1981                 }
1982         }
1983
1984         workq_unlock(wq);
1985
1986 out:
1987         thread_mtx_unlock(thread);
1988         thread_deallocate(thread);
1989         return 0;
1990 }
1991
1992 static int
1993 workq_thread_reset_dispatch_override(proc_t p, thread_t thread)
1994 {
1995         struct uu_workq_policy old_pri, new_pri;
1996         struct workqueue *wq = proc_get_wqptr(p);
1997         struct uthread *uth = get_bsdthread_info(thread);
1998
1999         if ((thread_get_tag(thread) & THREAD_TAG_WORKQUEUE) == 0) {
2000                 return EPERM;
2001         }
2002
2003         WQ_TRACE_WQ(TRACE_wq_override_reset | DBG_FUNC_NONE, wq, 0, 0, 0, 0);
2004
2005         workq_lock_spin(wq);
2006         old_pri = new_pri = uth->uu_workq_pri;
2007         new_pri.qos_override = THREAD_QOS_UNSPECIFIED;
2008         workq_thread_update_bucket(p, wq, uth, old_pri, new_pri, false);
2009         workq_unlock(wq);
2010         return 0;
2011 }
2012
2013 static int
2014 workq_thread_allow_kill(__unused proc_t p, thread_t thread, bool enable)
2015 {
2016         if (!(thread_get_tag(thread) & THREAD_TAG_WORKQUEUE)) {
2017                 // If the thread isn't a workqueue thread, don't set the
2018                 // kill_allowed bit; however, we still need to return 0
2019                 // instead of an error code since this code is executed
2020                 // on the abort path which needs to not depend on the
2021                 // pthread_t (returning an error depends on pthread_t via
2022                 // cerror_nocancel)
2023                 return 0;
2024         }
2025         struct uthread *uth = get_bsdthread_info(thread);
2026         uth->uu_workq_pthread_kill_allowed = enable;
2027         return 0;
2028 }
2029
2030 static int
2031 bsdthread_get_max_parallelism(thread_qos_t qos, unsigned long flags,
2032     int *retval)
2033 {
2034         static_assert(QOS_PARALLELISM_COUNT_LOGICAL ==
2035             _PTHREAD_QOS_PARALLELISM_COUNT_LOGICAL, "logical");
2036         static_assert(QOS_PARALLELISM_REALTIME ==
2037             _PTHREAD_QOS_PARALLELISM_REALTIME, "realtime");
2038
2039         if (flags & ~(QOS_PARALLELISM_REALTIME | QOS_PARALLELISM_COUNT_LOGICAL)) {
2040                 return EINVAL;
2041         }
2042
2043         if (flags & QOS_PARALLELISM_REALTIME) {
2044                 if (qos) {
2045                         return EINVAL;
2046                 }
2047         } else if (qos == THREAD_QOS_UNSPECIFIED || qos >= THREAD_QOS_LAST) {
2048                 return EINVAL;
2049         }
2050
2051         *retval = qos_max_parallelism(qos, flags);
2052         return 0;
2053 }
2054
2055 #define ENSURE_UNUSED(arg) \
2056                 ({ if ((arg) != 0) { return EINVAL; } })
2057
2058 int
2059 bsdthread_ctl(struct proc *p, struct bsdthread_ctl_args *uap, int *retval)
2060 {
2061         switch (uap->cmd) {
2062         case BSDTHREAD_CTL_QOS_OVERRIDE_START:
2063                 return bsdthread_add_explicit_override(p, (mach_port_name_t)uap->arg1,
2064                            (pthread_priority_t)uap->arg2, uap->arg3);
2065         case BSDTHREAD_CTL_QOS_OVERRIDE_END:
2066                 ENSURE_UNUSED(uap->arg3);
2067                 return bsdthread_remove_explicit_override(p, (mach_port_name_t)uap->arg1,
2068                            (user_addr_t)uap->arg2);
2069
2070         case BSDTHREAD_CTL_QOS_OVERRIDE_DISPATCH:
2071                 return workq_thread_add_dispatch_override(p, (mach_port_name_t)uap->arg1,
2072                            (pthread_priority_t)uap->arg2, uap->arg3);
2073         case BSDTHREAD_CTL_QOS_OVERRIDE_RESET:
2074                 return workq_thread_reset_dispatch_override(p, current_thread());
2075
2076         case BSDTHREAD_CTL_SET_SELF:
2077                 return bsdthread_set_self(p, current_thread(),
2078                            (pthread_priority_t)uap->arg1, (mach_port_name_t)uap->arg2,
2079                            (enum workq_set_self_flags)uap->arg3);
2080
2081         case BSDTHREAD_CTL_QOS_MAX_PARALLELISM:
2082                 ENSURE_UNUSED(uap->arg3);
2083                 return bsdthread_get_max_parallelism((thread_qos_t)uap->arg1,
2084                            (unsigned long)uap->arg2, retval);
2085         case BSDTHREAD_CTL_WORKQ_ALLOW_KILL:
2086                 ENSURE_UNUSED(uap->arg2);
2087                 ENSURE_UNUSED(uap->arg3);
2088                 return workq_thread_allow_kill(p, current_thread(), (bool)uap->arg1);
2089
2090         case BSDTHREAD_CTL_SET_QOS:
2091         case BSDTHREAD_CTL_QOS_DISPATCH_ASYNCHRONOUS_OVERRIDE_ADD:
2092         case BSDTHREAD_CTL_QOS_DISPATCH_ASYNCHRONOUS_OVERRIDE_RESET:
2093                 /* no longer supported */
2094                 return ENOTSUP;
2095
2096         default:
2097                 return EINVAL;
2098         }
2099 }
2100
2101 #pragma mark workqueue thread manipulation
2102
2103 static void __dead2
2104 workq_unpark_select_threadreq_or_park_and_unlock(proc_t p, struct workqueue *wq,
2105     struct uthread *uth, uint32_t setup_flags);
2106
2107 static void __dead2
2108 workq_select_threadreq_or_park_and_unlock(proc_t p, struct workqueue *wq,
2109     struct uthread *uth, uint32_t setup_flags);
2110
2111 static void workq_setup_and_run(proc_t p, struct uthread *uth, int flags) __dead2;
2112
2113 #if KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD
2114 static inline uint64_t
2115 workq_trace_req_id(workq_threadreq_t req)
2116 {
2117         struct kqworkloop *kqwl;
2118         if (req->tr_flags & WORKQ_TR_FLAG_WORKLOOP) {
2119                 kqwl = __container_of(req, struct kqworkloop, kqwl_request);
2120                 return kqwl->kqwl_dynamicid;
2121         }
2122
2123         return VM_KERNEL_ADDRHIDE(req);
2124 }
2125 #endif
2126
2127 /**
2128  * Entry point for libdispatch to ask for threads
2129  */
2130 static int
2131 workq_reqthreads(struct proc *p, uint32_t reqcount, pthread_priority_t pp)
2132 {
2133         thread_qos_t qos = _pthread_priority_thread_qos(pp);
2134         struct workqueue *wq = proc_get_wqptr(p);
2135         uint32_t unpaced, upcall_flags = WQ_FLAG_THREAD_NEWSPI;
2136
2137         if (wq == NULL || reqcount <= 0 || reqcount > UINT16_MAX ||
2138             qos == THREAD_QOS_UNSPECIFIED) {
2139                 return EINVAL;
2140         }
2141
2142         WQ_TRACE_WQ(TRACE_wq_wqops_reqthreads | DBG_FUNC_NONE,
2143             wq, reqcount, pp, 0, 0);
2144
2145         workq_threadreq_t req = zalloc(workq_zone_threadreq);
2146         priority_queue_entry_init(&req->tr_entry);
2147         req->tr_state = WORKQ_TR_STATE_NEW;
2148         req->tr_flags = 0;
2149         req->tr_qos   = qos;
2150
2151         if (pp & _PTHREAD_PRIORITY_OVERCOMMIT_FLAG) {
2152                 req->tr_flags |= WORKQ_TR_FLAG_OVERCOMMIT;
2153                 upcall_flags |= WQ_FLAG_THREAD_OVERCOMMIT;
2154         }
2155
2156         WQ_TRACE_WQ(TRACE_wq_thread_request_initiate | DBG_FUNC_NONE,
2157             wq, workq_trace_req_id(req), req->tr_qos, reqcount, 0);
2158
2159         workq_lock_spin(wq);
2160         do {
2161                 if (_wq_exiting(wq)) {
2162                         goto exiting;
2163                 }
2164
2165                 /*
2166                  * When userspace is asking for parallelism, wakeup up to (reqcount - 1)
2167                  * threads without pacing, to inform the scheduler of that workload.
2168                  *
2169                  * The last requests, or the ones that failed the admission checks are
2170                  * enqueued and go through the regular creator codepath.
2171                  *
2172                  * If there aren't enough threads, add one, but re-evaluate everything
2173                  * as conditions may now have changed.
2174                  */
2175                 if (reqcount > 1 && (req->tr_flags & WORKQ_TR_FLAG_OVERCOMMIT) == 0) {
2176                         unpaced = workq_constrained_allowance(wq, qos, NULL, false);
2177                         if (unpaced >= reqcount - 1) {
2178                                 unpaced = reqcount - 1;
2179                         }
2180                 } else {
2181                         unpaced = reqcount - 1;
2182                 }
2183
2184                 /*
2185                  * This path does not currently handle custom workloop parameters
2186                  * when creating threads for parallelism.
2187                  */
2188                 assert(!(req->tr_flags & WORKQ_TR_FLAG_WL_PARAMS));
2189
2190                 /*
2191                  * This is a trimmed down version of workq_threadreq_bind_and_unlock()
2192                  */
2193                 while (unpaced > 0 && wq->wq_thidlecount) {
2194                         struct uthread *uth;
2195                         bool needs_wakeup;
2196                         uint8_t uu_flags = UT_WORKQ_EARLY_BOUND;
2197
2198                         if (req->tr_flags & WORKQ_TR_FLAG_OVERCOMMIT) {
2199                                 uu_flags |= UT_WORKQ_OVERCOMMIT;
2200                         }
2201
2202                         uth = workq_pop_idle_thread(wq, uu_flags, &needs_wakeup);
2203
2204                         _wq_thactive_inc(wq, qos);
2205                         wq->wq_thscheduled_count[_wq_bucket(qos)]++;
2206                         workq_thread_reset_pri(wq, uth, req, /*unpark*/ true);
2207                         wq->wq_fulfilled++;
2208
2209                         uth->uu_save.uus_workq_park_data.upcall_flags = upcall_flags;
2210                         uth->uu_save.uus_workq_park_data.thread_request = req;
2211                         if (needs_wakeup) {
2212                                 workq_thread_wakeup(uth);
2213                         }
2214                         unpaced--;
2215                         reqcount--;
2216                 }
2217         } while (unpaced && wq->wq_nthreads < wq_max_threads &&
2218             workq_add_new_idle_thread(p, wq));
2219
2220         if (_wq_exiting(wq)) {
2221                 goto exiting;
2222         }
2223
2224         req->tr_count = reqcount;
2225         if (workq_threadreq_enqueue(wq, req)) {
2226                 /* This can drop the workqueue lock, and take it again */
2227                 workq_schedule_creator(p, wq, WORKQ_THREADREQ_CAN_CREATE_THREADS);
2228         }
2229         workq_unlock(wq);
2230         return 0;
2231
2232 exiting:
2233         workq_unlock(wq);
2234         zfree(workq_zone_threadreq, req);
2235         return ECANCELED;
2236 }
2237
2238 bool
2239 workq_kern_threadreq_initiate(struct proc *p, workq_threadreq_t req,
2240     struct turnstile *workloop_ts, thread_qos_t qos,
2241     workq_kern_threadreq_flags_t flags)
2242 {
2243         struct workqueue *wq = proc_get_wqptr_fast(p);
2244         struct uthread *uth = NULL;
2245
2246         assert(req->tr_flags & (WORKQ_TR_FLAG_WORKLOOP | WORKQ_TR_FLAG_KEVENT));
2247
2248         if (req->tr_flags & WORKQ_TR_FLAG_WL_OUTSIDE_QOS) {
2249                 workq_threadreq_param_t trp = kqueue_threadreq_workloop_param(req);
2250                 qos = thread_workq_qos_for_pri(trp.trp_pri);
2251                 if (qos == THREAD_QOS_UNSPECIFIED) {
2252                         qos = WORKQ_THREAD_QOS_ABOVEUI;
2253                 }
2254         }
2255
2256         assert(req->tr_state == WORKQ_TR_STATE_IDLE);
2257         priority_queue_entry_init(&req->tr_entry);
2258         req->tr_count = 1;
2259         req->tr_state = WORKQ_TR_STATE_NEW;
2260         req->tr_qos   = qos;
2261
2262         WQ_TRACE_WQ(TRACE_wq_thread_request_initiate | DBG_FUNC_NONE, wq,
2263             workq_trace_req_id(req), qos, 1, 0);
2264
2265         if (flags & WORKQ_THREADREQ_ATTEMPT_REBIND) {
2266                 /*
2267                  * we're called back synchronously from the context of
2268                  * kqueue_threadreq_unbind from within workq_thread_return()
2269                  * we can try to match up this thread with this request !
2270                  */
2271                 uth = current_uthread();
2272                 assert(uth->uu_kqr_bound == NULL);
2273         }
2274
2275         workq_lock_spin(wq);
2276         if (_wq_exiting(wq)) {
2277                 req->tr_state = WORKQ_TR_STATE_IDLE;
2278                 workq_unlock(wq);
2279                 return false;
2280         }
2281
2282         if (uth && workq_threadreq_admissible(wq, uth, req)) {
2283                 assert(uth != wq->wq_creator);
2284                 if (uth->uu_workq_pri.qos_bucket != req->tr_qos) {
2285                         _wq_thactive_move(wq, uth->uu_workq_pri.qos_bucket, req->tr_qos);
2286                         workq_thread_reset_pri(wq, uth, req, /*unpark*/ false);
2287                 }
2288                 /*
2289                  * We're called from workq_kern_threadreq_initiate()
2290                  * due to an unbind, with the kq req held.
2291                  */
2292                 WQ_TRACE_WQ(TRACE_wq_thread_logical_run | DBG_FUNC_START, wq,
2293                     workq_trace_req_id(req), 0, 0, 0);
2294                 wq->wq_fulfilled++;
2295                 kqueue_threadreq_bind(p, req, uth->uu_thread, 0);
2296         } else {
2297                 if (workloop_ts) {
2298                         workq_perform_turnstile_operation_locked(wq, ^{
2299                                 turnstile_update_inheritor(workloop_ts, wq->wq_turnstile,
2300                                 TURNSTILE_IMMEDIATE_UPDATE | TURNSTILE_INHERITOR_TURNSTILE);
2301                                 turnstile_update_inheritor_complete(workloop_ts,
2302                                 TURNSTILE_INTERLOCK_HELD);
2303                         });
2304                 }
2305                 if (workq_threadreq_enqueue(wq, req)) {
2306                         workq_schedule_creator(p, wq, flags);
2307                 }
2308         }
2309
2310         workq_unlock(wq);
2311
2312         return true;
2313 }
2314
2315 void
2316 workq_kern_threadreq_modify(struct proc *p, workq_threadreq_t req,
2317     thread_qos_t qos, workq_kern_threadreq_flags_t flags)
2318 {
2319         struct workqueue *wq = proc_get_wqptr_fast(p);
2320         bool make_overcommit = false;
2321
2322         if (req->tr_flags & WORKQ_TR_FLAG_WL_OUTSIDE_QOS) {
2323                 /* Requests outside-of-QoS shouldn't accept modify operations */
2324                 return;
2325         }
2326
2327         workq_lock_spin(wq);
2328
2329         assert(req->tr_qos != WORKQ_THREAD_QOS_MANAGER);
2330         assert(req->tr_flags & (WORKQ_TR_FLAG_KEVENT | WORKQ_TR_FLAG_WORKLOOP));
2331
2332         if (req->tr_state == WORKQ_TR_STATE_BINDING) {
2333                 kqueue_threadreq_bind(p, req, req->tr_thread, 0);
2334                 workq_unlock(wq);
2335                 return;
2336         }
2337
2338         if (flags & WORKQ_THREADREQ_MAKE_OVERCOMMIT) {
2339                 make_overcommit = (req->tr_flags & WORKQ_TR_FLAG_OVERCOMMIT) == 0;
2340         }
2341
2342         if (_wq_exiting(wq) || (req->tr_qos == qos && !make_overcommit)) {
2343                 workq_unlock(wq);
2344                 return;
2345         }
2346
2347         assert(req->tr_count == 1);
2348         if (req->tr_state != WORKQ_TR_STATE_QUEUED) {
2349                 panic("Invalid thread request (%p) state %d", req, req->tr_state);
2350         }
2351
2352         WQ_TRACE_WQ(TRACE_wq_thread_request_modify | DBG_FUNC_NONE, wq,
2353             workq_trace_req_id(req), qos, 0, 0);
2354
2355         struct priority_queue *pq = workq_priority_queue_for_req(wq, req);
2356         workq_threadreq_t req_max;
2357
2358         /*
2359          * Stage 1: Dequeue the request from its priority queue.
2360          *
2361          * If we dequeue the root item of the constrained priority queue,
2362          * maintain the best constrained request qos invariant.
2363          */
2364         if (priority_queue_remove(pq, &req->tr_entry,
2365             PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE)) {
2366                 if ((req->tr_flags & WORKQ_TR_FLAG_OVERCOMMIT) == 0) {
2367                         _wq_thactive_refresh_best_constrained_req_qos(wq);
2368                 }
2369         }
2370
2371         /*
2372          * Stage 2: Apply changes to the thread request
2373          *
2374          * If the item will not become the root of the priority queue it belongs to,
2375          * then we need to wait in line, just enqueue and return quickly.
2376          */
2377         if (__improbable(make_overcommit)) {
2378                 req->tr_flags ^= WORKQ_TR_FLAG_OVERCOMMIT;
2379                 pq = workq_priority_queue_for_req(wq, req);
2380         }
2381         req->tr_qos = qos;
2382
2383         req_max = priority_queue_max(pq, struct workq_threadreq_s, tr_entry);
2384         if (req_max && req_max->tr_qos >= qos) {
2385                 priority_queue_insert(pq, &req->tr_entry, workq_priority_for_req(req),
2386                     PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE);
2387                 workq_unlock(wq);
2388                 return;
2389         }
2390
2391         /*
2392          * Stage 3: Reevaluate whether we should run the thread request.
2393          *
2394          * Pretend the thread request is new again:
2395          * - adjust wq_reqcount to not count it anymore.
2396          * - make its state WORKQ_TR_STATE_NEW (so that workq_threadreq_bind_and_unlock
2397          *   properly attempts a synchronous bind)
2398          */
2399         wq->wq_reqcount--;
2400         req->tr_state = WORKQ_TR_STATE_NEW;
2401         if (workq_threadreq_enqueue(wq, req)) {
2402                 workq_schedule_creator(p, wq, flags);
2403         }
2404         workq_unlock(wq);
2405 }
2406
2407 void
2408 workq_kern_threadreq_lock(struct proc *p)
2409 {
2410         workq_lock_spin(proc_get_wqptr_fast(p));
2411 }
2412
2413 void
2414 workq_kern_threadreq_unlock(struct proc *p)
2415 {
2416         workq_unlock(proc_get_wqptr_fast(p));
2417 }
2418
2419 void
2420 workq_kern_threadreq_update_inheritor(struct proc *p, workq_threadreq_t req,
2421     thread_t owner, struct turnstile *wl_ts,
2422     turnstile_update_flags_t flags)
2423 {
2424         struct workqueue *wq = proc_get_wqptr_fast(p);
2425         turnstile_inheritor_t inheritor;
2426
2427         assert(req->tr_qos != WORKQ_THREAD_QOS_MANAGER);
2428         assert(req->tr_flags & WORKQ_TR_FLAG_WORKLOOP);
2429         workq_lock_held(wq);
2430
2431         if (req->tr_state == WORKQ_TR_STATE_BINDING) {
2432                 kqueue_threadreq_bind(p, req, req->tr_thread,
2433                     KQUEUE_THREADERQ_BIND_NO_INHERITOR_UPDATE);
2434                 return;
2435         }
2436
2437         if (_wq_exiting(wq)) {
2438                 inheritor = TURNSTILE_INHERITOR_NULL;
2439         } else {
2440                 if (req->tr_state != WORKQ_TR_STATE_QUEUED) {
2441                         panic("Invalid thread request (%p) state %d", req, req->tr_state);
2442                 }
2443
2444                 if (owner) {
2445                         inheritor = owner;
2446                         flags |= TURNSTILE_INHERITOR_THREAD;
2447                 } else {
2448                         inheritor = wq->wq_turnstile;
2449                         flags |= TURNSTILE_INHERITOR_TURNSTILE;
2450                 }
2451         }
2452
2453         workq_perform_turnstile_operation_locked(wq, ^{
2454                 turnstile_update_inheritor(wl_ts, inheritor, flags);
2455         });
2456 }
2457
2458 void
2459 workq_kern_threadreq_redrive(struct proc *p, workq_kern_threadreq_flags_t flags)
2460 {
2461         struct workqueue *wq = proc_get_wqptr_fast(p);
2462
2463         workq_lock_spin(wq);
2464         workq_schedule_creator(p, wq, flags);
2465         workq_unlock(wq);
2466 }
2467
2468 void
2469 workq_schedule_creator_turnstile_redrive(struct workqueue *wq, bool locked)
2470 {
2471         if (locked) {
2472                 workq_schedule_creator(NULL, wq, WORKQ_THREADREQ_NONE);
2473         } else {
2474                 workq_schedule_immediate_thread_creation(wq);
2475         }
2476 }
2477
2478 static int
2479 workq_thread_return(struct proc *p, struct workq_kernreturn_args *uap,
2480     struct workqueue *wq)
2481 {
2482         thread_t th = current_thread();
2483         struct uthread *uth = get_bsdthread_info(th);
2484         workq_threadreq_t kqr = uth->uu_kqr_bound;
2485         workq_threadreq_param_t trp = { };
2486         int nevents = uap->affinity, error;
2487         user_addr_t eventlist = uap->item;
2488
2489         if (((thread_get_tag(th) & THREAD_TAG_WORKQUEUE) == 0) ||
2490             (uth->uu_workq_flags & UT_WORKQ_DYING)) {
2491                 return EINVAL;
2492         }
2493
2494         if (eventlist && nevents && kqr == NULL) {
2495                 return EINVAL;
2496         }
2497
2498         /* reset signal mask on the workqueue thread to default state */
2499         if (uth->uu_sigmask != (sigset_t)(~workq_threadmask)) {
2500                 proc_lock(p);
2501                 uth->uu_sigmask = ~workq_threadmask;
2502                 proc_unlock(p);
2503         }
2504
2505         if (kqr && kqr->tr_flags & WORKQ_TR_FLAG_WL_PARAMS) {
2506                 /*
2507                  * Ensure we store the threadreq param before unbinding
2508                  * the kqr from this thread.
2509                  */
2510                 trp = kqueue_threadreq_workloop_param(kqr);
2511         }
2512
2513         /*
2514          * Freeze thee base pri while we decide the fate of this thread.
2515          *
2516          * Either:
2517          * - we return to user and kevent_cleanup will have unfrozen the base pri,
2518          * - or we proceed to workq_select_threadreq_or_park_and_unlock() who will.
2519          */
2520         thread_freeze_base_pri(th);
2521
2522         if (kqr) {
2523                 uint32_t upcall_flags = WQ_FLAG_THREAD_NEWSPI | WQ_FLAG_THREAD_REUSE;
2524                 if (kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP) {
2525                         upcall_flags |= WQ_FLAG_THREAD_WORKLOOP | WQ_FLAG_THREAD_KEVENT;
2526                 } else {
2527                         upcall_flags |= WQ_FLAG_THREAD_KEVENT;
2528                 }
2529                 if (uth->uu_workq_pri.qos_bucket == WORKQ_THREAD_QOS_MANAGER) {
2530                         upcall_flags |= WQ_FLAG_THREAD_EVENT_MANAGER;
2531                 } else {
2532                         if (uth->uu_workq_flags & UT_WORKQ_OVERCOMMIT) {
2533                                 upcall_flags |= WQ_FLAG_THREAD_OVERCOMMIT;
2534                         }
2535                         if (uth->uu_workq_flags & UT_WORKQ_OUTSIDE_QOS) {
2536                                 upcall_flags |= WQ_FLAG_THREAD_OUTSIDEQOS;
2537                         } else {
2538                                 upcall_flags |= uth->uu_workq_pri.qos_req |
2539                                     WQ_FLAG_THREAD_PRIO_QOS;
2540                         }
2541                 }
2542
2543                 error = pthread_functions->workq_handle_stack_events(p, th,
2544                     get_task_map(p->task), uth->uu_workq_stackaddr,
2545                     uth->uu_workq_thport, eventlist, nevents, upcall_flags);
2546                 if (error) {
2547                         assert(uth->uu_kqr_bound == kqr);
2548                         return error;
2549                 }
2550
2551                 // pthread is supposed to pass KEVENT_FLAG_PARKING here
2552                 // which should cause the above call to either:
2553                 // - not return
2554                 // - return an error
2555                 // - return 0 and have unbound properly
2556                 assert(uth->uu_kqr_bound == NULL);
2557         }
2558
2559         WQ_TRACE_WQ(TRACE_wq_runthread | DBG_FUNC_END, wq, uap->options, 0, 0, 0);
2560
2561         thread_sched_call(th, NULL);
2562         thread_will_park_or_terminate(th);
2563 #if CONFIG_WORKLOOP_DEBUG
2564         UU_KEVENT_HISTORY_WRITE_ENTRY(uth, { .uu_error = -1, });
2565 #endif
2566
2567         workq_lock_spin(wq);
2568         WQ_TRACE_WQ(TRACE_wq_thread_logical_run | DBG_FUNC_END, wq, 0, 0, 0, 0);
2569         uth->uu_save.uus_workq_park_data.workloop_params = trp.trp_value;
2570         workq_select_threadreq_or_park_and_unlock(p, wq, uth,
2571             WQ_SETUP_CLEAR_VOUCHER);
2572         __builtin_unreachable();
2573 }
2574
2575 /**
2576  * Multiplexed call to interact with the workqueue mechanism
2577  */
2578 int
2579 workq_kernreturn(struct proc *p, struct workq_kernreturn_args *uap, int32_t *retval)
2580 {
2581         int options = uap->options;
2582         int arg2 = uap->affinity;
2583         int arg3 = uap->prio;
2584         struct workqueue *wq = proc_get_wqptr(p);
2585         int error = 0;
2586
2587         if ((p->p_lflag & P_LREGISTER) == 0) {
2588                 return EINVAL;
2589         }
2590
2591         switch (options) {
2592         case WQOPS_QUEUE_NEWSPISUPP: {
2593                 /*
2594                  * arg2 = offset of serialno into dispatch queue
2595                  * arg3 = kevent support
2596                  */
2597                 int offset = arg2;
2598                 if (arg3 & 0x01) {
2599                         // If we get here, then userspace has indicated support for kevent delivery.
2600                 }
2601
2602                 p->p_dispatchqueue_serialno_offset = (uint64_t)offset;
2603                 break;
2604         }
2605         case WQOPS_QUEUE_REQTHREADS: {
2606                 /*
2607                  * arg2 = number of threads to start
2608                  * arg3 = priority
2609                  */
2610                 error = workq_reqthreads(p, arg2, arg3);
2611                 break;
2612         }
2613         case WQOPS_SET_EVENT_MANAGER_PRIORITY: {
2614                 /*
2615                  * arg2 = priority for the manager thread
2616                  *
2617                  * if _PTHREAD_PRIORITY_SCHED_PRI_FLAG is set,
2618                  * the low bits of the value contains a scheduling priority
2619                  * instead of a QOS value
2620                  */
2621                 pthread_priority_t pri = arg2;
2622
2623                 if (wq == NULL) {
2624                         error = EINVAL;
2625                         break;
2626                 }
2627
2628                 /*
2629                  * Normalize the incoming priority so that it is ordered numerically.
2630                  */
2631                 if (pri & _PTHREAD_PRIORITY_SCHED_PRI_FLAG) {
2632                         pri &= (_PTHREAD_PRIORITY_SCHED_PRI_MASK |
2633                             _PTHREAD_PRIORITY_SCHED_PRI_FLAG);
2634                 } else {
2635                         thread_qos_t qos = _pthread_priority_thread_qos(pri);
2636                         int relpri = _pthread_priority_relpri(pri);
2637                         if (relpri > 0 || relpri < THREAD_QOS_MIN_TIER_IMPORTANCE ||
2638                             qos == THREAD_QOS_UNSPECIFIED) {
2639                                 error = EINVAL;
2640                                 break;
2641                         }
2642                         pri &= ~_PTHREAD_PRIORITY_FLAGS_MASK;
2643                 }
2644
2645                 /*
2646                  * If userspace passes a scheduling priority, that wins over any QoS.
2647                  * Userspace should takes care not to lower the priority this way.
2648                  */
2649                 workq_lock_spin(wq);
2650                 if (wq->wq_event_manager_priority < (uint32_t)pri) {
2651                         wq->wq_event_manager_priority = (uint32_t)pri;
2652                 }
2653                 workq_unlock(wq);
2654                 break;
2655         }
2656         case WQOPS_THREAD_KEVENT_RETURN:
2657         case WQOPS_THREAD_WORKLOOP_RETURN:
2658         case WQOPS_THREAD_RETURN: {
2659                 error = workq_thread_return(p, uap, wq);
2660                 break;
2661         }
2662
2663         case WQOPS_SHOULD_NARROW: {
2664                 /*
2665                  * arg2 = priority to test
2666                  * arg3 = unused
2667                  */
2668                 thread_t th = current_thread();
2669                 struct uthread *uth = get_bsdthread_info(th);
2670                 if (((thread_get_tag(th) & THREAD_TAG_WORKQUEUE) == 0) ||
2671                     (uth->uu_workq_flags & (UT_WORKQ_DYING | UT_WORKQ_OVERCOMMIT))) {
2672                         error = EINVAL;
2673                         break;
2674                 }
2675
2676                 thread_qos_t qos = _pthread_priority_thread_qos(arg2);
2677                 if (qos == THREAD_QOS_UNSPECIFIED) {
2678                         error = EINVAL;
2679                         break;
2680                 }
2681                 workq_lock_spin(wq);
2682                 bool should_narrow = !workq_constrained_allowance(wq, qos, uth, false);
2683                 workq_unlock(wq);
2684
2685                 *retval = should_narrow;
2686                 break;
2687         }
2688         case WQOPS_SETUP_DISPATCH: {
2689                 /*
2690                  * item = pointer to workq_dispatch_config structure
2691                  * arg2 = sizeof(item)
2692                  */
2693                 struct workq_dispatch_config cfg;
2694                 bzero(&cfg, sizeof(cfg));
2695
2696                 error = copyin(uap->item, &cfg, MIN(sizeof(cfg), (unsigned long) arg2));
2697                 if (error) {
2698                         break;
2699                 }
2700
2701                 if (cfg.wdc_flags & ~WORKQ_DISPATCH_SUPPORTED_FLAGS ||
2702                     cfg.wdc_version < WORKQ_DISPATCH_MIN_SUPPORTED_VERSION) {
2703                         error = ENOTSUP;
2704                         break;
2705                 }
2706
2707                 /* Load fields from version 1 */
2708                 p->p_dispatchqueue_serialno_offset = cfg.wdc_queue_serialno_offs;
2709
2710                 /* Load fields from version 2 */
2711                 if (cfg.wdc_version >= 2) {
2712                         p->p_dispatchqueue_label_offset = cfg.wdc_queue_label_offs;
2713                 }
2714
2715                 break;
2716         }
2717         default:
2718                 error = EINVAL;
2719                 break;
2720         }
2721
2722         return error;
2723 }
2724
2725 /*
2726  * We have no work to do, park ourselves on the idle list.
2727  *
2728  * Consumes the workqueue lock and does not return.
2729  */
2730 __attribute__((noreturn, noinline))
2731 static void
2732 workq_park_and_unlock(proc_t p, struct workqueue *wq, struct uthread *uth,
2733     uint32_t setup_flags)
2734 {
2735         assert(uth == current_uthread());
2736         assert(uth->uu_kqr_bound == NULL);
2737         workq_push_idle_thread(p, wq, uth, setup_flags); // may not return
2738
2739         workq_thread_reset_cpupercent(NULL, uth);
2740
2741         if ((uth->uu_workq_flags & UT_WORKQ_IDLE_CLEANUP) &&
2742             !(uth->uu_workq_flags & UT_WORKQ_DYING)) {
2743                 workq_unlock(wq);
2744
2745                 /*
2746                  * workq_push_idle_thread() will unset `has_stack`
2747                  * if it wants us to free the stack before parking.
2748                  */
2749                 if (!uth->uu_save.uus_workq_park_data.has_stack) {
2750                         pthread_functions->workq_markfree_threadstack(p, uth->uu_thread,
2751                             get_task_map(p->task), uth->uu_workq_stackaddr);
2752                 }
2753
2754                 /*
2755                  * When we remove the voucher from the thread, we may lose our importance
2756                  * causing us to get preempted, so we do this after putting the thread on
2757                  * the idle list.  Then, when we get our importance back we'll be able to
2758                  * use this thread from e.g. the kevent call out to deliver a boosting
2759                  * message.
2760                  */
2761                 __assert_only kern_return_t kr;
2762                 kr = thread_set_voucher_name(MACH_PORT_NULL);
2763                 assert(kr == KERN_SUCCESS);
2764
2765                 workq_lock_spin(wq);
2766                 uth->uu_workq_flags &= ~UT_WORKQ_IDLE_CLEANUP;
2767                 setup_flags &= ~WQ_SETUP_CLEAR_VOUCHER;
2768         }
2769
2770         if (uth->uu_workq_flags & UT_WORKQ_RUNNING) {
2771                 /*
2772                  * While we'd dropped the lock to unset our voucher, someone came
2773                  * around and made us runnable.  But because we weren't waiting on the
2774                  * event their thread_wakeup() was ineffectual.  To correct for that,
2775                  * we just run the continuation ourselves.
2776                  */
2777                 WQ_TRACE_WQ(TRACE_wq_thread_logical_run | DBG_FUNC_END, wq, 0, 0, 0, 0);
2778                 workq_unpark_select_threadreq_or_park_and_unlock(p, wq, uth, setup_flags);
2779                 __builtin_unreachable();
2780         }
2781
2782         if (uth->uu_workq_flags & UT_WORKQ_DYING) {
2783                 workq_unpark_for_death_and_unlock(p, wq, uth,
2784                     WORKQ_UNPARK_FOR_DEATH_WAS_IDLE, setup_flags);
2785                 __builtin_unreachable();
2786         }
2787
2788         thread_set_pending_block_hint(uth->uu_thread, kThreadWaitParkedWorkQueue);
2789         assert_wait(workq_parked_wait_event(uth), THREAD_INTERRUPTIBLE);
2790         workq_unlock(wq);
2791         WQ_TRACE_WQ(TRACE_wq_thread_logical_run | DBG_FUNC_END, wq, 0, 0, 0, 0);
2792         thread_block(workq_unpark_continue);
2793         __builtin_unreachable();
2794 }
2795
2796 static inline bool
2797 workq_may_start_event_mgr_thread(struct workqueue *wq, struct uthread *uth)
2798 {
2799         /*
2800          * There's an event manager request and either:
2801          * - no event manager currently running
2802          * - we are re-using the event manager
2803          */
2804         return wq->wq_thscheduled_count[_wq_bucket(WORKQ_THREAD_QOS_MANAGER)] == 0 ||
2805                (uth && uth->uu_workq_pri.qos_bucket == WORKQ_THREAD_QOS_MANAGER);
2806 }
2807
2808 static uint32_t
2809 workq_constrained_allowance(struct workqueue *wq, thread_qos_t at_qos,
2810     struct uthread *uth, bool may_start_timer)
2811 {
2812         assert(at_qos != WORKQ_THREAD_QOS_MANAGER);
2813         uint32_t count = 0;
2814
2815         uint32_t max_count = wq->wq_constrained_threads_scheduled;
2816         if (uth && (uth->uu_workq_flags & UT_WORKQ_OVERCOMMIT) == 0) {
2817                 /*
2818                  * don't count the current thread as scheduled
2819                  */
2820                 assert(max_count > 0);
2821                 max_count--;
2822         }
2823         if (max_count >= wq_max_constrained_threads) {
2824                 WQ_TRACE_WQ(TRACE_wq_constrained_admission | DBG_FUNC_NONE, wq, 1,
2825                     wq->wq_constrained_threads_scheduled,
2826                     wq_max_constrained_threads, 0);
2827                 /*
2828                  * we need 1 or more constrained threads to return to the kernel before
2829                  * we can dispatch additional work
2830                  */
2831                 return 0;
2832         }
2833         max_count -= wq_max_constrained_threads;
2834
2835         /*
2836          * Compute a metric for many how many threads are active.  We find the
2837          * highest priority request outstanding and then add up the number of
2838          * active threads in that and all higher-priority buckets.  We'll also add
2839          * any "busy" threads which are not active but blocked recently enough that
2840          * we can't be sure they've gone idle yet.  We'll then compare this metric
2841          * to our max concurrency to decide whether to add a new thread.
2842          */
2843
2844         uint32_t busycount, thactive_count;
2845
2846         thactive_count = _wq_thactive_aggregate_downto_qos(wq, _wq_thactive(wq),
2847             at_qos, &busycount, NULL);
2848
2849         if (uth && uth->uu_workq_pri.qos_bucket != WORKQ_THREAD_QOS_MANAGER &&
2850             at_qos <= uth->uu_workq_pri.qos_bucket) {
2851                 /*
2852                  * Don't count this thread as currently active, but only if it's not
2853                  * a manager thread, as _wq_thactive_aggregate_downto_qos ignores active
2854                  * managers.
2855                  */
2856                 assert(thactive_count > 0);
2857                 thactive_count--;
2858         }
2859
2860         count = wq_max_parallelism[_wq_bucket(at_qos)];
2861         if (count > thactive_count + busycount) {
2862                 count -= thactive_count + busycount;
2863                 WQ_TRACE_WQ(TRACE_wq_constrained_admission | DBG_FUNC_NONE, wq, 2,
2864                     thactive_count, busycount, 0);
2865                 return MIN(count, max_count);
2866         } else {
2867                 WQ_TRACE_WQ(TRACE_wq_constrained_admission | DBG_FUNC_NONE, wq, 3,
2868                     thactive_count, busycount, 0);
2869         }
2870
2871         if (busycount && may_start_timer) {
2872                 /*
2873                  * If this is called from the add timer, we won't have another timer
2874                  * fire when the thread exits the "busy" state, so rearm the timer.
2875                  */
2876                 workq_schedule_delayed_thread_creation(wq, 0);
2877         }
2878
2879         return 0;
2880 }
2881
2882 static bool
2883 workq_threadreq_admissible(struct workqueue *wq, struct uthread *uth,
2884     workq_threadreq_t req)
2885 {
2886         if (req->tr_qos == WORKQ_THREAD_QOS_MANAGER) {
2887                 return workq_may_start_event_mgr_thread(wq, uth);
2888         }
2889         if ((req->tr_flags & WORKQ_TR_FLAG_OVERCOMMIT) == 0) {
2890                 return workq_constrained_allowance(wq, req->tr_qos, uth, true);
2891         }
2892         return true;
2893 }
2894
2895 static workq_threadreq_t
2896 workq_threadreq_select_for_creator(struct workqueue *wq)
2897 {
2898         workq_threadreq_t req_qos, req_pri, req_tmp;
2899         thread_qos_t qos = THREAD_QOS_UNSPECIFIED;
2900         uint8_t pri = 0;
2901
2902         req_tmp = wq->wq_event_manager_threadreq;
2903         if (req_tmp && workq_may_start_event_mgr_thread(wq, NULL)) {
2904                 return req_tmp;
2905         }
2906
2907         /*
2908          * Compute the best priority request, and ignore the turnstile for now
2909          */
2910
2911         req_pri = priority_queue_max(&wq->wq_special_queue,
2912             struct workq_threadreq_s, tr_entry);
2913         if (req_pri) {
2914                 pri = priority_queue_entry_key(&wq->wq_special_queue, &req_pri->tr_entry);
2915         }
2916
2917         /*
2918          * Compute the best QoS Request, and check whether it beats the "pri" one
2919          */
2920
2921         req_qos = priority_queue_max(&wq->wq_overcommit_queue,
2922             struct workq_threadreq_s, tr_entry);
2923         if (req_qos) {
2924                 qos = req_qos->tr_qos;
2925         }
2926
2927         req_tmp = priority_queue_max(&wq->wq_constrained_queue,
2928             struct workq_threadreq_s, tr_entry);
2929
2930         if (req_tmp && qos < req_tmp->tr_qos) {
2931                 if (pri && pri >= thread_workq_pri_for_qos(req_tmp->tr_qos)) {
2932                         return req_pri;
2933                 }
2934
2935                 if (workq_constrained_allowance(wq, req_tmp->tr_qos, NULL, true)) {
2936                         /*
2937                          * If the constrained thread request is the best one and passes
2938                          * the admission check, pick it.
2939                          */
2940                         return req_tmp;
2941                 }
2942         }
2943
2944         if (pri && (!qos || pri >= thread_workq_pri_for_qos(qos))) {
2945                 return req_pri;
2946         }
2947
2948         if (req_qos) {
2949                 return req_qos;
2950         }
2951
2952         /*
2953          * If we had no eligible request but we have a turnstile push,
2954          * it must be a non overcommit thread request that failed
2955          * the admission check.
2956          *
2957          * Just fake a BG thread request so that if the push stops the creator
2958          * priority just drops to 4.
2959          */
2960         if (turnstile_workq_proprietor_of_max_turnstile(wq->wq_turnstile, NULL)) {
2961                 static struct workq_threadreq_s workq_sync_push_fake_req = {
2962                         .tr_qos = THREAD_QOS_BACKGROUND,
2963                 };
2964
2965                 return &workq_sync_push_fake_req;
2966         }
2967
2968         return NULL;
2969 }
2970
2971 static workq_threadreq_t
2972 workq_threadreq_select(struct workqueue *wq, struct uthread *uth)
2973 {
2974         workq_threadreq_t req_qos, req_pri, req_tmp;
2975         uintptr_t proprietor;
2976         thread_qos_t qos = THREAD_QOS_UNSPECIFIED;
2977         uint8_t pri = 0;
2978
2979         if (uth == wq->wq_creator) {
2980                 uth = NULL;
2981         }
2982
2983         req_tmp = wq->wq_event_manager_threadreq;
2984         if (req_tmp && workq_may_start_event_mgr_thread(wq, uth)) {
2985                 return req_tmp;
2986         }
2987
2988         /*
2989          * Compute the best priority request (special or turnstile)
2990          */
2991
2992         pri = turnstile_workq_proprietor_of_max_turnstile(wq->wq_turnstile,
2993             &proprietor);
2994         if (pri) {
2995                 struct kqworkloop *kqwl = (struct kqworkloop *)proprietor;
2996                 req_pri = &kqwl->kqwl_request;
2997                 if (req_pri->tr_state != WORKQ_TR_STATE_QUEUED) {
2998                         panic("Invalid thread request (%p) state %d",
2999                             req_pri, req_pri->tr_state);
3000                 }
3001         } else {
3002                 req_pri = NULL;
3003         }
3004
3005         req_tmp = priority_queue_max(&wq->wq_special_queue,
3006             struct workq_threadreq_s, tr_entry);
3007         if (req_tmp && pri < priority_queue_entry_key(&wq->wq_special_queue,
3008             &req_tmp->tr_entry)) {
3009                 req_pri = req_tmp;
3010                 pri = priority_queue_entry_key(&wq->wq_special_queue, &req_tmp->tr_entry);
3011         }
3012
3013         /*
3014          * Compute the best QoS Request, and check whether it beats the "pri" one
3015          */
3016
3017         req_qos = priority_queue_max(&wq->wq_overcommit_queue,
3018             struct workq_threadreq_s, tr_entry);
3019         if (req_qos) {
3020                 qos = req_qos->tr_qos;
3021         }
3022
3023         req_tmp = priority_queue_max(&wq->wq_constrained_queue,
3024             struct workq_threadreq_s, tr_entry);
3025
3026         if (req_tmp && qos < req_tmp->tr_qos) {
3027                 if (pri && pri >= thread_workq_pri_for_qos(req_tmp->tr_qos)) {
3028                         return req_pri;
3029                 }
3030
3031                 if (workq_constrained_allowance(wq, req_tmp->tr_qos, uth, true)) {
3032                         /*
3033                          * If the constrained thread request is the best one and passes
3034                          * the admission check, pick it.
3035                          */
3036                         return req_tmp;
3037                 }
3038         }
3039
3040         if (req_pri && (!qos || pri >= thread_workq_pri_for_qos(qos))) {
3041                 return req_pri;
3042         }
3043
3044         return req_qos;
3045 }
3046
3047 /*
3048  * The creator is an anonymous thread that is counted as scheduled,
3049  * but otherwise without its scheduler callback set or tracked as active
3050  * that is used to make other threads.
3051  *
3052  * When more requests are added or an existing one is hurried along,
3053  * a creator is elected and setup, or the existing one overridden accordingly.
3054  *
3055  * While this creator is in flight, because no request has been dequeued,
3056  * already running threads have a chance at stealing thread requests avoiding
3057  * useless context switches, and the creator once scheduled may not find any
3058  * work to do and will then just park again.
3059  *
3060  * The creator serves the dual purpose of informing the scheduler of work that
3061  * hasn't be materialized as threads yet, and also as a natural pacing mechanism
3062  * for thread creation.
3063  *
3064  * By being anonymous (and not bound to anything) it means that thread requests
3065  * can be stolen from this creator by threads already on core yielding more
3066  * efficient scheduling and reduced context switches.
3067  */
3068 static void
3069 workq_schedule_creator(proc_t p, struct workqueue *wq,
3070     workq_kern_threadreq_flags_t flags)
3071 {
3072         workq_threadreq_t req;
3073         struct uthread *uth;
3074         bool needs_wakeup;
3075
3076         workq_lock_held(wq);
3077         assert(p || (flags & WORKQ_THREADREQ_CAN_CREATE_THREADS) == 0);
3078
3079 again:
3080         uth = wq->wq_creator;
3081
3082         if (!wq->wq_reqcount) {
3083                 /*
3084                  * There is no thread request left.
3085                  *
3086                  * If there is a creator, leave everything in place, so that it cleans
3087                  * up itself in workq_push_idle_thread().
3088                  *
3089                  * Else, make sure the turnstile state is reset to no inheritor.
3090                  */
3091                 if (uth == NULL) {
3092                         workq_turnstile_update_inheritor(wq, TURNSTILE_INHERITOR_NULL, 0);
3093                 }
3094                 return;
3095         }
3096
3097         req = workq_threadreq_select_for_creator(wq);
3098         if (req == NULL) {
3099                 /*
3100                  * There isn't a thread request that passes the admission check.
3101                  *
3102                  * If there is a creator, do not touch anything, the creator will sort
3103                  * it out when it runs.
3104                  *
3105                  * Else, set the inheritor to "WORKQ" so that the turnstile propagation
3106                  * code calls us if anything changes.
3107                  */
3108                 if (uth == NULL) {
3109                         workq_turnstile_update_inheritor(wq, wq, TURNSTILE_INHERITOR_WORKQ);
3110                 }
3111                 return;
3112         }
3113
3114         if (uth) {
3115                 /*
3116                  * We need to maybe override the creator we already have
3117                  */
3118                 if (workq_thread_needs_priority_change(req, uth)) {
3119                         WQ_TRACE_WQ(TRACE_wq_creator_select | DBG_FUNC_NONE,
3120                             wq, 1, thread_tid(uth->uu_thread), req->tr_qos, 0);
3121                         workq_thread_reset_pri(wq, uth, req, /*unpark*/ true);
3122                 }
3123                 assert(wq->wq_inheritor == uth->uu_thread);
3124         } else if (wq->wq_thidlecount) {
3125                 /*
3126                  * We need to unpark a creator thread
3127                  */
3128                 wq->wq_creator = uth = workq_pop_idle_thread(wq, UT_WORKQ_OVERCOMMIT,
3129                     &needs_wakeup);
3130                 if (workq_thread_needs_priority_change(req, uth)) {
3131                         workq_thread_reset_pri(wq, uth, req, /*unpark*/ true);
3132                 }
3133                 workq_turnstile_update_inheritor(wq, uth->uu_thread,
3134                     TURNSTILE_INHERITOR_THREAD);
3135                 WQ_TRACE_WQ(TRACE_wq_creator_select | DBG_FUNC_NONE,
3136                     wq, 2, thread_tid(uth->uu_thread), req->tr_qos, 0);
3137                 uth->uu_save.uus_workq_park_data.fulfilled_snapshot = wq->wq_fulfilled;
3138                 uth->uu_save.uus_workq_park_data.yields = 0;
3139                 if (needs_wakeup) {
3140                         workq_thread_wakeup(uth);
3141                 }
3142         } else {
3143                 /*
3144                  * We need to allocate a thread...
3145                  */
3146                 if (__improbable(wq->wq_nthreads >= wq_max_threads)) {
3147                         /* out of threads, just go away */
3148                         flags = WORKQ_THREADREQ_NONE;
3149                 } else if (flags & WORKQ_THREADREQ_SET_AST_ON_FAILURE) {
3150                         act_set_astkevent(current_thread(), AST_KEVENT_REDRIVE_THREADREQ);
3151                 } else if (!(flags & WORKQ_THREADREQ_CAN_CREATE_THREADS)) {
3152                         /* This can drop the workqueue lock, and take it again */
3153                         workq_schedule_immediate_thread_creation(wq);
3154                 } else if (workq_add_new_idle_thread(p, wq)) {
3155                         goto again;
3156                 } else {
3157                         workq_schedule_delayed_thread_creation(wq, 0);
3158                 }
3159
3160                 /*
3161                  * If the current thread is the inheritor:
3162                  *
3163                  * If we set the AST, then the thread will stay the inheritor until
3164                  * either the AST calls workq_kern_threadreq_redrive(), or it parks
3165                  * and calls workq_push_idle_thread().
3166                  *
3167                  * Else, the responsibility of the thread creation is with a thread-call
3168                  * and we need to clear the inheritor.
3169                  */
3170                 if ((flags & WORKQ_THREADREQ_SET_AST_ON_FAILURE) == 0 &&
3171                     wq->wq_inheritor == current_thread()) {
3172                         workq_turnstile_update_inheritor(wq, TURNSTILE_INHERITOR_NULL, 0);
3173                 }
3174         }
3175 }
3176
3177 /**
3178  * Same as workq_unpark_select_threadreq_or_park_and_unlock,
3179  * but do not allow early binds.
3180  *
3181  * Called with the base pri frozen, will unfreeze it.
3182  */
3183 __attribute__((noreturn, noinline))
3184 static void
3185 workq_select_threadreq_or_park_and_unlock(proc_t p, struct workqueue *wq,
3186     struct uthread *uth, uint32_t setup_flags)
3187 {
3188         workq_threadreq_t req = NULL;
3189         bool is_creator = (wq->wq_creator == uth);
3190         bool schedule_creator = false;
3191
3192         if (__improbable(_wq_exiting(wq))) {
3193                 WQ_TRACE_WQ(TRACE_wq_select_threadreq | DBG_FUNC_NONE, wq, 0, 0, 0, 0);
3194                 goto park;
3195         }
3196
3197         if (wq->wq_reqcount == 0) {
3198                 WQ_TRACE_WQ(TRACE_wq_select_threadreq | DBG_FUNC_NONE, wq, 1, 0, 0, 0);
3199                 goto park;
3200         }
3201
3202         req = workq_threadreq_select(wq, uth);
3203         if (__improbable(req == NULL)) {
3204                 WQ_TRACE_WQ(TRACE_wq_select_threadreq | DBG_FUNC_NONE, wq, 2, 0, 0, 0);
3205                 goto park;
3206         }
3207
3208         uint8_t tr_flags = req->tr_flags;
3209         struct turnstile *req_ts = kqueue_threadreq_get_turnstile(req);
3210
3211         /*
3212          * Attempt to setup ourselves as the new thing to run, moving all priority
3213          * pushes to ourselves.
3214          *
3215          * If the current thread is the creator, then the fact that we are presently
3216          * running is proof that we'll do something useful, so keep going.
3217          *
3218          * For other cases, peek at the AST to know whether the scheduler wants
3219          * to preempt us, if yes, park instead, and move the thread request
3220          * turnstile back to the workqueue.
3221          */
3222         if (req_ts) {
3223                 workq_perform_turnstile_operation_locked(wq, ^{
3224                         turnstile_update_inheritor(req_ts, uth->uu_thread,
3225                         TURNSTILE_IMMEDIATE_UPDATE | TURNSTILE_INHERITOR_THREAD);
3226                         turnstile_update_inheritor_complete(req_ts,
3227                         TURNSTILE_INTERLOCK_HELD);
3228                 });
3229         }
3230
3231         if (is_creator) {
3232                 WQ_TRACE_WQ(TRACE_wq_creator_select, wq, 4, 0,
3233                     uth->uu_save.uus_workq_park_data.yields, 0);
3234                 wq->wq_creator = NULL;
3235                 _wq_thactive_inc(wq, req->tr_qos);
3236                 wq->wq_thscheduled_count[_wq_bucket(req->tr_qos)]++;
3237         } else if (uth->uu_workq_pri.qos_bucket != req->tr_qos) {
3238                 _wq_thactive_move(wq, uth->uu_workq_pri.qos_bucket, req->tr_qos);
3239         }
3240
3241         workq_thread_reset_pri(wq, uth, req, /*unpark*/ true);
3242
3243         thread_unfreeze_base_pri(uth->uu_thread);
3244 #if 0 // <rdar://problem/55259863> to turn this back on
3245         if (__improbable(thread_unfreeze_base_pri(uth->uu_thread) && !is_creator)) {
3246                 if (req_ts) {
3247                         workq_perform_turnstile_operation_locked(wq, ^{
3248                                 turnstile_update_inheritor(req_ts, wq->wq_turnstile,
3249                                 TURNSTILE_IMMEDIATE_UPDATE | TURNSTILE_INHERITOR_TURNSTILE);
3250                                 turnstile_update_inheritor_complete(req_ts,
3251                                 TURNSTILE_INTERLOCK_HELD);
3252                         });
3253                 }
3254                 WQ_TRACE_WQ(TRACE_wq_select_threadreq | DBG_FUNC_NONE, wq, 3, 0, 0, 0);
3255                 goto park_thawed;
3256         }
3257 #endif
3258
3259         /*
3260          * We passed all checks, dequeue the request, bind to it, and set it up
3261          * to return to user.
3262          */
3263         WQ_TRACE_WQ(TRACE_wq_thread_logical_run | DBG_FUNC_START, wq,
3264             workq_trace_req_id(req), 0, 0, 0);
3265         wq->wq_fulfilled++;
3266         schedule_creator = workq_threadreq_dequeue(wq, req);
3267
3268         if (tr_flags & (WORKQ_TR_FLAG_KEVENT | WORKQ_TR_FLAG_WORKLOOP)) {
3269                 kqueue_threadreq_bind_prepost(p, req, uth);
3270                 req = NULL;
3271         } else if (req->tr_count > 0) {
3272                 req = NULL;
3273         }
3274
3275         workq_thread_reset_cpupercent(req, uth);
3276         if (uth->uu_workq_flags & UT_WORKQ_NEW) {
3277                 uth->uu_workq_flags ^= UT_WORKQ_NEW;
3278                 setup_flags |= WQ_SETUP_FIRST_USE;
3279         }
3280         if (tr_flags & WORKQ_TR_FLAG_OVERCOMMIT) {
3281                 if ((uth->uu_workq_flags & UT_WORKQ_OVERCOMMIT) == 0) {
3282                         uth->uu_workq_flags |= UT_WORKQ_OVERCOMMIT;
3283                         wq->wq_constrained_threads_scheduled--;
3284                 }
3285         } else {
3286                 if ((uth->uu_workq_flags & UT_WORKQ_OVERCOMMIT) != 0) {
3287                         uth->uu_workq_flags &= ~UT_WORKQ_OVERCOMMIT;
3288                         wq->wq_constrained_threads_scheduled++;
3289                 }
3290         }
3291
3292         if (is_creator || schedule_creator) {
3293                 /* This can drop the workqueue lock, and take it again */
3294                 workq_schedule_creator(p, wq, WORKQ_THREADREQ_CAN_CREATE_THREADS);
3295         }
3296
3297         workq_unlock(wq);
3298
3299         if (req) {
3300                 zfree(workq_zone_threadreq, req);
3301         }
3302
3303         /*
3304          * Run Thread, Run!
3305          */
3306         uint32_t upcall_flags = WQ_FLAG_THREAD_NEWSPI;
3307         if (uth->uu_workq_pri.qos_bucket == WORKQ_THREAD_QOS_MANAGER) {
3308                 upcall_flags |= WQ_FLAG_THREAD_EVENT_MANAGER;
3309         } else if (tr_flags & WORKQ_TR_FLAG_OVERCOMMIT) {
3310                 upcall_flags |= WQ_FLAG_THREAD_OVERCOMMIT;
3311         }
3312         if (tr_flags & WORKQ_TR_FLAG_KEVENT) {
3313                 upcall_flags |= WQ_FLAG_THREAD_KEVENT;
3314         }
3315         if (tr_flags & WORKQ_TR_FLAG_WORKLOOP) {
3316                 upcall_flags |= WQ_FLAG_THREAD_WORKLOOP | WQ_FLAG_THREAD_KEVENT;
3317         }
3318         uth->uu_save.uus_workq_park_data.upcall_flags = upcall_flags;
3319
3320         if (tr_flags & (WORKQ_TR_FLAG_KEVENT | WORKQ_TR_FLAG_WORKLOOP)) {
3321                 kqueue_threadreq_bind_commit(p, uth->uu_thread);
3322         }
3323         workq_setup_and_run(p, uth, setup_flags);
3324         __builtin_unreachable();
3325
3326 park:
3327         thread_unfreeze_base_pri(uth->uu_thread);
3328 #if 0 // <rdar://problem/55259863>
3329 park_thawed:
3330 #endif
3331         workq_park_and_unlock(p, wq, uth, setup_flags);
3332 }
3333
3334 /**
3335  * Runs a thread request on a thread
3336  *
3337  * - if thread is THREAD_NULL, will find a thread and run the request there.
3338  *   Otherwise, the thread must be the current thread.
3339  *
3340  * - if req is NULL, will find the highest priority request and run that.  If
3341  *   it is not NULL, it must be a threadreq object in state NEW.  If it can not
3342  *   be run immediately, it will be enqueued and moved to state QUEUED.
3343  *
3344  *   Either way, the thread request object serviced will be moved to state
3345  *   BINDING and attached to the uthread.
3346  *
3347  * Should be called with the workqueue lock held.  Will drop it.
3348  * Should be called with the base pri not frozen.
3349  */
3350 __attribute__((noreturn, noinline))
3351 static void
3352 workq_unpark_select_threadreq_or_park_and_unlock(proc_t p, struct workqueue *wq,
3353     struct uthread *uth, uint32_t setup_flags)
3354 {
3355         if (uth->uu_workq_flags & UT_WORKQ_EARLY_BOUND) {
3356                 if (uth->uu_workq_flags & UT_WORKQ_NEW) {
3357                         setup_flags |= WQ_SETUP_FIRST_USE;
3358                 }
3359                 uth->uu_workq_flags &= ~(UT_WORKQ_NEW | UT_WORKQ_EARLY_BOUND);
3360                 /*
3361                  * This pointer is possibly freed and only used for tracing purposes.
3362                  */
3363                 workq_threadreq_t req = uth->uu_save.uus_workq_park_data.thread_request;
3364                 workq_unlock(wq);
3365                 WQ_TRACE_WQ(TRACE_wq_thread_logical_run | DBG_FUNC_START, wq,
3366                     VM_KERNEL_ADDRHIDE(req), 0, 0, 0);
3367                 (void)req;
3368                 workq_setup_and_run(p, uth, setup_flags);
3369                 __builtin_unreachable();
3370         }
3371
3372         thread_freeze_base_pri(uth->uu_thread);
3373         workq_select_threadreq_or_park_and_unlock(p, wq, uth, setup_flags);
3374 }
3375
3376 static bool
3377 workq_creator_should_yield(struct workqueue *wq, struct uthread *uth)
3378 {
3379         thread_qos_t qos = workq_pri_override(uth->uu_workq_pri);
3380
3381         if (qos >= THREAD_QOS_USER_INTERACTIVE) {
3382                 return false;
3383         }
3384
3385         uint32_t snapshot = uth->uu_save.uus_workq_park_data.fulfilled_snapshot;
3386         if (wq->wq_fulfilled == snapshot) {
3387                 return false;
3388         }
3389
3390         uint32_t cnt = 0, conc = wq_max_parallelism[_wq_bucket(qos)];
3391         if (wq->wq_fulfilled - snapshot > conc) {
3392                 /* we fulfilled more than NCPU requests since being dispatched */
3393                 WQ_TRACE_WQ(TRACE_wq_creator_yield, wq, 1,
3394                     wq->wq_fulfilled, snapshot, 0);
3395                 return true;
3396         }
3397
3398         for (int i = _wq_bucket(qos); i < WORKQ_NUM_QOS_BUCKETS; i++) {
3399                 cnt += wq->wq_thscheduled_count[i];
3400         }
3401         if (conc <= cnt) {
3402                 /* We fulfilled requests and have more than NCPU scheduled threads */
3403                 WQ_TRACE_WQ(TRACE_wq_creator_yield, wq, 2,
3404                     wq->wq_fulfilled, snapshot, 0);
3405                 return true;
3406         }
3407
3408         return false;
3409 }
3410
3411 /**
3412  * parked thread wakes up
3413  */
3414 __attribute__((noreturn, noinline))
3415 static void
3416 workq_unpark_continue(void *parameter __unused, wait_result_t wr __unused)
3417 {
3418         thread_t th = current_thread();
3419         struct uthread *uth = get_bsdthread_info(th);
3420         proc_t p = current_proc();
3421         struct workqueue *wq = proc_get_wqptr_fast(p);
3422
3423         workq_lock_spin(wq);
3424
3425         if (wq->wq_creator == uth && workq_creator_should_yield(wq, uth)) {
3426                 /*
3427                  * If the number of threads we have out are able to keep up with the
3428                  * demand, then we should avoid sending this creator thread to
3429                  * userspace.
3430                  */
3431                 uth->uu_save.uus_workq_park_data.fulfilled_snapshot = wq->wq_fulfilled;
3432                 uth->uu_save.uus_workq_park_data.yields++;
3433                 workq_unlock(wq);
3434                 thread_yield_with_continuation(workq_unpark_continue, NULL);
3435                 __builtin_unreachable();
3436         }
3437
3438         if (__probable(uth->uu_workq_flags & UT_WORKQ_RUNNING)) {
3439                 workq_unpark_select_threadreq_or_park_and_unlock(p, wq, uth, WQ_SETUP_NONE);
3440                 __builtin_unreachable();
3441         }
3442
3443         if (__probable(wr == THREAD_AWAKENED)) {
3444                 /*
3445                  * We were set running, but for the purposes of dying.
3446                  */
3447                 assert(uth->uu_workq_flags & UT_WORKQ_DYING);
3448                 assert((uth->uu_workq_flags & UT_WORKQ_NEW) == 0);
3449         } else {
3450                 /*
3451                  * workaround for <rdar://problem/38647347>,
3452                  * in case we do hit userspace, make sure calling
3453                  * workq_thread_terminate() does the right thing here,
3454                  * and if we never call it, that workq_exit() will too because it sees
3455                  * this thread on the runlist.
3456                  */
3457                 assert(wr == THREAD_INTERRUPTED);
3458                 wq->wq_thdying_count++;
3459                 uth->uu_workq_flags |= UT_WORKQ_DYING;
3460         }
3461
3462         workq_unpark_for_death_and_unlock(p, wq, uth,
3463             WORKQ_UNPARK_FOR_DEATH_WAS_IDLE, WQ_SETUP_NONE);
3464         __builtin_unreachable();
3465 }
3466
3467 __attribute__((noreturn, noinline))
3468 static void
3469 workq_setup_and_run(proc_t p, struct uthread *uth, int setup_flags)
3470 {
3471         thread_t th = uth->uu_thread;
3472         vm_map_t vmap = get_task_map(p->task);
3473
3474         if (setup_flags & WQ_SETUP_CLEAR_VOUCHER) {
3475                 /*
3476                  * For preemption reasons, we want to reset the voucher as late as
3477                  * possible, so we do it in two places:
3478                  *   - Just before parking (i.e. in workq_park_and_unlock())
3479                  *   - Prior to doing the setup for the next workitem (i.e. here)
3480                  *
3481                  * Those two places are sufficient to ensure we always reset it before
3482                  * it goes back out to user space, but be careful to not break that
3483                  * guarantee.
3484                  */
3485                 __assert_only kern_return_t kr;
3486                 kr = thread_set_voucher_name(MACH_PORT_NULL);
3487                 assert(kr == KERN_SUCCESS);
3488         }
3489
3490         uint32_t upcall_flags = uth->uu_save.uus_workq_park_data.upcall_flags;
3491         if (!(setup_flags & WQ_SETUP_FIRST_USE)) {
3492                 upcall_flags |= WQ_FLAG_THREAD_REUSE;
3493         }
3494
3495         if (uth->uu_workq_flags & UT_WORKQ_OUTSIDE_QOS) {
3496                 /*
3497                  * For threads that have an outside-of-QoS thread priority, indicate
3498                  * to userspace that setting QoS should only affect the TSD and not
3499                  * change QOS in the kernel.
3500                  */
3501                 upcall_flags |= WQ_FLAG_THREAD_OUTSIDEQOS;
3502         } else {
3503                 /*
3504                  * Put the QoS class value into the lower bits of the reuse_thread
3505                  * register, this is where the thread priority used to be stored
3506                  * anyway.
3507                  */
3508                 upcall_flags |= uth->uu_save.uus_workq_park_data.qos |
3509                     WQ_FLAG_THREAD_PRIO_QOS;
3510         }
3511
3512         if (uth->uu_workq_thport == MACH_PORT_NULL) {
3513                 /* convert_thread_to_port() consumes a reference */
3514                 thread_reference(th);
3515                 ipc_port_t port = convert_thread_to_port(th);
3516                 uth->uu_workq_thport = ipc_port_copyout_send(port, get_task_ipcspace(p->task));
3517         }
3518
3519         /*
3520          * Call out to pthread, this sets up the thread, pulls in kevent structs
3521          * onto the stack, sets up the thread state and then returns to userspace.
3522          */
3523         WQ_TRACE_WQ(TRACE_wq_runthread | DBG_FUNC_START,
3524             proc_get_wqptr_fast(p), 0, 0, 0, 0);
3525         thread_sched_call(th, workq_sched_callback);
3526         pthread_functions->workq_setup_thread(p, th, vmap, uth->uu_workq_stackaddr,
3527             uth->uu_workq_thport, 0, setup_flags, upcall_flags);
3528
3529         __builtin_unreachable();
3530 }
3531
3532 #pragma mark misc
3533
3534 int
3535 fill_procworkqueue(proc_t p, struct proc_workqueueinfo * pwqinfo)
3536 {
3537         struct workqueue *wq = proc_get_wqptr(p);
3538         int error = 0;
3539         int     activecount;
3540
3541         if (wq == NULL) {
3542                 return EINVAL;
3543         }
3544
3545         /*
3546          * This is sometimes called from interrupt context by the kperf sampler.
3547          * In that case, it's not safe to spin trying to take the lock since we
3548          * might already hold it.  So, we just try-lock it and error out if it's
3549          * already held.  Since this is just a debugging aid, and all our callers
3550          * are able to handle an error, that's fine.
3551          */
3552         bool locked = workq_lock_try(wq);
3553         if (!locked) {
3554                 return EBUSY;
3555         }
3556
3557         wq_thactive_t act = _wq_thactive(wq);
3558         activecount = _wq_thactive_aggregate_downto_qos(wq, act,
3559             WORKQ_THREAD_QOS_MIN, NULL, NULL);
3560         if (act & _wq_thactive_offset_for_qos(WORKQ_THREAD_QOS_MANAGER)) {
3561                 activecount++;
3562         }
3563         pwqinfo->pwq_nthreads = wq->wq_nthreads;
3564         pwqinfo->pwq_runthreads = activecount;
3565         pwqinfo->pwq_blockedthreads = wq->wq_threads_scheduled - activecount;
3566         pwqinfo->pwq_state = 0;
3567
3568         if (wq->wq_constrained_threads_scheduled >= wq_max_constrained_threads) {
3569                 pwqinfo->pwq_state |= WQ_EXCEEDED_CONSTRAINED_THREAD_LIMIT;
3570         }
3571
3572         if (wq->wq_nthreads >= wq_max_threads) {
3573                 pwqinfo->pwq_state |= WQ_EXCEEDED_TOTAL_THREAD_LIMIT;
3574         }
3575
3576         workq_unlock(wq);
3577         return error;
3578 }
3579
3580 boolean_t
3581 workqueue_get_pwq_exceeded(void *v, boolean_t *exceeded_total,
3582     boolean_t *exceeded_constrained)
3583 {
3584         proc_t p = v;
3585         struct proc_workqueueinfo pwqinfo;
3586         int err;
3587
3588         assert(p != NULL);
3589         assert(exceeded_total != NULL);
3590         assert(exceeded_constrained != NULL);
3591
3592         err = fill_procworkqueue(p, &pwqinfo);
3593         if (err) {
3594                 return FALSE;
3595         }
3596         if (!(pwqinfo.pwq_state & WQ_FLAGS_AVAILABLE)) {
3597                 return FALSE;
3598         }
3599
3600         *exceeded_total = (pwqinfo.pwq_state & WQ_EXCEEDED_TOTAL_THREAD_LIMIT);
3601         *exceeded_constrained = (pwqinfo.pwq_state & WQ_EXCEEDED_CONSTRAINED_THREAD_LIMIT);
3602
3603         return TRUE;
3604 }
3605
3606 uint32_t
3607 workqueue_get_pwq_state_kdp(void * v)
3608 {
3609         static_assert((WQ_EXCEEDED_CONSTRAINED_THREAD_LIMIT << 17) ==
3610             kTaskWqExceededConstrainedThreadLimit);
3611         static_assert((WQ_EXCEEDED_TOTAL_THREAD_LIMIT << 17) ==
3612             kTaskWqExceededTotalThreadLimit);
3613         static_assert((WQ_FLAGS_AVAILABLE << 17) == kTaskWqFlagsAvailable);
3614         static_assert((WQ_FLAGS_AVAILABLE | WQ_EXCEEDED_TOTAL_THREAD_LIMIT |
3615             WQ_EXCEEDED_CONSTRAINED_THREAD_LIMIT) == 0x7);
3616
3617         if (v == NULL) {
3618                 return 0;
3619         }
3620
3621         proc_t p = v;
3622         struct workqueue *wq = proc_get_wqptr(p);
3623
3624         if (wq == NULL || workq_lock_spin_is_acquired_kdp(wq)) {
3625                 return 0;
3626         }
3627
3628         uint32_t pwq_state = WQ_FLAGS_AVAILABLE;
3629
3630         if (wq->wq_constrained_threads_scheduled >= wq_max_constrained_threads) {
3631                 pwq_state |= WQ_EXCEEDED_CONSTRAINED_THREAD_LIMIT;
3632         }
3633
3634         if (wq->wq_nthreads >= wq_max_threads) {
3635                 pwq_state |= WQ_EXCEEDED_TOTAL_THREAD_LIMIT;
3636         }
3637
3638         return pwq_state;
3639 }
3640
3641 void
3642 workq_init(void)
3643 {
3644         workq_lck_grp_attr = lck_grp_attr_alloc_init();
3645         workq_lck_attr = lck_attr_alloc_init();
3646         workq_lck_grp = lck_grp_alloc_init("workq", workq_lck_grp_attr);
3647
3648         workq_zone_workqueue = zinit(sizeof(struct workqueue),
3649             1024 * sizeof(struct workqueue), 8192, "workq.wq");
3650         workq_zone_threadreq = zinit(sizeof(struct workq_threadreq_s),
3651             1024 * sizeof(struct workq_threadreq_s), 8192, "workq.threadreq");
3652
3653         clock_interval_to_absolutetime_interval(wq_stalled_window.usecs,
3654             NSEC_PER_USEC, &wq_stalled_window.abstime);
3655         clock_interval_to_absolutetime_interval(wq_reduce_pool_window.usecs,
3656             NSEC_PER_USEC, &wq_reduce_pool_window.abstime);
3657         clock_interval_to_absolutetime_interval(wq_max_timer_interval.usecs,
3658             NSEC_PER_USEC, &wq_max_timer_interval.abstime);
3659
3660         thread_deallocate_daemon_register_queue(&workq_deallocate_queue,
3661             workq_deallocate_queue_invoke);
3662 }