]> git.saurik.com Git - apple/xnu.git/blame - bsd/pthread/pthread_workqueue.c
xnu-6153.11.26.tar.gz
[apple/xnu.git] / bsd / pthread / pthread_workqueue.c
CommitLineData
d9a64523
A
1/*
2 * Copyright (c) 2000-2017 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/* Copyright (c) 1995-2018 Apple, Inc. All Rights Reserved */
29
30#include <sys/cdefs.h>
31
d9a64523
A
32#include <kern/assert.h>
33#include <kern/ast.h>
34#include <kern/clock.h>
35#include <kern/cpu_data.h>
36#include <kern/kern_types.h>
37#include <kern/policy_internal.h>
38#include <kern/processor.h>
0a7de745 39#include <kern/sched_prim.h> /* for thread_exception_return */
d9a64523
A
40#include <kern/task.h>
41#include <kern/thread.h>
42#include <kern/zalloc.h>
43#include <mach/kern_return.h>
44#include <mach/mach_param.h>
45#include <mach/mach_port.h>
46#include <mach/mach_types.h>
47#include <mach/mach_vm.h>
48#include <mach/sync_policy.h>
49#include <mach/task.h>
50#include <mach/thread_act.h> /* for thread_resume */
51#include <mach/thread_policy.h>
52#include <mach/thread_status.h>
53#include <mach/vm_prot.h>
54#include <mach/vm_statistics.h>
55#include <machine/atomic.h>
56#include <machine/machine_routines.h>
57#include <vm/vm_map.h>
58#include <vm/vm_protos.h>
59
60#include <sys/eventvar.h>
61#include <sys/kdebug.h>
62#include <sys/kernel.h>
63#include <sys/lock.h>
64#include <sys/param.h>
0a7de745 65#include <sys/proc_info.h> /* for fill_procworkqueue */
d9a64523
A
66#include <sys/proc_internal.h>
67#include <sys/pthread_shims.h>
68#include <sys/resourcevar.h>
69#include <sys/signalvar.h>
70#include <sys/sysctl.h>
71#include <sys/sysproto.h>
72#include <sys/systm.h>
73#include <sys/ulock.h> /* for ulock_owner_value_to_port_name */
74
75#include <pthread/bsdthread_private.h>
76#include <pthread/workqueue_syscalls.h>
77#include <pthread/workqueue_internal.h>
78#include <pthread/workqueue_trace.h>
79
80#include <os/log.h>
81
d9a64523 82static void workq_unpark_continue(void *uth, wait_result_t wr) __dead2;
cb323159
A
83static void workq_schedule_creator(proc_t p, struct workqueue *wq,
84 workq_kern_threadreq_flags_t flags);
d9a64523
A
85
86static bool workq_threadreq_admissible(struct workqueue *wq, struct uthread *uth,
0a7de745 87 workq_threadreq_t req);
d9a64523
A
88
89static uint32_t workq_constrained_allowance(struct workqueue *wq,
0a7de745 90 thread_qos_t at_qos, struct uthread *uth, bool may_start_timer);
d9a64523
A
91
92static bool workq_thread_is_busy(uint64_t cur_ts,
0a7de745 93 _Atomic uint64_t *lastblocked_tsp);
d9a64523
A
94
95static int workq_sysctl_handle_usecs SYSCTL_HANDLER_ARGS;
96
97#pragma mark globals
98
99struct workq_usec_var {
100 uint32_t usecs;
101 uint64_t abstime;
102};
103
104#define WORKQ_SYSCTL_USECS(var, init) \
0a7de745
A
105 static struct workq_usec_var var = { .usecs = init }; \
106 SYSCTL_OID(_kern, OID_AUTO, var##_usecs, \
107 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &var, 0, \
108 workq_sysctl_handle_usecs, "I", "")
d9a64523
A
109
110static lck_grp_t *workq_lck_grp;
111static lck_attr_t *workq_lck_attr;
112static lck_grp_attr_t *workq_lck_grp_attr;
113os_refgrp_decl(static, workq_refgrp, "workq", NULL);
114
cb323159 115static struct mpsc_daemon_queue workq_deallocate_queue;
d9a64523
A
116static zone_t workq_zone_workqueue;
117static zone_t workq_zone_threadreq;
118
0a7de745 119WORKQ_SYSCTL_USECS(wq_stalled_window, WQ_STALLED_WINDOW_USECS);
d9a64523
A
120WORKQ_SYSCTL_USECS(wq_reduce_pool_window, WQ_REDUCE_POOL_WINDOW_USECS);
121WORKQ_SYSCTL_USECS(wq_max_timer_interval, WQ_MAX_TIMER_INTERVAL_USECS);
122static uint32_t wq_max_threads = WORKQUEUE_MAXTHREADS;
123static uint32_t wq_max_constrained_threads = WORKQUEUE_MAXTHREADS / 8;
124static uint32_t wq_init_constrained_limit = 1;
125static uint16_t wq_death_max_load;
126static uint32_t wq_max_parallelism[WORKQ_NUM_QOS_BUCKETS];
127
128#pragma mark sysctls
129
130static int
131workq_sysctl_handle_usecs SYSCTL_HANDLER_ARGS
132{
133#pragma unused(arg2)
134 struct workq_usec_var *v = arg1;
135 int error = sysctl_handle_int(oidp, &v->usecs, 0, req);
0a7de745 136 if (error || !req->newptr) {
d9a64523 137 return error;
0a7de745 138 }
d9a64523 139 clock_interval_to_absolutetime_interval(v->usecs, NSEC_PER_USEC,
0a7de745 140 &v->abstime);
d9a64523
A
141 return 0;
142}
143
144SYSCTL_INT(_kern, OID_AUTO, wq_max_threads, CTLFLAG_RW | CTLFLAG_LOCKED,
0a7de745 145 &wq_max_threads, 0, "");
d9a64523
A
146
147SYSCTL_INT(_kern, OID_AUTO, wq_max_constrained_threads, CTLFLAG_RW | CTLFLAG_LOCKED,
0a7de745 148 &wq_max_constrained_threads, 0, "");
d9a64523
A
149
150#pragma mark p_wqptr
151
152#define WQPTR_IS_INITING_VALUE ((struct workqueue *)~(uintptr_t)0)
153
154static struct workqueue *
155proc_get_wqptr_fast(struct proc *p)
156{
157 return os_atomic_load(&p->p_wqptr, relaxed);
158}
159
160static struct workqueue *
161proc_get_wqptr(struct proc *p)
162{
163 struct workqueue *wq = proc_get_wqptr_fast(p);
164 return wq == WQPTR_IS_INITING_VALUE ? NULL : wq;
165}
166
167static void
168proc_set_wqptr(struct proc *p, struct workqueue *wq)
169{
170 wq = os_atomic_xchg(&p->p_wqptr, wq, release);
171 if (wq == WQPTR_IS_INITING_VALUE) {
172 proc_lock(p);
173 thread_wakeup(&p->p_wqptr);
174 proc_unlock(p);
175 }
176}
177
178static bool
179proc_init_wqptr_or_wait(struct proc *p)
180{
181 struct workqueue *wq;
182
183 proc_lock(p);
cb323159 184 wq = os_atomic_load(&p->p_wqptr, relaxed);
d9a64523
A
185
186 if (wq == NULL) {
cb323159 187 os_atomic_store(&p->p_wqptr, WQPTR_IS_INITING_VALUE, relaxed);
d9a64523
A
188 proc_unlock(p);
189 return true;
190 }
191
192 if (wq == WQPTR_IS_INITING_VALUE) {
193 assert_wait(&p->p_wqptr, THREAD_UNINT);
194 proc_unlock(p);
195 thread_block(THREAD_CONTINUE_NULL);
196 } else {
197 proc_unlock(p);
198 }
199 return false;
200}
201
202static inline event_t
203workq_parked_wait_event(struct uthread *uth)
204{
205 return (event_t)&uth->uu_workq_stackaddr;
206}
207
208static inline void
209workq_thread_wakeup(struct uthread *uth)
210{
cb323159 211 thread_wakeup_thread(workq_parked_wait_event(uth), uth->uu_thread);
d9a64523
A
212}
213
214#pragma mark wq_thactive
215
216#if defined(__LP64__)
217// Layout is:
218// 127 - 115 : 13 bits of zeroes
219// 114 - 112 : best QoS among all pending constrained requests
220// 111 - 0 : MGR, AUI, UI, IN, DF, UT, BG+MT buckets every 16 bits
221#define WQ_THACTIVE_BUCKET_WIDTH 16
222#define WQ_THACTIVE_QOS_SHIFT (7 * WQ_THACTIVE_BUCKET_WIDTH)
223#else
224// Layout is:
225// 63 - 61 : best QoS among all pending constrained requests
226// 60 : Manager bucket (0 or 1)
227// 59 - 0 : AUI, UI, IN, DF, UT, BG+MT buckets every 10 bits
228#define WQ_THACTIVE_BUCKET_WIDTH 10
229#define WQ_THACTIVE_QOS_SHIFT (6 * WQ_THACTIVE_BUCKET_WIDTH + 1)
230#endif
231#define WQ_THACTIVE_BUCKET_MASK ((1U << WQ_THACTIVE_BUCKET_WIDTH) - 1)
232#define WQ_THACTIVE_BUCKET_HALF (1U << (WQ_THACTIVE_BUCKET_WIDTH - 1))
233
234static_assert(sizeof(wq_thactive_t) * CHAR_BIT - WQ_THACTIVE_QOS_SHIFT >= 3,
0a7de745 235 "Make sure we have space to encode a QoS");
d9a64523
A
236
237static inline wq_thactive_t
238_wq_thactive(struct workqueue *wq)
239{
cb323159 240 return os_atomic_load_wide(&wq->wq_thactive, relaxed);
d9a64523
A
241}
242
243static inline int
244_wq_bucket(thread_qos_t qos)
245{
246 // Map both BG and MT to the same bucket by over-shifting down and
247 // clamping MT and BG together.
248 switch (qos) {
249 case THREAD_QOS_MAINTENANCE:
250 return 0;
251 default:
252 return qos - 2;
253 }
254}
255
256#define WQ_THACTIVE_BEST_CONSTRAINED_REQ_QOS(tha) \
0a7de745 257 ((tha) >> WQ_THACTIVE_QOS_SHIFT)
d9a64523
A
258
259static inline thread_qos_t
260_wq_thactive_best_constrained_req_qos(struct workqueue *wq)
261{
262 // Avoid expensive atomic operations: the three bits we're loading are in
263 // a single byte, and always updated under the workqueue lock
264 wq_thactive_t v = *(wq_thactive_t *)&wq->wq_thactive;
265 return WQ_THACTIVE_BEST_CONSTRAINED_REQ_QOS(v);
266}
267
268static void
269_wq_thactive_refresh_best_constrained_req_qos(struct workqueue *wq)
270{
271 thread_qos_t old_qos, new_qos;
272 workq_threadreq_t req;
273
274 req = priority_queue_max(&wq->wq_constrained_queue,
0a7de745 275 struct workq_threadreq_s, tr_entry);
d9a64523
A
276 new_qos = req ? req->tr_qos : THREAD_QOS_UNSPECIFIED;
277 old_qos = _wq_thactive_best_constrained_req_qos(wq);
278 if (old_qos != new_qos) {
279 long delta = (long)new_qos - (long)old_qos;
280 wq_thactive_t v = (wq_thactive_t)delta << WQ_THACTIVE_QOS_SHIFT;
281 /*
282 * We can do an atomic add relative to the initial load because updates
283 * to this qos are always serialized under the workqueue lock.
284 */
285 v = os_atomic_add(&wq->wq_thactive, v, relaxed);
286#ifdef __LP64__
287 WQ_TRACE_WQ(TRACE_wq_thactive_update, wq, (uint64_t)v,
0a7de745 288 (uint64_t)(v >> 64), 0, 0);
d9a64523
A
289#else
290 WQ_TRACE_WQ(TRACE_wq_thactive_update, wq, v, 0, 0, 0);
291#endif
292 }
293}
294
295static inline wq_thactive_t
296_wq_thactive_offset_for_qos(thread_qos_t qos)
297{
298 return (wq_thactive_t)1 << (_wq_bucket(qos) * WQ_THACTIVE_BUCKET_WIDTH);
299}
300
301static inline wq_thactive_t
302_wq_thactive_inc(struct workqueue *wq, thread_qos_t qos)
303{
304 wq_thactive_t v = _wq_thactive_offset_for_qos(qos);
305 return os_atomic_add_orig(&wq->wq_thactive, v, relaxed);
306}
307
308static inline wq_thactive_t
309_wq_thactive_dec(struct workqueue *wq, thread_qos_t qos)
310{
311 wq_thactive_t v = _wq_thactive_offset_for_qos(qos);
312 return os_atomic_sub_orig(&wq->wq_thactive, v, relaxed);
313}
314
315static inline void
316_wq_thactive_move(struct workqueue *wq,
0a7de745 317 thread_qos_t old_qos, thread_qos_t new_qos)
d9a64523
A
318{
319 wq_thactive_t v = _wq_thactive_offset_for_qos(new_qos) -
0a7de745 320 _wq_thactive_offset_for_qos(old_qos);
cb323159 321 os_atomic_add(&wq->wq_thactive, v, relaxed);
d9a64523
A
322 wq->wq_thscheduled_count[_wq_bucket(old_qos)]--;
323 wq->wq_thscheduled_count[_wq_bucket(new_qos)]++;
324}
325
326static inline uint32_t
327_wq_thactive_aggregate_downto_qos(struct workqueue *wq, wq_thactive_t v,
0a7de745 328 thread_qos_t qos, uint32_t *busycount, uint32_t *max_busycount)
d9a64523
A
329{
330 uint32_t count = 0, active;
331 uint64_t curtime;
332
333 assert(WORKQ_THREAD_QOS_MIN <= qos && qos <= WORKQ_THREAD_QOS_MAX);
334
335 if (busycount) {
336 curtime = mach_absolute_time();
337 *busycount = 0;
338 }
339 if (max_busycount) {
340 *max_busycount = THREAD_QOS_LAST - qos;
341 }
342
343 int i = _wq_bucket(qos);
344 v >>= i * WQ_THACTIVE_BUCKET_WIDTH;
345 for (; i < WORKQ_NUM_QOS_BUCKETS; i++, v >>= WQ_THACTIVE_BUCKET_WIDTH) {
346 active = v & WQ_THACTIVE_BUCKET_MASK;
347 count += active;
348
349 if (busycount && wq->wq_thscheduled_count[i] > active) {
350 if (workq_thread_is_busy(curtime, &wq->wq_lastblocked_ts[i])) {
351 /*
352 * We only consider the last blocked thread for a given bucket
353 * as busy because we don't want to take the list lock in each
354 * sched callback. However this is an approximation that could
355 * contribute to thread creation storms.
356 */
357 (*busycount)++;
358 }
359 }
360 }
361
362 return count;
363}
364
365#pragma mark wq_flags
366
367static inline uint32_t
368_wq_flags(struct workqueue *wq)
369{
370 return os_atomic_load(&wq->wq_flags, relaxed);
371}
372
373static inline bool
374_wq_exiting(struct workqueue *wq)
375{
376 return _wq_flags(wq) & WQ_EXITING;
377}
378
379bool
380workq_is_exiting(struct proc *p)
381{
382 struct workqueue *wq = proc_get_wqptr(p);
383 return !wq || _wq_exiting(wq);
384}
385
d9a64523
A
386#pragma mark workqueue lock
387
388static bool
389workq_lock_spin_is_acquired_kdp(struct workqueue *wq)
390{
391 return kdp_lck_spin_is_acquired(&wq->wq_lock);
392}
393
394static inline void
395workq_lock_spin(struct workqueue *wq)
396{
0a7de745 397 lck_spin_lock_grp(&wq->wq_lock, workq_lck_grp);
d9a64523
A
398}
399
400static inline void
401workq_lock_held(__assert_only struct workqueue *wq)
402{
403 LCK_SPIN_ASSERT(&wq->wq_lock, LCK_ASSERT_OWNED);
404}
405
406static inline bool
407workq_lock_try(struct workqueue *wq)
408{
0a7de745 409 return lck_spin_try_lock_grp(&wq->wq_lock, workq_lck_grp);
d9a64523
A
410}
411
412static inline void
413workq_unlock(struct workqueue *wq)
414{
415 lck_spin_unlock(&wq->wq_lock);
416}
417
418#pragma mark idle thread lists
419
420#define WORKQ_POLICY_INIT(qos) \
0a7de745 421 (struct uu_workq_policy){ .qos_req = qos, .qos_bucket = qos }
d9a64523
A
422
423static inline thread_qos_t
424workq_pri_bucket(struct uu_workq_policy req)
425{
426 return MAX(MAX(req.qos_req, req.qos_max), req.qos_override);
427}
428
429static inline thread_qos_t
430workq_pri_override(struct uu_workq_policy req)
431{
432 return MAX(workq_pri_bucket(req), req.qos_bucket);
433}
434
435static inline bool
436workq_thread_needs_params_change(workq_threadreq_t req, struct uthread *uth)
437{
438 workq_threadreq_param_t cur_trp, req_trp = { };
439
440 cur_trp.trp_value = uth->uu_save.uus_workq_park_data.workloop_params;
cb323159 441 if (req->tr_flags & WORKQ_TR_FLAG_WL_PARAMS) {
d9a64523
A
442 req_trp = kqueue_threadreq_workloop_param(req);
443 }
444
445 /*
446 * CPU percent flags are handled separately to policy changes, so ignore
447 * them for all of these checks.
448 */
449 uint16_t cur_flags = (cur_trp.trp_flags & ~TRP_CPUPERCENT);
450 uint16_t req_flags = (req_trp.trp_flags & ~TRP_CPUPERCENT);
451
452 if (!req_flags && !cur_flags) {
453 return false;
454 }
455
456 if (req_flags != cur_flags) {
457 return true;
458 }
459
460 if ((req_flags & TRP_PRIORITY) && req_trp.trp_pri != cur_trp.trp_pri) {
461 return true;
462 }
463
464 if ((req_flags & TRP_POLICY) && cur_trp.trp_pol != cur_trp.trp_pol) {
465 return true;
466 }
467
468 return false;
469}
470
471static inline bool
472workq_thread_needs_priority_change(workq_threadreq_t req, struct uthread *uth)
473{
474 if (workq_thread_needs_params_change(req, uth)) {
475 return true;
476 }
477
478 return req->tr_qos != workq_pri_override(uth->uu_workq_pri);
479}
480
481static void
482workq_thread_update_bucket(proc_t p, struct workqueue *wq, struct uthread *uth,
0a7de745
A
483 struct uu_workq_policy old_pri, struct uu_workq_policy new_pri,
484 bool force_run)
d9a64523
A
485{
486 thread_qos_t old_bucket = old_pri.qos_bucket;
487 thread_qos_t new_bucket = workq_pri_bucket(new_pri);
488
489 if (old_bucket != new_bucket) {
490 _wq_thactive_move(wq, old_bucket, new_bucket);
491 }
492
493 new_pri.qos_bucket = new_bucket;
494 uth->uu_workq_pri = new_pri;
495
496 if (workq_pri_override(old_pri) != new_bucket) {
497 thread_set_workq_override(uth->uu_thread, new_bucket);
498 }
499
500 if (wq->wq_reqcount && (old_bucket > new_bucket || force_run)) {
501 int flags = WORKQ_THREADREQ_CAN_CREATE_THREADS;
502 if (old_bucket > new_bucket) {
503 /*
504 * When lowering our bucket, we may unblock a thread request,
505 * but we can't drop our priority before we have evaluated
506 * whether this is the case, and if we ever drop the workqueue lock
507 * that would cause a priority inversion.
508 *
509 * We hence have to disallow thread creation in that case.
510 */
511 flags = 0;
512 }
513 workq_schedule_creator(p, wq, flags);
514 }
515}
516
517/*
518 * Sets/resets the cpu percent limits on the current thread. We can't set
519 * these limits from outside of the current thread, so this function needs
520 * to be called when we're executing on the intended
521 */
522static void
523workq_thread_reset_cpupercent(workq_threadreq_t req, struct uthread *uth)
524{
525 assert(uth == current_uthread());
526 workq_threadreq_param_t trp = { };
527
cb323159 528 if (req && (req->tr_flags & WORKQ_TR_FLAG_WL_PARAMS)) {
d9a64523
A
529 trp = kqueue_threadreq_workloop_param(req);
530 }
531
532 if (uth->uu_workq_flags & UT_WORKQ_CPUPERCENT) {
533 /*
534 * Going through disable when we have an existing CPU percent limit
535 * set will force the ledger to refill the token bucket of the current
536 * thread. Removing any penalty applied by previous thread use.
537 */
538 thread_set_cpulimit(THREAD_CPULIMIT_DISABLE, 0, 0);
539 uth->uu_workq_flags &= ~UT_WORKQ_CPUPERCENT;
540 }
541
542 if (trp.trp_flags & TRP_CPUPERCENT) {
543 thread_set_cpulimit(THREAD_CPULIMIT_BLOCK, trp.trp_cpupercent,
0a7de745 544 (uint64_t)trp.trp_refillms * NSEC_PER_SEC);
d9a64523
A
545 uth->uu_workq_flags |= UT_WORKQ_CPUPERCENT;
546 }
547}
548
549static void
550workq_thread_reset_pri(struct workqueue *wq, struct uthread *uth,
cb323159 551 workq_threadreq_t req, bool unpark)
d9a64523
A
552{
553 thread_t th = uth->uu_thread;
554 thread_qos_t qos = req ? req->tr_qos : WORKQ_THREAD_QOS_CLEANUP;
555 workq_threadreq_param_t trp = { };
556 int priority = 31;
557 int policy = POLICY_TIMESHARE;
558
cb323159 559 if (req && (req->tr_flags & WORKQ_TR_FLAG_WL_PARAMS)) {
d9a64523
A
560 trp = kqueue_threadreq_workloop_param(req);
561 }
562
563 uth->uu_workq_pri = WORKQ_POLICY_INIT(qos);
564 uth->uu_workq_flags &= ~UT_WORKQ_OUTSIDE_QOS;
d9a64523 565
cb323159
A
566 if (unpark) {
567 uth->uu_save.uus_workq_park_data.workloop_params = trp.trp_value;
568 // qos sent out to userspace (may differ from uu_workq_pri on param threads)
569 uth->uu_save.uus_workq_park_data.qos = qos;
570 }
d9a64523
A
571
572 if (qos == WORKQ_THREAD_QOS_MANAGER) {
573 uint32_t mgr_pri = wq->wq_event_manager_priority;
574 assert(trp.trp_value == 0); // manager qos and thread policy don't mix
575
576 if (mgr_pri & _PTHREAD_PRIORITY_SCHED_PRI_FLAG) {
577 mgr_pri &= _PTHREAD_PRIORITY_SCHED_PRI_MASK;
578 thread_set_workq_pri(th, THREAD_QOS_UNSPECIFIED, mgr_pri,
0a7de745 579 POLICY_TIMESHARE);
d9a64523
A
580 return;
581 }
582
583 qos = _pthread_priority_thread_qos(mgr_pri);
584 } else {
585 if (trp.trp_flags & TRP_PRIORITY) {
586 qos = THREAD_QOS_UNSPECIFIED;
587 priority = trp.trp_pri;
588 uth->uu_workq_flags |= UT_WORKQ_OUTSIDE_QOS;
589 }
590
591 if (trp.trp_flags & TRP_POLICY) {
592 policy = trp.trp_pol;
593 }
594 }
595
596 thread_set_workq_pri(th, qos, priority, policy);
597}
598
599/*
600 * Called by kevent with the NOTE_WL_THREAD_REQUEST knote lock held,
601 * every time a servicer is being told about a new max QoS.
602 */
603void
cb323159 604workq_thread_set_max_qos(struct proc *p, workq_threadreq_t kqr)
d9a64523
A
605{
606 struct uu_workq_policy old_pri, new_pri;
cb323159 607 struct uthread *uth = current_uthread();
d9a64523 608 struct workqueue *wq = proc_get_wqptr_fast(p);
cb323159 609 thread_qos_t qos = kqr->tr_kq_qos_index;
d9a64523 610
0a7de745 611 if (uth->uu_workq_pri.qos_max == qos) {
d9a64523 612 return;
0a7de745 613 }
d9a64523
A
614
615 workq_lock_spin(wq);
616 old_pri = new_pri = uth->uu_workq_pri;
617 new_pri.qos_max = qos;
618 workq_thread_update_bucket(p, wq, uth, old_pri, new_pri, false);
619 workq_unlock(wq);
620}
621
622#pragma mark idle threads accounting and handling
623
624static inline struct uthread *
625workq_oldest_killable_idle_thread(struct workqueue *wq)
626{
627 struct uthread *uth = TAILQ_LAST(&wq->wq_thidlelist, workq_uthread_head);
628
629 if (uth && !uth->uu_save.uus_workq_park_data.has_stack) {
630 uth = TAILQ_PREV(uth, workq_uthread_head, uu_workq_entry);
631 if (uth) {
632 assert(uth->uu_save.uus_workq_park_data.has_stack);
633 }
634 }
635 return uth;
636}
637
638static inline uint64_t
639workq_kill_delay_for_idle_thread(struct workqueue *wq)
640{
641 uint64_t delay = wq_reduce_pool_window.abstime;
642 uint16_t idle = wq->wq_thidlecount;
643
644 /*
645 * If we have less than wq_death_max_load threads, have a 5s timer.
646 *
647 * For the next wq_max_constrained_threads ones, decay linearly from
648 * from 5s to 50ms.
649 */
650 if (idle <= wq_death_max_load) {
651 return delay;
652 }
653
654 if (wq_max_constrained_threads > idle - wq_death_max_load) {
655 delay *= (wq_max_constrained_threads - (idle - wq_death_max_load));
656 }
657 return delay / wq_max_constrained_threads;
658}
659
660static inline bool
661workq_should_kill_idle_thread(struct workqueue *wq, struct uthread *uth,
0a7de745 662 uint64_t now)
d9a64523
A
663{
664 uint64_t delay = workq_kill_delay_for_idle_thread(wq);
665 return now - uth->uu_save.uus_workq_park_data.idle_stamp > delay;
666}
667
668static void
669workq_death_call_schedule(struct workqueue *wq, uint64_t deadline)
670{
671 uint32_t wq_flags = os_atomic_load(&wq->wq_flags, relaxed);
672
673 if (wq_flags & (WQ_EXITING | WQ_DEATH_CALL_SCHEDULED)) {
674 return;
675 }
676 os_atomic_or(&wq->wq_flags, WQ_DEATH_CALL_SCHEDULED, relaxed);
677
678 WQ_TRACE_WQ(TRACE_wq_death_call | DBG_FUNC_NONE, wq, 1, 0, 0, 0);
679
680 /*
681 * <rdar://problem/13139182> Due to how long term timers work, the leeway
682 * can't be too short, so use 500ms which is long enough that we will not
683 * wake up the CPU for killing threads, but short enough that it doesn't
684 * fall into long-term timer list shenanigans.
685 */
686 thread_call_enter_delayed_with_leeway(wq->wq_death_call, NULL, deadline,
0a7de745
A
687 wq_reduce_pool_window.abstime / 10,
688 THREAD_CALL_DELAY_LEEWAY | THREAD_CALL_DELAY_USER_BACKGROUND);
d9a64523
A
689}
690
691/*
692 * `decrement` is set to the number of threads that are no longer dying:
693 * - because they have been resuscitated just in time (workq_pop_idle_thread)
694 * - or have been killed (workq_thread_terminate).
695 */
696static void
697workq_death_policy_evaluate(struct workqueue *wq, uint16_t decrement)
698{
699 struct uthread *uth;
700
701 assert(wq->wq_thdying_count >= decrement);
0a7de745 702 if ((wq->wq_thdying_count -= decrement) > 0) {
d9a64523 703 return;
0a7de745 704 }
d9a64523 705
0a7de745 706 if (wq->wq_thidlecount <= 1) {
d9a64523 707 return;
0a7de745 708 }
d9a64523 709
0a7de745 710 if ((uth = workq_oldest_killable_idle_thread(wq)) == NULL) {
d9a64523 711 return;
0a7de745 712 }
d9a64523
A
713
714 uint64_t now = mach_absolute_time();
715 uint64_t delay = workq_kill_delay_for_idle_thread(wq);
716
717 if (now - uth->uu_save.uus_workq_park_data.idle_stamp > delay) {
718 WQ_TRACE_WQ(TRACE_wq_thread_terminate | DBG_FUNC_START,
0a7de745 719 wq, wq->wq_thidlecount, 0, 0, 0);
d9a64523
A
720 wq->wq_thdying_count++;
721 uth->uu_workq_flags |= UT_WORKQ_DYING;
cb323159
A
722 if ((uth->uu_workq_flags & UT_WORKQ_IDLE_CLEANUP) == 0) {
723 workq_thread_wakeup(uth);
724 }
d9a64523
A
725 return;
726 }
727
728 workq_death_call_schedule(wq,
0a7de745 729 uth->uu_save.uus_workq_park_data.idle_stamp + delay);
d9a64523
A
730}
731
732void
733workq_thread_terminate(struct proc *p, struct uthread *uth)
734{
735 struct workqueue *wq = proc_get_wqptr_fast(p);
736
737 workq_lock_spin(wq);
738 TAILQ_REMOVE(&wq->wq_thrunlist, uth, uu_workq_entry);
739 if (uth->uu_workq_flags & UT_WORKQ_DYING) {
740 WQ_TRACE_WQ(TRACE_wq_thread_terminate | DBG_FUNC_END,
0a7de745 741 wq, wq->wq_thidlecount, 0, 0, 0);
d9a64523
A
742 workq_death_policy_evaluate(wq, 1);
743 }
744 if (wq->wq_nthreads-- == wq_max_threads) {
745 /*
746 * We got under the thread limit again, which may have prevented
747 * thread creation from happening, redrive if there are pending requests
748 */
749 if (wq->wq_reqcount) {
750 workq_schedule_creator(p, wq, WORKQ_THREADREQ_CAN_CREATE_THREADS);
751 }
752 }
753 workq_unlock(wq);
754
755 thread_deallocate(uth->uu_thread);
756}
757
758static void
759workq_kill_old_threads_call(void *param0, void *param1 __unused)
760{
761 struct workqueue *wq = param0;
762
763 workq_lock_spin(wq);
764 WQ_TRACE_WQ(TRACE_wq_death_call | DBG_FUNC_START, wq, 0, 0, 0, 0);
cb323159 765 os_atomic_andnot(&wq->wq_flags, WQ_DEATH_CALL_SCHEDULED, relaxed);
d9a64523
A
766 workq_death_policy_evaluate(wq, 0);
767 WQ_TRACE_WQ(TRACE_wq_death_call | DBG_FUNC_END, wq, 0, 0, 0, 0);
768 workq_unlock(wq);
769}
770
771static struct uthread *
cb323159
A
772workq_pop_idle_thread(struct workqueue *wq, uint8_t uu_flags,
773 bool *needs_wakeup)
d9a64523
A
774{
775 struct uthread *uth;
776
777 if ((uth = TAILQ_FIRST(&wq->wq_thidlelist))) {
778 TAILQ_REMOVE(&wq->wq_thidlelist, uth, uu_workq_entry);
779 } else {
780 uth = TAILQ_FIRST(&wq->wq_thnewlist);
781 TAILQ_REMOVE(&wq->wq_thnewlist, uth, uu_workq_entry);
782 }
783 TAILQ_INSERT_TAIL(&wq->wq_thrunlist, uth, uu_workq_entry);
784
785 assert((uth->uu_workq_flags & UT_WORKQ_RUNNING) == 0);
cb323159
A
786 uth->uu_workq_flags |= UT_WORKQ_RUNNING | uu_flags;
787 if ((uu_flags & UT_WORKQ_OVERCOMMIT) == 0) {
788 wq->wq_constrained_threads_scheduled++;
789 }
d9a64523
A
790 wq->wq_threads_scheduled++;
791 wq->wq_thidlecount--;
792
793 if (__improbable(uth->uu_workq_flags & UT_WORKQ_DYING)) {
794 uth->uu_workq_flags ^= UT_WORKQ_DYING;
795 workq_death_policy_evaluate(wq, 1);
cb323159
A
796 *needs_wakeup = false;
797 } else if (uth->uu_workq_flags & UT_WORKQ_IDLE_CLEANUP) {
798 *needs_wakeup = false;
799 } else {
800 *needs_wakeup = true;
d9a64523
A
801 }
802 return uth;
803}
804
805/*
806 * Called by thread_create_workq_waiting() during thread initialization, before
807 * assert_wait, before the thread has been started.
808 */
809event_t
810workq_thread_init_and_wq_lock(task_t task, thread_t th)
811{
812 struct uthread *uth = get_bsdthread_info(th);
813
814 uth->uu_workq_flags = UT_WORKQ_NEW;
815 uth->uu_workq_pri = WORKQ_POLICY_INIT(THREAD_QOS_LEGACY);
816 uth->uu_workq_thport = MACH_PORT_NULL;
817 uth->uu_workq_stackaddr = 0;
cb323159 818 uth->uu_workq_pthread_kill_allowed = 0;
d9a64523
A
819
820 thread_set_tag(th, THREAD_TAG_PTHREAD | THREAD_TAG_WORKQUEUE);
821 thread_reset_workq_qos(th, THREAD_QOS_LEGACY);
822
823 workq_lock_spin(proc_get_wqptr_fast(get_bsdtask_info(task)));
824 return workq_parked_wait_event(uth);
825}
826
827/**
828 * Try to add a new workqueue thread.
829 *
830 * - called with workq lock held
831 * - dropped and retaken around thread creation
832 * - return with workq lock held
833 */
834static bool
835workq_add_new_idle_thread(proc_t p, struct workqueue *wq)
836{
837 mach_vm_offset_t th_stackaddr;
838 kern_return_t kret;
839 thread_t th;
840
841 wq->wq_nthreads++;
842
843 workq_unlock(wq);
844
845 vm_map_t vmap = get_task_map(p->task);
846
847 kret = pthread_functions->workq_create_threadstack(p, vmap, &th_stackaddr);
848 if (kret != KERN_SUCCESS) {
849 WQ_TRACE_WQ(TRACE_wq_thread_create_failed | DBG_FUNC_NONE, wq,
0a7de745 850 kret, 1, 0, 0);
d9a64523
A
851 goto out;
852 }
853
854 kret = thread_create_workq_waiting(p->task, workq_unpark_continue, &th);
855 if (kret != KERN_SUCCESS) {
856 WQ_TRACE_WQ(TRACE_wq_thread_create_failed | DBG_FUNC_NONE, wq,
0a7de745 857 kret, 0, 0, 0);
d9a64523
A
858 pthread_functions->workq_destroy_threadstack(p, vmap, th_stackaddr);
859 goto out;
860 }
861
862 // thread_create_workq_waiting() will return with the wq lock held
863 // on success, because it calls workq_thread_init_and_wq_lock() above
864
865 struct uthread *uth = get_bsdthread_info(th);
866
867 wq->wq_creations++;
868 wq->wq_thidlecount++;
869 uth->uu_workq_stackaddr = th_stackaddr;
870 TAILQ_INSERT_TAIL(&wq->wq_thnewlist, uth, uu_workq_entry);
871
872 WQ_TRACE_WQ(TRACE_wq_thread_create | DBG_FUNC_NONE, wq, 0, 0, 0, 0);
873 return true;
874
875out:
876 workq_lock_spin(wq);
877 /*
878 * Do not redrive here if we went under wq_max_threads again,
879 * it is the responsibility of the callers of this function
880 * to do so when it fails.
881 */
882 wq->wq_nthreads--;
883 return false;
884}
885
886#define WORKQ_UNPARK_FOR_DEATH_WAS_IDLE 0x1
887
888__attribute__((noreturn, noinline))
889static void
890workq_unpark_for_death_and_unlock(proc_t p, struct workqueue *wq,
cb323159 891 struct uthread *uth, uint32_t death_flags, uint32_t setup_flags)
d9a64523
A
892{
893 thread_qos_t qos = workq_pri_override(uth->uu_workq_pri);
894 bool first_use = uth->uu_workq_flags & UT_WORKQ_NEW;
895
896 if (qos > WORKQ_THREAD_QOS_CLEANUP) {
cb323159 897 workq_thread_reset_pri(wq, uth, NULL, /*unpark*/ true);
d9a64523
A
898 qos = WORKQ_THREAD_QOS_CLEANUP;
899 }
900
901 workq_thread_reset_cpupercent(NULL, uth);
902
903 if (death_flags & WORKQ_UNPARK_FOR_DEATH_WAS_IDLE) {
904 wq->wq_thidlecount--;
905 if (first_use) {
906 TAILQ_REMOVE(&wq->wq_thnewlist, uth, uu_workq_entry);
907 } else {
908 TAILQ_REMOVE(&wq->wq_thidlelist, uth, uu_workq_entry);
909 }
910 }
911 TAILQ_INSERT_TAIL(&wq->wq_thrunlist, uth, uu_workq_entry);
912
913 workq_unlock(wq);
914
cb323159
A
915 if (setup_flags & WQ_SETUP_CLEAR_VOUCHER) {
916 __assert_only kern_return_t kr;
917 kr = thread_set_voucher_name(MACH_PORT_NULL);
918 assert(kr == KERN_SUCCESS);
919 }
920
d9a64523 921 uint32_t flags = WQ_FLAG_THREAD_NEWSPI | qos | WQ_FLAG_THREAD_PRIO_QOS;
d9a64523
A
922 thread_t th = uth->uu_thread;
923 vm_map_t vmap = get_task_map(p->task);
924
0a7de745
A
925 if (!first_use) {
926 flags |= WQ_FLAG_THREAD_REUSE;
927 }
d9a64523
A
928
929 pthread_functions->workq_setup_thread(p, th, vmap, uth->uu_workq_stackaddr,
cb323159 930 uth->uu_workq_thport, 0, WQ_SETUP_EXIT_THREAD, flags);
d9a64523
A
931 __builtin_unreachable();
932}
933
934bool
935workq_is_current_thread_updating_turnstile(struct workqueue *wq)
936{
937 return wq->wq_turnstile_updater == current_thread();
938}
939
940__attribute__((always_inline))
941static inline void
942workq_perform_turnstile_operation_locked(struct workqueue *wq,
0a7de745 943 void (^operation)(void))
d9a64523
A
944{
945 workq_lock_held(wq);
946 wq->wq_turnstile_updater = current_thread();
947 operation();
948 wq->wq_turnstile_updater = THREAD_NULL;
949}
950
951static void
952workq_turnstile_update_inheritor(struct workqueue *wq,
0a7de745
A
953 turnstile_inheritor_t inheritor,
954 turnstile_update_flags_t flags)
d9a64523 955{
cb323159
A
956 if (wq->wq_inheritor == inheritor) {
957 return;
958 }
959 wq->wq_inheritor = inheritor;
d9a64523
A
960 workq_perform_turnstile_operation_locked(wq, ^{
961 turnstile_update_inheritor(wq->wq_turnstile, inheritor,
0a7de745 962 flags | TURNSTILE_IMMEDIATE_UPDATE);
d9a64523 963 turnstile_update_inheritor_complete(wq->wq_turnstile,
0a7de745 964 TURNSTILE_INTERLOCK_HELD);
d9a64523
A
965 });
966}
967
968static void
cb323159
A
969workq_push_idle_thread(proc_t p, struct workqueue *wq, struct uthread *uth,
970 uint32_t setup_flags)
d9a64523
A
971{
972 uint64_t now = mach_absolute_time();
cb323159 973 bool is_creator = (uth == wq->wq_creator);
d9a64523 974
d9a64523
A
975 if ((uth->uu_workq_flags & UT_WORKQ_OVERCOMMIT) == 0) {
976 wq->wq_constrained_threads_scheduled--;
977 }
cb323159 978 uth->uu_workq_flags &= ~(UT_WORKQ_RUNNING | UT_WORKQ_OVERCOMMIT);
d9a64523
A
979 TAILQ_REMOVE(&wq->wq_thrunlist, uth, uu_workq_entry);
980 wq->wq_threads_scheduled--;
981
cb323159
A
982 if (is_creator) {
983 wq->wq_creator = NULL;
d9a64523 984 WQ_TRACE_WQ(TRACE_wq_creator_select, wq, 3, 0,
0a7de745 985 uth->uu_save.uus_workq_park_data.yields, 0);
cb323159
A
986 }
987
988 if (wq->wq_inheritor == uth->uu_thread) {
989 assert(wq->wq_creator == NULL);
d9a64523
A
990 if (wq->wq_reqcount) {
991 workq_turnstile_update_inheritor(wq, wq, TURNSTILE_INHERITOR_WORKQ);
992 } else {
993 workq_turnstile_update_inheritor(wq, TURNSTILE_INHERITOR_NULL, 0);
994 }
cb323159
A
995 }
996
997 if (uth->uu_workq_flags & UT_WORKQ_NEW) {
998 assert(is_creator || (_wq_flags(wq) & WQ_EXITING));
999 TAILQ_INSERT_TAIL(&wq->wq_thnewlist, uth, uu_workq_entry);
1000 wq->wq_thidlecount++;
1001 return;
1002 }
1003
1004 if (!is_creator) {
d9a64523
A
1005 _wq_thactive_dec(wq, uth->uu_workq_pri.qos_bucket);
1006 wq->wq_thscheduled_count[_wq_bucket(uth->uu_workq_pri.qos_bucket)]--;
d9a64523
A
1007 uth->uu_workq_flags |= UT_WORKQ_IDLE_CLEANUP;
1008 }
1009
1010 uth->uu_save.uus_workq_park_data.idle_stamp = now;
1011
1012 struct uthread *oldest = workq_oldest_killable_idle_thread(wq);
1013 uint16_t cur_idle = wq->wq_thidlecount;
1014
1015 if (cur_idle >= wq_max_constrained_threads ||
0a7de745
A
1016 (wq->wq_thdying_count == 0 && oldest &&
1017 workq_should_kill_idle_thread(wq, oldest, now))) {
d9a64523
A
1018 /*
1019 * Immediately kill threads if we have too may of them.
1020 *
1021 * And swap "place" with the oldest one we'd have woken up.
1022 * This is a relatively desperate situation where we really
1023 * need to kill threads quickly and it's best to kill
1024 * the one that's currently on core than context switching.
1025 */
1026 if (oldest) {
1027 oldest->uu_save.uus_workq_park_data.idle_stamp = now;
1028 TAILQ_REMOVE(&wq->wq_thidlelist, oldest, uu_workq_entry);
1029 TAILQ_INSERT_HEAD(&wq->wq_thidlelist, oldest, uu_workq_entry);
1030 }
1031
1032 WQ_TRACE_WQ(TRACE_wq_thread_terminate | DBG_FUNC_START,
0a7de745 1033 wq, cur_idle, 0, 0, 0);
d9a64523
A
1034 wq->wq_thdying_count++;
1035 uth->uu_workq_flags |= UT_WORKQ_DYING;
1036 uth->uu_workq_flags &= ~UT_WORKQ_IDLE_CLEANUP;
cb323159 1037 workq_unpark_for_death_and_unlock(p, wq, uth, 0, setup_flags);
d9a64523
A
1038 __builtin_unreachable();
1039 }
1040
1041 struct uthread *tail = TAILQ_LAST(&wq->wq_thidlelist, workq_uthread_head);
1042
1043 cur_idle += 1;
1044 wq->wq_thidlecount = cur_idle;
1045
1046 if (cur_idle >= wq_death_max_load && tail &&
0a7de745 1047 tail->uu_save.uus_workq_park_data.has_stack) {
d9a64523
A
1048 uth->uu_save.uus_workq_park_data.has_stack = false;
1049 TAILQ_INSERT_TAIL(&wq->wq_thidlelist, uth, uu_workq_entry);
1050 } else {
1051 uth->uu_save.uus_workq_park_data.has_stack = true;
1052 TAILQ_INSERT_HEAD(&wq->wq_thidlelist, uth, uu_workq_entry);
1053 }
1054
1055 if (!tail) {
1056 uint64_t delay = workq_kill_delay_for_idle_thread(wq);
1057 workq_death_call_schedule(wq, now + delay);
1058 }
1059}
1060
1061#pragma mark thread requests
1062
1063static inline int
1064workq_priority_for_req(workq_threadreq_t req)
1065{
1066 thread_qos_t qos = req->tr_qos;
1067
cb323159 1068 if (req->tr_flags & WORKQ_TR_FLAG_WL_OUTSIDE_QOS) {
d9a64523
A
1069 workq_threadreq_param_t trp = kqueue_threadreq_workloop_param(req);
1070 assert(trp.trp_flags & TRP_PRIORITY);
1071 return trp.trp_pri;
1072 }
1073 return thread_workq_pri_for_qos(qos);
1074}
1075
1076static inline struct priority_queue *
1077workq_priority_queue_for_req(struct workqueue *wq, workq_threadreq_t req)
1078{
cb323159 1079 if (req->tr_flags & WORKQ_TR_FLAG_WL_OUTSIDE_QOS) {
d9a64523 1080 return &wq->wq_special_queue;
cb323159 1081 } else if (req->tr_flags & WORKQ_TR_FLAG_OVERCOMMIT) {
d9a64523
A
1082 return &wq->wq_overcommit_queue;
1083 } else {
1084 return &wq->wq_constrained_queue;
1085 }
1086}
1087
1088/*
1089 * returns true if the the enqueued request is the highest priority item
1090 * in its priority queue.
1091 */
1092static bool
1093workq_threadreq_enqueue(struct workqueue *wq, workq_threadreq_t req)
1094{
cb323159 1095 assert(req->tr_state == WORKQ_TR_STATE_NEW);
d9a64523 1096
cb323159 1097 req->tr_state = WORKQ_TR_STATE_QUEUED;
d9a64523
A
1098 wq->wq_reqcount += req->tr_count;
1099
1100 if (req->tr_qos == WORKQ_THREAD_QOS_MANAGER) {
1101 assert(wq->wq_event_manager_threadreq == NULL);
cb323159 1102 assert(req->tr_flags & WORKQ_TR_FLAG_KEVENT);
d9a64523
A
1103 assert(req->tr_count == 1);
1104 wq->wq_event_manager_threadreq = req;
1105 return true;
1106 }
1107 if (priority_queue_insert(workq_priority_queue_for_req(wq, req),
0a7de745
A
1108 &req->tr_entry, workq_priority_for_req(req),
1109 PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE)) {
cb323159 1110 if ((req->tr_flags & WORKQ_TR_FLAG_OVERCOMMIT) == 0) {
d9a64523
A
1111 _wq_thactive_refresh_best_constrained_req_qos(wq);
1112 }
1113 return true;
1114 }
1115 return false;
1116}
1117
1118/*
1119 * returns true if the the dequeued request was the highest priority item
1120 * in its priority queue.
1121 */
1122static bool
1123workq_threadreq_dequeue(struct workqueue *wq, workq_threadreq_t req)
1124{
1125 wq->wq_reqcount--;
1126
1127 if (--req->tr_count == 0) {
1128 if (req->tr_qos == WORKQ_THREAD_QOS_MANAGER) {
1129 assert(wq->wq_event_manager_threadreq == req);
1130 assert(req->tr_count == 0);
1131 wq->wq_event_manager_threadreq = NULL;
1132 return true;
1133 }
1134 if (priority_queue_remove(workq_priority_queue_for_req(wq, req),
0a7de745 1135 &req->tr_entry, PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE)) {
cb323159 1136 if ((req->tr_flags & WORKQ_TR_FLAG_OVERCOMMIT) == 0) {
d9a64523
A
1137 _wq_thactive_refresh_best_constrained_req_qos(wq);
1138 }
1139 return true;
1140 }
1141 }
1142 return false;
1143}
1144
1145static void
1146workq_threadreq_destroy(proc_t p, workq_threadreq_t req)
1147{
cb323159
A
1148 req->tr_state = WORKQ_TR_STATE_CANCELED;
1149 if (req->tr_flags & (WORKQ_TR_FLAG_WORKLOOP | WORKQ_TR_FLAG_KEVENT)) {
d9a64523
A
1150 kqueue_threadreq_cancel(p, req);
1151 } else {
1152 zfree(workq_zone_threadreq, req);
1153 }
1154}
1155
d9a64523
A
1156#pragma mark workqueue thread creation thread calls
1157
1158static inline bool
1159workq_thread_call_prepost(struct workqueue *wq, uint32_t sched, uint32_t pend,
0a7de745 1160 uint32_t fail_mask)
d9a64523
A
1161{
1162 uint32_t old_flags, new_flags;
1163
1164 os_atomic_rmw_loop(&wq->wq_flags, old_flags, new_flags, acquire, {
1165 if (__improbable(old_flags & (WQ_EXITING | sched | pend | fail_mask))) {
0a7de745 1166 os_atomic_rmw_loop_give_up(return false);
d9a64523
A
1167 }
1168 if (__improbable(old_flags & WQ_PROC_SUSPENDED)) {
0a7de745 1169 new_flags = old_flags | pend;
d9a64523 1170 } else {
0a7de745 1171 new_flags = old_flags | sched;
d9a64523
A
1172 }
1173 });
1174
1175 return (old_flags & WQ_PROC_SUSPENDED) == 0;
1176}
1177
1178#define WORKQ_SCHEDULE_DELAYED_THREAD_CREATION_RESTART 0x1
1179
1180static bool
1181workq_schedule_delayed_thread_creation(struct workqueue *wq, int flags)
1182{
1183 assert(!preemption_enabled());
1184
1185 if (!workq_thread_call_prepost(wq, WQ_DELAYED_CALL_SCHEDULED,
0a7de745
A
1186 WQ_DELAYED_CALL_PENDED, WQ_IMMEDIATE_CALL_PENDED |
1187 WQ_IMMEDIATE_CALL_SCHEDULED)) {
d9a64523
A
1188 return false;
1189 }
1190
1191 uint64_t now = mach_absolute_time();
1192
1193 if (flags & WORKQ_SCHEDULE_DELAYED_THREAD_CREATION_RESTART) {
1194 /* do not change the window */
1195 } else if (now - wq->wq_thread_call_last_run <= wq->wq_timer_interval) {
1196 wq->wq_timer_interval *= 2;
1197 if (wq->wq_timer_interval > wq_max_timer_interval.abstime) {
1198 wq->wq_timer_interval = wq_max_timer_interval.abstime;
1199 }
1200 } else if (now - wq->wq_thread_call_last_run > 2 * wq->wq_timer_interval) {
1201 wq->wq_timer_interval /= 2;
1202 if (wq->wq_timer_interval < wq_stalled_window.abstime) {
1203 wq->wq_timer_interval = wq_stalled_window.abstime;
1204 }
1205 }
1206
1207 WQ_TRACE_WQ(TRACE_wq_start_add_timer, wq, wq->wq_reqcount,
0a7de745 1208 _wq_flags(wq), wq->wq_timer_interval, 0);
d9a64523
A
1209
1210 thread_call_t call = wq->wq_delayed_call;
1211 uintptr_t arg = WQ_DELAYED_CALL_SCHEDULED;
1212 uint64_t deadline = now + wq->wq_timer_interval;
1213 if (thread_call_enter1_delayed(call, (void *)arg, deadline)) {
1214 panic("delayed_call was already enqueued");
1215 }
1216 return true;
1217}
1218
1219static void
1220workq_schedule_immediate_thread_creation(struct workqueue *wq)
1221{
1222 assert(!preemption_enabled());
1223
1224 if (workq_thread_call_prepost(wq, WQ_IMMEDIATE_CALL_SCHEDULED,
0a7de745 1225 WQ_IMMEDIATE_CALL_PENDED, 0)) {
d9a64523 1226 WQ_TRACE_WQ(TRACE_wq_start_add_timer, wq, wq->wq_reqcount,
0a7de745 1227 _wq_flags(wq), 0, 0);
d9a64523
A
1228
1229 uintptr_t arg = WQ_IMMEDIATE_CALL_SCHEDULED;
1230 if (thread_call_enter1(wq->wq_immediate_call, (void *)arg)) {
1231 panic("immediate_call was already enqueued");
1232 }
1233 }
1234}
1235
1236void
1237workq_proc_suspended(struct proc *p)
1238{
1239 struct workqueue *wq = proc_get_wqptr(p);
1240
0a7de745
A
1241 if (wq) {
1242 os_atomic_or(&wq->wq_flags, WQ_PROC_SUSPENDED, relaxed);
1243 }
d9a64523
A
1244}
1245
1246void
1247workq_proc_resumed(struct proc *p)
1248{
1249 struct workqueue *wq = proc_get_wqptr(p);
1250 uint32_t wq_flags;
1251
0a7de745
A
1252 if (!wq) {
1253 return;
1254 }
d9a64523 1255
cb323159
A
1256 wq_flags = os_atomic_andnot_orig(&wq->wq_flags, WQ_PROC_SUSPENDED |
1257 WQ_DELAYED_CALL_PENDED | WQ_IMMEDIATE_CALL_PENDED, relaxed);
d9a64523
A
1258 if ((wq_flags & WQ_EXITING) == 0) {
1259 disable_preemption();
1260 if (wq_flags & WQ_IMMEDIATE_CALL_PENDED) {
1261 workq_schedule_immediate_thread_creation(wq);
1262 } else if (wq_flags & WQ_DELAYED_CALL_PENDED) {
1263 workq_schedule_delayed_thread_creation(wq,
0a7de745 1264 WORKQ_SCHEDULE_DELAYED_THREAD_CREATION_RESTART);
d9a64523
A
1265 }
1266 enable_preemption();
1267 }
1268}
1269
1270/**
1271 * returns whether lastblocked_tsp is within wq_stalled_window usecs of now
1272 */
1273static bool
1274workq_thread_is_busy(uint64_t now, _Atomic uint64_t *lastblocked_tsp)
1275{
cb323159 1276 uint64_t lastblocked_ts = os_atomic_load_wide(lastblocked_tsp, relaxed);
d9a64523
A
1277 if (now <= lastblocked_ts) {
1278 /*
1279 * Because the update of the timestamp when a thread blocks
1280 * isn't serialized against us looking at it (i.e. we don't hold
1281 * the workq lock), it's possible to have a timestamp that matches
1282 * the current time or that even looks to be in the future relative
1283 * to when we grabbed the current time...
1284 *
1285 * Just treat this as a busy thread since it must have just blocked.
1286 */
1287 return true;
1288 }
1289 return (now - lastblocked_ts) < wq_stalled_window.abstime;
1290}
1291
1292static void
1293workq_add_new_threads_call(void *_p, void *flags)
1294{
1295 proc_t p = _p;
1296 struct workqueue *wq = proc_get_wqptr(p);
1297 uint32_t my_flag = (uint32_t)(uintptr_t)flags;
1298
1299 /*
1300 * workq_exit() will set the workqueue to NULL before
1301 * it cancels thread calls.
1302 */
0a7de745
A
1303 if (!wq) {
1304 return;
1305 }
d9a64523
A
1306
1307 assert((my_flag == WQ_DELAYED_CALL_SCHEDULED) ||
0a7de745 1308 (my_flag == WQ_IMMEDIATE_CALL_SCHEDULED));
d9a64523
A
1309
1310 WQ_TRACE_WQ(TRACE_wq_add_timer | DBG_FUNC_START, wq, _wq_flags(wq),
0a7de745 1311 wq->wq_nthreads, wq->wq_thidlecount, 0);
d9a64523
A
1312
1313 workq_lock_spin(wq);
1314
1315 wq->wq_thread_call_last_run = mach_absolute_time();
cb323159 1316 os_atomic_andnot(&wq->wq_flags, my_flag, release);
d9a64523
A
1317
1318 /* This can drop the workqueue lock, and take it again */
1319 workq_schedule_creator(p, wq, WORKQ_THREADREQ_CAN_CREATE_THREADS);
1320
1321 workq_unlock(wq);
1322
1323 WQ_TRACE_WQ(TRACE_wq_add_timer | DBG_FUNC_END, wq, 0,
0a7de745 1324 wq->wq_nthreads, wq->wq_thidlecount, 0);
d9a64523
A
1325}
1326
1327#pragma mark thread state tracking
1328
1329static void
1330workq_sched_callback(int type, thread_t thread)
1331{
1332 struct uthread *uth = get_bsdthread_info(thread);
1333 proc_t proc = get_bsdtask_info(get_threadtask(thread));
1334 struct workqueue *wq = proc_get_wqptr(proc);
1335 thread_qos_t req_qos, qos = uth->uu_workq_pri.qos_bucket;
1336 wq_thactive_t old_thactive;
1337 bool start_timer = false;
1338
1339 if (qos == WORKQ_THREAD_QOS_MANAGER) {
1340 return;
1341 }
1342
1343 switch (type) {
1344 case SCHED_CALL_BLOCK:
1345 old_thactive = _wq_thactive_dec(wq, qos);
1346 req_qos = WQ_THACTIVE_BEST_CONSTRAINED_REQ_QOS(old_thactive);
1347
1348 /*
1349 * Remember the timestamp of the last thread that blocked in this
1350 * bucket, it used used by admission checks to ignore one thread
1351 * being inactive if this timestamp is recent enough.
1352 *
1353 * If we collide with another thread trying to update the
1354 * last_blocked (really unlikely since another thread would have to
1355 * get scheduled and then block after we start down this path), it's
1356 * not a problem. Either timestamp is adequate, so no need to retry
1357 */
cb323159 1358 os_atomic_store_wide(&wq->wq_lastblocked_ts[_wq_bucket(qos)],
0a7de745 1359 thread_last_run_time(thread), relaxed);
d9a64523
A
1360
1361 if (req_qos == THREAD_QOS_UNSPECIFIED) {
1362 /*
1363 * No pending request at the moment we could unblock, move on.
1364 */
1365 } else if (qos < req_qos) {
1366 /*
1367 * The blocking thread is at a lower QoS than the highest currently
1368 * pending constrained request, nothing has to be redriven
1369 */
1370 } else {
1371 uint32_t max_busycount, old_req_count;
1372 old_req_count = _wq_thactive_aggregate_downto_qos(wq, old_thactive,
0a7de745 1373 req_qos, NULL, &max_busycount);
d9a64523
A
1374 /*
1375 * If it is possible that may_start_constrained_thread had refused
1376 * admission due to being over the max concurrency, we may need to
1377 * spin up a new thread.
1378 *
1379 * We take into account the maximum number of busy threads
1380 * that can affect may_start_constrained_thread as looking at the
1381 * actual number may_start_constrained_thread will see is racy.
1382 *
1383 * IOW at NCPU = 4, for IN (req_qos = 1), if the old req count is
1384 * between NCPU (4) and NCPU - 2 (2) we need to redrive.
1385 */
1386 uint32_t conc = wq_max_parallelism[_wq_bucket(qos)];
1387 if (old_req_count <= conc && conc <= old_req_count + max_busycount) {
1388 start_timer = workq_schedule_delayed_thread_creation(wq, 0);
1389 }
1390 }
1391 if (__improbable(kdebug_enable)) {
1392 __unused uint32_t old = _wq_thactive_aggregate_downto_qos(wq,
0a7de745 1393 old_thactive, qos, NULL, NULL);
d9a64523 1394 WQ_TRACE_WQ(TRACE_wq_thread_block | DBG_FUNC_START, wq,
0a7de745
A
1395 old - 1, qos | (req_qos << 8),
1396 wq->wq_reqcount << 1 | start_timer, 0);
d9a64523
A
1397 }
1398 break;
1399
1400 case SCHED_CALL_UNBLOCK:
1401 /*
1402 * we cannot take the workqueue_lock here...
1403 * an UNBLOCK can occur from a timer event which
1404 * is run from an interrupt context... if the workqueue_lock
1405 * is already held by this processor, we'll deadlock...
1406 * the thread lock for the thread being UNBLOCKED
1407 * is also held
1408 */
1409 old_thactive = _wq_thactive_inc(wq, qos);
1410 if (__improbable(kdebug_enable)) {
1411 __unused uint32_t old = _wq_thactive_aggregate_downto_qos(wq,
0a7de745 1412 old_thactive, qos, NULL, NULL);
d9a64523
A
1413 req_qos = WQ_THACTIVE_BEST_CONSTRAINED_REQ_QOS(old_thactive);
1414 WQ_TRACE_WQ(TRACE_wq_thread_block | DBG_FUNC_END, wq,
0a7de745
A
1415 old + 1, qos | (req_qos << 8),
1416 wq->wq_threads_scheduled, 0);
d9a64523
A
1417 }
1418 break;
1419 }
1420}
1421
1422#pragma mark workq lifecycle
1423
1424void
1425workq_reference(struct workqueue *wq)
1426{
1427 os_ref_retain(&wq->wq_refcnt);
1428}
1429
cb323159
A
1430static void
1431workq_deallocate_queue_invoke(mpsc_queue_chain_t e,
1432 __assert_only mpsc_daemon_queue_t dq)
d9a64523 1433{
cb323159 1434 struct workqueue *wq;
d9a64523
A
1435 struct turnstile *ts;
1436
cb323159
A
1437 wq = mpsc_queue_element(e, struct workqueue, wq_destroy_link);
1438 assert(dq == &workq_deallocate_queue);
1439
1440 turnstile_complete((uintptr_t)wq, &wq->wq_turnstile, &ts, TURNSTILE_WORKQS);
d9a64523
A
1441 assert(ts);
1442 turnstile_cleanup();
1443 turnstile_deallocate(ts);
1444
1445 lck_spin_destroy(&wq->wq_lock, workq_lck_grp);
1446 zfree(workq_zone_workqueue, wq);
1447}
1448
1449static void
1450workq_deallocate(struct workqueue *wq)
1451{
1452 if (os_ref_release_relaxed(&wq->wq_refcnt) == 0) {
cb323159
A
1453 workq_deallocate_queue_invoke(&wq->wq_destroy_link,
1454 &workq_deallocate_queue);
d9a64523
A
1455 }
1456}
1457
1458void
1459workq_deallocate_safe(struct workqueue *wq)
1460{
1461 if (__improbable(os_ref_release_relaxed(&wq->wq_refcnt) == 0)) {
cb323159
A
1462 mpsc_daemon_enqueue(&workq_deallocate_queue, &wq->wq_destroy_link,
1463 MPSC_QUEUE_DISABLE_PREEMPTION);
d9a64523
A
1464 }
1465}
1466
1467/**
1468 * Setup per-process state for the workqueue.
1469 */
1470int
1471workq_open(struct proc *p, __unused struct workq_open_args *uap,
0a7de745 1472 __unused int32_t *retval)
d9a64523
A
1473{
1474 struct workqueue *wq;
1475 int error = 0;
1476
1477 if ((p->p_lflag & P_LREGISTER) == 0) {
1478 return EINVAL;
1479 }
1480
1481 if (wq_init_constrained_limit) {
1482 uint32_t limit, num_cpus = ml_get_max_cpus();
1483
1484 /*
1485 * set up the limit for the constrained pool
1486 * this is a virtual pool in that we don't
1487 * maintain it on a separate idle and run list
1488 */
1489 limit = num_cpus * WORKQUEUE_CONSTRAINED_FACTOR;
1490
0a7de745 1491 if (limit > wq_max_constrained_threads) {
d9a64523 1492 wq_max_constrained_threads = limit;
0a7de745 1493 }
d9a64523
A
1494
1495 if (wq_max_threads > WQ_THACTIVE_BUCKET_HALF) {
1496 wq_max_threads = WQ_THACTIVE_BUCKET_HALF;
1497 }
1498 if (wq_max_threads > CONFIG_THREAD_MAX - 20) {
1499 wq_max_threads = CONFIG_THREAD_MAX - 20;
1500 }
1501
1502 wq_death_max_load = (uint16_t)fls(num_cpus) + 1;
1503
1504 for (thread_qos_t qos = WORKQ_THREAD_QOS_MIN; qos <= WORKQ_THREAD_QOS_MAX; qos++) {
1505 wq_max_parallelism[_wq_bucket(qos)] =
0a7de745 1506 qos_max_parallelism(qos, QOS_PARALLELISM_COUNT_LOGICAL);
d9a64523
A
1507 }
1508
1509 wq_init_constrained_limit = 0;
1510 }
1511
1512 if (proc_get_wqptr(p) == NULL) {
1513 if (proc_init_wqptr_or_wait(p) == FALSE) {
1514 assert(proc_get_wqptr(p) != NULL);
1515 goto out;
1516 }
1517
1518 wq = (struct workqueue *)zalloc(workq_zone_workqueue);
1519 bzero(wq, sizeof(struct workqueue));
1520
1521 os_ref_init_count(&wq->wq_refcnt, &workq_refgrp, 1);
1522
1523 // Start the event manager at the priority hinted at by the policy engine
1524 thread_qos_t mgr_priority_hint = task_get_default_manager_qos(current_task());
1525 pthread_priority_t pp = _pthread_priority_make_from_thread_qos(mgr_priority_hint, 0, 0);
1526 wq->wq_event_manager_priority = (uint32_t)pp;
1527 wq->wq_timer_interval = wq_stalled_window.abstime;
1528 wq->wq_proc = p;
1529 turnstile_prepare((uintptr_t)wq, &wq->wq_turnstile, turnstile_alloc(),
0a7de745 1530 TURNSTILE_WORKQS);
d9a64523
A
1531
1532 TAILQ_INIT(&wq->wq_thrunlist);
1533 TAILQ_INIT(&wq->wq_thnewlist);
1534 TAILQ_INIT(&wq->wq_thidlelist);
1535 priority_queue_init(&wq->wq_overcommit_queue,
0a7de745 1536 PRIORITY_QUEUE_BUILTIN_MAX_HEAP);
d9a64523 1537 priority_queue_init(&wq->wq_constrained_queue,
0a7de745 1538 PRIORITY_QUEUE_BUILTIN_MAX_HEAP);
d9a64523 1539 priority_queue_init(&wq->wq_special_queue,
0a7de745 1540 PRIORITY_QUEUE_BUILTIN_MAX_HEAP);
d9a64523
A
1541
1542 wq->wq_delayed_call = thread_call_allocate_with_options(
0a7de745
A
1543 workq_add_new_threads_call, p, THREAD_CALL_PRIORITY_KERNEL,
1544 THREAD_CALL_OPTIONS_ONCE);
d9a64523 1545 wq->wq_immediate_call = thread_call_allocate_with_options(
0a7de745
A
1546 workq_add_new_threads_call, p, THREAD_CALL_PRIORITY_KERNEL,
1547 THREAD_CALL_OPTIONS_ONCE);
d9a64523 1548 wq->wq_death_call = thread_call_allocate_with_options(
0a7de745
A
1549 workq_kill_old_threads_call, wq,
1550 THREAD_CALL_PRIORITY_USER, THREAD_CALL_OPTIONS_ONCE);
d9a64523
A
1551
1552 lck_spin_init(&wq->wq_lock, workq_lck_grp, workq_lck_attr);
1553
1554 WQ_TRACE_WQ(TRACE_wq_create | DBG_FUNC_NONE, wq,
0a7de745 1555 VM_KERNEL_ADDRHIDE(wq), 0, 0, 0);
d9a64523
A
1556 proc_set_wqptr(p, wq);
1557 }
1558out:
1559
1560 return error;
1561}
1562
1563/*
1564 * Routine: workq_mark_exiting
1565 *
1566 * Function: Mark the work queue such that new threads will not be added to the
1567 * work queue after we return.
1568 *
1569 * Conditions: Called against the current process.
1570 */
1571void
1572workq_mark_exiting(struct proc *p)
1573{
1574 struct workqueue *wq = proc_get_wqptr(p);
1575 uint32_t wq_flags;
1576 workq_threadreq_t mgr_req;
1577
0a7de745
A
1578 if (!wq) {
1579 return;
1580 }
d9a64523 1581
0a7de745 1582 WQ_TRACE_WQ(TRACE_wq_pthread_exit | DBG_FUNC_START, wq, 0, 0, 0, 0);
d9a64523
A
1583
1584 workq_lock_spin(wq);
1585
1586 wq_flags = os_atomic_or_orig(&wq->wq_flags, WQ_EXITING, relaxed);
1587 if (__improbable(wq_flags & WQ_EXITING)) {
1588 panic("workq_mark_exiting called twice");
1589 }
1590
1591 /*
1592 * Opportunistically try to cancel thread calls that are likely in flight.
1593 * workq_exit() will do the proper cleanup.
1594 */
1595 if (wq_flags & WQ_IMMEDIATE_CALL_SCHEDULED) {
1596 thread_call_cancel(wq->wq_immediate_call);
1597 }
1598 if (wq_flags & WQ_DELAYED_CALL_SCHEDULED) {
1599 thread_call_cancel(wq->wq_delayed_call);
1600 }
1601 if (wq_flags & WQ_DEATH_CALL_SCHEDULED) {
1602 thread_call_cancel(wq->wq_death_call);
1603 }
1604
1605 mgr_req = wq->wq_event_manager_threadreq;
1606 wq->wq_event_manager_threadreq = NULL;
1607 wq->wq_reqcount = 0; /* workq_schedule_creator must not look at queues */
cb323159
A
1608 wq->wq_creator = NULL;
1609 workq_turnstile_update_inheritor(wq, TURNSTILE_INHERITOR_NULL, 0);
d9a64523
A
1610
1611 workq_unlock(wq);
1612
1613 if (mgr_req) {
1614 kqueue_threadreq_cancel(p, mgr_req);
1615 }
1616 /*
1617 * No one touches the priority queues once WQ_EXITING is set.
1618 * It is hence safe to do the tear down without holding any lock.
1619 */
1620 priority_queue_destroy(&wq->wq_overcommit_queue,
0a7de745 1621 struct workq_threadreq_s, tr_entry, ^(void *e){
d9a64523
A
1622 workq_threadreq_destroy(p, e);
1623 });
1624 priority_queue_destroy(&wq->wq_constrained_queue,
0a7de745 1625 struct workq_threadreq_s, tr_entry, ^(void *e){
d9a64523
A
1626 workq_threadreq_destroy(p, e);
1627 });
1628 priority_queue_destroy(&wq->wq_special_queue,
0a7de745 1629 struct workq_threadreq_s, tr_entry, ^(void *e){
d9a64523
A
1630 workq_threadreq_destroy(p, e);
1631 });
1632
0a7de745 1633 WQ_TRACE(TRACE_wq_pthread_exit | DBG_FUNC_END, 0, 0, 0, 0, 0);
d9a64523
A
1634}
1635
1636/*
1637 * Routine: workq_exit
1638 *
1639 * Function: clean up the work queue structure(s) now that there are no threads
1640 * left running inside the work queue (except possibly current_thread).
1641 *
1642 * Conditions: Called by the last thread in the process.
1643 * Called against current process.
1644 */
1645void
1646workq_exit(struct proc *p)
1647{
1648 struct workqueue *wq;
1649 struct uthread *uth, *tmp;
1650
1651 wq = os_atomic_xchg(&p->p_wqptr, NULL, relaxed);
1652 if (wq != NULL) {
1653 thread_t th = current_thread();
1654
0a7de745 1655 WQ_TRACE_WQ(TRACE_wq_workqueue_exit | DBG_FUNC_START, wq, 0, 0, 0, 0);
d9a64523
A
1656
1657 if (thread_get_tag(th) & THREAD_TAG_WORKQUEUE) {
1658 /*
1659 * <rdar://problem/40111515> Make sure we will no longer call the
1660 * sched call, if we ever block this thread, which the cancel_wait
1661 * below can do.
1662 */
1663 thread_sched_call(th, NULL);
1664 }
1665
1666 /*
1667 * Thread calls are always scheduled by the proc itself or under the
1668 * workqueue spinlock if WQ_EXITING is not yet set.
1669 *
1670 * Either way, when this runs, the proc has no threads left beside
1671 * the one running this very code, so we know no thread call can be
1672 * dispatched anymore.
1673 */
1674 thread_call_cancel_wait(wq->wq_delayed_call);
1675 thread_call_cancel_wait(wq->wq_immediate_call);
1676 thread_call_cancel_wait(wq->wq_death_call);
1677 thread_call_free(wq->wq_delayed_call);
1678 thread_call_free(wq->wq_immediate_call);
1679 thread_call_free(wq->wq_death_call);
1680
1681 /*
1682 * Clean up workqueue data structures for threads that exited and
1683 * didn't get a chance to clean up after themselves.
1684 *
1685 * idle/new threads should have been interrupted and died on their own
1686 */
1687 TAILQ_FOREACH_SAFE(uth, &wq->wq_thrunlist, uu_workq_entry, tmp) {
1688 thread_sched_call(uth->uu_thread, NULL);
1689 thread_deallocate(uth->uu_thread);
1690 }
1691 assert(TAILQ_EMPTY(&wq->wq_thnewlist));
1692 assert(TAILQ_EMPTY(&wq->wq_thidlelist));
1693
1694 WQ_TRACE_WQ(TRACE_wq_destroy | DBG_FUNC_END, wq,
0a7de745 1695 VM_KERNEL_ADDRHIDE(wq), 0, 0, 0);
d9a64523
A
1696
1697 workq_deallocate(wq);
1698
0a7de745 1699 WQ_TRACE(TRACE_wq_workqueue_exit | DBG_FUNC_END, 0, 0, 0, 0, 0);
d9a64523
A
1700 }
1701}
1702
1703
1704#pragma mark bsd thread control
1705
1706static bool
1707_pthread_priority_to_policy(pthread_priority_t priority,
0a7de745 1708 thread_qos_policy_data_t *data)
d9a64523
A
1709{
1710 data->qos_tier = _pthread_priority_thread_qos(priority);
1711 data->tier_importance = _pthread_priority_relpri(priority);
1712 if (data->qos_tier == THREAD_QOS_UNSPECIFIED || data->tier_importance > 0 ||
0a7de745 1713 data->tier_importance < THREAD_QOS_MIN_TIER_IMPORTANCE) {
d9a64523
A
1714 return false;
1715 }
1716 return true;
1717}
1718
1719static int
1720bsdthread_set_self(proc_t p, thread_t th, pthread_priority_t priority,
0a7de745 1721 mach_port_name_t voucher, enum workq_set_self_flags flags)
d9a64523
A
1722{
1723 struct uthread *uth = get_bsdthread_info(th);
1724 struct workqueue *wq = proc_get_wqptr(p);
1725
1726 kern_return_t kr;
1727 int unbind_rv = 0, qos_rv = 0, voucher_rv = 0, fixedpri_rv = 0;
1728 bool is_wq_thread = (thread_get_tag(th) & THREAD_TAG_WORKQUEUE);
1729
1730 if (flags & WORKQ_SET_SELF_WQ_KEVENT_UNBIND) {
1731 if (!is_wq_thread) {
1732 unbind_rv = EINVAL;
1733 goto qos;
1734 }
1735
1736 if (uth->uu_workq_pri.qos_bucket == WORKQ_THREAD_QOS_MANAGER) {
1737 unbind_rv = EINVAL;
1738 goto qos;
1739 }
1740
cb323159 1741 workq_threadreq_t kqr = uth->uu_kqr_bound;
d9a64523
A
1742 if (kqr == NULL) {
1743 unbind_rv = EALREADY;
1744 goto qos;
1745 }
1746
cb323159 1747 if (kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP) {
d9a64523
A
1748 unbind_rv = EINVAL;
1749 goto qos;
1750 }
1751
cb323159 1752 kqueue_threadreq_unbind(p, kqr);
d9a64523
A
1753 }
1754
1755qos:
1756 if (flags & WORKQ_SET_SELF_QOS_FLAG) {
1757 thread_qos_policy_data_t new_policy;
1758
1759 if (!_pthread_priority_to_policy(priority, &new_policy)) {
1760 qos_rv = EINVAL;
1761 goto voucher;
1762 }
1763
1764 if (!is_wq_thread) {
1765 /*
1766 * Threads opted out of QoS can't change QoS
1767 */
1768 if (!thread_has_qos_policy(th)) {
1769 qos_rv = EPERM;
1770 goto voucher;
1771 }
cb323159
A
1772 } else if (uth->uu_workq_pri.qos_bucket == WORKQ_THREAD_QOS_MANAGER ||
1773 uth->uu_workq_pri.qos_bucket == WORKQ_THREAD_QOS_ABOVEUI) {
d9a64523 1774 /*
cb323159 1775 * Workqueue manager threads or threads above UI can't change QoS
d9a64523
A
1776 */
1777 qos_rv = EINVAL;
1778 goto voucher;
1779 } else {
1780 /*
1781 * For workqueue threads, possibly adjust buckets and redrive thread
1782 * requests.
1783 */
1784 bool old_overcommit = uth->uu_workq_flags & UT_WORKQ_OVERCOMMIT;
1785 bool new_overcommit = priority & _PTHREAD_PRIORITY_OVERCOMMIT_FLAG;
1786 struct uu_workq_policy old_pri, new_pri;
1787 bool force_run = false;
1788
1789 workq_lock_spin(wq);
1790
1791 if (old_overcommit != new_overcommit) {
1792 uth->uu_workq_flags ^= UT_WORKQ_OVERCOMMIT;
1793 if (old_overcommit) {
1794 wq->wq_constrained_threads_scheduled++;
1795 } else if (wq->wq_constrained_threads_scheduled-- ==
0a7de745 1796 wq_max_constrained_threads) {
d9a64523
A
1797 force_run = true;
1798 }
1799 }
1800
1801 old_pri = new_pri = uth->uu_workq_pri;
1802 new_pri.qos_req = new_policy.qos_tier;
1803 workq_thread_update_bucket(p, wq, uth, old_pri, new_pri, force_run);
1804 workq_unlock(wq);
1805 }
1806
1807 kr = thread_policy_set_internal(th, THREAD_QOS_POLICY,
0a7de745 1808 (thread_policy_t)&new_policy, THREAD_QOS_POLICY_COUNT);
d9a64523
A
1809 if (kr != KERN_SUCCESS) {
1810 qos_rv = EINVAL;
1811 }
1812 }
1813
1814voucher:
1815 if (flags & WORKQ_SET_SELF_VOUCHER_FLAG) {
1816 kr = thread_set_voucher_name(voucher);
1817 if (kr != KERN_SUCCESS) {
1818 voucher_rv = ENOENT;
1819 goto fixedpri;
1820 }
1821 }
1822
1823fixedpri:
0a7de745
A
1824 if (qos_rv) {
1825 goto done;
1826 }
d9a64523
A
1827 if (flags & WORKQ_SET_SELF_FIXEDPRIORITY_FLAG) {
1828 thread_extended_policy_data_t extpol = {.timeshare = 0};
1829
1830 if (is_wq_thread) {
1831 /* Not allowed on workqueue threads */
1832 fixedpri_rv = ENOTSUP;
1833 goto done;
1834 }
1835
1836 kr = thread_policy_set_internal(th, THREAD_EXTENDED_POLICY,
0a7de745 1837 (thread_policy_t)&extpol, THREAD_EXTENDED_POLICY_COUNT);
d9a64523
A
1838 if (kr != KERN_SUCCESS) {
1839 fixedpri_rv = EINVAL;
1840 goto done;
1841 }
1842 } else if (flags & WORKQ_SET_SELF_TIMESHARE_FLAG) {
1843 thread_extended_policy_data_t extpol = {.timeshare = 1};
1844
1845 if (is_wq_thread) {
1846 /* Not allowed on workqueue threads */
1847 fixedpri_rv = ENOTSUP;
1848 goto done;
1849 }
1850
1851 kr = thread_policy_set_internal(th, THREAD_EXTENDED_POLICY,
0a7de745 1852 (thread_policy_t)&extpol, THREAD_EXTENDED_POLICY_COUNT);
d9a64523
A
1853 if (kr != KERN_SUCCESS) {
1854 fixedpri_rv = EINVAL;
1855 goto done;
1856 }
1857 }
1858
1859done:
1860 if (qos_rv && voucher_rv) {
1861 /* Both failed, give that a unique error. */
1862 return EBADMSG;
1863 }
1864
1865 if (unbind_rv) {
1866 return unbind_rv;
1867 }
1868
1869 if (qos_rv) {
1870 return qos_rv;
1871 }
1872
1873 if (voucher_rv) {
1874 return voucher_rv;
1875 }
1876
1877 if (fixedpri_rv) {
1878 return fixedpri_rv;
1879 }
1880
1881 return 0;
1882}
1883
1884static int
1885bsdthread_add_explicit_override(proc_t p, mach_port_name_t kport,
0a7de745 1886 pthread_priority_t pp, user_addr_t resource)
d9a64523
A
1887{
1888 thread_qos_t qos = _pthread_priority_thread_qos(pp);
1889 if (qos == THREAD_QOS_UNSPECIFIED) {
1890 return EINVAL;
1891 }
1892
cb323159
A
1893 thread_t th = port_name_to_thread(kport,
1894 PORT_TO_THREAD_IN_CURRENT_TASK);
d9a64523
A
1895 if (th == THREAD_NULL) {
1896 return ESRCH;
1897 }
1898
1899 int rv = proc_thread_qos_add_override(p->task, th, 0, qos, TRUE,
0a7de745 1900 resource, THREAD_QOS_OVERRIDE_TYPE_PTHREAD_EXPLICIT_OVERRIDE);
d9a64523
A
1901
1902 thread_deallocate(th);
1903 return rv;
1904}
1905
1906static int
1907bsdthread_remove_explicit_override(proc_t p, mach_port_name_t kport,
0a7de745 1908 user_addr_t resource)
d9a64523 1909{
cb323159
A
1910 thread_t th = port_name_to_thread(kport,
1911 PORT_TO_THREAD_IN_CURRENT_TASK);
d9a64523
A
1912 if (th == THREAD_NULL) {
1913 return ESRCH;
1914 }
1915
1916 int rv = proc_thread_qos_remove_override(p->task, th, 0, resource,
0a7de745 1917 THREAD_QOS_OVERRIDE_TYPE_PTHREAD_EXPLICIT_OVERRIDE);
d9a64523
A
1918
1919 thread_deallocate(th);
1920 return rv;
1921}
1922
1923static int
1924workq_thread_add_dispatch_override(proc_t p, mach_port_name_t kport,
0a7de745 1925 pthread_priority_t pp, user_addr_t ulock_addr)
d9a64523
A
1926{
1927 struct uu_workq_policy old_pri, new_pri;
1928 struct workqueue *wq = proc_get_wqptr(p);
1929
1930 thread_qos_t qos_override = _pthread_priority_thread_qos(pp);
1931 if (qos_override == THREAD_QOS_UNSPECIFIED) {
1932 return EINVAL;
1933 }
1934
cb323159
A
1935 thread_t thread = port_name_to_thread(kport,
1936 PORT_TO_THREAD_IN_CURRENT_TASK);
d9a64523
A
1937 if (thread == THREAD_NULL) {
1938 return ESRCH;
1939 }
1940
1941 struct uthread *uth = get_bsdthread_info(thread);
1942 if ((thread_get_tag(thread) & THREAD_TAG_WORKQUEUE) == 0) {
1943 thread_deallocate(thread);
1944 return EPERM;
1945 }
1946
1947 WQ_TRACE_WQ(TRACE_wq_override_dispatch | DBG_FUNC_NONE,
0a7de745 1948 wq, thread_tid(thread), 1, pp, 0);
d9a64523
A
1949
1950 thread_mtx_lock(thread);
1951
1952 if (ulock_addr) {
cb323159 1953 uint32_t val;
d9a64523
A
1954 int rc;
1955 /*
1956 * Workaround lack of explicit support for 'no-fault copyin'
1957 * <rdar://problem/24999882>, as disabling preemption prevents paging in
1958 */
1959 disable_preemption();
cb323159 1960 rc = copyin_atomic32(ulock_addr, &val);
d9a64523 1961 enable_preemption();
cb323159 1962 if (rc == 0 && ulock_owner_value_to_port_name(val) != kport) {
d9a64523
A
1963 goto out;
1964 }
1965 }
1966
1967 workq_lock_spin(wq);
1968
1969 old_pri = uth->uu_workq_pri;
1970 if (old_pri.qos_override >= qos_override) {
1971 /* Nothing to do */
1972 } else if (thread == current_thread()) {
1973 new_pri = old_pri;
1974 new_pri.qos_override = qos_override;
1975 workq_thread_update_bucket(p, wq, uth, old_pri, new_pri, false);
1976 } else {
1977 uth->uu_workq_pri.qos_override = qos_override;
1978 if (qos_override > workq_pri_override(old_pri)) {
1979 thread_set_workq_override(thread, qos_override);
1980 }
1981 }
1982
1983 workq_unlock(wq);
1984
1985out:
1986 thread_mtx_unlock(thread);
1987 thread_deallocate(thread);
1988 return 0;
1989}
1990
1991static int
1992workq_thread_reset_dispatch_override(proc_t p, thread_t thread)
1993{
1994 struct uu_workq_policy old_pri, new_pri;
1995 struct workqueue *wq = proc_get_wqptr(p);
1996 struct uthread *uth = get_bsdthread_info(thread);
1997
1998 if ((thread_get_tag(thread) & THREAD_TAG_WORKQUEUE) == 0) {
1999 return EPERM;
2000 }
2001
2002 WQ_TRACE_WQ(TRACE_wq_override_reset | DBG_FUNC_NONE, wq, 0, 0, 0, 0);
2003
2004 workq_lock_spin(wq);
2005 old_pri = new_pri = uth->uu_workq_pri;
2006 new_pri.qos_override = THREAD_QOS_UNSPECIFIED;
2007 workq_thread_update_bucket(p, wq, uth, old_pri, new_pri, false);
2008 workq_unlock(wq);
2009 return 0;
2010}
2011
cb323159
A
2012static int
2013workq_thread_allow_kill(__unused proc_t p, thread_t thread, bool enable)
2014{
2015 if (!(thread_get_tag(thread) & THREAD_TAG_WORKQUEUE)) {
2016 // If the thread isn't a workqueue thread, don't set the
2017 // kill_allowed bit; however, we still need to return 0
2018 // instead of an error code since this code is executed
2019 // on the abort path which needs to not depend on the
2020 // pthread_t (returning an error depends on pthread_t via
2021 // cerror_nocancel)
2022 return 0;
2023 }
2024 struct uthread *uth = get_bsdthread_info(thread);
2025 uth->uu_workq_pthread_kill_allowed = enable;
2026 return 0;
2027}
2028
d9a64523
A
2029static int
2030bsdthread_get_max_parallelism(thread_qos_t qos, unsigned long flags,
0a7de745 2031 int *retval)
d9a64523
A
2032{
2033 static_assert(QOS_PARALLELISM_COUNT_LOGICAL ==
0a7de745 2034 _PTHREAD_QOS_PARALLELISM_COUNT_LOGICAL, "logical");
d9a64523 2035 static_assert(QOS_PARALLELISM_REALTIME ==
0a7de745 2036 _PTHREAD_QOS_PARALLELISM_REALTIME, "realtime");
d9a64523
A
2037
2038 if (flags & ~(QOS_PARALLELISM_REALTIME | QOS_PARALLELISM_COUNT_LOGICAL)) {
2039 return EINVAL;
2040 }
2041
2042 if (flags & QOS_PARALLELISM_REALTIME) {
2043 if (qos) {
2044 return EINVAL;
2045 }
2046 } else if (qos == THREAD_QOS_UNSPECIFIED || qos >= THREAD_QOS_LAST) {
2047 return EINVAL;
2048 }
2049
2050 *retval = qos_max_parallelism(qos, flags);
2051 return 0;
2052}
2053
2054#define ENSURE_UNUSED(arg) \
0a7de745 2055 ({ if ((arg) != 0) { return EINVAL; } })
d9a64523
A
2056
2057int
2058bsdthread_ctl(struct proc *p, struct bsdthread_ctl_args *uap, int *retval)
2059{
2060 switch (uap->cmd) {
2061 case BSDTHREAD_CTL_QOS_OVERRIDE_START:
2062 return bsdthread_add_explicit_override(p, (mach_port_name_t)uap->arg1,
0a7de745 2063 (pthread_priority_t)uap->arg2, uap->arg3);
d9a64523
A
2064 case BSDTHREAD_CTL_QOS_OVERRIDE_END:
2065 ENSURE_UNUSED(uap->arg3);
2066 return bsdthread_remove_explicit_override(p, (mach_port_name_t)uap->arg1,
0a7de745 2067 (user_addr_t)uap->arg2);
d9a64523
A
2068
2069 case BSDTHREAD_CTL_QOS_OVERRIDE_DISPATCH:
2070 return workq_thread_add_dispatch_override(p, (mach_port_name_t)uap->arg1,
0a7de745 2071 (pthread_priority_t)uap->arg2, uap->arg3);
d9a64523
A
2072 case BSDTHREAD_CTL_QOS_OVERRIDE_RESET:
2073 return workq_thread_reset_dispatch_override(p, current_thread());
2074
2075 case BSDTHREAD_CTL_SET_SELF:
2076 return bsdthread_set_self(p, current_thread(),
0a7de745
A
2077 (pthread_priority_t)uap->arg1, (mach_port_name_t)uap->arg2,
2078 (enum workq_set_self_flags)uap->arg3);
d9a64523
A
2079
2080 case BSDTHREAD_CTL_QOS_MAX_PARALLELISM:
2081 ENSURE_UNUSED(uap->arg3);
2082 return bsdthread_get_max_parallelism((thread_qos_t)uap->arg1,
0a7de745 2083 (unsigned long)uap->arg2, retval);
cb323159
A
2084 case BSDTHREAD_CTL_WORKQ_ALLOW_KILL:
2085 ENSURE_UNUSED(uap->arg2);
2086 ENSURE_UNUSED(uap->arg3);
2087 return workq_thread_allow_kill(p, current_thread(), (bool)uap->arg1);
d9a64523
A
2088
2089 case BSDTHREAD_CTL_SET_QOS:
2090 case BSDTHREAD_CTL_QOS_DISPATCH_ASYNCHRONOUS_OVERRIDE_ADD:
2091 case BSDTHREAD_CTL_QOS_DISPATCH_ASYNCHRONOUS_OVERRIDE_RESET:
2092 /* no longer supported */
2093 return ENOTSUP;
2094
2095 default:
2096 return EINVAL;
2097 }
2098}
2099
2100#pragma mark workqueue thread manipulation
2101
cb323159
A
2102static void __dead2
2103workq_unpark_select_threadreq_or_park_and_unlock(proc_t p, struct workqueue *wq,
2104 struct uthread *uth, uint32_t setup_flags);
2105
d9a64523
A
2106static void __dead2
2107workq_select_threadreq_or_park_and_unlock(proc_t p, struct workqueue *wq,
cb323159 2108 struct uthread *uth, uint32_t setup_flags);
d9a64523
A
2109
2110static void workq_setup_and_run(proc_t p, struct uthread *uth, int flags) __dead2;
2111
2112#if KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD
2113static inline uint64_t
2114workq_trace_req_id(workq_threadreq_t req)
2115{
2116 struct kqworkloop *kqwl;
cb323159
A
2117 if (req->tr_flags & WORKQ_TR_FLAG_WORKLOOP) {
2118 kqwl = __container_of(req, struct kqworkloop, kqwl_request);
d9a64523
A
2119 return kqwl->kqwl_dynamicid;
2120 }
2121
2122 return VM_KERNEL_ADDRHIDE(req);
2123}
2124#endif
2125
2126/**
2127 * Entry point for libdispatch to ask for threads
2128 */
2129static int
2130workq_reqthreads(struct proc *p, uint32_t reqcount, pthread_priority_t pp)
2131{
2132 thread_qos_t qos = _pthread_priority_thread_qos(pp);
2133 struct workqueue *wq = proc_get_wqptr(p);
2134 uint32_t unpaced, upcall_flags = WQ_FLAG_THREAD_NEWSPI;
2135
2136 if (wq == NULL || reqcount <= 0 || reqcount > UINT16_MAX ||
0a7de745 2137 qos == THREAD_QOS_UNSPECIFIED) {
d9a64523
A
2138 return EINVAL;
2139 }
2140
2141 WQ_TRACE_WQ(TRACE_wq_wqops_reqthreads | DBG_FUNC_NONE,
0a7de745 2142 wq, reqcount, pp, 0, 0);
d9a64523
A
2143
2144 workq_threadreq_t req = zalloc(workq_zone_threadreq);
2145 priority_queue_entry_init(&req->tr_entry);
cb323159 2146 req->tr_state = WORKQ_TR_STATE_NEW;
d9a64523
A
2147 req->tr_flags = 0;
2148 req->tr_qos = qos;
2149
2150 if (pp & _PTHREAD_PRIORITY_OVERCOMMIT_FLAG) {
cb323159 2151 req->tr_flags |= WORKQ_TR_FLAG_OVERCOMMIT;
d9a64523
A
2152 upcall_flags |= WQ_FLAG_THREAD_OVERCOMMIT;
2153 }
2154
2155 WQ_TRACE_WQ(TRACE_wq_thread_request_initiate | DBG_FUNC_NONE,
0a7de745 2156 wq, workq_trace_req_id(req), req->tr_qos, reqcount, 0);
d9a64523
A
2157
2158 workq_lock_spin(wq);
2159 do {
2160 if (_wq_exiting(wq)) {
2161 goto exiting;
2162 }
2163
2164 /*
2165 * When userspace is asking for parallelism, wakeup up to (reqcount - 1)
2166 * threads without pacing, to inform the scheduler of that workload.
2167 *
2168 * The last requests, or the ones that failed the admission checks are
2169 * enqueued and go through the regular creator codepath.
2170 *
2171 * If there aren't enough threads, add one, but re-evaluate everything
2172 * as conditions may now have changed.
2173 */
cb323159 2174 if (reqcount > 1 && (req->tr_flags & WORKQ_TR_FLAG_OVERCOMMIT) == 0) {
d9a64523
A
2175 unpaced = workq_constrained_allowance(wq, qos, NULL, false);
2176 if (unpaced >= reqcount - 1) {
2177 unpaced = reqcount - 1;
2178 }
2179 } else {
2180 unpaced = reqcount - 1;
2181 }
2182
2183 /*
2184 * This path does not currently handle custom workloop parameters
2185 * when creating threads for parallelism.
2186 */
cb323159 2187 assert(!(req->tr_flags & WORKQ_TR_FLAG_WL_PARAMS));
d9a64523
A
2188
2189 /*
2190 * This is a trimmed down version of workq_threadreq_bind_and_unlock()
2191 */
2192 while (unpaced > 0 && wq->wq_thidlecount) {
cb323159
A
2193 struct uthread *uth;
2194 bool needs_wakeup;
2195 uint8_t uu_flags = UT_WORKQ_EARLY_BOUND;
2196
2197 if (req->tr_flags & WORKQ_TR_FLAG_OVERCOMMIT) {
2198 uu_flags |= UT_WORKQ_OVERCOMMIT;
2199 }
2200
2201 uth = workq_pop_idle_thread(wq, uu_flags, &needs_wakeup);
d9a64523
A
2202
2203 _wq_thactive_inc(wq, qos);
2204 wq->wq_thscheduled_count[_wq_bucket(qos)]++;
cb323159 2205 workq_thread_reset_pri(wq, uth, req, /*unpark*/ true);
d9a64523
A
2206 wq->wq_fulfilled++;
2207
d9a64523
A
2208 uth->uu_save.uus_workq_park_data.upcall_flags = upcall_flags;
2209 uth->uu_save.uus_workq_park_data.thread_request = req;
cb323159
A
2210 if (needs_wakeup) {
2211 workq_thread_wakeup(uth);
2212 }
d9a64523
A
2213 unpaced--;
2214 reqcount--;
2215 }
2216 } while (unpaced && wq->wq_nthreads < wq_max_threads &&
0a7de745 2217 workq_add_new_idle_thread(p, wq));
d9a64523
A
2218
2219 if (_wq_exiting(wq)) {
2220 goto exiting;
2221 }
2222
2223 req->tr_count = reqcount;
2224 if (workq_threadreq_enqueue(wq, req)) {
2225 /* This can drop the workqueue lock, and take it again */
2226 workq_schedule_creator(p, wq, WORKQ_THREADREQ_CAN_CREATE_THREADS);
2227 }
2228 workq_unlock(wq);
2229 return 0;
2230
2231exiting:
2232 workq_unlock(wq);
2233 zfree(workq_zone_threadreq, req);
2234 return ECANCELED;
2235}
2236
2237bool
cb323159
A
2238workq_kern_threadreq_initiate(struct proc *p, workq_threadreq_t req,
2239 struct turnstile *workloop_ts, thread_qos_t qos,
2240 workq_kern_threadreq_flags_t flags)
d9a64523
A
2241{
2242 struct workqueue *wq = proc_get_wqptr_fast(p);
d9a64523 2243 struct uthread *uth = NULL;
d9a64523 2244
cb323159 2245 assert(req->tr_flags & (WORKQ_TR_FLAG_WORKLOOP | WORKQ_TR_FLAG_KEVENT));
d9a64523 2246
cb323159 2247 if (req->tr_flags & WORKQ_TR_FLAG_WL_OUTSIDE_QOS) {
d9a64523 2248 workq_threadreq_param_t trp = kqueue_threadreq_workloop_param(req);
cb323159
A
2249 qos = thread_workq_qos_for_pri(trp.trp_pri);
2250 if (qos == THREAD_QOS_UNSPECIFIED) {
2251 qos = WORKQ_THREAD_QOS_ABOVEUI;
d9a64523 2252 }
d9a64523
A
2253 }
2254
cb323159 2255 assert(req->tr_state == WORKQ_TR_STATE_IDLE);
d9a64523
A
2256 priority_queue_entry_init(&req->tr_entry);
2257 req->tr_count = 1;
cb323159 2258 req->tr_state = WORKQ_TR_STATE_NEW;
d9a64523
A
2259 req->tr_qos = qos;
2260
2261 WQ_TRACE_WQ(TRACE_wq_thread_request_initiate | DBG_FUNC_NONE, wq,
0a7de745 2262 workq_trace_req_id(req), qos, 1, 0);
d9a64523
A
2263
2264 if (flags & WORKQ_THREADREQ_ATTEMPT_REBIND) {
2265 /*
2266 * we're called back synchronously from the context of
2267 * kqueue_threadreq_unbind from within workq_thread_return()
2268 * we can try to match up this thread with this request !
2269 */
2270 uth = current_uthread();
2271 assert(uth->uu_kqr_bound == NULL);
2272 }
2273
2274 workq_lock_spin(wq);
2275 if (_wq_exiting(wq)) {
cb323159 2276 req->tr_state = WORKQ_TR_STATE_IDLE;
d9a64523
A
2277 workq_unlock(wq);
2278 return false;
2279 }
2280
2281 if (uth && workq_threadreq_admissible(wq, uth, req)) {
2282 assert(uth != wq->wq_creator);
cb323159
A
2283 if (uth->uu_workq_pri.qos_bucket != req->tr_qos) {
2284 _wq_thactive_move(wq, uth->uu_workq_pri.qos_bucket, req->tr_qos);
2285 workq_thread_reset_pri(wq, uth, req, /*unpark*/ false);
2286 }
2287 /*
2288 * We're called from workq_kern_threadreq_initiate()
2289 * due to an unbind, with the kq req held.
2290 */
2291 WQ_TRACE_WQ(TRACE_wq_thread_logical_run | DBG_FUNC_START, wq,
2292 workq_trace_req_id(req), 0, 0, 0);
2293 wq->wq_fulfilled++;
2294 kqueue_threadreq_bind(p, req, uth->uu_thread, 0);
d9a64523
A
2295 } else {
2296 if (workloop_ts) {
2297 workq_perform_turnstile_operation_locked(wq, ^{
2298 turnstile_update_inheritor(workloop_ts, wq->wq_turnstile,
0a7de745 2299 TURNSTILE_IMMEDIATE_UPDATE | TURNSTILE_INHERITOR_TURNSTILE);
d9a64523 2300 turnstile_update_inheritor_complete(workloop_ts,
0a7de745 2301 TURNSTILE_INTERLOCK_HELD);
d9a64523
A
2302 });
2303 }
2304 if (workq_threadreq_enqueue(wq, req)) {
2305 workq_schedule_creator(p, wq, flags);
2306 }
d9a64523
A
2307 }
2308
cb323159
A
2309 workq_unlock(wq);
2310
d9a64523
A
2311 return true;
2312}
2313
2314void
cb323159
A
2315workq_kern_threadreq_modify(struct proc *p, workq_threadreq_t req,
2316 thread_qos_t qos, workq_kern_threadreq_flags_t flags)
d9a64523
A
2317{
2318 struct workqueue *wq = proc_get_wqptr_fast(p);
cb323159 2319 bool make_overcommit = false;
d9a64523 2320
cb323159 2321 if (req->tr_flags & WORKQ_TR_FLAG_WL_OUTSIDE_QOS) {
d9a64523
A
2322 /* Requests outside-of-QoS shouldn't accept modify operations */
2323 return;
2324 }
2325
2326 workq_lock_spin(wq);
2327
2328 assert(req->tr_qos != WORKQ_THREAD_QOS_MANAGER);
cb323159 2329 assert(req->tr_flags & (WORKQ_TR_FLAG_KEVENT | WORKQ_TR_FLAG_WORKLOOP));
d9a64523 2330
cb323159
A
2331 if (req->tr_state == WORKQ_TR_STATE_BINDING) {
2332 kqueue_threadreq_bind(p, req, req->tr_thread, 0);
d9a64523
A
2333 workq_unlock(wq);
2334 return;
2335 }
2336
cb323159
A
2337 if (flags & WORKQ_THREADREQ_MAKE_OVERCOMMIT) {
2338 make_overcommit = (req->tr_flags & WORKQ_TR_FLAG_OVERCOMMIT) == 0;
2339 }
d9a64523 2340
cb323159 2341 if (_wq_exiting(wq) || (req->tr_qos == qos && !make_overcommit)) {
d9a64523
A
2342 workq_unlock(wq);
2343 return;
2344 }
2345
2346 assert(req->tr_count == 1);
cb323159 2347 if (req->tr_state != WORKQ_TR_STATE_QUEUED) {
d9a64523
A
2348 panic("Invalid thread request (%p) state %d", req, req->tr_state);
2349 }
2350
2351 WQ_TRACE_WQ(TRACE_wq_thread_request_modify | DBG_FUNC_NONE, wq,
0a7de745 2352 workq_trace_req_id(req), qos, 0, 0);
d9a64523
A
2353
2354 struct priority_queue *pq = workq_priority_queue_for_req(wq, req);
2355 workq_threadreq_t req_max;
2356
2357 /*
2358 * Stage 1: Dequeue the request from its priority queue.
2359 *
2360 * If we dequeue the root item of the constrained priority queue,
2361 * maintain the best constrained request qos invariant.
2362 */
2363 if (priority_queue_remove(pq, &req->tr_entry,
0a7de745 2364 PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE)) {
cb323159 2365 if ((req->tr_flags & WORKQ_TR_FLAG_OVERCOMMIT) == 0) {
d9a64523
A
2366 _wq_thactive_refresh_best_constrained_req_qos(wq);
2367 }
2368 }
2369
2370 /*
2371 * Stage 2: Apply changes to the thread request
2372 *
2373 * If the item will not become the root of the priority queue it belongs to,
2374 * then we need to wait in line, just enqueue and return quickly.
2375 */
cb323159
A
2376 if (__improbable(make_overcommit)) {
2377 req->tr_flags ^= WORKQ_TR_FLAG_OVERCOMMIT;
d9a64523
A
2378 pq = workq_priority_queue_for_req(wq, req);
2379 }
2380 req->tr_qos = qos;
2381
2382 req_max = priority_queue_max(pq, struct workq_threadreq_s, tr_entry);
2383 if (req_max && req_max->tr_qos >= qos) {
2384 priority_queue_insert(pq, &req->tr_entry, workq_priority_for_req(req),
0a7de745 2385 PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE);
d9a64523
A
2386 workq_unlock(wq);
2387 return;
2388 }
2389
2390 /*
2391 * Stage 3: Reevaluate whether we should run the thread request.
2392 *
2393 * Pretend the thread request is new again:
2394 * - adjust wq_reqcount to not count it anymore.
cb323159 2395 * - make its state WORKQ_TR_STATE_NEW (so that workq_threadreq_bind_and_unlock
d9a64523
A
2396 * properly attempts a synchronous bind)
2397 */
2398 wq->wq_reqcount--;
cb323159 2399 req->tr_state = WORKQ_TR_STATE_NEW;
d9a64523
A
2400 if (workq_threadreq_enqueue(wq, req)) {
2401 workq_schedule_creator(p, wq, flags);
2402 }
2403 workq_unlock(wq);
2404}
2405
2406void
2407workq_kern_threadreq_lock(struct proc *p)
2408{
2409 workq_lock_spin(proc_get_wqptr_fast(p));
2410}
2411
2412void
2413workq_kern_threadreq_unlock(struct proc *p)
2414{
2415 workq_unlock(proc_get_wqptr_fast(p));
2416}
2417
2418void
cb323159 2419workq_kern_threadreq_update_inheritor(struct proc *p, workq_threadreq_t req,
0a7de745
A
2420 thread_t owner, struct turnstile *wl_ts,
2421 turnstile_update_flags_t flags)
d9a64523
A
2422{
2423 struct workqueue *wq = proc_get_wqptr_fast(p);
d9a64523
A
2424 turnstile_inheritor_t inheritor;
2425
2426 assert(req->tr_qos != WORKQ_THREAD_QOS_MANAGER);
cb323159 2427 assert(req->tr_flags & WORKQ_TR_FLAG_WORKLOOP);
d9a64523
A
2428 workq_lock_held(wq);
2429
cb323159
A
2430 if (req->tr_state == WORKQ_TR_STATE_BINDING) {
2431 kqueue_threadreq_bind(p, req, req->tr_thread,
0a7de745 2432 KQUEUE_THREADERQ_BIND_NO_INHERITOR_UPDATE);
d9a64523
A
2433 return;
2434 }
2435
2436 if (_wq_exiting(wq)) {
2437 inheritor = TURNSTILE_INHERITOR_NULL;
2438 } else {
cb323159 2439 if (req->tr_state != WORKQ_TR_STATE_QUEUED) {
d9a64523
A
2440 panic("Invalid thread request (%p) state %d", req, req->tr_state);
2441 }
2442
2443 if (owner) {
2444 inheritor = owner;
2445 flags |= TURNSTILE_INHERITOR_THREAD;
2446 } else {
2447 inheritor = wq->wq_turnstile;
2448 flags |= TURNSTILE_INHERITOR_TURNSTILE;
2449 }
2450 }
2451
2452 workq_perform_turnstile_operation_locked(wq, ^{
2453 turnstile_update_inheritor(wl_ts, inheritor, flags);
2454 });
2455}
2456
2457void
cb323159 2458workq_kern_threadreq_redrive(struct proc *p, workq_kern_threadreq_flags_t flags)
d9a64523
A
2459{
2460 struct workqueue *wq = proc_get_wqptr_fast(p);
2461
2462 workq_lock_spin(wq);
2463 workq_schedule_creator(p, wq, flags);
2464 workq_unlock(wq);
2465}
2466
2467void
2468workq_schedule_creator_turnstile_redrive(struct workqueue *wq, bool locked)
2469{
cb323159
A
2470 if (locked) {
2471 workq_schedule_creator(NULL, wq, WORKQ_THREADREQ_NONE);
2472 } else {
2473 workq_schedule_immediate_thread_creation(wq);
0a7de745 2474 }
d9a64523
A
2475}
2476
2477static int
2478workq_thread_return(struct proc *p, struct workq_kernreturn_args *uap,
0a7de745 2479 struct workqueue *wq)
d9a64523
A
2480{
2481 thread_t th = current_thread();
2482 struct uthread *uth = get_bsdthread_info(th);
cb323159 2483 workq_threadreq_t kqr = uth->uu_kqr_bound;
d9a64523
A
2484 workq_threadreq_param_t trp = { };
2485 int nevents = uap->affinity, error;
2486 user_addr_t eventlist = uap->item;
2487
2488 if (((thread_get_tag(th) & THREAD_TAG_WORKQUEUE) == 0) ||
0a7de745 2489 (uth->uu_workq_flags & UT_WORKQ_DYING)) {
d9a64523
A
2490 return EINVAL;
2491 }
2492
2493 if (eventlist && nevents && kqr == NULL) {
2494 return EINVAL;
2495 }
2496
2497 /* reset signal mask on the workqueue thread to default state */
2498 if (uth->uu_sigmask != (sigset_t)(~workq_threadmask)) {
2499 proc_lock(p);
2500 uth->uu_sigmask = ~workq_threadmask;
2501 proc_unlock(p);
2502 }
2503
cb323159 2504 if (kqr && kqr->tr_flags & WORKQ_TR_FLAG_WL_PARAMS) {
d9a64523
A
2505 /*
2506 * Ensure we store the threadreq param before unbinding
2507 * the kqr from this thread.
2508 */
cb323159 2509 trp = kqueue_threadreq_workloop_param(kqr);
d9a64523
A
2510 }
2511
cb323159
A
2512 /*
2513 * Freeze thee base pri while we decide the fate of this thread.
2514 *
2515 * Either:
2516 * - we return to user and kevent_cleanup will have unfrozen the base pri,
2517 * - or we proceed to workq_select_threadreq_or_park_and_unlock() who will.
2518 */
2519 thread_freeze_base_pri(th);
2520
d9a64523
A
2521 if (kqr) {
2522 uint32_t upcall_flags = WQ_FLAG_THREAD_NEWSPI | WQ_FLAG_THREAD_REUSE;
cb323159 2523 if (kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP) {
d9a64523
A
2524 upcall_flags |= WQ_FLAG_THREAD_WORKLOOP | WQ_FLAG_THREAD_KEVENT;
2525 } else {
2526 upcall_flags |= WQ_FLAG_THREAD_KEVENT;
2527 }
2528 if (uth->uu_workq_pri.qos_bucket == WORKQ_THREAD_QOS_MANAGER) {
2529 upcall_flags |= WQ_FLAG_THREAD_EVENT_MANAGER;
2530 } else {
2531 if (uth->uu_workq_flags & UT_WORKQ_OVERCOMMIT) {
2532 upcall_flags |= WQ_FLAG_THREAD_OVERCOMMIT;
2533 }
2534 if (uth->uu_workq_flags & UT_WORKQ_OUTSIDE_QOS) {
2535 upcall_flags |= WQ_FLAG_THREAD_OUTSIDEQOS;
2536 } else {
2537 upcall_flags |= uth->uu_workq_pri.qos_req |
0a7de745 2538 WQ_FLAG_THREAD_PRIO_QOS;
d9a64523
A
2539 }
2540 }
2541
2542 error = pthread_functions->workq_handle_stack_events(p, th,
0a7de745
A
2543 get_task_map(p->task), uth->uu_workq_stackaddr,
2544 uth->uu_workq_thport, eventlist, nevents, upcall_flags);
2545 if (error) {
cb323159 2546 assert(uth->uu_kqr_bound == kqr);
0a7de745
A
2547 return error;
2548 }
d9a64523
A
2549
2550 // pthread is supposed to pass KEVENT_FLAG_PARKING here
2551 // which should cause the above call to either:
2552 // - not return
2553 // - return an error
2554 // - return 0 and have unbound properly
2555 assert(uth->uu_kqr_bound == NULL);
2556 }
2557
2558 WQ_TRACE_WQ(TRACE_wq_runthread | DBG_FUNC_END, wq, uap->options, 0, 0, 0);
2559
2560 thread_sched_call(th, NULL);
2561 thread_will_park_or_terminate(th);
2562#if CONFIG_WORKLOOP_DEBUG
2563 UU_KEVENT_HISTORY_WRITE_ENTRY(uth, { .uu_error = -1, });
2564#endif
2565
2566 workq_lock_spin(wq);
2567 WQ_TRACE_WQ(TRACE_wq_thread_logical_run | DBG_FUNC_END, wq, 0, 0, 0, 0);
2568 uth->uu_save.uus_workq_park_data.workloop_params = trp.trp_value;
cb323159
A
2569 workq_select_threadreq_or_park_and_unlock(p, wq, uth,
2570 WQ_SETUP_CLEAR_VOUCHER);
d9a64523
A
2571 __builtin_unreachable();
2572}
2573
2574/**
2575 * Multiplexed call to interact with the workqueue mechanism
2576 */
2577int
2578workq_kernreturn(struct proc *p, struct workq_kernreturn_args *uap, int32_t *retval)
2579{
2580 int options = uap->options;
2581 int arg2 = uap->affinity;
2582 int arg3 = uap->prio;
2583 struct workqueue *wq = proc_get_wqptr(p);
2584 int error = 0;
2585
2586 if ((p->p_lflag & P_LREGISTER) == 0) {
2587 return EINVAL;
2588 }
2589
2590 switch (options) {
2591 case WQOPS_QUEUE_NEWSPISUPP: {
2592 /*
2593 * arg2 = offset of serialno into dispatch queue
2594 * arg3 = kevent support
2595 */
2596 int offset = arg2;
0a7de745 2597 if (arg3 & 0x01) {
d9a64523
A
2598 // If we get here, then userspace has indicated support for kevent delivery.
2599 }
2600
2601 p->p_dispatchqueue_serialno_offset = (uint64_t)offset;
2602 break;
2603 }
2604 case WQOPS_QUEUE_REQTHREADS: {
2605 /*
2606 * arg2 = number of threads to start
2607 * arg3 = priority
2608 */
2609 error = workq_reqthreads(p, arg2, arg3);
2610 break;
2611 }
2612 case WQOPS_SET_EVENT_MANAGER_PRIORITY: {
2613 /*
2614 * arg2 = priority for the manager thread
2615 *
2616 * if _PTHREAD_PRIORITY_SCHED_PRI_FLAG is set,
2617 * the low bits of the value contains a scheduling priority
2618 * instead of a QOS value
2619 */
2620 pthread_priority_t pri = arg2;
2621
2622 if (wq == NULL) {
2623 error = EINVAL;
2624 break;
2625 }
2626
2627 /*
2628 * Normalize the incoming priority so that it is ordered numerically.
2629 */
2630 if (pri & _PTHREAD_PRIORITY_SCHED_PRI_FLAG) {
2631 pri &= (_PTHREAD_PRIORITY_SCHED_PRI_MASK |
0a7de745 2632 _PTHREAD_PRIORITY_SCHED_PRI_FLAG);
d9a64523
A
2633 } else {
2634 thread_qos_t qos = _pthread_priority_thread_qos(pri);
2635 int relpri = _pthread_priority_relpri(pri);
2636 if (relpri > 0 || relpri < THREAD_QOS_MIN_TIER_IMPORTANCE ||
0a7de745 2637 qos == THREAD_QOS_UNSPECIFIED) {
d9a64523
A
2638 error = EINVAL;
2639 break;
2640 }
2641 pri &= ~_PTHREAD_PRIORITY_FLAGS_MASK;
2642 }
2643
2644 /*
2645 * If userspace passes a scheduling priority, that wins over any QoS.
2646 * Userspace should takes care not to lower the priority this way.
2647 */
2648 workq_lock_spin(wq);
2649 if (wq->wq_event_manager_priority < (uint32_t)pri) {
2650 wq->wq_event_manager_priority = (uint32_t)pri;
2651 }
2652 workq_unlock(wq);
2653 break;
2654 }
2655 case WQOPS_THREAD_KEVENT_RETURN:
2656 case WQOPS_THREAD_WORKLOOP_RETURN:
2657 case WQOPS_THREAD_RETURN: {
2658 error = workq_thread_return(p, uap, wq);
2659 break;
2660 }
2661
2662 case WQOPS_SHOULD_NARROW: {
2663 /*
2664 * arg2 = priority to test
2665 * arg3 = unused
2666 */
2667 thread_t th = current_thread();
2668 struct uthread *uth = get_bsdthread_info(th);
2669 if (((thread_get_tag(th) & THREAD_TAG_WORKQUEUE) == 0) ||
0a7de745 2670 (uth->uu_workq_flags & (UT_WORKQ_DYING | UT_WORKQ_OVERCOMMIT))) {
d9a64523
A
2671 error = EINVAL;
2672 break;
2673 }
2674
2675 thread_qos_t qos = _pthread_priority_thread_qos(arg2);
2676 if (qos == THREAD_QOS_UNSPECIFIED) {
2677 error = EINVAL;
2678 break;
2679 }
2680 workq_lock_spin(wq);
2681 bool should_narrow = !workq_constrained_allowance(wq, qos, uth, false);
2682 workq_unlock(wq);
2683
2684 *retval = should_narrow;
2685 break;
2686 }
cb323159
A
2687 case WQOPS_SETUP_DISPATCH: {
2688 /*
2689 * item = pointer to workq_dispatch_config structure
2690 * arg2 = sizeof(item)
2691 */
2692 struct workq_dispatch_config cfg;
2693 bzero(&cfg, sizeof(cfg));
2694
2695 error = copyin(uap->item, &cfg, MIN(sizeof(cfg), (unsigned long) arg2));
2696 if (error) {
2697 break;
2698 }
2699
2700 if (cfg.wdc_flags & ~WORKQ_DISPATCH_SUPPORTED_FLAGS ||
2701 cfg.wdc_version < WORKQ_DISPATCH_MIN_SUPPORTED_VERSION) {
2702 error = ENOTSUP;
2703 break;
2704 }
2705
2706 /* Load fields from version 1 */
2707 p->p_dispatchqueue_serialno_offset = cfg.wdc_queue_serialno_offs;
2708
2709 /* Load fields from version 2 */
2710 if (cfg.wdc_version >= 2) {
2711 p->p_dispatchqueue_label_offset = cfg.wdc_queue_label_offs;
2712 }
2713
2714 break;
2715 }
d9a64523
A
2716 default:
2717 error = EINVAL;
2718 break;
2719 }
2720
0a7de745 2721 return error;
d9a64523
A
2722}
2723
2724/*
2725 * We have no work to do, park ourselves on the idle list.
2726 *
2727 * Consumes the workqueue lock and does not return.
2728 */
2729__attribute__((noreturn, noinline))
2730static void
cb323159
A
2731workq_park_and_unlock(proc_t p, struct workqueue *wq, struct uthread *uth,
2732 uint32_t setup_flags)
d9a64523
A
2733{
2734 assert(uth == current_uthread());
2735 assert(uth->uu_kqr_bound == NULL);
cb323159 2736 workq_push_idle_thread(p, wq, uth, setup_flags); // may not return
d9a64523
A
2737
2738 workq_thread_reset_cpupercent(NULL, uth);
2739
cb323159
A
2740 if ((uth->uu_workq_flags & UT_WORKQ_IDLE_CLEANUP) &&
2741 !(uth->uu_workq_flags & UT_WORKQ_DYING)) {
d9a64523
A
2742 workq_unlock(wq);
2743
2744 /*
2745 * workq_push_idle_thread() will unset `has_stack`
2746 * if it wants us to free the stack before parking.
2747 */
2748 if (!uth->uu_save.uus_workq_park_data.has_stack) {
2749 pthread_functions->workq_markfree_threadstack(p, uth->uu_thread,
0a7de745 2750 get_task_map(p->task), uth->uu_workq_stackaddr);
d9a64523
A
2751 }
2752
2753 /*
2754 * When we remove the voucher from the thread, we may lose our importance
2755 * causing us to get preempted, so we do this after putting the thread on
2756 * the idle list. Then, when we get our importance back we'll be able to
2757 * use this thread from e.g. the kevent call out to deliver a boosting
2758 * message.
2759 */
2760 __assert_only kern_return_t kr;
2761 kr = thread_set_voucher_name(MACH_PORT_NULL);
2762 assert(kr == KERN_SUCCESS);
2763
2764 workq_lock_spin(wq);
2765 uth->uu_workq_flags &= ~UT_WORKQ_IDLE_CLEANUP;
cb323159 2766 setup_flags &= ~WQ_SETUP_CLEAR_VOUCHER;
d9a64523
A
2767 }
2768
2769 if (uth->uu_workq_flags & UT_WORKQ_RUNNING) {
2770 /*
2771 * While we'd dropped the lock to unset our voucher, someone came
2772 * around and made us runnable. But because we weren't waiting on the
2773 * event their thread_wakeup() was ineffectual. To correct for that,
2774 * we just run the continuation ourselves.
2775 */
2776 WQ_TRACE_WQ(TRACE_wq_thread_logical_run | DBG_FUNC_END, wq, 0, 0, 0, 0);
cb323159 2777 workq_unpark_select_threadreq_or_park_and_unlock(p, wq, uth, setup_flags);
d9a64523
A
2778 __builtin_unreachable();
2779 }
2780
2781 if (uth->uu_workq_flags & UT_WORKQ_DYING) {
2782 workq_unpark_for_death_and_unlock(p, wq, uth,
cb323159 2783 WORKQ_UNPARK_FOR_DEATH_WAS_IDLE, setup_flags);
d9a64523
A
2784 __builtin_unreachable();
2785 }
2786
2787 thread_set_pending_block_hint(uth->uu_thread, kThreadWaitParkedWorkQueue);
2788 assert_wait(workq_parked_wait_event(uth), THREAD_INTERRUPTIBLE);
2789 workq_unlock(wq);
2790 WQ_TRACE_WQ(TRACE_wq_thread_logical_run | DBG_FUNC_END, wq, 0, 0, 0, 0);
2791 thread_block(workq_unpark_continue);
2792 __builtin_unreachable();
2793}
2794
2795static inline bool
2796workq_may_start_event_mgr_thread(struct workqueue *wq, struct uthread *uth)
2797{
2798 /*
2799 * There's an event manager request and either:
2800 * - no event manager currently running
2801 * - we are re-using the event manager
2802 */
2803 return wq->wq_thscheduled_count[_wq_bucket(WORKQ_THREAD_QOS_MANAGER)] == 0 ||
0a7de745 2804 (uth && uth->uu_workq_pri.qos_bucket == WORKQ_THREAD_QOS_MANAGER);
d9a64523
A
2805}
2806
2807static uint32_t
2808workq_constrained_allowance(struct workqueue *wq, thread_qos_t at_qos,
0a7de745 2809 struct uthread *uth, bool may_start_timer)
d9a64523
A
2810{
2811 assert(at_qos != WORKQ_THREAD_QOS_MANAGER);
2812 uint32_t count = 0;
2813
2814 uint32_t max_count = wq->wq_constrained_threads_scheduled;
2815 if (uth && (uth->uu_workq_flags & UT_WORKQ_OVERCOMMIT) == 0) {
2816 /*
2817 * don't count the current thread as scheduled
2818 */
2819 assert(max_count > 0);
2820 max_count--;
2821 }
2822 if (max_count >= wq_max_constrained_threads) {
2823 WQ_TRACE_WQ(TRACE_wq_constrained_admission | DBG_FUNC_NONE, wq, 1,
0a7de745
A
2824 wq->wq_constrained_threads_scheduled,
2825 wq_max_constrained_threads, 0);
d9a64523
A
2826 /*
2827 * we need 1 or more constrained threads to return to the kernel before
2828 * we can dispatch additional work
2829 */
2830 return 0;
2831 }
2832 max_count -= wq_max_constrained_threads;
2833
2834 /*
2835 * Compute a metric for many how many threads are active. We find the
2836 * highest priority request outstanding and then add up the number of
2837 * active threads in that and all higher-priority buckets. We'll also add
2838 * any "busy" threads which are not active but blocked recently enough that
2839 * we can't be sure they've gone idle yet. We'll then compare this metric
2840 * to our max concurrency to decide whether to add a new thread.
2841 */
2842
2843 uint32_t busycount, thactive_count;
2844
2845 thactive_count = _wq_thactive_aggregate_downto_qos(wq, _wq_thactive(wq),
0a7de745 2846 at_qos, &busycount, NULL);
d9a64523
A
2847
2848 if (uth && uth->uu_workq_pri.qos_bucket != WORKQ_THREAD_QOS_MANAGER &&
0a7de745 2849 at_qos <= uth->uu_workq_pri.qos_bucket) {
d9a64523
A
2850 /*
2851 * Don't count this thread as currently active, but only if it's not
2852 * a manager thread, as _wq_thactive_aggregate_downto_qos ignores active
2853 * managers.
2854 */
2855 assert(thactive_count > 0);
2856 thactive_count--;
2857 }
2858
2859 count = wq_max_parallelism[_wq_bucket(at_qos)];
2860 if (count > thactive_count + busycount) {
2861 count -= thactive_count + busycount;
2862 WQ_TRACE_WQ(TRACE_wq_constrained_admission | DBG_FUNC_NONE, wq, 2,
0a7de745 2863 thactive_count, busycount, 0);
d9a64523
A
2864 return MIN(count, max_count);
2865 } else {
2866 WQ_TRACE_WQ(TRACE_wq_constrained_admission | DBG_FUNC_NONE, wq, 3,
0a7de745 2867 thactive_count, busycount, 0);
d9a64523
A
2868 }
2869
2870 if (busycount && may_start_timer) {
2871 /*
2872 * If this is called from the add timer, we won't have another timer
2873 * fire when the thread exits the "busy" state, so rearm the timer.
2874 */
2875 workq_schedule_delayed_thread_creation(wq, 0);
2876 }
2877
2878 return 0;
2879}
2880
2881static bool
2882workq_threadreq_admissible(struct workqueue *wq, struct uthread *uth,
0a7de745 2883 workq_threadreq_t req)
d9a64523
A
2884{
2885 if (req->tr_qos == WORKQ_THREAD_QOS_MANAGER) {
2886 return workq_may_start_event_mgr_thread(wq, uth);
2887 }
cb323159 2888 if ((req->tr_flags & WORKQ_TR_FLAG_OVERCOMMIT) == 0) {
d9a64523
A
2889 return workq_constrained_allowance(wq, req->tr_qos, uth, true);
2890 }
2891 return true;
2892}
2893
2894static workq_threadreq_t
2895workq_threadreq_select_for_creator(struct workqueue *wq)
2896{
2897 workq_threadreq_t req_qos, req_pri, req_tmp;
2898 thread_qos_t qos = THREAD_QOS_UNSPECIFIED;
2899 uint8_t pri = 0;
2900
2901 req_tmp = wq->wq_event_manager_threadreq;
2902 if (req_tmp && workq_may_start_event_mgr_thread(wq, NULL)) {
2903 return req_tmp;
2904 }
2905
2906 /*
2907 * Compute the best priority request, and ignore the turnstile for now
2908 */
2909
2910 req_pri = priority_queue_max(&wq->wq_special_queue,
0a7de745 2911 struct workq_threadreq_s, tr_entry);
d9a64523
A
2912 if (req_pri) {
2913 pri = priority_queue_entry_key(&wq->wq_special_queue, &req_pri->tr_entry);
2914 }
2915
2916 /*
2917 * Compute the best QoS Request, and check whether it beats the "pri" one
2918 */
2919
2920 req_qos = priority_queue_max(&wq->wq_overcommit_queue,
0a7de745 2921 struct workq_threadreq_s, tr_entry);
d9a64523
A
2922 if (req_qos) {
2923 qos = req_qos->tr_qos;
2924 }
2925
2926 req_tmp = priority_queue_max(&wq->wq_constrained_queue,
0a7de745 2927 struct workq_threadreq_s, tr_entry);
d9a64523
A
2928
2929 if (req_tmp && qos < req_tmp->tr_qos) {
2930 if (pri && pri >= thread_workq_pri_for_qos(req_tmp->tr_qos)) {
2931 return req_pri;
2932 }
2933
2934 if (workq_constrained_allowance(wq, req_tmp->tr_qos, NULL, true)) {
2935 /*
2936 * If the constrained thread request is the best one and passes
2937 * the admission check, pick it.
2938 */
2939 return req_tmp;
2940 }
2941 }
2942
2943 if (pri && (!qos || pri >= thread_workq_pri_for_qos(qos))) {
2944 return req_pri;
2945 }
2946
2947 if (req_qos) {
2948 return req_qos;
2949 }
2950
2951 /*
2952 * If we had no eligible request but we have a turnstile push,
2953 * it must be a non overcommit thread request that failed
2954 * the admission check.
2955 *
2956 * Just fake a BG thread request so that if the push stops the creator
2957 * priority just drops to 4.
2958 */
2959 if (turnstile_workq_proprietor_of_max_turnstile(wq->wq_turnstile, NULL)) {
2960 static struct workq_threadreq_s workq_sync_push_fake_req = {
2961 .tr_qos = THREAD_QOS_BACKGROUND,
2962 };
2963
2964 return &workq_sync_push_fake_req;
2965 }
2966
2967 return NULL;
2968}
2969
2970static workq_threadreq_t
2971workq_threadreq_select(struct workqueue *wq, struct uthread *uth)
2972{
2973 workq_threadreq_t req_qos, req_pri, req_tmp;
2974 uintptr_t proprietor;
2975 thread_qos_t qos = THREAD_QOS_UNSPECIFIED;
2976 uint8_t pri = 0;
2977
0a7de745
A
2978 if (uth == wq->wq_creator) {
2979 uth = NULL;
2980 }
d9a64523
A
2981
2982 req_tmp = wq->wq_event_manager_threadreq;
2983 if (req_tmp && workq_may_start_event_mgr_thread(wq, uth)) {
2984 return req_tmp;
2985 }
2986
2987 /*
2988 * Compute the best priority request (special or turnstile)
2989 */
2990
2991 pri = turnstile_workq_proprietor_of_max_turnstile(wq->wq_turnstile,
0a7de745 2992 &proprietor);
d9a64523
A
2993 if (pri) {
2994 struct kqworkloop *kqwl = (struct kqworkloop *)proprietor;
cb323159
A
2995 req_pri = &kqwl->kqwl_request;
2996 if (req_pri->tr_state != WORKQ_TR_STATE_QUEUED) {
d9a64523 2997 panic("Invalid thread request (%p) state %d",
0a7de745 2998 req_pri, req_pri->tr_state);
d9a64523
A
2999 }
3000 } else {
3001 req_pri = NULL;
3002 }
3003
3004 req_tmp = priority_queue_max(&wq->wq_special_queue,
0a7de745 3005 struct workq_threadreq_s, tr_entry);
d9a64523 3006 if (req_tmp && pri < priority_queue_entry_key(&wq->wq_special_queue,
0a7de745 3007 &req_tmp->tr_entry)) {
d9a64523
A
3008 req_pri = req_tmp;
3009 pri = priority_queue_entry_key(&wq->wq_special_queue, &req_tmp->tr_entry);
3010 }
3011
3012 /*
3013 * Compute the best QoS Request, and check whether it beats the "pri" one
3014 */
3015
3016 req_qos = priority_queue_max(&wq->wq_overcommit_queue,
0a7de745 3017 struct workq_threadreq_s, tr_entry);
d9a64523
A
3018 if (req_qos) {
3019 qos = req_qos->tr_qos;
3020 }
3021
3022 req_tmp = priority_queue_max(&wq->wq_constrained_queue,
0a7de745 3023 struct workq_threadreq_s, tr_entry);
d9a64523
A
3024
3025 if (req_tmp && qos < req_tmp->tr_qos) {
3026 if (pri && pri >= thread_workq_pri_for_qos(req_tmp->tr_qos)) {
3027 return req_pri;
3028 }
3029
3030 if (workq_constrained_allowance(wq, req_tmp->tr_qos, uth, true)) {
3031 /*
3032 * If the constrained thread request is the best one and passes
3033 * the admission check, pick it.
3034 */
3035 return req_tmp;
3036 }
3037 }
3038
3039 if (req_pri && (!qos || pri >= thread_workq_pri_for_qos(qos))) {
3040 return req_pri;
3041 }
3042
3043 return req_qos;
3044}
3045
3046/*
3047 * The creator is an anonymous thread that is counted as scheduled,
3048 * but otherwise without its scheduler callback set or tracked as active
3049 * that is used to make other threads.
3050 *
3051 * When more requests are added or an existing one is hurried along,
3052 * a creator is elected and setup, or the existing one overridden accordingly.
3053 *
3054 * While this creator is in flight, because no request has been dequeued,
3055 * already running threads have a chance at stealing thread requests avoiding
3056 * useless context switches, and the creator once scheduled may not find any
3057 * work to do and will then just park again.
3058 *
3059 * The creator serves the dual purpose of informing the scheduler of work that
3060 * hasn't be materialized as threads yet, and also as a natural pacing mechanism
3061 * for thread creation.
3062 *
3063 * By being anonymous (and not bound to anything) it means that thread requests
3064 * can be stolen from this creator by threads already on core yielding more
3065 * efficient scheduling and reduced context switches.
3066 */
3067static void
cb323159
A
3068workq_schedule_creator(proc_t p, struct workqueue *wq,
3069 workq_kern_threadreq_flags_t flags)
d9a64523
A
3070{
3071 workq_threadreq_t req;
3072 struct uthread *uth;
cb323159 3073 bool needs_wakeup;
d9a64523
A
3074
3075 workq_lock_held(wq);
3076 assert(p || (flags & WORKQ_THREADREQ_CAN_CREATE_THREADS) == 0);
3077
3078again:
3079 uth = wq->wq_creator;
3080
3081 if (!wq->wq_reqcount) {
cb323159
A
3082 /*
3083 * There is no thread request left.
3084 *
3085 * If there is a creator, leave everything in place, so that it cleans
3086 * up itself in workq_push_idle_thread().
3087 *
3088 * Else, make sure the turnstile state is reset to no inheritor.
3089 */
d9a64523
A
3090 if (uth == NULL) {
3091 workq_turnstile_update_inheritor(wq, TURNSTILE_INHERITOR_NULL, 0);
3092 }
3093 return;
3094 }
3095
3096 req = workq_threadreq_select_for_creator(wq);
3097 if (req == NULL) {
cb323159
A
3098 /*
3099 * There isn't a thread request that passes the admission check.
3100 *
3101 * If there is a creator, do not touch anything, the creator will sort
3102 * it out when it runs.
3103 *
3104 * Else, set the inheritor to "WORKQ" so that the turnstile propagation
3105 * code calls us if anything changes.
3106 */
3107 if (uth == NULL) {
d9a64523
A
3108 workq_turnstile_update_inheritor(wq, wq, TURNSTILE_INHERITOR_WORKQ);
3109 }
3110 return;
3111 }
3112
3113 if (uth) {
3114 /*
3115 * We need to maybe override the creator we already have
3116 */
3117 if (workq_thread_needs_priority_change(req, uth)) {
3118 WQ_TRACE_WQ(TRACE_wq_creator_select | DBG_FUNC_NONE,
0a7de745 3119 wq, 1, thread_tid(uth->uu_thread), req->tr_qos, 0);
cb323159 3120 workq_thread_reset_pri(wq, uth, req, /*unpark*/ true);
d9a64523 3121 }
cb323159 3122 assert(wq->wq_inheritor == uth->uu_thread);
d9a64523
A
3123 } else if (wq->wq_thidlecount) {
3124 /*
3125 * We need to unpark a creator thread
3126 */
cb323159
A
3127 wq->wq_creator = uth = workq_pop_idle_thread(wq, UT_WORKQ_OVERCOMMIT,
3128 &needs_wakeup);
d9a64523 3129 if (workq_thread_needs_priority_change(req, uth)) {
cb323159 3130 workq_thread_reset_pri(wq, uth, req, /*unpark*/ true);
d9a64523
A
3131 }
3132 workq_turnstile_update_inheritor(wq, uth->uu_thread,
0a7de745 3133 TURNSTILE_INHERITOR_THREAD);
d9a64523 3134 WQ_TRACE_WQ(TRACE_wq_creator_select | DBG_FUNC_NONE,
0a7de745 3135 wq, 2, thread_tid(uth->uu_thread), req->tr_qos, 0);
d9a64523
A
3136 uth->uu_save.uus_workq_park_data.fulfilled_snapshot = wq->wq_fulfilled;
3137 uth->uu_save.uus_workq_park_data.yields = 0;
cb323159
A
3138 if (needs_wakeup) {
3139 workq_thread_wakeup(uth);
3140 }
d9a64523
A
3141 } else {
3142 /*
3143 * We need to allocate a thread...
3144 */
3145 if (__improbable(wq->wq_nthreads >= wq_max_threads)) {
3146 /* out of threads, just go away */
cb323159 3147 flags = WORKQ_THREADREQ_NONE;
d9a64523
A
3148 } else if (flags & WORKQ_THREADREQ_SET_AST_ON_FAILURE) {
3149 act_set_astkevent(current_thread(), AST_KEVENT_REDRIVE_THREADREQ);
3150 } else if (!(flags & WORKQ_THREADREQ_CAN_CREATE_THREADS)) {
3151 /* This can drop the workqueue lock, and take it again */
3152 workq_schedule_immediate_thread_creation(wq);
3153 } else if (workq_add_new_idle_thread(p, wq)) {
3154 goto again;
3155 } else {
3156 workq_schedule_delayed_thread_creation(wq, 0);
3157 }
3158
cb323159
A
3159 /*
3160 * If the current thread is the inheritor:
3161 *
3162 * If we set the AST, then the thread will stay the inheritor until
3163 * either the AST calls workq_kern_threadreq_redrive(), or it parks
3164 * and calls workq_push_idle_thread().
3165 *
3166 * Else, the responsibility of the thread creation is with a thread-call
3167 * and we need to clear the inheritor.
3168 */
3169 if ((flags & WORKQ_THREADREQ_SET_AST_ON_FAILURE) == 0 &&
3170 wq->wq_inheritor == current_thread()) {
3171 workq_turnstile_update_inheritor(wq, TURNSTILE_INHERITOR_NULL, 0);
3172 }
3173 }
3174}
3175
3176/**
3177 * Same as workq_unpark_select_threadreq_or_park_and_unlock,
3178 * but do not allow early binds.
3179 *
3180 * Called with the base pri frozen, will unfreeze it.
3181 */
3182__attribute__((noreturn, noinline))
3183static void
3184workq_select_threadreq_or_park_and_unlock(proc_t p, struct workqueue *wq,
3185 struct uthread *uth, uint32_t setup_flags)
3186{
3187 workq_threadreq_t req = NULL;
3188 bool is_creator = (wq->wq_creator == uth);
3189 bool schedule_creator = false;
3190
3191 if (__improbable(_wq_exiting(wq))) {
3192 WQ_TRACE_WQ(TRACE_wq_select_threadreq | DBG_FUNC_NONE, wq, 0, 0, 0, 0);
3193 goto park;
3194 }
3195
3196 if (wq->wq_reqcount == 0) {
3197 WQ_TRACE_WQ(TRACE_wq_select_threadreq | DBG_FUNC_NONE, wq, 1, 0, 0, 0);
3198 goto park;
3199 }
3200
3201 req = workq_threadreq_select(wq, uth);
3202 if (__improbable(req == NULL)) {
3203 WQ_TRACE_WQ(TRACE_wq_select_threadreq | DBG_FUNC_NONE, wq, 2, 0, 0, 0);
3204 goto park;
3205 }
3206
3207 uint8_t tr_flags = req->tr_flags;
3208 struct turnstile *req_ts = kqueue_threadreq_get_turnstile(req);
3209
3210 /*
3211 * Attempt to setup ourselves as the new thing to run, moving all priority
3212 * pushes to ourselves.
3213 *
3214 * If the current thread is the creator, then the fact that we are presently
3215 * running is proof that we'll do something useful, so keep going.
3216 *
3217 * For other cases, peek at the AST to know whether the scheduler wants
3218 * to preempt us, if yes, park instead, and move the thread request
3219 * turnstile back to the workqueue.
3220 */
3221 if (req_ts) {
3222 workq_perform_turnstile_operation_locked(wq, ^{
3223 turnstile_update_inheritor(req_ts, uth->uu_thread,
3224 TURNSTILE_IMMEDIATE_UPDATE | TURNSTILE_INHERITOR_THREAD);
3225 turnstile_update_inheritor_complete(req_ts,
3226 TURNSTILE_INTERLOCK_HELD);
3227 });
3228 }
3229
3230 if (is_creator) {
3231 WQ_TRACE_WQ(TRACE_wq_creator_select, wq, 4, 0,
3232 uth->uu_save.uus_workq_park_data.yields, 0);
3233 wq->wq_creator = NULL;
3234 _wq_thactive_inc(wq, req->tr_qos);
3235 wq->wq_thscheduled_count[_wq_bucket(req->tr_qos)]++;
3236 } else if (uth->uu_workq_pri.qos_bucket != req->tr_qos) {
3237 _wq_thactive_move(wq, uth->uu_workq_pri.qos_bucket, req->tr_qos);
3238 }
3239
3240 workq_thread_reset_pri(wq, uth, req, /*unpark*/ true);
3241
3242 if (__improbable(thread_unfreeze_base_pri(uth->uu_thread) && !is_creator)) {
3243 if (req_ts) {
3244 workq_perform_turnstile_operation_locked(wq, ^{
3245 turnstile_update_inheritor(req_ts, wq->wq_turnstile,
3246 TURNSTILE_IMMEDIATE_UPDATE | TURNSTILE_INHERITOR_TURNSTILE);
3247 turnstile_update_inheritor_complete(req_ts,
3248 TURNSTILE_INTERLOCK_HELD);
3249 });
d9a64523 3250 }
cb323159
A
3251 WQ_TRACE_WQ(TRACE_wq_select_threadreq | DBG_FUNC_NONE, wq, 3, 0, 0, 0);
3252 goto park_thawed;
3253 }
3254
3255 /*
3256 * We passed all checks, dequeue the request, bind to it, and set it up
3257 * to return to user.
3258 */
3259 WQ_TRACE_WQ(TRACE_wq_thread_logical_run | DBG_FUNC_START, wq,
3260 workq_trace_req_id(req), 0, 0, 0);
3261 wq->wq_fulfilled++;
3262 schedule_creator = workq_threadreq_dequeue(wq, req);
3263
3264 if (tr_flags & (WORKQ_TR_FLAG_KEVENT | WORKQ_TR_FLAG_WORKLOOP)) {
3265 kqueue_threadreq_bind_prepost(p, req, uth);
3266 req = NULL;
3267 } else if (req->tr_count > 0) {
3268 req = NULL;
3269 }
3270
3271 workq_thread_reset_cpupercent(req, uth);
3272 if (uth->uu_workq_flags & UT_WORKQ_NEW) {
3273 uth->uu_workq_flags ^= UT_WORKQ_NEW;
3274 setup_flags |= WQ_SETUP_FIRST_USE;
3275 }
3276 if (tr_flags & WORKQ_TR_FLAG_OVERCOMMIT) {
3277 if ((uth->uu_workq_flags & UT_WORKQ_OVERCOMMIT) == 0) {
3278 uth->uu_workq_flags |= UT_WORKQ_OVERCOMMIT;
3279 wq->wq_constrained_threads_scheduled--;
3280 }
3281 } else {
3282 if ((uth->uu_workq_flags & UT_WORKQ_OVERCOMMIT) != 0) {
3283 uth->uu_workq_flags &= ~UT_WORKQ_OVERCOMMIT;
3284 wq->wq_constrained_threads_scheduled++;
3285 }
3286 }
3287
3288 if (is_creator || schedule_creator) {
3289 /* This can drop the workqueue lock, and take it again */
3290 workq_schedule_creator(p, wq, WORKQ_THREADREQ_CAN_CREATE_THREADS);
3291 }
3292
3293 workq_unlock(wq);
3294
3295 if (req) {
3296 zfree(workq_zone_threadreq, req);
3297 }
3298
3299 /*
3300 * Run Thread, Run!
3301 */
3302 uint32_t upcall_flags = WQ_FLAG_THREAD_NEWSPI;
3303 if (uth->uu_workq_pri.qos_bucket == WORKQ_THREAD_QOS_MANAGER) {
3304 upcall_flags |= WQ_FLAG_THREAD_EVENT_MANAGER;
3305 } else if (tr_flags & WORKQ_TR_FLAG_OVERCOMMIT) {
3306 upcall_flags |= WQ_FLAG_THREAD_OVERCOMMIT;
d9a64523 3307 }
cb323159
A
3308 if (tr_flags & WORKQ_TR_FLAG_KEVENT) {
3309 upcall_flags |= WQ_FLAG_THREAD_KEVENT;
3310 }
3311 if (tr_flags & WORKQ_TR_FLAG_WORKLOOP) {
3312 upcall_flags |= WQ_FLAG_THREAD_WORKLOOP | WQ_FLAG_THREAD_KEVENT;
3313 }
3314 uth->uu_save.uus_workq_park_data.upcall_flags = upcall_flags;
3315
3316 if (tr_flags & (WORKQ_TR_FLAG_KEVENT | WORKQ_TR_FLAG_WORKLOOP)) {
3317 kqueue_threadreq_bind_commit(p, uth->uu_thread);
3318 }
3319 workq_setup_and_run(p, uth, setup_flags);
3320 __builtin_unreachable();
3321
3322park:
3323 thread_unfreeze_base_pri(uth->uu_thread);
3324park_thawed:
3325 workq_park_and_unlock(p, wq, uth, setup_flags);
d9a64523
A
3326}
3327
3328/**
3329 * Runs a thread request on a thread
3330 *
3331 * - if thread is THREAD_NULL, will find a thread and run the request there.
3332 * Otherwise, the thread must be the current thread.
3333 *
3334 * - if req is NULL, will find the highest priority request and run that. If
3335 * it is not NULL, it must be a threadreq object in state NEW. If it can not
3336 * be run immediately, it will be enqueued and moved to state QUEUED.
3337 *
3338 * Either way, the thread request object serviced will be moved to state
3339 * BINDING and attached to the uthread.
3340 *
cb323159
A
3341 * Should be called with the workqueue lock held. Will drop it.
3342 * Should be called with the base pri not frozen.
d9a64523
A
3343 */
3344__attribute__((noreturn, noinline))
3345static void
cb323159
A
3346workq_unpark_select_threadreq_or_park_and_unlock(proc_t p, struct workqueue *wq,
3347 struct uthread *uth, uint32_t setup_flags)
d9a64523 3348{
d9a64523
A
3349 if (uth->uu_workq_flags & UT_WORKQ_EARLY_BOUND) {
3350 if (uth->uu_workq_flags & UT_WORKQ_NEW) {
3351 setup_flags |= WQ_SETUP_FIRST_USE;
3352 }
3353 uth->uu_workq_flags &= ~(UT_WORKQ_NEW | UT_WORKQ_EARLY_BOUND);
3354 /*
3355 * This pointer is possibly freed and only used for tracing purposes.
3356 */
cb323159 3357 workq_threadreq_t req = uth->uu_save.uus_workq_park_data.thread_request;
d9a64523
A
3358 workq_unlock(wq);
3359 WQ_TRACE_WQ(TRACE_wq_thread_logical_run | DBG_FUNC_START, wq,
0a7de745 3360 VM_KERNEL_ADDRHIDE(req), 0, 0, 0);
cb323159 3361 (void)req;
d9a64523
A
3362 workq_setup_and_run(p, uth, setup_flags);
3363 __builtin_unreachable();
3364 }
3365
cb323159
A
3366 thread_freeze_base_pri(uth->uu_thread);
3367 workq_select_threadreq_or_park_and_unlock(p, wq, uth, setup_flags);
d9a64523
A
3368}
3369
3370static bool
3371workq_creator_should_yield(struct workqueue *wq, struct uthread *uth)
3372{
3373 thread_qos_t qos = workq_pri_override(uth->uu_workq_pri);
3374
3375 if (qos >= THREAD_QOS_USER_INTERACTIVE) {
3376 return false;
3377 }
3378
3379 uint32_t snapshot = uth->uu_save.uus_workq_park_data.fulfilled_snapshot;
3380 if (wq->wq_fulfilled == snapshot) {
3381 return false;
3382 }
3383
3384 uint32_t cnt = 0, conc = wq_max_parallelism[_wq_bucket(qos)];
3385 if (wq->wq_fulfilled - snapshot > conc) {
3386 /* we fulfilled more than NCPU requests since being dispatched */
3387 WQ_TRACE_WQ(TRACE_wq_creator_yield, wq, 1,
0a7de745 3388 wq->wq_fulfilled, snapshot, 0);
d9a64523
A
3389 return true;
3390 }
3391
3392 for (int i = _wq_bucket(qos); i < WORKQ_NUM_QOS_BUCKETS; i++) {
3393 cnt += wq->wq_thscheduled_count[i];
3394 }
3395 if (conc <= cnt) {
3396 /* We fulfilled requests and have more than NCPU scheduled threads */
3397 WQ_TRACE_WQ(TRACE_wq_creator_yield, wq, 2,
0a7de745 3398 wq->wq_fulfilled, snapshot, 0);
d9a64523
A
3399 return true;
3400 }
3401
3402 return false;
3403}
3404
3405/**
3406 * parked thread wakes up
3407 */
3408__attribute__((noreturn, noinline))
3409static void
3410workq_unpark_continue(void *parameter __unused, wait_result_t wr __unused)
3411{
cb323159
A
3412 thread_t th = current_thread();
3413 struct uthread *uth = get_bsdthread_info(th);
d9a64523
A
3414 proc_t p = current_proc();
3415 struct workqueue *wq = proc_get_wqptr_fast(p);
3416
3417 workq_lock_spin(wq);
3418
3419 if (wq->wq_creator == uth && workq_creator_should_yield(wq, uth)) {
3420 /*
3421 * If the number of threads we have out are able to keep up with the
3422 * demand, then we should avoid sending this creator thread to
3423 * userspace.
3424 */
3425 uth->uu_save.uus_workq_park_data.fulfilled_snapshot = wq->wq_fulfilled;
3426 uth->uu_save.uus_workq_park_data.yields++;
3427 workq_unlock(wq);
3428 thread_yield_with_continuation(workq_unpark_continue, NULL);
3429 __builtin_unreachable();
3430 }
3431
3432 if (__probable(uth->uu_workq_flags & UT_WORKQ_RUNNING)) {
cb323159 3433 workq_unpark_select_threadreq_or_park_and_unlock(p, wq, uth, WQ_SETUP_NONE);
d9a64523
A
3434 __builtin_unreachable();
3435 }
3436
3437 if (__probable(wr == THREAD_AWAKENED)) {
3438 /*
3439 * We were set running, but for the purposes of dying.
3440 */
3441 assert(uth->uu_workq_flags & UT_WORKQ_DYING);
3442 assert((uth->uu_workq_flags & UT_WORKQ_NEW) == 0);
3443 } else {
3444 /*
3445 * workaround for <rdar://problem/38647347>,
3446 * in case we do hit userspace, make sure calling
3447 * workq_thread_terminate() does the right thing here,
3448 * and if we never call it, that workq_exit() will too because it sees
3449 * this thread on the runlist.
3450 */
3451 assert(wr == THREAD_INTERRUPTED);
3452 wq->wq_thdying_count++;
3453 uth->uu_workq_flags |= UT_WORKQ_DYING;
3454 }
3455
3456 workq_unpark_for_death_and_unlock(p, wq, uth,
cb323159 3457 WORKQ_UNPARK_FOR_DEATH_WAS_IDLE, WQ_SETUP_NONE);
d9a64523
A
3458 __builtin_unreachable();
3459}
3460
3461__attribute__((noreturn, noinline))
3462static void
3463workq_setup_and_run(proc_t p, struct uthread *uth, int setup_flags)
3464{
3465 thread_t th = uth->uu_thread;
3466 vm_map_t vmap = get_task_map(p->task);
3467
3468 if (setup_flags & WQ_SETUP_CLEAR_VOUCHER) {
3469 /*
3470 * For preemption reasons, we want to reset the voucher as late as
3471 * possible, so we do it in two places:
3472 * - Just before parking (i.e. in workq_park_and_unlock())
3473 * - Prior to doing the setup for the next workitem (i.e. here)
3474 *
3475 * Those two places are sufficient to ensure we always reset it before
3476 * it goes back out to user space, but be careful to not break that
3477 * guarantee.
3478 */
3479 __assert_only kern_return_t kr;
3480 kr = thread_set_voucher_name(MACH_PORT_NULL);
3481 assert(kr == KERN_SUCCESS);
3482 }
3483
3484 uint32_t upcall_flags = uth->uu_save.uus_workq_park_data.upcall_flags;
3485 if (!(setup_flags & WQ_SETUP_FIRST_USE)) {
3486 upcall_flags |= WQ_FLAG_THREAD_REUSE;
3487 }
3488
3489 if (uth->uu_workq_flags & UT_WORKQ_OUTSIDE_QOS) {
3490 /*
3491 * For threads that have an outside-of-QoS thread priority, indicate
3492 * to userspace that setting QoS should only affect the TSD and not
3493 * change QOS in the kernel.
3494 */
3495 upcall_flags |= WQ_FLAG_THREAD_OUTSIDEQOS;
3496 } else {
3497 /*
3498 * Put the QoS class value into the lower bits of the reuse_thread
3499 * register, this is where the thread priority used to be stored
3500 * anyway.
3501 */
3502 upcall_flags |= uth->uu_save.uus_workq_park_data.qos |
0a7de745 3503 WQ_FLAG_THREAD_PRIO_QOS;
d9a64523
A
3504 }
3505
3506 if (uth->uu_workq_thport == MACH_PORT_NULL) {
3507 /* convert_thread_to_port() consumes a reference */
3508 thread_reference(th);
3509 ipc_port_t port = convert_thread_to_port(th);
3510 uth->uu_workq_thport = ipc_port_copyout_send(port, get_task_ipcspace(p->task));
3511 }
3512
3513 /*
3514 * Call out to pthread, this sets up the thread, pulls in kevent structs
3515 * onto the stack, sets up the thread state and then returns to userspace.
3516 */
3517 WQ_TRACE_WQ(TRACE_wq_runthread | DBG_FUNC_START,
0a7de745 3518 proc_get_wqptr_fast(p), 0, 0, 0, 0);
d9a64523
A
3519 thread_sched_call(th, workq_sched_callback);
3520 pthread_functions->workq_setup_thread(p, th, vmap, uth->uu_workq_stackaddr,
0a7de745 3521 uth->uu_workq_thport, 0, setup_flags, upcall_flags);
d9a64523
A
3522
3523 __builtin_unreachable();
3524}
3525
3526#pragma mark misc
3527
3528int
3529fill_procworkqueue(proc_t p, struct proc_workqueueinfo * pwqinfo)
3530{
3531 struct workqueue *wq = proc_get_wqptr(p);
3532 int error = 0;
0a7de745 3533 int activecount;
d9a64523
A
3534
3535 if (wq == NULL) {
3536 return EINVAL;
3537 }
3538
3539 /*
3540 * This is sometimes called from interrupt context by the kperf sampler.
3541 * In that case, it's not safe to spin trying to take the lock since we
3542 * might already hold it. So, we just try-lock it and error out if it's
3543 * already held. Since this is just a debugging aid, and all our callers
3544 * are able to handle an error, that's fine.
3545 */
3546 bool locked = workq_lock_try(wq);
3547 if (!locked) {
3548 return EBUSY;
3549 }
3550
3551 wq_thactive_t act = _wq_thactive(wq);
3552 activecount = _wq_thactive_aggregate_downto_qos(wq, act,
0a7de745 3553 WORKQ_THREAD_QOS_MIN, NULL, NULL);
d9a64523
A
3554 if (act & _wq_thactive_offset_for_qos(WORKQ_THREAD_QOS_MANAGER)) {
3555 activecount++;
3556 }
3557 pwqinfo->pwq_nthreads = wq->wq_nthreads;
3558 pwqinfo->pwq_runthreads = activecount;
3559 pwqinfo->pwq_blockedthreads = wq->wq_threads_scheduled - activecount;
3560 pwqinfo->pwq_state = 0;
3561
3562 if (wq->wq_constrained_threads_scheduled >= wq_max_constrained_threads) {
3563 pwqinfo->pwq_state |= WQ_EXCEEDED_CONSTRAINED_THREAD_LIMIT;
3564 }
3565
3566 if (wq->wq_nthreads >= wq_max_threads) {
3567 pwqinfo->pwq_state |= WQ_EXCEEDED_TOTAL_THREAD_LIMIT;
3568 }
3569
3570 workq_unlock(wq);
3571 return error;
3572}
3573
3574boolean_t
3575workqueue_get_pwq_exceeded(void *v, boolean_t *exceeded_total,
0a7de745 3576 boolean_t *exceeded_constrained)
d9a64523
A
3577{
3578 proc_t p = v;
3579 struct proc_workqueueinfo pwqinfo;
3580 int err;
3581
3582 assert(p != NULL);
3583 assert(exceeded_total != NULL);
3584 assert(exceeded_constrained != NULL);
3585
3586 err = fill_procworkqueue(p, &pwqinfo);
3587 if (err) {
3588 return FALSE;
3589 }
3590 if (!(pwqinfo.pwq_state & WQ_FLAGS_AVAILABLE)) {
3591 return FALSE;
3592 }
3593
3594 *exceeded_total = (pwqinfo.pwq_state & WQ_EXCEEDED_TOTAL_THREAD_LIMIT);
3595 *exceeded_constrained = (pwqinfo.pwq_state & WQ_EXCEEDED_CONSTRAINED_THREAD_LIMIT);
3596
3597 return TRUE;
3598}
3599
3600uint32_t
3601workqueue_get_pwq_state_kdp(void * v)
3602{
3603 static_assert((WQ_EXCEEDED_CONSTRAINED_THREAD_LIMIT << 17) ==
0a7de745 3604 kTaskWqExceededConstrainedThreadLimit);
d9a64523 3605 static_assert((WQ_EXCEEDED_TOTAL_THREAD_LIMIT << 17) ==
0a7de745 3606 kTaskWqExceededTotalThreadLimit);
d9a64523
A
3607 static_assert((WQ_FLAGS_AVAILABLE << 17) == kTaskWqFlagsAvailable);
3608 static_assert((WQ_FLAGS_AVAILABLE | WQ_EXCEEDED_TOTAL_THREAD_LIMIT |
0a7de745 3609 WQ_EXCEEDED_CONSTRAINED_THREAD_LIMIT) == 0x7);
d9a64523
A
3610
3611 if (v == NULL) {
3612 return 0;
3613 }
3614
3615 proc_t p = v;
3616 struct workqueue *wq = proc_get_wqptr(p);
3617
3618 if (wq == NULL || workq_lock_spin_is_acquired_kdp(wq)) {
3619 return 0;
3620 }
3621
3622 uint32_t pwq_state = WQ_FLAGS_AVAILABLE;
3623
3624 if (wq->wq_constrained_threads_scheduled >= wq_max_constrained_threads) {
3625 pwq_state |= WQ_EXCEEDED_CONSTRAINED_THREAD_LIMIT;
3626 }
3627
3628 if (wq->wq_nthreads >= wq_max_threads) {
3629 pwq_state |= WQ_EXCEEDED_TOTAL_THREAD_LIMIT;
3630 }
3631
3632 return pwq_state;
3633}
3634
3635void
3636workq_init(void)
3637{
3638 workq_lck_grp_attr = lck_grp_attr_alloc_init();
3639 workq_lck_attr = lck_attr_alloc_init();
3640 workq_lck_grp = lck_grp_alloc_init("workq", workq_lck_grp_attr);
3641
3642 workq_zone_workqueue = zinit(sizeof(struct workqueue),
0a7de745 3643 1024 * sizeof(struct workqueue), 8192, "workq.wq");
d9a64523 3644 workq_zone_threadreq = zinit(sizeof(struct workq_threadreq_s),
0a7de745 3645 1024 * sizeof(struct workq_threadreq_s), 8192, "workq.threadreq");
d9a64523
A
3646
3647 clock_interval_to_absolutetime_interval(wq_stalled_window.usecs,
0a7de745 3648 NSEC_PER_USEC, &wq_stalled_window.abstime);
d9a64523 3649 clock_interval_to_absolutetime_interval(wq_reduce_pool_window.usecs,
0a7de745 3650 NSEC_PER_USEC, &wq_reduce_pool_window.abstime);
d9a64523 3651 clock_interval_to_absolutetime_interval(wq_max_timer_interval.usecs,
0a7de745 3652 NSEC_PER_USEC, &wq_max_timer_interval.abstime);
cb323159
A
3653
3654 thread_deallocate_daemon_register_queue(&workq_deallocate_queue,
3655 workq_deallocate_queue_invoke);
d9a64523 3656}