2 * Copyright (c) 2000-2012 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
28 /* Copyright (c) 1995-2005 Apple Computer, Inc. All Rights Reserved */
33 #pragma mark - Front Matter
35 #define _PTHREAD_CONDATTR_T
36 #define _PTHREAD_COND_T
37 #define _PTHREAD_MUTEXATTR_T
38 #define _PTHREAD_MUTEX_T
39 #define _PTHREAD_RWLOCKATTR_T
40 #define _PTHREAD_RWLOCK_T
42 #undef pthread_mutexattr_t
43 #undef pthread_mutex_t
44 #undef pthread_condattr_t
46 #undef pthread_rwlockattr_t
47 #undef pthread_rwlock_t
49 #include <sys/cdefs.h>
52 // <rdar://problem/26158937> panic() should be marked noreturn
53 extern void panic(const char *string
, ...) __printflike(1,2) __dead2
;
55 #include <sys/param.h>
56 #include <sys/queue.h>
57 #include <sys/resourcevar.h>
58 //#include <sys/proc_internal.h>
59 #include <sys/kauth.h>
60 #include <sys/systm.h>
61 #include <sys/timeb.h>
62 #include <sys/times.h>
64 #include <sys/kernel.h>
66 #include <sys/signalvar.h>
67 #include <sys/sysctl.h>
68 #include <sys/syslog.h>
71 #include <sys/kdebug.h>
72 //#include <sys/sysproto.h>
74 #include <sys/user.h> /* for coredump */
75 #include <sys/proc_info.h> /* for fill_procworkqueue */
77 #include <mach/mach_port.h>
78 #include <mach/mach_types.h>
79 #include <mach/semaphore.h>
80 #include <mach/sync_policy.h>
81 #include <mach/task.h>
82 #include <mach/vm_prot.h>
83 #include <kern/kern_types.h>
84 #include <kern/task.h>
85 #include <kern/clock.h>
86 #include <mach/kern_return.h>
87 #include <kern/thread.h>
88 #include <kern/zalloc.h>
89 #include <kern/sched_prim.h> /* for thread_exception_return */
90 #include <kern/processor.h>
91 #include <kern/assert.h>
92 #include <mach/mach_vm.h>
93 #include <mach/mach_param.h>
94 #include <mach/thread_status.h>
95 #include <mach/thread_policy.h>
96 #include <mach/message.h>
97 #include <mach/port.h>
98 //#include <vm/vm_protos.h>
99 #include <vm/vm_fault.h>
100 #include <vm/vm_map.h>
101 #include <mach/thread_act.h> /* for thread_resume */
102 #include <machine/machine_routines.h>
103 #include <mach/shared_region.h>
105 #include <libkern/OSAtomic.h>
106 #include <libkern/libkern.h>
108 #include <sys/pthread_shims.h>
109 #include "kern_internal.h"
111 // XXX: Dirty import for sys/signarvar.h that's wrapped in BSD_KERNEL_PRIVATE
112 #define sigcantmask (sigmask(SIGKILL) | sigmask(SIGSTOP))
114 // XXX: Ditto for thread tags from kern/thread.h
115 #define THREAD_TAG_MAINTHREAD 0x1
116 #define THREAD_TAG_PTHREAD 0x10
117 #define THREAD_TAG_WORKQUEUE 0x20
119 lck_grp_attr_t
*pthread_lck_grp_attr
;
120 lck_grp_t
*pthread_lck_grp
;
121 lck_attr_t
*pthread_lck_attr
;
123 zone_t pthread_zone_workqueue
;
124 zone_t pthread_zone_threadlist
;
125 zone_t pthread_zone_threadreq
;
127 extern void thread_set_cthreadself(thread_t thread
, uint64_t pself
, int isLP64
);
128 extern void workqueue_thread_yielded(void);
130 #define WQ_SETUP_FIRST_USE 1
131 #define WQ_SETUP_CLEAR_VOUCHER 2
132 static void _setup_wqthread(proc_t p
, thread_t th
, struct workqueue
*wq
,
133 struct threadlist
*tl
, int flags
);
135 static void reset_priority(struct threadlist
*tl
, pthread_priority_t pri
);
136 static pthread_priority_t
pthread_priority_from_wq_class_index(struct workqueue
*wq
, int index
);
138 static void wq_unpark_continue(void* ptr
, wait_result_t wait_result
) __dead2
;
140 static bool workqueue_addnewthread(proc_t p
, struct workqueue
*wq
);
141 static void workqueue_removethread(struct threadlist
*tl
, bool fromexit
, bool first_use
);
142 static void workqueue_lock_spin(struct workqueue
*);
143 static void workqueue_unlock(struct workqueue
*);
145 #define WQ_RUN_TR_THROTTLED 0
146 #define WQ_RUN_TR_THREAD_NEEDED 1
147 #define WQ_RUN_TR_THREAD_STARTED 2
148 #define WQ_RUN_TR_EXITING 3
149 static int workqueue_run_threadreq_and_unlock(proc_t p
, struct workqueue
*wq
,
150 struct threadlist
*tl
, struct threadreq
*req
, bool may_add_new_thread
);
152 static bool may_start_constrained_thread(struct workqueue
*wq
,
153 uint32_t at_priclass
, struct threadlist
*tl
, bool may_start_timer
);
155 static mach_vm_offset_t
stack_addr_hint(proc_t p
, vm_map_t vmap
);
156 static boolean_t
wq_thread_is_busy(uint64_t cur_ts
,
157 _Atomic
uint64_t *lastblocked_tsp
);
159 int proc_settargetconc(pid_t pid
, int queuenum
, int32_t targetconc
);
160 int proc_setalltargetconc(pid_t pid
, int32_t * targetconcp
);
162 #define WQ_MAXPRI_MIN 0 /* low prio queue num */
163 #define WQ_MAXPRI_MAX 2 /* max prio queuenum */
164 #define WQ_PRI_NUM 3 /* number of prio work queues */
166 #define C_32_STK_ALIGN 16
167 #define C_64_STK_ALIGN 16
168 #define C_64_REDZONE_LEN 128
170 #define PTHREAD_T_OFFSET 0
173 * Flags filed passed to bsdthread_create and back in pthread_start
174 31 <---------------------------------> 0
175 _________________________________________
176 | flags(8) | policy(8) | importance(16) |
177 -----------------------------------------
180 #define PTHREAD_START_CUSTOM 0x01000000
181 #define PTHREAD_START_SETSCHED 0x02000000
182 #define PTHREAD_START_DETACHED 0x04000000
183 #define PTHREAD_START_QOSCLASS 0x08000000
184 #define PTHREAD_START_TSD_BASE_SET 0x10000000
185 #define PTHREAD_START_QOSCLASS_MASK 0x00ffffff
186 #define PTHREAD_START_POLICY_BITSHIFT 16
187 #define PTHREAD_START_POLICY_MASK 0xff
188 #define PTHREAD_START_IMPORTANCE_MASK 0xffff
190 #define SCHED_OTHER POLICY_TIMESHARE
191 #define SCHED_FIFO POLICY_FIFO
192 #define SCHED_RR POLICY_RR
194 #define BASEPRI_DEFAULT 31
198 static uint32_t wq_stalled_window_usecs
= WQ_STALLED_WINDOW_USECS
;
199 static uint32_t wq_reduce_pool_window_usecs
= WQ_REDUCE_POOL_WINDOW_USECS
;
200 static uint32_t wq_max_timer_interval_usecs
= WQ_MAX_TIMER_INTERVAL_USECS
;
201 static uint32_t wq_max_threads
= WORKQUEUE_MAXTHREADS
;
202 static uint32_t wq_max_constrained_threads
= WORKQUEUE_MAXTHREADS
/ 8;
203 static uint32_t wq_max_concurrency
[WORKQUEUE_NUM_BUCKETS
+ 1]; // set to ncpus on load
205 SYSCTL_INT(_kern
, OID_AUTO
, wq_stalled_window_usecs
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
206 &wq_stalled_window_usecs
, 0, "");
208 SYSCTL_INT(_kern
, OID_AUTO
, wq_reduce_pool_window_usecs
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
209 &wq_reduce_pool_window_usecs
, 0, "");
211 SYSCTL_INT(_kern
, OID_AUTO
, wq_max_timer_interval_usecs
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
212 &wq_max_timer_interval_usecs
, 0, "");
214 SYSCTL_INT(_kern
, OID_AUTO
, wq_max_threads
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
215 &wq_max_threads
, 0, "");
217 SYSCTL_INT(_kern
, OID_AUTO
, wq_max_constrained_threads
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
218 &wq_max_constrained_threads
, 0, "");
221 static int wq_kevent_test SYSCTL_HANDLER_ARGS
;
222 SYSCTL_PROC(_debug
, OID_AUTO
, wq_kevent_test
, CTLFLAG_MASKED
| CTLFLAG_RW
| CTLFLAG_LOCKED
| CTLFLAG_ANYBODY
| CTLTYPE_OPAQUE
, NULL
, 0, wq_kevent_test
, 0, "-");
225 static uint32_t wq_init_constrained_limit
= 1;
227 uint32_t pthread_debug_tracing
= 1;
229 SYSCTL_INT(_kern
, OID_AUTO
, pthread_debug_tracing
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
230 &pthread_debug_tracing
, 0, "")
233 * +-----+-----+-----+-----+-----+-----+-----+
234 * | MT | BG | UT | DE | IN | UN | mgr |
235 * +-----+-----+-----+-----+-----+-----+-----+-----+
236 * | pri | 5 | 4 | 3 | 2 | 1 | 0 | 6 |
237 * | qos | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
238 * +-----+-----+-----+-----+-----+-----+-----+-----+
240 static inline uint32_t
241 _wq_bucket_to_thread_qos(int pri
)
243 if (pri
== WORKQUEUE_EVENT_MANAGER_BUCKET
) {
244 return WORKQUEUE_EVENT_MANAGER_BUCKET
+ 1;
246 return WORKQUEUE_EVENT_MANAGER_BUCKET
- pri
;
249 #pragma mark wq_thactive
251 #if defined(__LP64__)
253 // 7 * 16 bits for each QoS bucket request count (including manager)
254 // 3 bits of best QoS among all pending constrained requests
256 #define WQ_THACTIVE_BUCKET_WIDTH 16
257 #define WQ_THACTIVE_QOS_SHIFT (7 * WQ_THACTIVE_BUCKET_WIDTH)
260 // 6 * 10 bits for each QoS bucket request count (except manager)
261 // 1 bit for the manager bucket
262 // 3 bits of best QoS among all pending constrained requests
263 #define WQ_THACTIVE_BUCKET_WIDTH 10
264 #define WQ_THACTIVE_QOS_SHIFT (6 * WQ_THACTIVE_BUCKET_WIDTH + 1)
266 #define WQ_THACTIVE_BUCKET_MASK ((1U << WQ_THACTIVE_BUCKET_WIDTH) - 1)
267 #define WQ_THACTIVE_BUCKET_HALF (1U << (WQ_THACTIVE_BUCKET_WIDTH - 1))
268 #define WQ_THACTIVE_NO_PENDING_REQUEST 6
270 _Static_assert(sizeof(wq_thactive_t
) * CHAR_BIT
- WQ_THACTIVE_QOS_SHIFT
>= 3,
271 "Make sure we have space to encode a QoS");
273 static inline wq_thactive_t
274 _wq_thactive_fetch_and_add(struct workqueue
*wq
, wq_thactive_t offset
)
276 #if PTHREAD_INLINE_RMW_ATOMICS || !defined(__LP64__)
277 return atomic_fetch_add_explicit(&wq
->wq_thactive
, offset
,
278 memory_order_relaxed
);
280 return pthread_kern
->atomic_fetch_add_128_relaxed(&wq
->wq_thactive
, offset
);
284 static inline wq_thactive_t
285 _wq_thactive(struct workqueue
*wq
)
287 #if PTHREAD_INLINE_RMW_ATOMICS || !defined(__LP64__)
288 return atomic_load_explicit(&wq
->wq_thactive
, memory_order_relaxed
);
290 return pthread_kern
->atomic_load_128_relaxed(&wq
->wq_thactive
);
294 #define WQ_THACTIVE_BEST_CONSTRAINED_REQ_QOS(tha) \
295 ((tha) >> WQ_THACTIVE_QOS_SHIFT)
297 static inline uint32_t
298 _wq_thactive_best_constrained_req_qos(struct workqueue
*wq
)
300 // Avoid expensive atomic operations: the three bits we're loading are in
301 // a single byte, and always updated under the workqueue lock
302 wq_thactive_t v
= *(wq_thactive_t
*)&wq
->wq_thactive
;
303 return WQ_THACTIVE_BEST_CONSTRAINED_REQ_QOS(v
);
306 static inline wq_thactive_t
307 _wq_thactive_set_best_constrained_req_qos(struct workqueue
*wq
,
308 uint32_t orig_qos
, uint32_t new_qos
)
311 v
= (wq_thactive_t
)(new_qos
- orig_qos
) << WQ_THACTIVE_QOS_SHIFT
;
313 * We can do an atomic add relative to the initial load because updates
314 * to this qos are always serialized under the workqueue lock.
316 return _wq_thactive_fetch_and_add(wq
, v
) + v
;
319 static inline wq_thactive_t
320 _wq_thactive_offset_for_qos(int qos
)
322 return (wq_thactive_t
)1 << (qos
* WQ_THACTIVE_BUCKET_WIDTH
);
325 static inline wq_thactive_t
326 _wq_thactive_inc(struct workqueue
*wq
, int qos
)
328 return _wq_thactive_fetch_and_add(wq
, _wq_thactive_offset_for_qos(qos
));
331 static inline wq_thactive_t
332 _wq_thactive_dec(struct workqueue
*wq
, int qos
)
334 return _wq_thactive_fetch_and_add(wq
, -_wq_thactive_offset_for_qos(qos
));
337 static inline wq_thactive_t
338 _wq_thactive_move(struct workqueue
*wq
, int oldqos
, int newqos
)
340 return _wq_thactive_fetch_and_add(wq
, _wq_thactive_offset_for_qos(newqos
) -
341 _wq_thactive_offset_for_qos(oldqos
));
344 static inline uint32_t
345 _wq_thactive_aggregate_downto_qos(struct workqueue
*wq
, wq_thactive_t v
,
346 int qos
, uint32_t *busycount
, uint32_t *max_busycount
)
348 uint32_t count
= 0, active
;
353 * on 32bits the manager bucket is a single bit and the best constrained
354 * request QoS 3 bits are where the 10 bits of a regular QoS bucket count
355 * would be. Mask them out.
357 v
&= ~(~0ull << WQ_THACTIVE_QOS_SHIFT
);
360 curtime
= mach_absolute_time();
364 *max_busycount
= qos
+ 1;
366 for (int i
= 0; i
<= qos
; i
++, v
>>= WQ_THACTIVE_BUCKET_WIDTH
) {
367 active
= v
& WQ_THACTIVE_BUCKET_MASK
;
369 if (busycount
&& wq
->wq_thscheduled_count
[i
] > active
) {
370 if (wq_thread_is_busy(curtime
, &wq
->wq_lastblocked_ts
[i
])) {
372 * We only consider the last blocked thread for a given bucket
373 * as busy because we don't want to take the list lock in each
374 * sched callback. However this is an approximation that could
375 * contribute to thread creation storms.
384 #pragma mark - Process/Thread Setup/Teardown syscalls
386 static mach_vm_offset_t
387 stack_addr_hint(proc_t p
, vm_map_t vmap
)
389 mach_vm_offset_t stackaddr
;
390 mach_vm_offset_t aslr_offset
;
391 bool proc64bit
= proc_is64bit(p
);
393 // We can't safely take random values % something unless its a power-of-two
394 _Static_assert(powerof2(PTH_DEFAULT_STACKSIZE
), "PTH_DEFAULT_STACKSIZE is a power-of-two");
396 #if defined(__i386__) || defined(__x86_64__)
398 // Matches vm_map_get_max_aslr_slide_pages's image shift in xnu
399 aslr_offset
= random() % (1 << 28); // about 512 stacks
401 // Actually bigger than the image shift, we've got ~256MB to work with
402 aslr_offset
= random() % (16 * PTH_DEFAULT_STACKSIZE
);
404 aslr_offset
= vm_map_trunc_page_mask(aslr_offset
, vm_map_page_mask(vmap
));
406 // Above nanomalloc range (see NANOZONE_SIGNATURE)
407 stackaddr
= 0x700000000000 + aslr_offset
;
409 stackaddr
= SHARED_REGION_BASE_I386
+ SHARED_REGION_SIZE_I386
+ aslr_offset
;
411 #elif defined(__arm__) || defined(__arm64__)
412 user_addr_t main_thread_stack_top
= 0;
413 if (pthread_kern
->proc_get_user_stack
) {
414 main_thread_stack_top
= pthread_kern
->proc_get_user_stack(p
);
416 if (proc64bit
&& main_thread_stack_top
) {
417 // The main thread stack position is randomly slid by xnu (c.f.
418 // load_main() in mach_loader.c), so basing pthread stack allocations
419 // where the main thread stack ends is already ASLRd and doing so
420 // avoids creating a gap in the process address space that may cause
421 // extra PTE memory usage. rdar://problem/33328206
422 stackaddr
= vm_map_trunc_page_mask((vm_map_offset_t
)main_thread_stack_top
,
423 vm_map_page_mask(vmap
));
425 // vm_map_get_max_aslr_slide_pages ensures 1MB of slide, we do better
426 aslr_offset
= random() % ((proc64bit
? 4 : 2) * PTH_DEFAULT_STACKSIZE
);
427 aslr_offset
= vm_map_trunc_page_mask((vm_map_offset_t
)aslr_offset
,
428 vm_map_page_mask(vmap
));
430 // 64 stacks below shared region
431 stackaddr
= SHARED_REGION_BASE_ARM64
- 64 * PTH_DEFAULT_STACKSIZE
- aslr_offset
;
433 // If you try to slide down from this point, you risk ending up in memory consumed by malloc
434 stackaddr
= SHARED_REGION_BASE_ARM
- 32 * PTH_DEFAULT_STACKSIZE
+ aslr_offset
;
438 #error Need to define a stack address hint for this architecture
444 * bsdthread_create system call. Used by pthread_create.
447 _bsdthread_create(struct proc
*p
, user_addr_t user_func
, user_addr_t user_funcarg
, user_addr_t user_stack
, user_addr_t user_pthread
, uint32_t flags
, user_addr_t
*retval
)
453 mach_vm_offset_t stackaddr
;
454 mach_vm_size_t th_allocsize
= 0;
455 mach_vm_size_t th_guardsize
;
456 mach_vm_offset_t th_stack
;
457 mach_vm_offset_t th_pthread
;
458 mach_vm_offset_t th_tsd_base
;
459 mach_port_name_t th_thport
;
461 vm_map_t vmap
= pthread_kern
->current_map();
462 task_t ctask
= current_task();
463 unsigned int policy
, importance
;
468 if (pthread_kern
->proc_get_register(p
) == 0) {
472 PTHREAD_TRACE(TRACE_pthread_thread_create
| DBG_FUNC_START
, flags
, 0, 0, 0, 0);
474 isLP64
= proc_is64bit(p
);
475 th_guardsize
= vm_map_page_size(vmap
);
477 stackaddr
= pthread_kern
->proc_get_stack_addr_hint(p
);
478 kret
= pthread_kern
->thread_create(ctask
, &th
);
479 if (kret
!= KERN_SUCCESS
)
481 thread_reference(th
);
483 pthread_kern
->thread_set_tag(th
, THREAD_TAG_PTHREAD
);
485 sright
= (void *)pthread_kern
->convert_thread_to_port(th
);
486 th_thport
= pthread_kern
->ipc_port_copyout_send(sright
, pthread_kern
->task_get_ipcspace(ctask
));
487 if (!MACH_PORT_VALID(th_thport
)) {
488 error
= EMFILE
; // userland will convert this into a crash
492 if ((flags
& PTHREAD_START_CUSTOM
) == 0) {
493 mach_vm_size_t pthread_size
=
494 vm_map_round_page_mask(pthread_kern
->proc_get_pthsize(p
) + PTHREAD_T_OFFSET
, vm_map_page_mask(vmap
));
495 th_allocsize
= th_guardsize
+ user_stack
+ pthread_size
;
496 user_stack
+= PTHREAD_T_OFFSET
;
498 kret
= mach_vm_map(vmap
, &stackaddr
,
501 VM_MAKE_TAG(VM_MEMORY_STACK
)| VM_FLAGS_ANYWHERE
, NULL
,
502 0, FALSE
, VM_PROT_DEFAULT
, VM_PROT_ALL
,
504 if (kret
!= KERN_SUCCESS
){
505 kret
= mach_vm_allocate(vmap
,
506 &stackaddr
, th_allocsize
,
507 VM_MAKE_TAG(VM_MEMORY_STACK
)| VM_FLAGS_ANYWHERE
);
509 if (kret
!= KERN_SUCCESS
) {
514 PTHREAD_TRACE(TRACE_pthread_thread_create
|DBG_FUNC_NONE
, th_allocsize
, stackaddr
, 0, 2, 0);
518 * The guard page is at the lowest address
519 * The stack base is the highest address
521 kret
= mach_vm_protect(vmap
, stackaddr
, th_guardsize
, FALSE
, VM_PROT_NONE
);
523 if (kret
!= KERN_SUCCESS
) {
528 th_pthread
= stackaddr
+ th_guardsize
+ user_stack
;
529 th_stack
= th_pthread
;
532 * Pre-fault the first page of the new thread's stack and the page that will
533 * contain the pthread_t structure.
535 if (vm_map_trunc_page_mask((vm_map_offset_t
)(th_stack
- C_64_REDZONE_LEN
), vm_map_page_mask(vmap
)) !=
536 vm_map_trunc_page_mask((vm_map_offset_t
)th_pthread
, vm_map_page_mask(vmap
))){
538 vm_map_trunc_page_mask((vm_map_offset_t
)(th_stack
- C_64_REDZONE_LEN
), vm_map_page_mask(vmap
)),
539 VM_PROT_READ
| VM_PROT_WRITE
,
541 THREAD_UNINT
, NULL
, 0);
545 vm_map_trunc_page_mask((vm_map_offset_t
)th_pthread
, vm_map_page_mask(vmap
)),
546 VM_PROT_READ
| VM_PROT_WRITE
,
548 THREAD_UNINT
, NULL
, 0);
551 th_stack
= user_stack
;
552 th_pthread
= user_pthread
;
554 PTHREAD_TRACE(TRACE_pthread_thread_create
|DBG_FUNC_NONE
, 0, 0, 0, 3, 0);
557 tsd_offset
= pthread_kern
->proc_get_pthread_tsd_offset(p
);
559 th_tsd_base
= th_pthread
+ tsd_offset
;
560 kret
= pthread_kern
->thread_set_tsd_base(th
, th_tsd_base
);
561 if (kret
== KERN_SUCCESS
) {
562 flags
|= PTHREAD_START_TSD_BASE_SET
;
566 #if defined(__i386__) || defined(__x86_64__)
568 * Set up i386 registers & function call.
571 x86_thread_state32_t state
= {
572 .eip
= (unsigned int)pthread_kern
->proc_get_threadstart(p
),
573 .eax
= (unsigned int)th_pthread
,
574 .ebx
= (unsigned int)th_thport
,
575 .ecx
= (unsigned int)user_func
,
576 .edx
= (unsigned int)user_funcarg
,
577 .edi
= (unsigned int)user_stack
,
578 .esi
= (unsigned int)flags
,
582 .esp
= (int)((vm_offset_t
)(th_stack
-C_32_STK_ALIGN
))
585 error
= pthread_kern
->thread_set_wq_state32(th
, (thread_state_t
)&state
);
586 if (error
!= KERN_SUCCESS
) {
591 x86_thread_state64_t state64
= {
592 .rip
= (uint64_t)pthread_kern
->proc_get_threadstart(p
),
593 .rdi
= (uint64_t)th_pthread
,
594 .rsi
= (uint64_t)(th_thport
),
595 .rdx
= (uint64_t)user_func
,
596 .rcx
= (uint64_t)user_funcarg
,
597 .r8
= (uint64_t)user_stack
,
598 .r9
= (uint64_t)flags
,
600 * set stack pointer aligned to 16 byte boundary
602 .rsp
= (uint64_t)(th_stack
- C_64_REDZONE_LEN
)
605 error
= pthread_kern
->thread_set_wq_state64(th
, (thread_state_t
)&state64
);
606 if (error
!= KERN_SUCCESS
) {
612 #elif defined(__arm__)
613 arm_thread_state_t state
= {
614 .pc
= (int)pthread_kern
->proc_get_threadstart(p
),
615 .r
[0] = (unsigned int)th_pthread
,
616 .r
[1] = (unsigned int)th_thport
,
617 .r
[2] = (unsigned int)user_func
,
618 .r
[3] = (unsigned int)user_funcarg
,
619 .r
[4] = (unsigned int)user_stack
,
620 .r
[5] = (unsigned int)flags
,
622 /* Set r7 & lr to 0 for better back tracing */
629 .sp
= (int)((vm_offset_t
)(th_stack
-C_32_STK_ALIGN
))
632 (void) pthread_kern
->thread_set_wq_state32(th
, (thread_state_t
)&state
);
635 #error bsdthread_create not defined for this architecture
638 if ((flags
& PTHREAD_START_SETSCHED
) != 0) {
639 /* Set scheduling parameters if needed */
640 thread_extended_policy_data_t extinfo
;
641 thread_precedence_policy_data_t precedinfo
;
643 importance
= (flags
& PTHREAD_START_IMPORTANCE_MASK
);
644 policy
= (flags
>> PTHREAD_START_POLICY_BITSHIFT
) & PTHREAD_START_POLICY_MASK
;
646 if (policy
== SCHED_OTHER
) {
647 extinfo
.timeshare
= 1;
649 extinfo
.timeshare
= 0;
652 thread_policy_set(th
, THREAD_EXTENDED_POLICY
, (thread_policy_t
)&extinfo
, THREAD_EXTENDED_POLICY_COUNT
);
654 precedinfo
.importance
= (importance
- BASEPRI_DEFAULT
);
655 thread_policy_set(th
, THREAD_PRECEDENCE_POLICY
, (thread_policy_t
)&precedinfo
, THREAD_PRECEDENCE_POLICY_COUNT
);
656 } else if ((flags
& PTHREAD_START_QOSCLASS
) != 0) {
657 /* Set thread QoS class if requested. */
658 pthread_priority_t priority
= (pthread_priority_t
)(flags
& PTHREAD_START_QOSCLASS_MASK
);
660 thread_qos_policy_data_t qos
;
661 qos
.qos_tier
= pthread_priority_get_thread_qos(priority
);
662 qos
.tier_importance
= (qos
.qos_tier
== QOS_CLASS_UNSPECIFIED
) ? 0 :
663 _pthread_priority_get_relpri(priority
);
665 pthread_kern
->thread_policy_set_internal(th
, THREAD_QOS_POLICY
, (thread_policy_t
)&qos
, THREAD_QOS_POLICY_COUNT
);
668 if (pthread_kern
->proc_get_mach_thread_self_tsd_offset
) {
669 uint64_t mach_thread_self_offset
=
670 pthread_kern
->proc_get_mach_thread_self_tsd_offset(p
);
671 if (mach_thread_self_offset
&& tsd_offset
) {
672 bool proc64bit
= proc_is64bit(p
);
674 uint64_t th_thport_tsd
= (uint64_t)th_thport
;
675 error
= copyout(&th_thport_tsd
, th_pthread
+ tsd_offset
+
676 mach_thread_self_offset
, sizeof(th_thport_tsd
));
678 uint32_t th_thport_tsd
= (uint32_t)th_thport
;
679 error
= copyout(&th_thport_tsd
, th_pthread
+ tsd_offset
+
680 mach_thread_self_offset
, sizeof(th_thport_tsd
));
688 kret
= pthread_kern
->thread_resume(th
);
689 if (kret
!= KERN_SUCCESS
) {
693 thread_deallocate(th
); /* drop the creator reference */
695 PTHREAD_TRACE(TRACE_pthread_thread_create
|DBG_FUNC_END
, error
, th_pthread
, 0, 0, 0);
697 // cast required as mach_vm_offset_t is always 64 bits even on 32-bit platforms
698 *retval
= (user_addr_t
)th_pthread
;
703 if (allocated
!= 0) {
704 (void)mach_vm_deallocate(vmap
, stackaddr
, th_allocsize
);
707 (void)pthread_kern
->mach_port_deallocate(pthread_kern
->task_get_ipcspace(ctask
), th_thport
);
708 if (pthread_kern
->thread_will_park_or_terminate
) {
709 pthread_kern
->thread_will_park_or_terminate(th
);
711 (void)thread_terminate(th
);
712 (void)thread_deallocate(th
);
717 * bsdthread_terminate system call. Used by pthread_terminate
720 _bsdthread_terminate(__unused
struct proc
*p
,
721 user_addr_t stackaddr
,
725 __unused
int32_t *retval
)
727 mach_vm_offset_t freeaddr
;
728 mach_vm_size_t freesize
;
730 thread_t th
= current_thread();
732 freeaddr
= (mach_vm_offset_t
)stackaddr
;
735 PTHREAD_TRACE(TRACE_pthread_thread_terminate
|DBG_FUNC_START
, freeaddr
, freesize
, kthport
, 0xff, 0);
737 if ((freesize
!= (mach_vm_size_t
)0) && (freeaddr
!= (mach_vm_offset_t
)0)) {
738 if (pthread_kern
->thread_get_tag(th
) & THREAD_TAG_MAINTHREAD
){
739 vm_map_t user_map
= pthread_kern
->current_map();
740 freesize
= vm_map_trunc_page_mask((vm_map_offset_t
)freesize
- 1, vm_map_page_mask(user_map
));
741 kret
= mach_vm_behavior_set(user_map
, freeaddr
, freesize
, VM_BEHAVIOR_REUSABLE
);
742 assert(kret
== KERN_SUCCESS
|| kret
== KERN_INVALID_ADDRESS
);
743 kret
= kret
? kret
: mach_vm_protect(user_map
, freeaddr
, freesize
, FALSE
, VM_PROT_NONE
);
744 assert(kret
== KERN_SUCCESS
|| kret
== KERN_INVALID_ADDRESS
);
746 kret
= mach_vm_deallocate(pthread_kern
->current_map(), freeaddr
, freesize
);
747 if (kret
!= KERN_SUCCESS
) {
748 PTHREAD_TRACE(TRACE_pthread_thread_terminate
|DBG_FUNC_END
, kret
, 0, 0, 0, 0);
754 if (pthread_kern
->thread_will_park_or_terminate
) {
755 pthread_kern
->thread_will_park_or_terminate(th
);
757 (void)thread_terminate(th
);
758 if (sem
!= MACH_PORT_NULL
) {
759 kret
= pthread_kern
->semaphore_signal_internal_trap(sem
);
760 if (kret
!= KERN_SUCCESS
) {
761 PTHREAD_TRACE(TRACE_pthread_thread_terminate
|DBG_FUNC_END
, kret
, 0, 0, 0, 0);
766 if (kthport
!= MACH_PORT_NULL
) {
767 pthread_kern
->mach_port_deallocate(pthread_kern
->task_get_ipcspace(current_task()), kthport
);
770 PTHREAD_TRACE(TRACE_pthread_thread_terminate
|DBG_FUNC_END
, 0, 0, 0, 0, 0);
772 pthread_kern
->thread_exception_return();
773 panic("bsdthread_terminate: still running\n");
775 PTHREAD_TRACE(TRACE_pthread_thread_terminate
|DBG_FUNC_END
, 0, 0xff, 0, 0, 0);
781 * bsdthread_register system call. Performs per-process setup. Responsible for
782 * returning capabilitiy bits to userspace and receiving userspace function addresses.
785 _bsdthread_register(struct proc
*p
,
786 user_addr_t threadstart
,
787 user_addr_t wqthread
,
789 user_addr_t pthread_init_data
,
790 user_addr_t pthread_init_data_size
,
791 uint64_t dispatchqueue_offset
,
794 struct _pthread_registration_data data
= {};
795 uint32_t max_tsd_offset
;
797 size_t pthread_init_sz
= 0;
799 /* syscall randomizer test can pass bogus values */
800 if (pthsize
< 0 || pthsize
> MAX_PTHREAD_SIZE
) {
804 * if we have pthread_init_data, then we use that and target_concptr
805 * (which is an offset) get data.
807 if (pthread_init_data
!= 0) {
808 if (pthread_init_data_size
< sizeof(data
.version
)) {
811 pthread_init_sz
= MIN(sizeof(data
), (size_t)pthread_init_data_size
);
812 int ret
= copyin(pthread_init_data
, &data
, pthread_init_sz
);
816 if (data
.version
!= (size_t)pthread_init_data_size
) {
820 data
.dispatch_queue_offset
= dispatchqueue_offset
;
823 /* We have to do this before proc_get_register so that it resets after fork */
824 mach_vm_offset_t stackaddr
= stack_addr_hint(p
, pthread_kern
->current_map());
825 pthread_kern
->proc_set_stack_addr_hint(p
, (user_addr_t
)stackaddr
);
827 /* prevent multiple registrations */
828 if (pthread_kern
->proc_get_register(p
) != 0) {
832 pthread_kern
->proc_set_threadstart(p
, threadstart
);
833 pthread_kern
->proc_set_wqthread(p
, wqthread
);
834 pthread_kern
->proc_set_pthsize(p
, pthsize
);
835 pthread_kern
->proc_set_register(p
);
837 uint32_t tsd_slot_sz
= proc_is64bit(p
) ? sizeof(uint64_t) : sizeof(uint32_t);
838 if ((uint32_t)pthsize
>= tsd_slot_sz
&&
839 data
.tsd_offset
<= (uint32_t)(pthsize
- tsd_slot_sz
)) {
840 max_tsd_offset
= ((uint32_t)pthsize
- data
.tsd_offset
- tsd_slot_sz
);
845 pthread_kern
->proc_set_pthread_tsd_offset(p
, data
.tsd_offset
);
847 if (data
.dispatch_queue_offset
> max_tsd_offset
) {
848 data
.dispatch_queue_offset
= 0;
850 pthread_kern
->proc_set_dispatchqueue_offset(p
, data
.dispatch_queue_offset
);
852 if (pthread_kern
->proc_set_return_to_kernel_offset
) {
853 if (data
.return_to_kernel_offset
> max_tsd_offset
) {
854 data
.return_to_kernel_offset
= 0;
856 pthread_kern
->proc_set_return_to_kernel_offset(p
,
857 data
.return_to_kernel_offset
);
860 if (pthread_kern
->proc_set_mach_thread_self_tsd_offset
) {
861 if (data
.mach_thread_self_offset
> max_tsd_offset
) {
862 data
.mach_thread_self_offset
= 0;
864 pthread_kern
->proc_set_mach_thread_self_tsd_offset(p
,
865 data
.mach_thread_self_offset
);
868 if (pthread_init_data
!= 0) {
869 /* Outgoing data that userspace expects as a reply */
870 data
.version
= sizeof(struct _pthread_registration_data
);
871 if (pthread_kern
->qos_main_thread_active()) {
872 mach_msg_type_number_t nqos
= THREAD_QOS_POLICY_COUNT
;
873 thread_qos_policy_data_t qos
;
874 boolean_t gd
= FALSE
;
876 kr
= pthread_kern
->thread_policy_get(current_thread(), THREAD_QOS_POLICY
, (thread_policy_t
)&qos
, &nqos
, &gd
);
877 if (kr
!= KERN_SUCCESS
|| qos
.qos_tier
== THREAD_QOS_UNSPECIFIED
) {
878 /* Unspecified threads means the kernel wants us to impose legacy upon the thread. */
879 qos
.qos_tier
= THREAD_QOS_LEGACY
;
880 qos
.tier_importance
= 0;
882 kr
= pthread_kern
->thread_policy_set_internal(current_thread(), THREAD_QOS_POLICY
, (thread_policy_t
)&qos
, THREAD_QOS_POLICY_COUNT
);
885 if (kr
== KERN_SUCCESS
) {
886 data
.main_qos
= thread_qos_get_pthread_priority(qos
.qos_tier
);
888 data
.main_qos
= _pthread_priority_make_newest(QOS_CLASS_UNSPECIFIED
, 0, 0);
891 data
.main_qos
= _pthread_priority_make_newest(QOS_CLASS_UNSPECIFIED
, 0, 0);
894 kr
= copyout(&data
, pthread_init_data
, pthread_init_sz
);
895 if (kr
!= KERN_SUCCESS
) {
900 /* return the supported feature set as the return value. */
901 *retval
= PTHREAD_FEATURE_SUPPORTED
;
906 #pragma mark - QoS Manipulation
909 _bsdthread_ctl_set_qos(struct proc
*p
, user_addr_t __unused cmd
, mach_port_name_t kport
, user_addr_t tsd_priority_addr
, user_addr_t arg3
, int *retval
)
914 pthread_priority_t priority
;
916 /* Unused parameters must be zero. */
921 /* QoS is stored in a given slot in the pthread TSD. We need to copy that in and set our QoS based on it. */
922 if (proc_is64bit(p
)) {
924 rv
= copyin(tsd_priority_addr
, &v
, sizeof(v
));
926 priority
= (int)(v
& 0xffffffff);
929 rv
= copyin(tsd_priority_addr
, &v
, sizeof(v
));
934 if ((th
= port_name_to_thread(kport
)) == THREAD_NULL
) {
938 /* <rdar://problem/16211829> Disable pthread_set_qos_class_np() on threads other than pthread_self */
939 if (th
!= current_thread()) {
940 thread_deallocate(th
);
944 rv
= _bsdthread_ctl_set_self(p
, 0, priority
, 0, _PTHREAD_SET_SELF_QOS_FLAG
, retval
);
946 /* Static param the thread, we just set QoS on it, so its stuck in QoS land now. */
947 /* pthread_kern->thread_static_param(th, TRUE); */ // see <rdar://problem/16433744>, for details
949 thread_deallocate(th
);
955 static inline struct threadlist
*
956 util_get_thread_threadlist_entry(thread_t th
)
958 struct uthread
*uth
= pthread_kern
->get_bsdthread_info(th
);
960 struct threadlist
*tl
= pthread_kern
->uthread_get_threadlist(uth
);
967 _workq_thread_has_been_unbound(thread_t th
, int qos_class
)
969 struct threadlist
*tl
= util_get_thread_threadlist_entry(th
);
974 struct workqueue
*wq
= tl
->th_workq
;
975 workqueue_lock_spin(wq
);
977 if (tl
->th_priority
== WORKQUEUE_EVENT_MANAGER_BUCKET
) {
979 } else if (qos_class
!= class_index_get_thread_qos(tl
->th_priority
)) {
983 if ((tl
->th_flags
& TH_LIST_KEVENT_BOUND
)){
986 tl
->th_flags
&= ~TH_LIST_KEVENT_BOUND
;
988 workqueue_unlock(wq
);
992 workqueue_unlock(wq
);
997 _bsdthread_ctl_set_self(struct proc
*p
, user_addr_t __unused cmd
, pthread_priority_t priority
, mach_port_name_t voucher
, _pthread_set_flags_t flags
, int __unused
*retval
)
999 thread_qos_policy_data_t qos
;
1000 mach_msg_type_number_t nqos
= THREAD_QOS_POLICY_COUNT
;
1001 boolean_t gd
= FALSE
;
1002 thread_t th
= current_thread();
1003 struct workqueue
*wq
= NULL
;
1004 struct threadlist
*tl
= NULL
;
1007 int qos_rv
= 0, voucher_rv
= 0, fixedpri_rv
= 0;
1009 if ((flags
& _PTHREAD_SET_SELF_WQ_KEVENT_UNBIND
) != 0) {
1010 tl
= util_get_thread_threadlist_entry(th
);
1017 workqueue_lock_spin(wq
);
1018 if (tl
->th_flags
& TH_LIST_KEVENT_BOUND
) {
1019 tl
->th_flags
&= ~TH_LIST_KEVENT_BOUND
;
1020 unsigned int kevent_flags
= KEVENT_FLAG_WORKQ
| KEVENT_FLAG_UNBIND_CHECK_FLAGS
;
1021 if (tl
->th_priority
== WORKQUEUE_EVENT_MANAGER_BUCKET
) {
1022 kevent_flags
|= KEVENT_FLAG_WORKQ_MANAGER
;
1025 workqueue_unlock(wq
);
1026 __assert_only
int ret
= kevent_qos_internal_unbind(p
, class_index_get_thread_qos(tl
->th_priority
), th
, kevent_flags
);
1029 workqueue_unlock(wq
);
1034 if ((flags
& _PTHREAD_SET_SELF_QOS_FLAG
) != 0) {
1035 kr
= pthread_kern
->thread_policy_get(th
, THREAD_QOS_POLICY
, (thread_policy_t
)&qos
, &nqos
, &gd
);
1036 if (kr
!= KERN_SUCCESS
) {
1042 * If we have main-thread QoS then we don't allow a thread to come out
1043 * of QOS_CLASS_UNSPECIFIED.
1045 if (pthread_kern
->qos_main_thread_active() && qos
.qos_tier
==
1046 THREAD_QOS_UNSPECIFIED
) {
1052 tl
= util_get_thread_threadlist_entry(th
);
1053 if (tl
) wq
= tl
->th_workq
;
1056 PTHREAD_TRACE_WQ(TRACE_pthread_set_qos_self
| DBG_FUNC_START
, wq
, qos
.qos_tier
, qos
.tier_importance
, 0, 0);
1058 qos
.qos_tier
= pthread_priority_get_thread_qos(priority
);
1059 qos
.tier_importance
= (qos
.qos_tier
== QOS_CLASS_UNSPECIFIED
) ? 0 : _pthread_priority_get_relpri(priority
);
1061 if (qos
.qos_tier
== QOS_CLASS_UNSPECIFIED
||
1062 qos
.tier_importance
> 0 || qos
.tier_importance
< THREAD_QOS_MIN_TIER_IMPORTANCE
) {
1068 * If we're a workqueue, the threadlist item priority needs adjusting,
1069 * along with the bucket we were running in.
1072 bool try_run_threadreq
= false;
1074 workqueue_lock_spin(wq
);
1075 kr
= pthread_kern
->thread_set_workq_qos(th
, qos
.qos_tier
, qos
.tier_importance
);
1076 assert(kr
== KERN_SUCCESS
|| kr
== KERN_TERMINATED
);
1078 /* Fix up counters. */
1079 uint8_t old_bucket
= tl
->th_priority
;
1080 uint8_t new_bucket
= pthread_priority_get_class_index(priority
);
1082 if (old_bucket
!= new_bucket
) {
1083 _wq_thactive_move(wq
, old_bucket
, new_bucket
);
1084 wq
->wq_thscheduled_count
[old_bucket
]--;
1085 wq
->wq_thscheduled_count
[new_bucket
]++;
1086 if (old_bucket
== WORKQUEUE_EVENT_MANAGER_BUCKET
||
1087 old_bucket
< new_bucket
) {
1089 * if the QoS of the thread was lowered, then this could
1090 * allow for a higher QoS thread request to run, so we need
1093 try_run_threadreq
= true;
1095 tl
->th_priority
= new_bucket
;
1098 bool old_overcommit
= !(tl
->th_flags
& TH_LIST_CONSTRAINED
);
1099 bool new_overcommit
= priority
& _PTHREAD_PRIORITY_OVERCOMMIT_FLAG
;
1100 if (!old_overcommit
&& new_overcommit
) {
1101 if (wq
->wq_constrained_threads_scheduled
-- ==
1102 wq_max_constrained_threads
) {
1103 try_run_threadreq
= true;
1105 tl
->th_flags
&= ~TH_LIST_CONSTRAINED
;
1106 } else if (old_overcommit
&& !new_overcommit
) {
1107 wq
->wq_constrained_threads_scheduled
++;
1108 tl
->th_flags
|= TH_LIST_CONSTRAINED
;
1111 if (try_run_threadreq
) {
1112 workqueue_run_threadreq_and_unlock(p
, wq
, NULL
, NULL
, true);
1114 workqueue_unlock(wq
);
1117 kr
= pthread_kern
->thread_policy_set_internal(th
, THREAD_QOS_POLICY
, (thread_policy_t
)&qos
, THREAD_QOS_POLICY_COUNT
);
1118 if (kr
!= KERN_SUCCESS
) {
1123 PTHREAD_TRACE_WQ(TRACE_pthread_set_qos_self
| DBG_FUNC_END
, wq
, qos
.qos_tier
, qos
.tier_importance
, 0, 0);
1127 if ((flags
& _PTHREAD_SET_SELF_VOUCHER_FLAG
) != 0) {
1128 kr
= pthread_kern
->thread_set_voucher_name(voucher
);
1129 if (kr
!= KERN_SUCCESS
) {
1130 voucher_rv
= ENOENT
;
1136 if (qos_rv
) goto done
;
1137 if ((flags
& _PTHREAD_SET_SELF_FIXEDPRIORITY_FLAG
) != 0) {
1138 thread_extended_policy_data_t extpol
= {.timeshare
= 0};
1140 if (!tl
) tl
= util_get_thread_threadlist_entry(th
);
1142 /* Not allowed on workqueue threads */
1143 fixedpri_rv
= ENOTSUP
;
1147 kr
= pthread_kern
->thread_policy_set_internal(th
, THREAD_EXTENDED_POLICY
, (thread_policy_t
)&extpol
, THREAD_EXTENDED_POLICY_COUNT
);
1148 if (kr
!= KERN_SUCCESS
) {
1149 fixedpri_rv
= EINVAL
;
1152 } else if ((flags
& _PTHREAD_SET_SELF_TIMESHARE_FLAG
) != 0) {
1153 thread_extended_policy_data_t extpol
= {.timeshare
= 1};
1155 if (!tl
) tl
= util_get_thread_threadlist_entry(th
);
1157 /* Not allowed on workqueue threads */
1158 fixedpri_rv
= ENOTSUP
;
1162 kr
= pthread_kern
->thread_policy_set_internal(th
, THREAD_EXTENDED_POLICY
, (thread_policy_t
)&extpol
, THREAD_EXTENDED_POLICY_COUNT
);
1163 if (kr
!= KERN_SUCCESS
) {
1164 fixedpri_rv
= EINVAL
;
1170 if (qos_rv
&& voucher_rv
) {
1171 /* Both failed, give that a unique error. */
1191 _bsdthread_ctl_qos_override_start(struct proc __unused
*p
, user_addr_t __unused cmd
, mach_port_name_t kport
, pthread_priority_t priority
, user_addr_t resource
, int __unused
*retval
)
1196 if ((th
= port_name_to_thread(kport
)) == THREAD_NULL
) {
1200 int override_qos
= pthread_priority_get_thread_qos(priority
);
1202 struct threadlist
*tl
= util_get_thread_threadlist_entry(th
);
1204 PTHREAD_TRACE_WQ(TRACE_wq_override_start
| DBG_FUNC_NONE
, tl
->th_workq
, thread_tid(th
), 1, priority
, 0);
1207 /* The only failure case here is if we pass a tid and have it lookup the thread, we pass the uthread, so this all always succeeds. */
1208 pthread_kern
->proc_usynch_thread_qos_add_override_for_resource_check_owner(th
, override_qos
, TRUE
,
1209 resource
, THREAD_QOS_OVERRIDE_TYPE_PTHREAD_EXPLICIT_OVERRIDE
, USER_ADDR_NULL
, MACH_PORT_NULL
);
1210 thread_deallocate(th
);
1215 _bsdthread_ctl_qos_override_end(struct proc __unused
*p
, user_addr_t __unused cmd
, mach_port_name_t kport
, user_addr_t resource
, user_addr_t arg3
, int __unused
*retval
)
1224 if ((th
= port_name_to_thread(kport
)) == THREAD_NULL
) {
1228 struct uthread
*uth
= pthread_kern
->get_bsdthread_info(th
);
1230 struct threadlist
*tl
= util_get_thread_threadlist_entry(th
);
1232 PTHREAD_TRACE_WQ(TRACE_wq_override_end
| DBG_FUNC_NONE
, tl
->th_workq
, thread_tid(th
), 0, 0, 0);
1235 pthread_kern
->proc_usynch_thread_qos_remove_override_for_resource(current_task(), uth
, 0, resource
, THREAD_QOS_OVERRIDE_TYPE_PTHREAD_EXPLICIT_OVERRIDE
);
1237 thread_deallocate(th
);
1242 _bsdthread_ctl_qos_dispatch_asynchronous_override_add_internal(mach_port_name_t kport
, pthread_priority_t priority
, user_addr_t resource
, user_addr_t ulock_addr
)
1247 if ((th
= port_name_to_thread(kport
)) == THREAD_NULL
) {
1251 int override_qos
= pthread_priority_get_thread_qos(priority
);
1253 struct threadlist
*tl
= util_get_thread_threadlist_entry(th
);
1255 thread_deallocate(th
);
1259 PTHREAD_TRACE_WQ(TRACE_wq_override_dispatch
| DBG_FUNC_NONE
, tl
->th_workq
, thread_tid(th
), 1, priority
, 0);
1261 rv
= pthread_kern
->proc_usynch_thread_qos_add_override_for_resource_check_owner(th
, override_qos
, TRUE
,
1262 resource
, THREAD_QOS_OVERRIDE_TYPE_DISPATCH_ASYNCHRONOUS_OVERRIDE
, ulock_addr
, kport
);
1264 thread_deallocate(th
);
1268 int _bsdthread_ctl_qos_dispatch_asynchronous_override_add(struct proc __unused
*p
, user_addr_t __unused cmd
,
1269 mach_port_name_t kport
, pthread_priority_t priority
, user_addr_t resource
, int __unused
*retval
)
1271 return _bsdthread_ctl_qos_dispatch_asynchronous_override_add_internal(kport
, priority
, resource
, USER_ADDR_NULL
);
1275 _bsdthread_ctl_qos_override_dispatch(struct proc
*p __unused
, user_addr_t cmd __unused
, mach_port_name_t kport
, pthread_priority_t priority
, user_addr_t ulock_addr
, int __unused
*retval
)
1277 return _bsdthread_ctl_qos_dispatch_asynchronous_override_add_internal(kport
, priority
, USER_ADDR_NULL
, ulock_addr
);
1281 _bsdthread_ctl_qos_override_reset(struct proc
*p
, user_addr_t cmd
, user_addr_t arg1
, user_addr_t arg2
, user_addr_t arg3
, int *retval
)
1283 if (arg1
!= 0 || arg2
!= 0 || arg3
!= 0) {
1287 return _bsdthread_ctl_qos_dispatch_asynchronous_override_reset(p
, cmd
, 1 /* reset_all */, 0, 0, retval
);
1291 _bsdthread_ctl_qos_dispatch_asynchronous_override_reset(struct proc __unused
*p
, user_addr_t __unused cmd
, int reset_all
, user_addr_t resource
, user_addr_t arg3
, int __unused
*retval
)
1293 if ((reset_all
&& (resource
!= 0)) || arg3
!= 0) {
1297 thread_t th
= current_thread();
1298 struct uthread
*uth
= pthread_kern
->get_bsdthread_info(th
);
1299 struct threadlist
*tl
= pthread_kern
->uthread_get_threadlist(uth
);
1305 PTHREAD_TRACE_WQ(TRACE_wq_override_reset
| DBG_FUNC_NONE
, tl
->th_workq
, 0, 0, 0, 0);
1307 resource
= reset_all
? THREAD_QOS_OVERRIDE_RESOURCE_WILDCARD
: resource
;
1308 pthread_kern
->proc_usynch_thread_qos_reset_override_for_resource(current_task(), uth
, 0, resource
, THREAD_QOS_OVERRIDE_TYPE_DISPATCH_ASYNCHRONOUS_OVERRIDE
);
1314 _bsdthread_ctl_max_parallelism(struct proc __unused
*p
, user_addr_t __unused cmd
,
1315 int qos
, unsigned long flags
, int *retval
)
1317 _Static_assert(QOS_PARALLELISM_COUNT_LOGICAL
==
1318 _PTHREAD_QOS_PARALLELISM_COUNT_LOGICAL
, "logical");
1319 _Static_assert(QOS_PARALLELISM_REALTIME
==
1320 _PTHREAD_QOS_PARALLELISM_REALTIME
, "realtime");
1322 if (flags
& ~(QOS_PARALLELISM_REALTIME
| QOS_PARALLELISM_COUNT_LOGICAL
)) {
1326 if (flags
& QOS_PARALLELISM_REALTIME
) {
1330 } else if (qos
== THREAD_QOS_UNSPECIFIED
|| qos
>= THREAD_QOS_LAST
) {
1334 *retval
= pthread_kern
->qos_max_parallelism(qos
, flags
);
1339 _bsdthread_ctl(struct proc
*p
, user_addr_t cmd
, user_addr_t arg1
, user_addr_t arg2
, user_addr_t arg3
, int *retval
)
1342 case BSDTHREAD_CTL_SET_QOS
:
1343 return _bsdthread_ctl_set_qos(p
, cmd
, (mach_port_name_t
)arg1
, arg2
, arg3
, retval
);
1344 case BSDTHREAD_CTL_QOS_OVERRIDE_START
:
1345 return _bsdthread_ctl_qos_override_start(p
, cmd
, (mach_port_name_t
)arg1
, (pthread_priority_t
)arg2
, arg3
, retval
);
1346 case BSDTHREAD_CTL_QOS_OVERRIDE_END
:
1347 return _bsdthread_ctl_qos_override_end(p
, cmd
, (mach_port_name_t
)arg1
, arg2
, arg3
, retval
);
1348 case BSDTHREAD_CTL_QOS_OVERRIDE_RESET
:
1349 return _bsdthread_ctl_qos_override_reset(p
, cmd
, arg1
, arg2
, arg3
, retval
);
1350 case BSDTHREAD_CTL_QOS_OVERRIDE_DISPATCH
:
1351 return _bsdthread_ctl_qos_override_dispatch(p
, cmd
, (mach_port_name_t
)arg1
, (pthread_priority_t
)arg2
, arg3
, retval
);
1352 case BSDTHREAD_CTL_QOS_DISPATCH_ASYNCHRONOUS_OVERRIDE_ADD
:
1353 return _bsdthread_ctl_qos_dispatch_asynchronous_override_add(p
, cmd
, (mach_port_name_t
)arg1
, (pthread_priority_t
)arg2
, arg3
, retval
);
1354 case BSDTHREAD_CTL_QOS_DISPATCH_ASYNCHRONOUS_OVERRIDE_RESET
:
1355 return _bsdthread_ctl_qos_dispatch_asynchronous_override_reset(p
, cmd
, (int)arg1
, arg2
, arg3
, retval
);
1356 case BSDTHREAD_CTL_SET_SELF
:
1357 return _bsdthread_ctl_set_self(p
, cmd
, (pthread_priority_t
)arg1
, (mach_port_name_t
)arg2
, (_pthread_set_flags_t
)arg3
, retval
);
1358 case BSDTHREAD_CTL_QOS_MAX_PARALLELISM
:
1359 return _bsdthread_ctl_max_parallelism(p
, cmd
, (int)arg1
, (unsigned long)arg2
, retval
);
1365 #pragma mark - Workqueue Implementation
1367 #pragma mark wq_flags
1369 static inline uint32_t
1370 _wq_flags(struct workqueue
*wq
)
1372 return atomic_load_explicit(&wq
->wq_flags
, memory_order_relaxed
);
1376 _wq_exiting(struct workqueue
*wq
)
1378 return _wq_flags(wq
) & WQ_EXITING
;
1381 static inline uint32_t
1382 _wq_flags_or_orig(struct workqueue
*wq
, uint32_t v
)
1384 #if PTHREAD_INLINE_RMW_ATOMICS
1387 state
= _wq_flags(wq
);
1388 } while (!OSCompareAndSwap(state
, state
| v
, &wq
->wq_flags
));
1391 return atomic_fetch_or_explicit(&wq
->wq_flags
, v
, memory_order_relaxed
);
1395 static inline uint32_t
1396 _wq_flags_and_orig(struct workqueue
*wq
, uint32_t v
)
1398 #if PTHREAD_INLINE_RMW_ATOMICS
1401 state
= _wq_flags(wq
);
1402 } while (!OSCompareAndSwap(state
, state
& v
, &wq
->wq_flags
));
1405 return atomic_fetch_and_explicit(&wq
->wq_flags
, v
, memory_order_relaxed
);
1410 WQ_TIMER_DELAYED_NEEDED(struct workqueue
*wq
)
1412 uint32_t oldflags
, newflags
;
1414 oldflags
= _wq_flags(wq
);
1415 if (oldflags
& (WQ_EXITING
| WQ_ATIMER_DELAYED_RUNNING
)) {
1418 newflags
= oldflags
| WQ_ATIMER_DELAYED_RUNNING
;
1419 } while (!OSCompareAndSwap(oldflags
, newflags
, &wq
->wq_flags
));
1424 WQ_TIMER_IMMEDIATE_NEEDED(struct workqueue
*wq
)
1426 uint32_t oldflags
, newflags
;
1428 oldflags
= _wq_flags(wq
);
1429 if (oldflags
& (WQ_EXITING
| WQ_ATIMER_IMMEDIATE_RUNNING
)) {
1432 newflags
= oldflags
| WQ_ATIMER_IMMEDIATE_RUNNING
;
1433 } while (!OSCompareAndSwap(oldflags
, newflags
, &wq
->wq_flags
));
1437 #pragma mark thread requests pacing
1439 static inline uint32_t
1440 _wq_pacing_shift_for_pri(int pri
)
1442 return _wq_bucket_to_thread_qos(pri
) - 1;
1446 _wq_highest_paced_priority(struct workqueue
*wq
)
1448 uint8_t paced
= wq
->wq_paced
;
1449 int msb
= paced
? 32 - __builtin_clz(paced
) : 0; // fls(paced) == bit + 1
1450 return WORKQUEUE_EVENT_MANAGER_BUCKET
- msb
;
1453 static inline uint8_t
1454 _wq_pacing_bit_for_pri(int pri
)
1456 return 1u << _wq_pacing_shift_for_pri(pri
);
1460 _wq_should_pace_priority(struct workqueue
*wq
, int pri
)
1462 return wq
->wq_paced
>= _wq_pacing_bit_for_pri(pri
);
1466 _wq_pacing_start(struct workqueue
*wq
, struct threadlist
*tl
)
1468 uint8_t bit
= _wq_pacing_bit_for_pri(tl
->th_priority
);
1469 assert((tl
->th_flags
& TH_LIST_PACING
) == 0);
1470 assert((wq
->wq_paced
& bit
) == 0);
1471 wq
->wq_paced
|= bit
;
1472 tl
->th_flags
|= TH_LIST_PACING
;
1476 _wq_pacing_end(struct workqueue
*wq
, struct threadlist
*tl
)
1478 if (tl
->th_flags
& TH_LIST_PACING
) {
1479 uint8_t bit
= _wq_pacing_bit_for_pri(tl
->th_priority
);
1480 assert((wq
->wq_paced
& bit
) != 0);
1481 wq
->wq_paced
^= bit
;
1482 tl
->th_flags
&= ~TH_LIST_PACING
;
1483 return wq
->wq_paced
< bit
; // !_wq_should_pace_priority
1488 #pragma mark thread requests
1491 _threadreq_init_alloced(struct threadreq
*req
, int priority
, int flags
)
1493 assert((flags
& TR_FLAG_ONSTACK
) == 0);
1494 req
->tr_state
= TR_STATE_NEW
;
1495 req
->tr_priority
= priority
;
1496 req
->tr_flags
= flags
;
1500 _threadreq_init_stack(struct threadreq
*req
, int priority
, int flags
)
1502 req
->tr_state
= TR_STATE_NEW
;
1503 req
->tr_priority
= priority
;
1504 req
->tr_flags
= flags
| TR_FLAG_ONSTACK
;
1508 _threadreq_copy_prepare(struct workqueue
*wq
)
1511 if (wq
->wq_cached_threadreq
) {
1515 workqueue_unlock(wq
);
1516 struct threadreq
*req
= zalloc(pthread_zone_threadreq
);
1517 workqueue_lock_spin(wq
);
1519 if (wq
->wq_cached_threadreq
) {
1521 * We lost the race and someone left behind an extra threadreq for us
1522 * to use. Throw away our request and retry.
1524 workqueue_unlock(wq
);
1525 zfree(pthread_zone_threadreq
, req
);
1526 workqueue_lock_spin(wq
);
1529 wq
->wq_cached_threadreq
= req
;
1532 assert(wq
->wq_cached_threadreq
);
1536 _threadreq_copy_prepare_noblock(struct workqueue
*wq
)
1538 if (wq
->wq_cached_threadreq
) {
1542 wq
->wq_cached_threadreq
= zalloc_noblock(pthread_zone_threadreq
);
1544 return wq
->wq_cached_threadreq
!= NULL
;
1547 static inline struct threadreq_head
*
1548 _threadreq_list_for_req(struct workqueue
*wq
, const struct threadreq
*req
)
1550 if (req
->tr_flags
& TR_FLAG_OVERCOMMIT
) {
1551 return &wq
->wq_overcommit_reqlist
[req
->tr_priority
];
1553 return &wq
->wq_reqlist
[req
->tr_priority
];
1558 _threadreq_enqueue(struct workqueue
*wq
, struct threadreq
*req
)
1560 assert(req
&& req
->tr_state
== TR_STATE_NEW
);
1561 if (req
->tr_priority
== WORKQUEUE_EVENT_MANAGER_BUCKET
) {
1562 assert(wq
->wq_event_manager_threadreq
.tr_state
!= TR_STATE_WAITING
);
1563 memcpy(&wq
->wq_event_manager_threadreq
, req
, sizeof(struct threadreq
));
1564 req
= &wq
->wq_event_manager_threadreq
;
1565 req
->tr_flags
&= ~(TR_FLAG_ONSTACK
| TR_FLAG_NO_PACING
);
1567 if (req
->tr_flags
& TR_FLAG_ONSTACK
) {
1568 assert(wq
->wq_cached_threadreq
);
1569 struct threadreq
*newreq
= wq
->wq_cached_threadreq
;
1570 wq
->wq_cached_threadreq
= NULL
;
1572 memcpy(newreq
, req
, sizeof(struct threadreq
));
1573 newreq
->tr_flags
&= ~(TR_FLAG_ONSTACK
| TR_FLAG_NO_PACING
);
1574 req
->tr_state
= TR_STATE_DEAD
;
1577 TAILQ_INSERT_TAIL(_threadreq_list_for_req(wq
, req
), req
, tr_entry
);
1579 req
->tr_state
= TR_STATE_WAITING
;
1584 _threadreq_dequeue(struct workqueue
*wq
, struct threadreq
*req
)
1586 if (req
->tr_priority
!= WORKQUEUE_EVENT_MANAGER_BUCKET
) {
1587 struct threadreq_head
*req_list
= _threadreq_list_for_req(wq
, req
);
1589 struct threadreq
*cursor
= NULL
;
1590 TAILQ_FOREACH(cursor
, req_list
, tr_entry
) {
1591 if (cursor
== req
) break;
1593 assert(cursor
== req
);
1595 TAILQ_REMOVE(req_list
, req
, tr_entry
);
1601 * Mark a thread request as complete. At this point, it is treated as owned by
1602 * the submitting subsystem and you should assume it could be freed.
1604 * Called with the workqueue lock held.
1607 _threadreq_complete_and_unlock(proc_t p
, struct workqueue
*wq
,
1608 struct threadreq
*req
, struct threadlist
*tl
)
1610 struct threadreq
*req_tofree
= NULL
;
1611 bool sync
= (req
->tr_state
== TR_STATE_NEW
);
1612 bool workloop
= req
->tr_flags
& TR_FLAG_WORKLOOP
;
1613 bool onstack
= req
->tr_flags
& TR_FLAG_ONSTACK
;
1614 bool kevent
= req
->tr_flags
& TR_FLAG_KEVENT
;
1615 bool unbinding
= tl
->th_flags
& TH_LIST_UNBINDING
;
1617 bool waking_parked_thread
= (tl
->th_flags
& TH_LIST_BUSY
);
1620 req
->tr_state
= TR_STATE_COMPLETE
;
1622 if (!workloop
&& !onstack
&& req
!= &wq
->wq_event_manager_threadreq
) {
1623 if (wq
->wq_cached_threadreq
) {
1626 wq
->wq_cached_threadreq
= req
;
1630 if (tl
->th_flags
& TH_LIST_UNBINDING
) {
1631 tl
->th_flags
&= ~TH_LIST_UNBINDING
;
1632 assert((tl
->th_flags
& TH_LIST_KEVENT_BOUND
));
1633 } else if (workloop
|| kevent
) {
1634 assert((tl
->th_flags
& TH_LIST_KEVENT_BOUND
) == 0);
1635 tl
->th_flags
|= TH_LIST_KEVENT_BOUND
;
1639 workqueue_unlock(wq
);
1640 ret
= pthread_kern
->workloop_fulfill_threadreq(wq
->wq_proc
, (void*)req
,
1641 tl
->th_thread
, sync
? WORKLOOP_FULFILL_THREADREQ_SYNC
: 0);
1644 } else if (kevent
) {
1645 unsigned int kevent_flags
= KEVENT_FLAG_WORKQ
;
1647 kevent_flags
|= KEVENT_FLAG_SYNCHRONOUS_BIND
;
1649 if (tl
->th_priority
== WORKQUEUE_EVENT_MANAGER_BUCKET
) {
1650 kevent_flags
|= KEVENT_FLAG_WORKQ_MANAGER
;
1652 workqueue_unlock(wq
);
1653 ret
= kevent_qos_internal_bind(wq
->wq_proc
,
1654 class_index_get_thread_qos(tl
->th_priority
), tl
->th_thread
,
1657 workqueue_lock_spin(wq
);
1658 tl
->th_flags
&= ~TH_LIST_KEVENT_BOUND
;
1668 PTHREAD_TRACE_WQ(TRACE_wq_run_threadreq
| DBG_FUNC_END
, wq
, 0, 0, 0, 0);
1669 PTHREAD_TRACE_WQ_REQ(TRACE_wq_runitem
| DBG_FUNC_START
, wq
, req
, tl
->th_priority
,
1670 thread_tid(current_thread()), thread_tid(tl
->th_thread
));
1672 if (waking_parked_thread
) {
1674 workqueue_lock_spin(wq
);
1676 tl
->th_flags
&= ~(TH_LIST_BUSY
);
1677 if ((tl
->th_flags
& TH_LIST_REMOVING_VOUCHER
) == 0) {
1679 * If the thread is in the process of removing its voucher, then it
1680 * isn't actually in the wait event yet and we don't need to wake
1681 * it up. Save the trouble (and potential lock-ordering issues
1684 thread_wakeup_thread(tl
, tl
->th_thread
);
1686 workqueue_unlock(wq
);
1688 if (req_tofree
) zfree(pthread_zone_threadreq
, req_tofree
);
1689 return WQ_RUN_TR_THREAD_STARTED
;
1692 assert ((tl
->th_flags
& TH_LIST_PACING
) == 0);
1694 workqueue_unlock(wq
);
1696 if (req_tofree
) zfree(pthread_zone_threadreq
, req_tofree
);
1698 return WQ_RUN_TR_THREAD_STARTED
;
1700 _setup_wqthread(p
, tl
->th_thread
, wq
, tl
, WQ_SETUP_CLEAR_VOUCHER
);
1701 pthread_kern
->unix_syscall_return(EJUSTRETURN
);
1702 __builtin_unreachable();
1706 * Mark a thread request as cancelled. Has similar ownership semantics to the
1707 * complete call above.
1710 _threadreq_cancel(struct workqueue
*wq
, struct threadreq
*req
)
1712 assert(req
->tr_state
== TR_STATE_WAITING
);
1713 req
->tr_state
= TR_STATE_DEAD
;
1715 assert((req
->tr_flags
& TR_FLAG_ONSTACK
) == 0);
1716 if (req
->tr_flags
& TR_FLAG_WORKLOOP
) {
1717 __assert_only
int ret
;
1718 ret
= pthread_kern
->workloop_fulfill_threadreq(wq
->wq_proc
, (void*)req
,
1719 THREAD_NULL
, WORKLOOP_FULFILL_THREADREQ_CANCEL
);
1720 assert(ret
== 0 || ret
== ECANCELED
);
1721 } else if (req
!= &wq
->wq_event_manager_threadreq
) {
1722 zfree(pthread_zone_threadreq
, req
);
1726 #pragma mark workqueue lock
1728 static boolean_t
workqueue_lock_spin_is_acquired_kdp(struct workqueue
*wq
) {
1729 return kdp_lck_spin_is_acquired(&wq
->wq_lock
);
1733 workqueue_lock_spin(struct workqueue
*wq
)
1735 assert(ml_get_interrupts_enabled() == TRUE
);
1736 lck_spin_lock(&wq
->wq_lock
);
1740 workqueue_lock_try(struct workqueue
*wq
)
1742 return lck_spin_try_lock(&wq
->wq_lock
);
1746 workqueue_unlock(struct workqueue
*wq
)
1748 lck_spin_unlock(&wq
->wq_lock
);
1751 #pragma mark workqueue add timer
1754 * Sets up the timer which will call out to workqueue_add_timer
1757 workqueue_interval_timer_start(struct workqueue
*wq
)
1761 /* n.b. wq_timer_interval is reset to 0 in workqueue_add_timer if the
1762 ATIMER_RUNNING flag is not present. The net effect here is that if a
1763 sequence of threads is required, we'll double the time before we give out
1765 if (wq
->wq_timer_interval
== 0) {
1766 wq
->wq_timer_interval
= wq_stalled_window_usecs
;
1769 wq
->wq_timer_interval
= wq
->wq_timer_interval
* 2;
1771 if (wq
->wq_timer_interval
> wq_max_timer_interval_usecs
) {
1772 wq
->wq_timer_interval
= wq_max_timer_interval_usecs
;
1775 clock_interval_to_deadline(wq
->wq_timer_interval
, 1000, &deadline
);
1777 PTHREAD_TRACE_WQ(TRACE_wq_start_add_timer
, wq
, wq
->wq_reqcount
,
1778 _wq_flags(wq
), wq
->wq_timer_interval
, 0);
1780 thread_call_t call
= wq
->wq_atimer_delayed_call
;
1781 if (thread_call_enter1_delayed(call
, call
, deadline
)) {
1782 panic("delayed_call was already enqueued");
1787 * Immediately trigger the workqueue_add_timer
1790 workqueue_interval_timer_trigger(struct workqueue
*wq
)
1792 PTHREAD_TRACE_WQ(TRACE_wq_start_add_timer
, wq
, wq
->wq_reqcount
,
1793 _wq_flags(wq
), 0, 0);
1795 thread_call_t call
= wq
->wq_atimer_immediate_call
;
1796 if (thread_call_enter1(call
, call
)) {
1797 panic("immediate_call was already enqueued");
1802 * returns whether lastblocked_tsp is within wq_stalled_window_usecs of cur_ts
1805 wq_thread_is_busy(uint64_t cur_ts
, _Atomic
uint64_t *lastblocked_tsp
)
1809 uint64_t lastblocked_ts
;
1812 lastblocked_ts
= atomic_load_explicit(lastblocked_tsp
, memory_order_relaxed
);
1813 if (lastblocked_ts
>= cur_ts
) {
1815 * because the update of the timestamp when a thread blocks isn't
1816 * serialized against us looking at it (i.e. we don't hold the workq lock)
1817 * it's possible to have a timestamp that matches the current time or
1818 * that even looks to be in the future relative to when we grabbed the current
1819 * time... just treat this as a busy thread since it must have just blocked.
1823 elapsed
= cur_ts
- lastblocked_ts
;
1825 pthread_kern
->absolutetime_to_microtime(elapsed
, &secs
, &usecs
);
1827 return (secs
== 0 && usecs
< wq_stalled_window_usecs
);
1831 * handler function for the timer
1834 workqueue_add_timer(struct workqueue
*wq
, thread_call_t thread_call_self
)
1836 proc_t p
= wq
->wq_proc
;
1838 workqueue_lock_spin(wq
);
1840 PTHREAD_TRACE_WQ(TRACE_wq_add_timer
| DBG_FUNC_START
, wq
,
1841 _wq_flags(wq
), wq
->wq_nthreads
, wq
->wq_thidlecount
, 0);
1844 * There's two tricky issues here.
1846 * First issue: we start the thread_call's that invoke this routine without
1847 * the workqueue lock held. The scheduler callback needs to trigger
1848 * reevaluation of the number of running threads but shouldn't take that
1849 * lock, so we can't use it to synchronize state around the thread_call.
1850 * As a result, it might re-enter the thread_call while this routine is
1851 * already running. This could cause it to fire a second time and we'll
1852 * have two add_timers running at once. Obviously, we don't want that to
1853 * keep stacking, so we need to keep it at two timers.
1855 * Solution: use wq_flags (accessed via atomic CAS) to synchronize the
1856 * enqueue of the thread_call itself. When a thread needs to trigger the
1857 * add_timer, it checks for ATIMER_DELAYED_RUNNING and, when not set, sets
1858 * the flag then does a thread_call_enter. We'll then remove that flag
1859 * only once we've got the lock and it's safe for the thread_call to be
1862 * Second issue: we need to make sure that the two timers don't execute this
1863 * routine concurrently. We can't use the workqueue lock for this because
1864 * we'll need to drop it during our execution.
1866 * Solution: use WQL_ATIMER_BUSY as a condition variable to indicate that
1867 * we are currently executing the routine and the next thread should wait.
1869 * After all that, we arrive at the following four possible states:
1870 * !WQ_ATIMER_DELAYED_RUNNING && !WQL_ATIMER_BUSY no pending timer, no active timer
1871 * !WQ_ATIMER_DELAYED_RUNNING && WQL_ATIMER_BUSY no pending timer, 1 active timer
1872 * WQ_ATIMER_DELAYED_RUNNING && !WQL_ATIMER_BUSY 1 pending timer, no active timer
1873 * WQ_ATIMER_DELAYED_RUNNING && WQL_ATIMER_BUSY 1 pending timer, 1 active timer
1875 * Further complication sometimes we need to trigger this function to run
1876 * without delay. Because we aren't under a lock between setting
1877 * WQ_ATIMER_DELAYED_RUNNING and calling thread_call_enter, we can't simply
1878 * re-enter the thread call: if thread_call_enter() returned false, we
1879 * wouldn't be able to distinguish the case where the thread_call had
1880 * already fired from the case where it hadn't been entered yet from the
1881 * other thread. So, we use a separate thread_call for immediate
1882 * invocations, and a separate RUNNING flag, WQ_ATIMER_IMMEDIATE_RUNNING.
1885 while (wq
->wq_lflags
& WQL_ATIMER_BUSY
) {
1886 wq
->wq_lflags
|= WQL_ATIMER_WAITING
;
1888 assert_wait((caddr_t
)wq
, (THREAD_UNINT
));
1889 workqueue_unlock(wq
);
1891 thread_block(THREAD_CONTINUE_NULL
);
1893 workqueue_lock_spin(wq
);
1896 * Prevent _workqueue_mark_exiting() from going away
1898 wq
->wq_lflags
|= WQL_ATIMER_BUSY
;
1901 * Decide which timer we are and remove the RUNNING flag.
1903 if (thread_call_self
== wq
->wq_atimer_delayed_call
) {
1904 uint64_t wq_flags
= _wq_flags_and_orig(wq
, ~WQ_ATIMER_DELAYED_RUNNING
);
1905 if ((wq_flags
& WQ_ATIMER_DELAYED_RUNNING
) == 0) {
1906 panic("workqueue_add_timer(delayed) w/o WQ_ATIMER_DELAYED_RUNNING");
1908 } else if (thread_call_self
== wq
->wq_atimer_immediate_call
) {
1909 uint64_t wq_flags
= _wq_flags_and_orig(wq
, ~WQ_ATIMER_IMMEDIATE_RUNNING
);
1910 if ((wq_flags
& WQ_ATIMER_IMMEDIATE_RUNNING
) == 0) {
1911 panic("workqueue_add_timer(immediate) w/o WQ_ATIMER_IMMEDIATE_RUNNING");
1914 panic("workqueue_add_timer can't figure out which timer it is");
1917 int ret
= WQ_RUN_TR_THREAD_STARTED
;
1918 while (ret
== WQ_RUN_TR_THREAD_STARTED
&& wq
->wq_reqcount
) {
1919 ret
= workqueue_run_threadreq_and_unlock(p
, wq
, NULL
, NULL
, true);
1921 workqueue_lock_spin(wq
);
1923 _threadreq_copy_prepare(wq
);
1926 * If we called WQ_TIMER_NEEDED above, then this flag will be set if that
1927 * call marked the timer running. If so, we let the timer interval grow.
1928 * Otherwise, we reset it back to 0.
1930 uint32_t wq_flags
= _wq_flags(wq
);
1931 if (!(wq_flags
& WQ_ATIMER_DELAYED_RUNNING
)) {
1932 wq
->wq_timer_interval
= 0;
1935 wq
->wq_lflags
&= ~WQL_ATIMER_BUSY
;
1937 if ((wq_flags
& WQ_EXITING
) || (wq
->wq_lflags
& WQL_ATIMER_WAITING
)) {
1939 * wakeup the thread hung up in _workqueue_mark_exiting or
1940 * workqueue_add_timer waiting for this timer to finish getting out of
1943 wq
->wq_lflags
&= ~WQL_ATIMER_WAITING
;
1947 PTHREAD_TRACE_WQ(TRACE_wq_add_timer
| DBG_FUNC_END
, wq
, 0, wq
->wq_nthreads
, wq
->wq_thidlecount
, 0);
1949 workqueue_unlock(wq
);
1952 #pragma mark thread state tracking
1954 // called by spinlock code when trying to yield to lock owner
1956 _workqueue_thread_yielded(void)
1961 workqueue_callback(int type
, thread_t thread
)
1963 struct uthread
*uth
= pthread_kern
->get_bsdthread_info(thread
);
1964 struct threadlist
*tl
= pthread_kern
->uthread_get_threadlist(uth
);
1965 struct workqueue
*wq
= tl
->th_workq
;
1966 uint32_t old_count
, req_qos
, qos
= tl
->th_priority
;
1967 wq_thactive_t old_thactive
;
1970 case SCHED_CALL_BLOCK
: {
1971 bool start_timer
= false;
1973 old_thactive
= _wq_thactive_dec(wq
, tl
->th_priority
);
1974 req_qos
= WQ_THACTIVE_BEST_CONSTRAINED_REQ_QOS(old_thactive
);
1975 old_count
= _wq_thactive_aggregate_downto_qos(wq
, old_thactive
,
1978 if (old_count
== wq_max_concurrency
[tl
->th_priority
]) {
1980 * The number of active threads at this priority has fallen below
1981 * the maximum number of concurrent threads that are allowed to run
1983 * if we collide with another thread trying to update the
1984 * last_blocked (really unlikely since another thread would have to
1985 * get scheduled and then block after we start down this path), it's
1986 * not a problem. Either timestamp is adequate, so no need to retry
1988 atomic_store_explicit(&wq
->wq_lastblocked_ts
[qos
],
1989 mach_absolute_time(), memory_order_relaxed
);
1992 if (req_qos
== WORKQUEUE_EVENT_MANAGER_BUCKET
|| qos
> req_qos
) {
1994 * The blocking thread is at a lower QoS than the highest currently
1995 * pending constrained request, nothing has to be redriven
1998 uint32_t max_busycount
, old_req_count
;
1999 old_req_count
= _wq_thactive_aggregate_downto_qos(wq
, old_thactive
,
2000 req_qos
, NULL
, &max_busycount
);
2002 * If it is possible that may_start_constrained_thread had refused
2003 * admission due to being over the max concurrency, we may need to
2004 * spin up a new thread.
2006 * We take into account the maximum number of busy threads
2007 * that can affect may_start_constrained_thread as looking at the
2008 * actual number may_start_constrained_thread will see is racy.
2010 * IOW at NCPU = 4, for IN (req_qos = 1), if the old req count is
2011 * between NCPU (4) and NCPU - 2 (2) we need to redrive.
2013 if (wq_max_concurrency
[req_qos
] <= old_req_count
+ max_busycount
&&
2014 old_req_count
<= wq_max_concurrency
[req_qos
]) {
2015 if (WQ_TIMER_DELAYED_NEEDED(wq
)) {
2017 workqueue_interval_timer_start(wq
);
2022 PTHREAD_TRACE_WQ(TRACE_wq_thread_block
| DBG_FUNC_START
, wq
,
2023 old_count
- 1, qos
| (req_qos
<< 8),
2024 wq
->wq_reqcount
<< 1 | start_timer
, 0);
2027 case SCHED_CALL_UNBLOCK
: {
2029 * we cannot take the workqueue_lock here...
2030 * an UNBLOCK can occur from a timer event which
2031 * is run from an interrupt context... if the workqueue_lock
2032 * is already held by this processor, we'll deadlock...
2033 * the thread lock for the thread being UNBLOCKED
2036 old_thactive
= _wq_thactive_inc(wq
, qos
);
2037 if (pthread_debug_tracing
) {
2038 req_qos
= WQ_THACTIVE_BEST_CONSTRAINED_REQ_QOS(old_thactive
);
2039 old_count
= _wq_thactive_aggregate_downto_qos(wq
, old_thactive
,
2041 PTHREAD_TRACE_WQ(TRACE_wq_thread_block
| DBG_FUNC_END
, wq
,
2042 old_count
+ 1, qos
| (req_qos
<< 8),
2043 wq
->wq_threads_scheduled
, 0);
2051 _workqueue_get_sched_callback(void)
2053 return workqueue_callback
;
2056 #pragma mark thread addition/removal
2058 static mach_vm_size_t
2059 _workqueue_allocsize(struct workqueue
*wq
)
2061 proc_t p
= wq
->wq_proc
;
2062 mach_vm_size_t guardsize
= vm_map_page_size(wq
->wq_map
);
2063 mach_vm_size_t pthread_size
=
2064 vm_map_round_page_mask(pthread_kern
->proc_get_pthsize(p
) + PTHREAD_T_OFFSET
, vm_map_page_mask(wq
->wq_map
));
2065 return guardsize
+ PTH_DEFAULT_STACKSIZE
+ pthread_size
;
2069 * pop goes the thread
2071 * If fromexit is set, the call is from workqueue_exit(,
2072 * so some cleanups are to be avoided.
2075 workqueue_removethread(struct threadlist
*tl
, bool fromexit
, bool first_use
)
2077 struct uthread
* uth
;
2078 struct workqueue
* wq
= tl
->th_workq
;
2080 if (tl
->th_priority
== WORKQUEUE_EVENT_MANAGER_BUCKET
){
2081 TAILQ_REMOVE(&wq
->wq_thidlemgrlist
, tl
, th_entry
);
2083 TAILQ_REMOVE(&wq
->wq_thidlelist
, tl
, th_entry
);
2086 if (fromexit
== 0) {
2087 assert(wq
->wq_nthreads
&& wq
->wq_thidlecount
);
2089 wq
->wq_thidlecount
--;
2093 * Clear the threadlist pointer in uthread so
2094 * blocked thread on wakeup for termination will
2095 * not access the thread list as it is going to be
2098 pthread_kern
->thread_sched_call(tl
->th_thread
, NULL
);
2100 uth
= pthread_kern
->get_bsdthread_info(tl
->th_thread
);
2101 if (uth
!= (struct uthread
*)0) {
2102 pthread_kern
->uthread_set_threadlist(uth
, NULL
);
2104 if (fromexit
== 0) {
2105 /* during exit the lock is not held */
2106 workqueue_unlock(wq
);
2109 if ( (tl
->th_flags
& TH_LIST_NEW
) || first_use
) {
2111 * thread was created, but never used...
2112 * need to clean up the stack and port ourselves
2113 * since we're not going to spin up through the
2114 * normal exit path triggered from Libc
2116 if (fromexit
== 0) {
2117 /* vm map is already deallocated when this is called from exit */
2118 (void)mach_vm_deallocate(wq
->wq_map
, tl
->th_stackaddr
, _workqueue_allocsize(wq
));
2120 (void)pthread_kern
->mach_port_deallocate(pthread_kern
->task_get_ipcspace(wq
->wq_task
), tl
->th_thport
);
2123 * drop our ref on the thread
2125 thread_deallocate(tl
->th_thread
);
2127 zfree(pthread_zone_threadlist
, tl
);
2132 * Try to add a new workqueue thread.
2134 * - called with workq lock held
2135 * - dropped and retaken around thread creation
2136 * - return with workq lock held
2139 workqueue_addnewthread(proc_t p
, struct workqueue
*wq
)
2145 workqueue_unlock(wq
);
2147 struct threadlist
*tl
= zalloc(pthread_zone_threadlist
);
2148 bzero(tl
, sizeof(struct threadlist
));
2151 kret
= pthread_kern
->thread_create_workq_waiting(wq
->wq_task
, wq_unpark_continue
, tl
, &th
);
2152 if (kret
!= KERN_SUCCESS
) {
2153 PTHREAD_TRACE_WQ(TRACE_wq_thread_create_failed
| DBG_FUNC_NONE
, wq
, kret
, 0, 0, 0);
2157 mach_vm_offset_t stackaddr
= pthread_kern
->proc_get_stack_addr_hint(p
);
2159 mach_vm_size_t guardsize
= vm_map_page_size(wq
->wq_map
);
2160 mach_vm_size_t pthread_size
=
2161 vm_map_round_page_mask(pthread_kern
->proc_get_pthsize(p
) + PTHREAD_T_OFFSET
, vm_map_page_mask(wq
->wq_map
));
2162 mach_vm_size_t th_allocsize
= guardsize
+ PTH_DEFAULT_STACKSIZE
+ pthread_size
;
2164 kret
= mach_vm_map(wq
->wq_map
, &stackaddr
,
2165 th_allocsize
, page_size
-1,
2166 VM_MAKE_TAG(VM_MEMORY_STACK
)| VM_FLAGS_ANYWHERE
, NULL
,
2167 0, FALSE
, VM_PROT_DEFAULT
, VM_PROT_ALL
,
2168 VM_INHERIT_DEFAULT
);
2170 if (kret
!= KERN_SUCCESS
) {
2171 kret
= mach_vm_allocate(wq
->wq_map
,
2172 &stackaddr
, th_allocsize
,
2173 VM_MAKE_TAG(VM_MEMORY_STACK
) | VM_FLAGS_ANYWHERE
);
2176 if (kret
!= KERN_SUCCESS
) {
2177 PTHREAD_TRACE_WQ(TRACE_wq_thread_create_failed
| DBG_FUNC_NONE
, wq
, kret
, 1, 0, 0);
2178 goto fail_terminate
;
2182 * The guard page is at the lowest address
2183 * The stack base is the highest address
2185 kret
= mach_vm_protect(wq
->wq_map
, stackaddr
, guardsize
, FALSE
, VM_PROT_NONE
);
2186 if (kret
!= KERN_SUCCESS
) {
2187 PTHREAD_TRACE_WQ(TRACE_wq_thread_create_failed
| DBG_FUNC_NONE
, wq
, kret
, 2, 0, 0);
2188 goto fail_vm_deallocate
;
2192 pthread_kern
->thread_set_tag(th
, THREAD_TAG_PTHREAD
| THREAD_TAG_WORKQUEUE
);
2193 pthread_kern
->thread_static_param(th
, TRUE
);
2196 * convert_thread_to_port() consumes a reference
2198 thread_reference(th
);
2199 void *sright
= (void *)pthread_kern
->convert_thread_to_port(th
);
2200 tl
->th_thport
= pthread_kern
->ipc_port_copyout_send(sright
,
2201 pthread_kern
->task_get_ipcspace(wq
->wq_task
));
2203 tl
->th_flags
= TH_LIST_INITED
| TH_LIST_NEW
;
2206 tl
->th_stackaddr
= stackaddr
;
2207 tl
->th_priority
= WORKQUEUE_NUM_BUCKETS
;
2209 struct uthread
*uth
;
2210 uth
= pthread_kern
->get_bsdthread_info(tl
->th_thread
);
2212 workqueue_lock_spin(wq
);
2214 void *current_tl
= pthread_kern
->uthread_get_threadlist(uth
);
2215 if (current_tl
== NULL
) {
2216 pthread_kern
->uthread_set_threadlist(uth
, tl
);
2217 TAILQ_INSERT_TAIL(&wq
->wq_thidlelist
, tl
, th_entry
);
2218 wq
->wq_thidlecount
++;
2219 } else if (current_tl
== WQ_THREADLIST_EXITING_POISON
) {
2221 * Failed thread creation race: The thread already woke up and has exited.
2223 PTHREAD_TRACE_WQ(TRACE_wq_thread_create_failed
| DBG_FUNC_NONE
, wq
, kret
, 3, 0, 0);
2226 panic("Unexpected initial threadlist value");
2229 PTHREAD_TRACE_WQ(TRACE_wq_thread_create
| DBG_FUNC_NONE
, wq
, 0, 0, 0, 0);
2234 workqueue_unlock(wq
);
2235 (void)pthread_kern
->mach_port_deallocate(pthread_kern
->task_get_ipcspace(wq
->wq_task
),
2239 (void) mach_vm_deallocate(wq
->wq_map
, stackaddr
, th_allocsize
);
2242 if (pthread_kern
->thread_will_park_or_terminate
) {
2243 pthread_kern
->thread_will_park_or_terminate(th
);
2245 (void)thread_terminate(th
);
2246 thread_deallocate(th
);
2249 zfree(pthread_zone_threadlist
, tl
);
2251 workqueue_lock_spin(wq
);
2258 * Setup per-process state for the workqueue.
2261 _workq_open(struct proc
*p
, __unused
int32_t *retval
)
2263 struct workqueue
* wq
;
2268 if (pthread_kern
->proc_get_register(p
) == 0) {
2272 num_cpus
= pthread_kern
->ml_get_max_cpus();
2274 if (wq_init_constrained_limit
) {
2277 * set up the limit for the constrained pool
2278 * this is a virtual pool in that we don't
2279 * maintain it on a separate idle and run list
2281 limit
= num_cpus
* WORKQUEUE_CONSTRAINED_FACTOR
;
2283 if (limit
> wq_max_constrained_threads
)
2284 wq_max_constrained_threads
= limit
;
2286 wq_init_constrained_limit
= 0;
2288 if (wq_max_threads
> WQ_THACTIVE_BUCKET_HALF
) {
2289 wq_max_threads
= WQ_THACTIVE_BUCKET_HALF
;
2291 if (wq_max_threads
> pthread_kern
->config_thread_max
- 20) {
2292 wq_max_threads
= pthread_kern
->config_thread_max
- 20;
2296 if (pthread_kern
->proc_get_wqptr(p
) == NULL
) {
2297 if (pthread_kern
->proc_init_wqptr_or_wait(p
) == FALSE
) {
2298 assert(pthread_kern
->proc_get_wqptr(p
) != NULL
);
2302 ptr
= (char *)zalloc(pthread_zone_workqueue
);
2303 bzero(ptr
, sizeof(struct workqueue
));
2305 wq
= (struct workqueue
*)ptr
;
2307 wq
->wq_task
= current_task();
2308 wq
->wq_map
= pthread_kern
->current_map();
2310 // Start the event manager at the priority hinted at by the policy engine
2311 int mgr_priority_hint
= pthread_kern
->task_get_default_manager_qos(current_task());
2312 wq
->wq_event_manager_priority
= (uint32_t)thread_qos_get_pthread_priority(mgr_priority_hint
) | _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG
;
2314 TAILQ_INIT(&wq
->wq_thrunlist
);
2315 TAILQ_INIT(&wq
->wq_thidlelist
);
2316 for (int i
= 0; i
< WORKQUEUE_EVENT_MANAGER_BUCKET
; i
++) {
2317 TAILQ_INIT(&wq
->wq_overcommit_reqlist
[i
]);
2318 TAILQ_INIT(&wq
->wq_reqlist
[i
]);
2321 wq
->wq_atimer_delayed_call
=
2322 thread_call_allocate_with_priority((thread_call_func_t
)workqueue_add_timer
,
2323 (thread_call_param_t
)wq
, THREAD_CALL_PRIORITY_KERNEL
);
2324 wq
->wq_atimer_immediate_call
=
2325 thread_call_allocate_with_priority((thread_call_func_t
)workqueue_add_timer
,
2326 (thread_call_param_t
)wq
, THREAD_CALL_PRIORITY_KERNEL
);
2328 lck_spin_init(&wq
->wq_lock
, pthread_lck_grp
, pthread_lck_attr
);
2330 wq
->wq_cached_threadreq
= zalloc(pthread_zone_threadreq
);
2331 *(wq_thactive_t
*)&wq
->wq_thactive
=
2332 (wq_thactive_t
)WQ_THACTIVE_NO_PENDING_REQUEST
<<
2333 WQ_THACTIVE_QOS_SHIFT
;
2335 pthread_kern
->proc_set_wqptr(p
, wq
);
2344 * Routine: workqueue_mark_exiting
2346 * Function: Mark the work queue such that new threads will not be added to the
2347 * work queue after we return.
2349 * Conditions: Called against the current process.
2352 _workqueue_mark_exiting(struct proc
*p
)
2354 struct workqueue
*wq
= pthread_kern
->proc_get_wqptr(p
);
2357 PTHREAD_TRACE_WQ(TRACE_wq_pthread_exit
|DBG_FUNC_START
, wq
, 0, 0, 0, 0);
2359 workqueue_lock_spin(wq
);
2362 * We arm the add timer without holding the workqueue lock so we need
2363 * to synchronize with any running or soon to be running timers.
2365 * Threads that intend to arm the timer atomically OR
2366 * WQ_ATIMER_{DELAYED,IMMEDIATE}_RUNNING into the wq_flags, only if
2367 * WQ_EXITING is not present. So, once we have set WQ_EXITING, we can
2368 * be sure that no new RUNNING flags will be set, but still need to
2369 * wait for the already running timers to complete.
2371 * We always hold the workq lock when dropping WQ_ATIMER_RUNNING, so
2372 * the check for and sleep until clear is protected.
2374 uint64_t wq_flags
= _wq_flags_or_orig(wq
, WQ_EXITING
);
2376 if (wq_flags
& WQ_ATIMER_DELAYED_RUNNING
) {
2377 if (thread_call_cancel(wq
->wq_atimer_delayed_call
) == TRUE
) {
2378 wq_flags
= _wq_flags_and_orig(wq
, ~WQ_ATIMER_DELAYED_RUNNING
);
2381 if (wq_flags
& WQ_ATIMER_IMMEDIATE_RUNNING
) {
2382 if (thread_call_cancel(wq
->wq_atimer_immediate_call
) == TRUE
) {
2383 wq_flags
= _wq_flags_and_orig(wq
, ~WQ_ATIMER_IMMEDIATE_RUNNING
);
2386 while ((_wq_flags(wq
) & (WQ_ATIMER_DELAYED_RUNNING
| WQ_ATIMER_IMMEDIATE_RUNNING
)) ||
2387 (wq
->wq_lflags
& WQL_ATIMER_BUSY
)) {
2388 assert_wait((caddr_t
)wq
, (THREAD_UNINT
));
2389 workqueue_unlock(wq
);
2391 thread_block(THREAD_CONTINUE_NULL
);
2393 workqueue_lock_spin(wq
);
2397 * Save off pending requests, will complete/free them below after unlocking
2399 TAILQ_HEAD(, threadreq
) local_list
= TAILQ_HEAD_INITIALIZER(local_list
);
2401 for (int i
= 0; i
< WORKQUEUE_EVENT_MANAGER_BUCKET
; i
++) {
2402 TAILQ_CONCAT(&local_list
, &wq
->wq_overcommit_reqlist
[i
], tr_entry
);
2403 TAILQ_CONCAT(&local_list
, &wq
->wq_reqlist
[i
], tr_entry
);
2407 * XXX: Can't deferred cancel the event manager request, so just smash it.
2409 assert((wq
->wq_event_manager_threadreq
.tr_flags
& TR_FLAG_WORKLOOP
) == 0);
2410 wq
->wq_event_manager_threadreq
.tr_state
= TR_STATE_DEAD
;
2412 workqueue_unlock(wq
);
2414 struct threadreq
*tr
, *tr_temp
;
2415 TAILQ_FOREACH_SAFE(tr
, &local_list
, tr_entry
, tr_temp
) {
2416 _threadreq_cancel(wq
, tr
);
2418 PTHREAD_TRACE(TRACE_wq_pthread_exit
|DBG_FUNC_END
, 0, 0, 0, 0, 0);
2422 * Routine: workqueue_exit
2424 * Function: clean up the work queue structure(s) now that there are no threads
2425 * left running inside the work queue (except possibly current_thread).
2427 * Conditions: Called by the last thread in the process.
2428 * Called against current process.
2431 _workqueue_exit(struct proc
*p
)
2433 struct workqueue
* wq
;
2434 struct threadlist
* tl
, *tlist
;
2435 struct uthread
*uth
;
2437 wq
= pthread_kern
->proc_get_wqptr(p
);
2440 PTHREAD_TRACE_WQ(TRACE_wq_workqueue_exit
|DBG_FUNC_START
, wq
, 0, 0, 0, 0);
2442 pthread_kern
->proc_set_wqptr(p
, NULL
);
2445 * Clean up workqueue data structures for threads that exited and
2446 * didn't get a chance to clean up after themselves.
2448 TAILQ_FOREACH_SAFE(tl
, &wq
->wq_thrunlist
, th_entry
, tlist
) {
2449 assert((tl
->th_flags
& TH_LIST_RUNNING
) != 0);
2451 pthread_kern
->thread_sched_call(tl
->th_thread
, NULL
);
2453 uth
= pthread_kern
->get_bsdthread_info(tl
->th_thread
);
2454 if (uth
!= (struct uthread
*)0) {
2455 pthread_kern
->uthread_set_threadlist(uth
, NULL
);
2457 TAILQ_REMOVE(&wq
->wq_thrunlist
, tl
, th_entry
);
2460 * drop our last ref on the thread
2462 thread_deallocate(tl
->th_thread
);
2464 zfree(pthread_zone_threadlist
, tl
);
2466 TAILQ_FOREACH_SAFE(tl
, &wq
->wq_thidlelist
, th_entry
, tlist
) {
2467 assert((tl
->th_flags
& TH_LIST_RUNNING
) == 0);
2468 assert(tl
->th_priority
!= WORKQUEUE_EVENT_MANAGER_BUCKET
);
2469 workqueue_removethread(tl
, true, false);
2471 TAILQ_FOREACH_SAFE(tl
, &wq
->wq_thidlemgrlist
, th_entry
, tlist
) {
2472 assert((tl
->th_flags
& TH_LIST_RUNNING
) == 0);
2473 assert(tl
->th_priority
== WORKQUEUE_EVENT_MANAGER_BUCKET
);
2474 workqueue_removethread(tl
, true, false);
2476 if (wq
->wq_cached_threadreq
) {
2477 zfree(pthread_zone_threadreq
, wq
->wq_cached_threadreq
);
2479 thread_call_free(wq
->wq_atimer_delayed_call
);
2480 thread_call_free(wq
->wq_atimer_immediate_call
);
2481 lck_spin_destroy(&wq
->wq_lock
, pthread_lck_grp
);
2483 for (int i
= 0; i
< WORKQUEUE_EVENT_MANAGER_BUCKET
; i
++) {
2484 assert(TAILQ_EMPTY(&wq
->wq_overcommit_reqlist
[i
]));
2485 assert(TAILQ_EMPTY(&wq
->wq_reqlist
[i
]));
2488 zfree(pthread_zone_workqueue
, wq
);
2490 PTHREAD_TRACE(TRACE_wq_workqueue_exit
|DBG_FUNC_END
, 0, 0, 0, 0, 0);
2495 #pragma mark workqueue thread manipulation
2499 * Entry point for libdispatch to ask for threads
2502 wqops_queue_reqthreads(struct proc
*p
, int reqcount
,
2503 pthread_priority_t priority
)
2505 bool overcommit
= _pthread_priority_get_flags(priority
) & _PTHREAD_PRIORITY_OVERCOMMIT_FLAG
;
2506 bool event_manager
= _pthread_priority_get_flags(priority
) & _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG
;
2507 int class = event_manager
? WORKQUEUE_EVENT_MANAGER_BUCKET
:
2508 pthread_priority_get_class_index(priority
);
2510 if ((reqcount
<= 0) || (class < 0) || (class >= WORKQUEUE_NUM_BUCKETS
) ||
2511 (overcommit
&& event_manager
)) {
2515 struct workqueue
*wq
;
2516 if ((wq
= (struct workqueue
*)pthread_kern
->proc_get_wqptr(p
)) == NULL
) {
2520 workqueue_lock_spin(wq
);
2521 _threadreq_copy_prepare(wq
);
2523 PTHREAD_TRACE_WQ(TRACE_wq_wqops_reqthreads
| DBG_FUNC_NONE
, wq
, reqcount
, priority
, 0, 0);
2526 if (overcommit
) tr_flags
|= TR_FLAG_OVERCOMMIT
;
2529 * when libdispatch asks for more than one thread, it wants to achieve
2530 * parallelism. Pacing would be detrimental to this ask, so treat
2531 * these specially to not do the pacing admission check
2533 tr_flags
|= TR_FLAG_NO_PACING
;
2536 while (reqcount
-- && !_wq_exiting(wq
)) {
2537 struct threadreq req
;
2538 _threadreq_init_stack(&req
, class, tr_flags
);
2540 workqueue_run_threadreq_and_unlock(p
, wq
, NULL
, &req
, true);
2542 workqueue_lock_spin(wq
); /* reacquire */
2543 _threadreq_copy_prepare(wq
);
2546 workqueue_unlock(wq
);
2552 * Used by the kevent system to request threads.
2554 * Currently count is ignored and we always return one thread per invocation.
2557 _workq_kevent_reqthreads(struct proc
*p
, pthread_priority_t priority
,
2560 int wq_run_tr
= WQ_RUN_TR_THROTTLED
;
2561 bool emergency_thread
= false;
2562 struct threadreq req
;
2565 struct workqueue
*wq
;
2566 if ((wq
= (struct workqueue
*)pthread_kern
->proc_get_wqptr(p
)) == NULL
) {
2570 int class = pthread_priority_get_class_index(priority
);
2572 workqueue_lock_spin(wq
);
2573 bool has_threadreq
= _threadreq_copy_prepare_noblock(wq
);
2575 PTHREAD_TRACE_WQ_REQ(TRACE_wq_kevent_reqthreads
| DBG_FUNC_NONE
, wq
, NULL
, priority
, 0, 0);
2578 * Skip straight to event manager if that's what was requested
2580 if ((_pthread_priority_get_qos_newest(priority
) == QOS_CLASS_UNSPECIFIED
) ||
2581 (_pthread_priority_get_flags(priority
) & _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG
)){
2585 bool will_pace
= _wq_should_pace_priority(wq
, class);
2586 if ((wq
->wq_thidlecount
== 0 || will_pace
) && has_threadreq
== false) {
2588 * We'll need to persist the request and can't, so return the emergency
2589 * thread instead, which has a persistent request object.
2591 emergency_thread
= true;
2596 * Handle overcommit requests
2598 if ((_pthread_priority_get_flags(priority
) & _PTHREAD_PRIORITY_OVERCOMMIT_FLAG
) != 0){
2599 _threadreq_init_stack(&req
, class, TR_FLAG_KEVENT
| TR_FLAG_OVERCOMMIT
);
2600 wq_run_tr
= workqueue_run_threadreq_and_unlock(p
, wq
, NULL
, &req
, false);
2605 * Handle constrained requests
2607 boolean_t may_start
= may_start_constrained_thread(wq
, class, NULL
, false);
2608 if (may_start
|| no_emergency
) {
2609 _threadreq_init_stack(&req
, class, TR_FLAG_KEVENT
);
2610 wq_run_tr
= workqueue_run_threadreq_and_unlock(p
, wq
, NULL
, &req
, false);
2613 emergency_thread
= true;
2618 _threadreq_init_stack(&req
, WORKQUEUE_EVENT_MANAGER_BUCKET
, TR_FLAG_KEVENT
);
2619 wq_run_tr
= workqueue_run_threadreq_and_unlock(p
, wq
, NULL
, &req
, false);
2622 if (wq_run_tr
== WQ_RUN_TR_THREAD_NEEDED
&& WQ_TIMER_IMMEDIATE_NEEDED(wq
)) {
2623 workqueue_interval_timer_trigger(wq
);
2625 return emergency_thread
? (void*)-1 : 0;
2629 _workq_reqthreads(struct proc
*p
, __assert_only
int requests_count
,
2630 workq_reqthreads_req_t request
)
2632 assert(requests_count
== 1);
2634 pthread_priority_t priority
= request
->priority
;
2635 bool no_emergency
= request
->count
& WORKQ_REQTHREADS_NOEMERGENCY
;
2637 return _workq_kevent_reqthreads(p
, priority
, no_emergency
);
2642 workq_kern_threadreq(struct proc
*p
, workq_threadreq_t _req
,
2643 enum workq_threadreq_type type
, unsigned long priority
, int flags
)
2645 struct workqueue
*wq
;
2648 if ((wq
= (struct workqueue
*)pthread_kern
->proc_get_wqptr(p
)) == NULL
) {
2653 case WORKQ_THREADREQ_KEVENT
: {
2654 bool no_emergency
= flags
& WORKQ_THREADREQ_FLAG_NOEMERGENCY
;
2655 (void)_workq_kevent_reqthreads(p
, priority
, no_emergency
);
2658 case WORKQ_THREADREQ_WORKLOOP
:
2659 case WORKQ_THREADREQ_WORKLOOP_NO_THREAD_CALL
: {
2660 struct threadreq
*req
= (struct threadreq
*)_req
;
2661 int req_class
= pthread_priority_get_class_index(priority
);
2662 int req_flags
= TR_FLAG_WORKLOOP
;
2663 if ((_pthread_priority_get_flags(priority
) &
2664 _PTHREAD_PRIORITY_OVERCOMMIT_FLAG
) != 0){
2665 req_flags
|= TR_FLAG_OVERCOMMIT
;
2668 thread_t thread
= current_thread();
2669 struct threadlist
*tl
= util_get_thread_threadlist_entry(thread
);
2671 if (tl
&& tl
!= WQ_THREADLIST_EXITING_POISON
&&
2672 (tl
->th_flags
& TH_LIST_UNBINDING
)) {
2674 * we're called back synchronously from the context of
2675 * kevent_qos_internal_unbind from within wqops_thread_return()
2676 * we can try to match up this thread with this request !
2682 _threadreq_init_alloced(req
, req_class
, req_flags
);
2683 workqueue_lock_spin(wq
);
2684 PTHREAD_TRACE_WQ_REQ(TRACE_wq_kevent_reqthreads
| DBG_FUNC_NONE
, wq
, req
, priority
, 1, 0);
2685 ret
= workqueue_run_threadreq_and_unlock(p
, wq
, tl
, req
, false);
2686 if (ret
== WQ_RUN_TR_EXITING
) {
2689 if (ret
== WQ_RUN_TR_THREAD_NEEDED
) {
2690 if (type
== WORKQ_THREADREQ_WORKLOOP_NO_THREAD_CALL
) {
2693 if (WQ_TIMER_IMMEDIATE_NEEDED(wq
)) {
2694 workqueue_interval_timer_trigger(wq
);
2699 case WORKQ_THREADREQ_REDRIVE
:
2700 PTHREAD_TRACE_WQ_REQ(TRACE_wq_kevent_reqthreads
| DBG_FUNC_NONE
, wq
, 0, 0, 4, 0);
2701 workqueue_lock_spin(wq
);
2702 ret
= workqueue_run_threadreq_and_unlock(p
, wq
, NULL
, NULL
, true);
2703 if (ret
== WQ_RUN_TR_EXITING
) {
2713 workq_kern_threadreq_modify(struct proc
*p
, workq_threadreq_t _req
,
2714 enum workq_threadreq_op operation
, unsigned long arg1
,
2715 unsigned long __unused arg2
)
2717 struct threadreq
*req
= (struct threadreq
*)_req
;
2718 struct workqueue
*wq
;
2719 int priclass
, ret
= 0, wq_tr_rc
= WQ_RUN_TR_THROTTLED
;
2721 if (req
== NULL
|| (wq
= pthread_kern
->proc_get_wqptr(p
)) == NULL
) {
2725 workqueue_lock_spin(wq
);
2727 if (_wq_exiting(wq
)) {
2733 * Find/validate the referenced request structure
2735 if (req
->tr_state
!= TR_STATE_WAITING
) {
2739 assert(req
->tr_priority
< WORKQUEUE_EVENT_MANAGER_BUCKET
);
2740 assert(req
->tr_flags
& TR_FLAG_WORKLOOP
);
2742 switch (operation
) {
2743 case WORKQ_THREADREQ_CHANGE_PRI
:
2744 case WORKQ_THREADREQ_CHANGE_PRI_NO_THREAD_CALL
:
2745 priclass
= pthread_priority_get_class_index(arg1
);
2746 PTHREAD_TRACE_WQ_REQ(TRACE_wq_kevent_reqthreads
| DBG_FUNC_NONE
, wq
, req
, arg1
, 2, 0);
2747 if (req
->tr_priority
== priclass
) {
2750 _threadreq_dequeue(wq
, req
);
2751 req
->tr_priority
= priclass
;
2752 req
->tr_state
= TR_STATE_NEW
; // what was old is new again
2753 wq_tr_rc
= workqueue_run_threadreq_and_unlock(p
, wq
, NULL
, req
, false);
2756 case WORKQ_THREADREQ_CANCEL
:
2757 PTHREAD_TRACE_WQ_REQ(TRACE_wq_kevent_reqthreads
| DBG_FUNC_NONE
, wq
, req
, 0, 3, 0);
2758 _threadreq_dequeue(wq
, req
);
2759 req
->tr_state
= TR_STATE_DEAD
;
2768 workqueue_unlock(wq
);
2770 if (wq_tr_rc
== WQ_RUN_TR_THREAD_NEEDED
) {
2771 if (operation
== WORKQ_THREADREQ_CHANGE_PRI_NO_THREAD_CALL
) {
2773 } else if (WQ_TIMER_IMMEDIATE_NEEDED(wq
)) {
2774 workqueue_interval_timer_trigger(wq
);
2782 wqops_thread_return(struct proc
*p
, struct workqueue
*wq
)
2784 thread_t th
= current_thread();
2785 struct uthread
*uth
= pthread_kern
->get_bsdthread_info(th
);
2786 struct threadlist
*tl
= pthread_kern
->uthread_get_threadlist(uth
);
2788 /* reset signal mask on the workqueue thread to default state */
2789 if (pthread_kern
->uthread_get_sigmask(uth
) != (sigset_t
)(~workq_threadmask
)) {
2790 pthread_kern
->proc_lock(p
);
2791 pthread_kern
->uthread_set_sigmask(uth
, ~workq_threadmask
);
2792 pthread_kern
->proc_unlock(p
);
2795 if (wq
== NULL
|| !tl
) {
2799 PTHREAD_TRACE_WQ(TRACE_wq_override_reset
| DBG_FUNC_START
, tl
->th_workq
, 0, 0, 0, 0);
2802 * This squash call has neat semantics: it removes the specified overrides,
2803 * replacing the current requested QoS with the previous effective QoS from
2804 * those overrides. This means we won't be preempted due to having our QoS
2805 * lowered. Of course, now our understanding of the thread's QoS is wrong,
2806 * so we'll adjust below.
2808 bool was_manager
= (tl
->th_priority
== WORKQUEUE_EVENT_MANAGER_BUCKET
);
2812 new_qos
= pthread_kern
->proc_usynch_thread_qos_squash_override_for_resource(th
,
2813 THREAD_QOS_OVERRIDE_RESOURCE_WILDCARD
,
2814 THREAD_QOS_OVERRIDE_TYPE_DISPATCH_ASYNCHRONOUS_OVERRIDE
);
2817 PTHREAD_TRACE_WQ(TRACE_wq_runitem
| DBG_FUNC_END
, wq
, tl
->th_priority
, 0, 0, 0);
2819 workqueue_lock_spin(wq
);
2821 if (tl
->th_flags
& TH_LIST_KEVENT_BOUND
) {
2822 unsigned int flags
= KEVENT_FLAG_WORKQ
;
2824 flags
|= KEVENT_FLAG_WORKQ_MANAGER
;
2827 tl
->th_flags
|= TH_LIST_UNBINDING
;
2828 workqueue_unlock(wq
);
2829 kevent_qos_internal_unbind(p
, class_index_get_thread_qos(tl
->th_priority
), th
, flags
);
2830 if (!(tl
->th_flags
& TH_LIST_UNBINDING
)) {
2831 _setup_wqthread(p
, th
, wq
, tl
, WQ_SETUP_CLEAR_VOUCHER
);
2832 pthread_kern
->unix_syscall_return(EJUSTRETURN
);
2833 __builtin_unreachable();
2835 workqueue_lock_spin(wq
);
2836 tl
->th_flags
&= ~(TH_LIST_KEVENT_BOUND
| TH_LIST_UNBINDING
);
2840 /* Fix up counters from the squash operation. */
2841 uint8_t old_bucket
= tl
->th_priority
;
2842 uint8_t new_bucket
= thread_qos_get_class_index(new_qos
);
2844 if (old_bucket
!= new_bucket
) {
2845 _wq_thactive_move(wq
, old_bucket
, new_bucket
);
2846 wq
->wq_thscheduled_count
[old_bucket
]--;
2847 wq
->wq_thscheduled_count
[new_bucket
]++;
2849 PTHREAD_TRACE_WQ(TRACE_wq_thread_squash
| DBG_FUNC_NONE
, wq
, tl
->th_priority
, new_bucket
, 0, 0);
2850 tl
->th_priority
= new_bucket
;
2851 PTHREAD_TRACE_WQ(TRACE_wq_override_reset
| DBG_FUNC_END
, tl
->th_workq
, new_qos
, 0, 0, 0);
2855 workqueue_run_threadreq_and_unlock(p
, wq
, tl
, NULL
, false);
2860 * Multiplexed call to interact with the workqueue mechanism
2863 _workq_kernreturn(struct proc
*p
,
2870 struct workqueue
*wq
;
2873 if (pthread_kern
->proc_get_register(p
) == 0) {
2878 case WQOPS_QUEUE_NEWSPISUPP
: {
2880 * arg2 = offset of serialno into dispatch queue
2881 * arg3 = kevent support
2885 // If we get here, then userspace has indicated support for kevent delivery.
2888 pthread_kern
->proc_set_dispatchqueue_serialno_offset(p
, (uint64_t)offset
);
2891 case WQOPS_QUEUE_REQTHREADS
: {
2893 * arg2 = number of threads to start
2896 error
= wqops_queue_reqthreads(p
, arg2
, arg3
);
2899 case WQOPS_SET_EVENT_MANAGER_PRIORITY
: {
2901 * arg2 = priority for the manager thread
2903 * if _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG is set, the
2904 * ~_PTHREAD_PRIORITY_FLAGS_MASK contains a scheduling priority instead
2907 pthread_priority_t pri
= arg2
;
2909 wq
= (struct workqueue
*)pthread_kern
->proc_get_wqptr(p
);
2914 workqueue_lock_spin(wq
);
2915 if (pri
& _PTHREAD_PRIORITY_SCHED_PRI_FLAG
){
2917 * If userspace passes a scheduling priority, that takes precidence
2918 * over any QoS. (So, userspace should take care not to accidenatally
2919 * lower the priority this way.)
2921 uint32_t sched_pri
= pri
& _PTHREAD_PRIORITY_SCHED_PRI_MASK
;
2922 if (wq
->wq_event_manager_priority
& _PTHREAD_PRIORITY_SCHED_PRI_FLAG
){
2923 wq
->wq_event_manager_priority
= MAX(sched_pri
, wq
->wq_event_manager_priority
& _PTHREAD_PRIORITY_SCHED_PRI_MASK
)
2924 | _PTHREAD_PRIORITY_SCHED_PRI_FLAG
| _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG
;
2926 wq
->wq_event_manager_priority
= sched_pri
2927 | _PTHREAD_PRIORITY_SCHED_PRI_FLAG
| _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG
;
2929 } else if ((wq
->wq_event_manager_priority
& _PTHREAD_PRIORITY_SCHED_PRI_FLAG
) == 0){
2930 int cur_qos
= pthread_priority_get_thread_qos(wq
->wq_event_manager_priority
);
2931 int new_qos
= pthread_priority_get_thread_qos(pri
);
2932 wq
->wq_event_manager_priority
= (uint32_t)thread_qos_get_pthread_priority(MAX(cur_qos
, new_qos
)) | _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG
;
2934 workqueue_unlock(wq
);
2937 case WQOPS_THREAD_KEVENT_RETURN
:
2938 case WQOPS_THREAD_WORKLOOP_RETURN
:
2939 wq
= (struct workqueue
*)pthread_kern
->proc_get_wqptr(p
);
2940 PTHREAD_TRACE_WQ(TRACE_wq_runthread
| DBG_FUNC_END
, wq
, options
, 0, 0, 0);
2941 if (item
!= 0 && arg2
!= 0) {
2942 int32_t kevent_retval
;
2944 if (options
== WQOPS_THREAD_KEVENT_RETURN
) {
2945 ret
= kevent_qos_internal(p
, -1, item
, arg2
, item
, arg2
, NULL
, NULL
,
2946 KEVENT_FLAG_WORKQ
| KEVENT_FLAG_IMMEDIATE
| KEVENT_FLAG_ERROR_EVENTS
,
2948 } else /* options == WQOPS_THREAD_WORKLOOP_RETURN */ {
2949 kqueue_id_t kevent_id
= -1;
2950 ret
= kevent_id_internal(p
, &kevent_id
, item
, arg2
, item
, arg2
,
2952 KEVENT_FLAG_WORKLOOP
| KEVENT_FLAG_IMMEDIATE
| KEVENT_FLAG_ERROR_EVENTS
,
2956 * We shouldn't be getting more errors out than events we put in, so
2957 * reusing the input buffer should always provide enough space. But,
2958 * the assert is commented out since we get errors in edge cases in the
2959 * process lifecycle.
2961 //assert(ret == KERN_SUCCESS && kevent_retval >= 0);
2962 if (ret
!= KERN_SUCCESS
){
2965 } else if (kevent_retval
> 0){
2966 assert(kevent_retval
<= arg2
);
2967 *retval
= kevent_retval
;
2974 case WQOPS_THREAD_RETURN
:
2975 wq
= (struct workqueue
*)pthread_kern
->proc_get_wqptr(p
);
2976 PTHREAD_TRACE_WQ(TRACE_wq_runthread
| DBG_FUNC_END
, wq
, options
, 0, 0, 0);
2978 error
= wqops_thread_return(p
, wq
);
2979 // NOT REACHED except in case of error
2983 case WQOPS_SHOULD_NARROW
: {
2985 * arg2 = priority to test
2988 pthread_priority_t priority
= arg2
;
2989 thread_t th
= current_thread();
2990 struct threadlist
*tl
= util_get_thread_threadlist_entry(th
);
2992 if (tl
== NULL
|| (tl
->th_flags
& TH_LIST_CONSTRAINED
) == 0) {
2997 int class = pthread_priority_get_class_index(priority
);
2999 workqueue_lock_spin(wq
);
3000 bool should_narrow
= !may_start_constrained_thread(wq
, class, tl
, false);
3001 workqueue_unlock(wq
);
3003 *retval
= should_narrow
;
3012 case WQOPS_THREAD_KEVENT_RETURN
:
3013 case WQOPS_THREAD_WORKLOOP_RETURN
:
3014 case WQOPS_THREAD_RETURN
:
3015 PTHREAD_TRACE_WQ(TRACE_wq_runthread
| DBG_FUNC_START
, wq
, options
, 0, 0, 0);
3022 * We have no work to do, park ourselves on the idle list.
3024 * Consumes the workqueue lock and does not return.
3027 parkit(struct workqueue
*wq
, struct threadlist
*tl
, thread_t thread
)
3029 assert(thread
== tl
->th_thread
);
3030 assert(thread
== current_thread());
3032 PTHREAD_TRACE_WQ(TRACE_wq_thread_park
| DBG_FUNC_START
, wq
, 0, 0, 0, 0);
3034 uint32_t us_to_wait
= 0;
3036 TAILQ_REMOVE(&wq
->wq_thrunlist
, tl
, th_entry
);
3038 tl
->th_flags
&= ~TH_LIST_RUNNING
;
3039 tl
->th_flags
&= ~TH_LIST_KEVENT
;
3040 assert((tl
->th_flags
& TH_LIST_KEVENT_BOUND
) == 0);
3042 if (tl
->th_flags
& TH_LIST_CONSTRAINED
) {
3043 wq
->wq_constrained_threads_scheduled
--;
3044 tl
->th_flags
&= ~TH_LIST_CONSTRAINED
;
3047 _wq_thactive_dec(wq
, tl
->th_priority
);
3048 wq
->wq_thscheduled_count
[tl
->th_priority
]--;
3049 wq
->wq_threads_scheduled
--;
3050 uint32_t thidlecount
= ++wq
->wq_thidlecount
;
3052 pthread_kern
->thread_sched_call(thread
, NULL
);
3055 * We'd like to always have one manager thread parked so that we can have
3056 * low latency when we need to bring a manager thread up. If that idle
3057 * thread list is empty, make this thread a manager thread.
3059 * XXX: This doesn't check that there's not a manager thread outstanding,
3060 * so it's based on the assumption that most manager callouts will change
3061 * their QoS before parking. If that stops being true, this may end up
3062 * costing us more than we gain.
3064 if (TAILQ_EMPTY(&wq
->wq_thidlemgrlist
) &&
3065 tl
->th_priority
!= WORKQUEUE_EVENT_MANAGER_BUCKET
){
3066 PTHREAD_TRACE_WQ(TRACE_wq_thread_reset_priority
| DBG_FUNC_NONE
,
3067 wq
, thread_tid(thread
),
3068 (tl
->th_priority
<< 16) | WORKQUEUE_EVENT_MANAGER_BUCKET
, 2, 0);
3069 reset_priority(tl
, pthread_priority_from_wq_class_index(wq
, WORKQUEUE_EVENT_MANAGER_BUCKET
));
3070 tl
->th_priority
= WORKQUEUE_EVENT_MANAGER_BUCKET
;
3073 if (tl
->th_priority
== WORKQUEUE_EVENT_MANAGER_BUCKET
){
3074 TAILQ_INSERT_HEAD(&wq
->wq_thidlemgrlist
, tl
, th_entry
);
3076 TAILQ_INSERT_HEAD(&wq
->wq_thidlelist
, tl
, th_entry
);
3080 * When we remove the voucher from the thread, we may lose our importance
3081 * causing us to get preempted, so we do this after putting the thread on
3082 * the idle list. That when, when we get our importance back we'll be able
3083 * to use this thread from e.g. the kevent call out to deliver a boosting
3086 tl
->th_flags
|= TH_LIST_REMOVING_VOUCHER
;
3087 workqueue_unlock(wq
);
3088 if (pthread_kern
->thread_will_park_or_terminate
) {
3089 pthread_kern
->thread_will_park_or_terminate(tl
->th_thread
);
3091 __assert_only kern_return_t kr
;
3092 kr
= pthread_kern
->thread_set_voucher_name(MACH_PORT_NULL
);
3093 assert(kr
== KERN_SUCCESS
);
3094 workqueue_lock_spin(wq
);
3095 tl
->th_flags
&= ~(TH_LIST_REMOVING_VOUCHER
);
3097 if ((tl
->th_flags
& TH_LIST_RUNNING
) == 0) {
3098 if (thidlecount
< 101) {
3099 us_to_wait
= wq_reduce_pool_window_usecs
- ((thidlecount
-2) * (wq_reduce_pool_window_usecs
/ 100));
3101 us_to_wait
= wq_reduce_pool_window_usecs
/ 100;
3104 thread_set_pending_block_hint(thread
, kThreadWaitParkedWorkQueue
);
3105 assert_wait_timeout_with_leeway((caddr_t
)tl
, (THREAD_INTERRUPTIBLE
),
3106 TIMEOUT_URGENCY_SYS_BACKGROUND
|TIMEOUT_URGENCY_LEEWAY
, us_to_wait
,
3107 wq_reduce_pool_window_usecs
/10, NSEC_PER_USEC
);
3109 workqueue_unlock(wq
);
3111 thread_block(wq_unpark_continue
);
3112 panic("thread_block(wq_unpark_continue) returned!");
3114 workqueue_unlock(wq
);
3117 * While we'd dropped the lock to unset our voucher, someone came
3118 * around and made us runnable. But because we weren't waiting on the
3119 * event their wakeup() was ineffectual. To correct for that, we just
3120 * run the continuation ourselves.
3122 wq_unpark_continue(NULL
, THREAD_AWAKENED
);
3127 may_start_constrained_thread(struct workqueue
*wq
, uint32_t at_priclass
,
3128 struct threadlist
*tl
, bool may_start_timer
)
3130 uint32_t req_qos
= _wq_thactive_best_constrained_req_qos(wq
);
3131 wq_thactive_t thactive
;
3133 if (may_start_timer
&& at_priclass
< req_qos
) {
3135 * When called from workqueue_run_threadreq_and_unlock() pre-post newest
3136 * higher priorities into the thactive state so that
3137 * workqueue_callback() takes the right decision.
3139 * If the admission check passes, workqueue_run_threadreq_and_unlock
3140 * will reset this value before running the request.
3142 thactive
= _wq_thactive_set_best_constrained_req_qos(wq
, req_qos
,
3145 PTHREAD_TRACE_WQ(TRACE_wq_thactive_update
, 1, (uint64_t)thactive
,
3146 (uint64_t)(thactive
>> 64), 0, 0);
3149 thactive
= _wq_thactive(wq
);
3152 uint32_t constrained_threads
= wq
->wq_constrained_threads_scheduled
;
3153 if (tl
&& (tl
->th_flags
& TH_LIST_CONSTRAINED
)) {
3155 * don't count the current thread as scheduled
3157 constrained_threads
--;
3159 if (constrained_threads
>= wq_max_constrained_threads
) {
3160 PTHREAD_TRACE_WQ(TRACE_wq_constrained_admission
| DBG_FUNC_NONE
, wq
, 1,
3161 wq
->wq_constrained_threads_scheduled
,
3162 wq_max_constrained_threads
, 0);
3164 * we need 1 or more constrained threads to return to the kernel before
3165 * we can dispatch additional work
3171 * Compute a metric for many how many threads are active. We find the
3172 * highest priority request outstanding and then add up the number of
3173 * active threads in that and all higher-priority buckets. We'll also add
3174 * any "busy" threads which are not active but blocked recently enough that
3175 * we can't be sure they've gone idle yet. We'll then compare this metric
3176 * to our max concurrency to decide whether to add a new thread.
3179 uint32_t busycount
, thactive_count
;
3181 thactive_count
= _wq_thactive_aggregate_downto_qos(wq
, thactive
,
3182 at_priclass
, &busycount
, NULL
);
3184 if (tl
&& tl
->th_priority
<= at_priclass
) {
3186 * don't count this thread as currently active
3188 assert(thactive_count
> 0);
3192 if (thactive_count
+ busycount
< wq_max_concurrency
[at_priclass
]) {
3193 PTHREAD_TRACE_WQ(TRACE_wq_constrained_admission
| DBG_FUNC_NONE
, wq
, 2,
3194 thactive_count
, busycount
, 0);
3197 PTHREAD_TRACE_WQ(TRACE_wq_constrained_admission
| DBG_FUNC_NONE
, wq
, 3,
3198 thactive_count
, busycount
, 0);
3201 if (busycount
&& may_start_timer
) {
3203 * If this is called from the add timer, we won't have another timer
3204 * fire when the thread exits the "busy" state, so rearm the timer.
3206 if (WQ_TIMER_DELAYED_NEEDED(wq
)) {
3207 workqueue_interval_timer_start(wq
);
3214 static struct threadlist
*
3215 pop_from_thidlelist(struct workqueue
*wq
, uint32_t priclass
)
3217 assert(wq
->wq_thidlecount
);
3219 struct threadlist
*tl
= NULL
;
3221 if (!TAILQ_EMPTY(&wq
->wq_thidlemgrlist
) &&
3222 (priclass
== WORKQUEUE_EVENT_MANAGER_BUCKET
|| TAILQ_EMPTY(&wq
->wq_thidlelist
))){
3223 tl
= TAILQ_FIRST(&wq
->wq_thidlemgrlist
);
3224 TAILQ_REMOVE(&wq
->wq_thidlemgrlist
, tl
, th_entry
);
3225 assert(tl
->th_priority
== WORKQUEUE_EVENT_MANAGER_BUCKET
);
3226 } else if (!TAILQ_EMPTY(&wq
->wq_thidlelist
) &&
3227 (priclass
!= WORKQUEUE_EVENT_MANAGER_BUCKET
|| TAILQ_EMPTY(&wq
->wq_thidlemgrlist
))){
3228 tl
= TAILQ_FIRST(&wq
->wq_thidlelist
);
3229 TAILQ_REMOVE(&wq
->wq_thidlelist
, tl
, th_entry
);
3230 assert(tl
->th_priority
!= WORKQUEUE_EVENT_MANAGER_BUCKET
);
3232 panic("pop_from_thidlelist called with no threads available");
3234 assert((tl
->th_flags
& TH_LIST_RUNNING
) == 0);
3236 assert(wq
->wq_thidlecount
);
3237 wq
->wq_thidlecount
--;
3239 TAILQ_INSERT_TAIL(&wq
->wq_thrunlist
, tl
, th_entry
);
3241 tl
->th_flags
|= TH_LIST_RUNNING
| TH_LIST_BUSY
;
3243 wq
->wq_threads_scheduled
++;
3244 wq
->wq_thscheduled_count
[priclass
]++;
3245 _wq_thactive_inc(wq
, priclass
);
3249 static pthread_priority_t
3250 pthread_priority_from_wq_class_index(struct workqueue
*wq
, int index
)
3252 if (index
== WORKQUEUE_EVENT_MANAGER_BUCKET
){
3253 return wq
->wq_event_manager_priority
;
3255 return class_index_get_pthread_priority(index
);
3260 reset_priority(struct threadlist
*tl
, pthread_priority_t pri
)
3263 thread_t th
= tl
->th_thread
;
3265 if ((pri
& _PTHREAD_PRIORITY_SCHED_PRI_FLAG
) == 0){
3266 ret
= pthread_kern
->thread_set_workq_qos(th
, pthread_priority_get_thread_qos(pri
), 0);
3267 assert(ret
== KERN_SUCCESS
|| ret
== KERN_TERMINATED
);
3269 if (tl
->th_flags
& TH_LIST_EVENT_MGR_SCHED_PRI
) {
3271 /* Reset priority to default (masked by QoS) */
3273 ret
= pthread_kern
->thread_set_workq_pri(th
, 31, POLICY_TIMESHARE
);
3274 assert(ret
== KERN_SUCCESS
|| ret
== KERN_TERMINATED
);
3276 tl
->th_flags
&= ~TH_LIST_EVENT_MGR_SCHED_PRI
;
3279 ret
= pthread_kern
->thread_set_workq_qos(th
, THREAD_QOS_UNSPECIFIED
, 0);
3280 assert(ret
== KERN_SUCCESS
|| ret
== KERN_TERMINATED
);
3281 ret
= pthread_kern
->thread_set_workq_pri(th
, (pri
& (~_PTHREAD_PRIORITY_FLAGS_MASK
)), POLICY_TIMESHARE
);
3282 assert(ret
== KERN_SUCCESS
|| ret
== KERN_TERMINATED
);
3284 tl
->th_flags
|= TH_LIST_EVENT_MGR_SCHED_PRI
;
3289 * Picks the best request to run, and returns the best overcommit fallback
3290 * if the best pick is non overcommit and risks failing its admission check.
3292 static struct threadreq
*
3293 workqueue_best_threadreqs(struct workqueue
*wq
, struct threadlist
*tl
,
3294 struct threadreq
**fallback
)
3296 struct threadreq
*req
, *best_req
= NULL
;
3297 int priclass
, prilimit
;
3299 if ((wq
->wq_event_manager_threadreq
.tr_state
== TR_STATE_WAITING
) &&
3300 ((wq
->wq_thscheduled_count
[WORKQUEUE_EVENT_MANAGER_BUCKET
] == 0) ||
3301 (tl
&& tl
->th_priority
== WORKQUEUE_EVENT_MANAGER_BUCKET
))) {
3303 * There's an event manager request and either:
3304 * - no event manager currently running
3305 * - we are re-using the event manager
3307 req
= &wq
->wq_event_manager_threadreq
;
3308 PTHREAD_TRACE_WQ_REQ(TRACE_wq_run_threadreq_req_select
| DBG_FUNC_NONE
, wq
, req
, 1, 0, 0);
3313 prilimit
= WORKQUEUE_EVENT_MANAGER_BUCKET
;
3315 prilimit
= _wq_highest_paced_priority(wq
);
3317 for (priclass
= 0; priclass
< prilimit
; priclass
++) {
3318 req
= TAILQ_FIRST(&wq
->wq_overcommit_reqlist
[priclass
]);
3320 PTHREAD_TRACE_WQ_REQ(TRACE_wq_run_threadreq_req_select
| DBG_FUNC_NONE
, wq
, req
, 2, 0, 0);
3329 best_req
= TAILQ_FIRST(&wq
->wq_reqlist
[priclass
]);
3331 PTHREAD_TRACE_WQ_REQ(TRACE_wq_run_threadreq_req_select
| DBG_FUNC_NONE
, wq
, best_req
, 3, 0, 0);
3339 * Runs a thread request on a thread
3341 * - if thread is THREAD_NULL, will find a thread and run the request there.
3342 * Otherwise, the thread must be the current thread.
3344 * - if req is NULL, will find the highest priority request and run that. If
3345 * it is not NULL, it must be a threadreq object in state NEW. If it can not
3346 * be run immediately, it will be enqueued and moved to state WAITING.
3348 * Either way, the thread request object serviced will be moved to state
3349 * PENDING and attached to the threadlist.
3351 * Should be called with the workqueue lock held. Will drop it.
3353 * WARNING: _workq_kevent_reqthreads needs to be able to preflight any
3354 * admission checks in this function. If you are changing this function,
3355 * keep that one up-to-date.
3357 * - if parking_tl is non NULL, then the current thread is parking. This will
3358 * try to reuse this thread for a request. If no match is found, it will be
3362 workqueue_run_threadreq_and_unlock(proc_t p
, struct workqueue
*wq
,
3363 struct threadlist
*parking_tl
, struct threadreq
*req
,
3364 bool may_add_new_thread
)
3366 struct threadreq
*incoming_req
= req
;
3368 struct threadlist
*tl
= parking_tl
;
3369 int rc
= WQ_RUN_TR_THROTTLED
;
3371 assert(tl
== NULL
|| tl
->th_thread
== current_thread());
3372 assert(req
== NULL
|| req
->tr_state
== TR_STATE_NEW
);
3373 assert(!may_add_new_thread
|| !tl
);
3375 PTHREAD_TRACE_WQ_REQ(TRACE_wq_run_threadreq
| DBG_FUNC_START
, wq
, req
,
3376 tl
? thread_tid(tl
->th_thread
) : 0,
3377 req
? (req
->tr_priority
<< 16 | req
->tr_flags
) : 0, 0);
3380 * Special cases when provided an event manager request
3382 if (req
&& req
->tr_priority
== WORKQUEUE_EVENT_MANAGER_BUCKET
) {
3383 // Clients must not rely on identity of event manager requests
3384 assert(req
->tr_flags
& TR_FLAG_ONSTACK
);
3385 // You can't be both overcommit and event manager
3386 assert((req
->tr_flags
& TR_FLAG_OVERCOMMIT
) == 0);
3389 * We can only ever have one event manager request, so coalesce them if
3390 * there's already one outstanding.
3392 if (wq
->wq_event_manager_threadreq
.tr_state
== TR_STATE_WAITING
) {
3393 PTHREAD_TRACE_WQ_REQ(TRACE_wq_run_threadreq_mgr_merge
| DBG_FUNC_NONE
, wq
, req
, 0, 0, 0);
3395 struct threadreq
*existing_req
= &wq
->wq_event_manager_threadreq
;
3396 if (req
->tr_flags
& TR_FLAG_KEVENT
) {
3397 existing_req
->tr_flags
|= TR_FLAG_KEVENT
;
3401 incoming_req
= NULL
;
3404 if (wq
->wq_thscheduled_count
[WORKQUEUE_EVENT_MANAGER_BUCKET
] &&
3405 (!tl
|| tl
->th_priority
!= WORKQUEUE_EVENT_MANAGER_BUCKET
)){
3407 * There can only be one event manager running at a time.
3409 PTHREAD_TRACE_WQ(TRACE_wq_run_threadreq
| DBG_FUNC_END
, wq
, 1, 0, 0, 0);
3414 again
: // Start again after creating a thread
3416 if (_wq_exiting(wq
)) {
3417 rc
= WQ_RUN_TR_EXITING
;
3422 * Thread request selection and admission control
3424 struct threadreq
*fallback
= NULL
;
3426 if ((req
->tr_flags
& TR_FLAG_NO_PACING
) == 0 &&
3427 _wq_should_pace_priority(wq
, req
->tr_priority
)) {
3429 * If a request fails the pacing admission check, then thread
3430 * requests are redriven when the pacing thread is finally scheduled
3431 * when it calls _wq_pacing_end() in wq_unpark_continue().
3435 } else if (wq
->wq_reqcount
== 0) {
3436 PTHREAD_TRACE_WQ(TRACE_wq_run_threadreq
| DBG_FUNC_END
, wq
, 2, 0, 0, 0);
3438 } else if ((req
= workqueue_best_threadreqs(wq
, tl
, &fallback
)) == NULL
) {
3439 PTHREAD_TRACE_WQ(TRACE_wq_run_threadreq
| DBG_FUNC_END
, wq
, 3, 0, 0, 0);
3443 if ((req
->tr_flags
& TR_FLAG_OVERCOMMIT
) == 0 &&
3444 (req
->tr_priority
< WORKQUEUE_EVENT_MANAGER_BUCKET
)) {
3445 if (!may_start_constrained_thread(wq
, req
->tr_priority
, parking_tl
, true)) {
3447 PTHREAD_TRACE_WQ(TRACE_wq_run_threadreq
| DBG_FUNC_END
, wq
, 4, 0, 0, 0);
3450 assert(req
->tr_state
== TR_STATE_WAITING
);
3459 if (tl
->th_priority
!= req
->tr_priority
) {
3460 _wq_thactive_move(wq
, tl
->th_priority
, req
->tr_priority
);
3461 wq
->wq_thscheduled_count
[tl
->th_priority
]--;
3462 wq
->wq_thscheduled_count
[req
->tr_priority
]++;
3464 PTHREAD_TRACE_WQ(TRACE_wq_run_threadreq_thread_select
| DBG_FUNC_NONE
,
3465 wq
, 1, thread_tid(tl
->th_thread
), 0, 0);
3466 } else if (wq
->wq_thidlecount
) {
3467 tl
= pop_from_thidlelist(wq
, req
->tr_priority
);
3469 * This call will update wq_thscheduled_count and wq_thactive_count for
3470 * the provided priority. It will not set the returned thread to that
3471 * priority. This matches the behavior of the parking_tl clause above.
3473 PTHREAD_TRACE_WQ(TRACE_wq_run_threadreq_thread_select
| DBG_FUNC_NONE
,
3474 wq
, 2, thread_tid(tl
->th_thread
), 0, 0);
3475 } else /* no idle threads */ {
3476 if (!may_add_new_thread
|| wq
->wq_nthreads
>= wq_max_threads
) {
3477 PTHREAD_TRACE_WQ(TRACE_wq_run_threadreq
| DBG_FUNC_END
, wq
, 5,
3478 may_add_new_thread
, wq
->wq_nthreads
, 0);
3479 if (wq
->wq_nthreads
< wq_max_threads
) {
3480 rc
= WQ_RUN_TR_THREAD_NEEDED
;
3485 bool added_thread
= workqueue_addnewthread(p
, wq
);
3487 * workqueue_addnewthread will drop and re-take the lock, so we
3488 * need to ensure we still have a cached request.
3490 * It also means we have to pick a new request, since our old pick may
3491 * not be valid anymore.
3494 if (req
&& (req
->tr_flags
& TR_FLAG_ONSTACK
)) {
3495 _threadreq_copy_prepare(wq
);
3499 PTHREAD_TRACE_WQ(TRACE_wq_run_threadreq_thread_select
| DBG_FUNC_NONE
,
3502 } else if (_wq_exiting(wq
)) {
3503 rc
= WQ_RUN_TR_EXITING
;
3506 PTHREAD_TRACE_WQ(TRACE_wq_run_threadreq
| DBG_FUNC_END
, wq
, 6, 0, 0, 0);
3508 * Something caused thread creation to fail. Kick off the timer in
3509 * the hope that it'll succeed next time.
3511 if (WQ_TIMER_DELAYED_NEEDED(wq
)) {
3512 workqueue_interval_timer_start(wq
);
3519 * Setup thread, mark request as complete and run with it.
3521 if (req
->tr_state
== TR_STATE_WAITING
) {
3522 _threadreq_dequeue(wq
, req
);
3524 if (tl
->th_priority
!= req
->tr_priority
) {
3525 PTHREAD_TRACE_WQ(TRACE_wq_thread_reset_priority
| DBG_FUNC_NONE
,
3526 wq
, thread_tid(tl
->th_thread
),
3527 (tl
->th_priority
<< 16) | req
->tr_priority
, 1, 0);
3528 reset_priority(tl
, pthread_priority_from_wq_class_index(wq
, req
->tr_priority
));
3529 tl
->th_priority
= (uint8_t)req
->tr_priority
;
3531 if (req
->tr_flags
& TR_FLAG_OVERCOMMIT
) {
3532 if ((tl
->th_flags
& TH_LIST_CONSTRAINED
) != 0) {
3533 tl
->th_flags
&= ~TH_LIST_CONSTRAINED
;
3534 wq
->wq_constrained_threads_scheduled
--;
3537 if ((tl
->th_flags
& TH_LIST_CONSTRAINED
) == 0) {
3538 tl
->th_flags
|= TH_LIST_CONSTRAINED
;
3539 wq
->wq_constrained_threads_scheduled
++;
3543 if (!parking_tl
&& !(req
->tr_flags
& TR_FLAG_NO_PACING
)) {
3544 _wq_pacing_start(wq
, tl
);
3546 if ((req
->tr_flags
& TR_FLAG_OVERCOMMIT
) == 0) {
3547 uint32_t old_qos
, new_qos
;
3550 * If we are scheduling a constrained thread request, we may need to
3551 * update the best constrained qos in the thactive atomic state.
3553 for (new_qos
= 0; new_qos
< WQ_THACTIVE_NO_PENDING_REQUEST
; new_qos
++) {
3554 if (TAILQ_FIRST(&wq
->wq_reqlist
[new_qos
]))
3557 old_qos
= _wq_thactive_best_constrained_req_qos(wq
);
3558 if (old_qos
!= new_qos
) {
3559 wq_thactive_t v
= _wq_thactive_set_best_constrained_req_qos(wq
,
3562 PTHREAD_TRACE_WQ(TRACE_wq_thactive_update
, 2, (uint64_t)v
,
3563 (uint64_t)(v
>> 64), 0, 0);
3565 PTHREAD_TRACE_WQ(TRACE_wq_thactive_update
, 2, v
, 0, 0, 0);
3570 uint32_t upcall_flags
= WQ_FLAG_THREAD_NEWSPI
;
3571 if (req
->tr_flags
& TR_FLAG_OVERCOMMIT
)
3572 upcall_flags
|= WQ_FLAG_THREAD_OVERCOMMIT
;
3573 if (req
->tr_flags
& TR_FLAG_KEVENT
)
3574 upcall_flags
|= WQ_FLAG_THREAD_KEVENT
;
3575 if (req
->tr_flags
& TR_FLAG_WORKLOOP
)
3576 upcall_flags
|= WQ_FLAG_THREAD_WORKLOOP
| WQ_FLAG_THREAD_KEVENT
;
3577 if (tl
->th_priority
== WORKQUEUE_EVENT_MANAGER_BUCKET
)
3578 upcall_flags
|= WQ_FLAG_THREAD_EVENT_MANAGER
;
3579 tl
->th_upcall_flags
= upcall_flags
>> WQ_FLAG_THREAD_PRIOSHIFT
;
3581 if (req
->tr_flags
& TR_FLAG_KEVENT
) {
3582 tl
->th_flags
|= TH_LIST_KEVENT
;
3584 tl
->th_flags
&= ~TH_LIST_KEVENT
;
3586 return _threadreq_complete_and_unlock(p
, wq
, req
, tl
);
3590 _threadreq_enqueue(wq
, incoming_req
);
3595 if (parking_tl
&& !(parking_tl
->th_flags
& TH_LIST_UNBINDING
)) {
3596 parkit(wq
, parking_tl
, parking_tl
->th_thread
);
3597 __builtin_unreachable();
3600 workqueue_unlock(wq
);
3606 * parked thread wakes up
3609 wq_unpark_continue(void* __unused ptr
, wait_result_t wait_result
)
3611 boolean_t first_use
= false;
3612 thread_t th
= current_thread();
3613 proc_t p
= current_proc();
3615 struct uthread
*uth
= pthread_kern
->get_bsdthread_info(th
);
3616 if (uth
== NULL
) goto done
;
3618 struct workqueue
*wq
= pthread_kern
->proc_get_wqptr(p
);
3619 if (wq
== NULL
) goto done
;
3621 workqueue_lock_spin(wq
);
3623 struct threadlist
*tl
= pthread_kern
->uthread_get_threadlist(uth
);
3624 assert(tl
!= WQ_THREADLIST_EXITING_POISON
);
3627 * We woke up before addnewthread() was finished setting us up. Go
3628 * ahead and exit, but before we do poison the threadlist variable so
3629 * that addnewthread() doesn't think we are valid still.
3631 pthread_kern
->uthread_set_threadlist(uth
, WQ_THREADLIST_EXITING_POISON
);
3632 workqueue_unlock(wq
);
3636 assert(tl
->th_flags
& TH_LIST_INITED
);
3638 if ((tl
->th_flags
& TH_LIST_NEW
)){
3639 tl
->th_flags
&= ~(TH_LIST_NEW
);
3643 if ((tl
->th_flags
& (TH_LIST_RUNNING
| TH_LIST_BUSY
)) == TH_LIST_RUNNING
) {
3645 * The normal wakeup path.
3647 goto return_to_user
;
3650 if ((tl
->th_flags
& TH_LIST_RUNNING
) == 0 &&
3651 wait_result
== THREAD_TIMED_OUT
&&
3652 tl
->th_priority
== WORKQUEUE_EVENT_MANAGER_BUCKET
&&
3653 TAILQ_FIRST(&wq
->wq_thidlemgrlist
) == tl
&&
3654 TAILQ_NEXT(tl
, th_entry
) == NULL
){
3656 * If we are the only idle manager and we pop'ed for self-destruction,
3657 * then don't actually exit. Instead, free our stack to save some
3658 * memory and re-park.
3661 workqueue_unlock(wq
);
3663 vm_map_t vmap
= wq
->wq_map
;
3665 // Keep this in sync with _setup_wqthread()
3666 const vm_size_t guardsize
= vm_map_page_size(vmap
);
3667 const user_addr_t freeaddr
= (user_addr_t
)tl
->th_stackaddr
+ guardsize
;
3668 const vm_map_offset_t freesize
= vm_map_trunc_page_mask((PTH_DEFAULT_STACKSIZE
+ guardsize
+ PTHREAD_T_OFFSET
) - 1, vm_map_page_mask(vmap
)) - guardsize
;
3670 __assert_only
int kr
= mach_vm_behavior_set(vmap
, freeaddr
, freesize
, VM_BEHAVIOR_REUSABLE
);
3672 if (kr
!= KERN_SUCCESS
&& kr
!= KERN_INVALID_ADDRESS
) {
3673 os_log_error(OS_LOG_DEFAULT
, "unable to make thread stack reusable (kr: %d)", kr
);
3677 workqueue_lock_spin(wq
);
3679 if ( !(tl
->th_flags
& TH_LIST_RUNNING
)) {
3680 thread_set_pending_block_hint(th
, kThreadWaitParkedWorkQueue
);
3681 assert_wait((caddr_t
)tl
, (THREAD_INTERRUPTIBLE
));
3683 workqueue_unlock(wq
);
3685 thread_block(wq_unpark_continue
);
3686 __builtin_unreachable();
3690 if ((tl
->th_flags
& TH_LIST_RUNNING
) == 0) {
3691 assert((tl
->th_flags
& TH_LIST_BUSY
) == 0);
3693 PTHREAD_TRACE_WQ(TRACE_wq_thread_park
| DBG_FUNC_END
, wq
, 0, 0, 0, 0);
3696 * We were set running, but not for the purposes of actually running.
3697 * This could be because the timer elapsed. Or it could be because the
3698 * thread aborted. Either way, we need to return to userspace to exit.
3700 * The call to workqueue_removethread will consume the lock.
3704 (tl
->th_priority
< qos_class_get_class_index(WQ_THREAD_CLEANUP_QOS
) ||
3705 (tl
->th_priority
== WORKQUEUE_EVENT_MANAGER_BUCKET
))) {
3706 // Reset the QoS to something low for the pthread cleanup
3707 PTHREAD_TRACE_WQ(TRACE_wq_thread_reset_priority
| DBG_FUNC_NONE
,
3709 (tl
->th_priority
<< 16) | qos_class_get_class_index(WQ_THREAD_CLEANUP_QOS
), 3, 0);
3710 pthread_priority_t cleanup_pri
= _pthread_priority_make_newest(WQ_THREAD_CLEANUP_QOS
, 0, 0);
3711 reset_priority(tl
, cleanup_pri
);
3714 workqueue_removethread(tl
, 0, first_use
);
3717 pthread_kern
->thread_bootstrap_return();
3719 pthread_kern
->unix_syscall_return(0);
3721 __builtin_unreachable();
3725 * The timer woke us up or the thread was aborted. However, we have
3726 * already started to make this a runnable thread. Wait for that to
3727 * finish, then continue to userspace.
3729 while ((tl
->th_flags
& TH_LIST_BUSY
)) {
3730 assert_wait((caddr_t
)tl
, (THREAD_UNINT
));
3732 workqueue_unlock(wq
);
3734 thread_block(THREAD_CONTINUE_NULL
);
3736 workqueue_lock_spin(wq
);
3741 PTHREAD_TRACE_WQ(TRACE_wq_thread_park
| DBG_FUNC_END
, wq
, 0, 0, 0, 0);
3743 if (_wq_pacing_end(wq
, tl
) && wq
->wq_reqcount
) {
3744 workqueue_run_threadreq_and_unlock(p
, wq
, NULL
, NULL
, true);
3746 workqueue_unlock(wq
);
3748 _setup_wqthread(p
, th
, wq
, tl
, first_use
? WQ_SETUP_FIRST_USE
: 0);
3749 pthread_kern
->thread_sched_call(th
, workqueue_callback
);
3752 pthread_kern
->thread_bootstrap_return();
3754 pthread_kern
->unix_syscall_return(EJUSTRETURN
);
3756 panic("Our attempt to return to userspace failed...");
3760 * configures initial thread stack/registers to jump into:
3761 * _pthread_wqthread(pthread_t self, mach_port_t kport, void *stackaddr, void *keventlist, int upcall_flags, int nkevents);
3762 * to get there we jump through assembily stubs in pthread_asm.s. Those
3763 * routines setup a stack frame, using the current stack pointer, and marshall
3764 * arguments from registers to the stack as required by the ABI.
3766 * One odd thing we do here is to start the pthread_t 4k below what would be the
3767 * top of the stack otherwise. This is because usually only the first 4k of the
3768 * pthread_t will be used and so we want to put it on the same 16k page as the
3769 * top of the stack to save memory.
3771 * When we are done the stack will look like:
3772 * |-----------| th_stackaddr + th_allocsize
3773 * |pthread_t | th_stackaddr + DEFAULT_STACKSIZE + guardsize + PTHREAD_STACK_OFFSET
3774 * |kevent list| optionally - at most WQ_KEVENT_LIST_LEN events
3775 * |kevent data| optionally - at most WQ_KEVENT_DATA_SIZE bytes
3776 * |stack gap | bottom aligned to 16 bytes, and at least as big as stack_gap_min
3780 * |guard page | guardsize
3781 * |-----------| th_stackaddr
3784 _setup_wqthread(proc_t p
, thread_t th
, struct workqueue
*wq
,
3785 struct threadlist
*tl
, int setup_flags
)
3788 if (setup_flags
& WQ_SETUP_CLEAR_VOUCHER
) {
3790 * For preemption reasons, we want to reset the voucher as late as
3791 * possible, so we do it in two places:
3792 * - Just before parking (i.e. in parkit())
3793 * - Prior to doing the setup for the next workitem (i.e. here)
3795 * Those two places are sufficient to ensure we always reset it before
3796 * it goes back out to user space, but be careful to not break that
3799 __assert_only kern_return_t kr
;
3800 kr
= pthread_kern
->thread_set_voucher_name(MACH_PORT_NULL
);
3801 assert(kr
== KERN_SUCCESS
);
3804 uint32_t upcall_flags
= tl
->th_upcall_flags
<< WQ_FLAG_THREAD_PRIOSHIFT
;
3805 if (!(setup_flags
& WQ_SETUP_FIRST_USE
)) {
3806 upcall_flags
|= WQ_FLAG_THREAD_REUSE
;
3810 * Put the QoS class value into the lower bits of the reuse_thread register, this is where
3811 * the thread priority used to be stored anyway.
3813 pthread_priority_t priority
= pthread_priority_from_wq_class_index(wq
, tl
->th_priority
);
3814 upcall_flags
|= (_pthread_priority_get_qos_newest(priority
) & WQ_FLAG_THREAD_PRIOMASK
);
3816 const vm_size_t guardsize
= vm_map_page_size(tl
->th_workq
->wq_map
);
3817 const vm_size_t stack_gap_min
= (proc_is64bit(p
) == 0) ? C_32_STK_ALIGN
: C_64_REDZONE_LEN
;
3818 const vm_size_t stack_align_min
= (proc_is64bit(p
) == 0) ? C_32_STK_ALIGN
: C_64_STK_ALIGN
;
3820 user_addr_t pthread_self_addr
= (user_addr_t
)(tl
->th_stackaddr
+ PTH_DEFAULT_STACKSIZE
+ guardsize
+ PTHREAD_T_OFFSET
);
3821 user_addr_t stack_top_addr
= (user_addr_t
)((pthread_self_addr
- stack_gap_min
) & -stack_align_min
);
3822 user_addr_t stack_bottom_addr
= (user_addr_t
)(tl
->th_stackaddr
+ guardsize
);
3824 user_addr_t wqstart_fnptr
= pthread_kern
->proc_get_wqthread(p
);
3825 if (!wqstart_fnptr
) {
3826 panic("workqueue thread start function pointer is NULL");
3829 if (setup_flags
& WQ_SETUP_FIRST_USE
) {
3830 uint32_t tsd_offset
= pthread_kern
->proc_get_pthread_tsd_offset(p
);
3832 mach_vm_offset_t th_tsd_base
= (mach_vm_offset_t
)pthread_self_addr
+ tsd_offset
;
3833 kern_return_t kret
= pthread_kern
->thread_set_tsd_base(th
, th_tsd_base
);
3834 if (kret
== KERN_SUCCESS
) {
3835 upcall_flags
|= WQ_FLAG_THREAD_TSD_BASE_SET
;
3840 * Pre-fault the first page of the new thread's stack and the page that will
3841 * contain the pthread_t structure.
3843 vm_map_t vmap
= pthread_kern
->current_map();
3844 if (vm_map_trunc_page_mask((vm_map_offset_t
)(stack_top_addr
- C_64_REDZONE_LEN
), vm_map_page_mask(vmap
)) !=
3845 vm_map_trunc_page_mask((vm_map_offset_t
)pthread_self_addr
, vm_map_page_mask(vmap
))){
3847 vm_map_trunc_page_mask((vm_map_offset_t
)(stack_top_addr
- C_64_REDZONE_LEN
), vm_map_page_mask(vmap
)),
3848 VM_PROT_READ
| VM_PROT_WRITE
,
3850 THREAD_UNINT
, NULL
, 0);
3853 vm_map_trunc_page_mask((vm_map_offset_t
)pthread_self_addr
, vm_map_page_mask(vmap
)),
3854 VM_PROT_READ
| VM_PROT_WRITE
,
3856 THREAD_UNINT
, NULL
, 0);
3859 user_addr_t kevent_list
= NULL
;
3860 int kevent_count
= 0;
3861 if (upcall_flags
& WQ_FLAG_THREAD_KEVENT
){
3862 bool workloop
= upcall_flags
& WQ_FLAG_THREAD_WORKLOOP
;
3864 kevent_list
= pthread_self_addr
- WQ_KEVENT_LIST_LEN
* sizeof(struct kevent_qos_s
);
3865 kevent_count
= WQ_KEVENT_LIST_LEN
;
3867 user_addr_t kevent_id_addr
= kevent_list
;
3870 * The kevent ID goes just below the kevent list. Sufficiently new
3871 * userspace will know to look there. Old userspace will just
3874 kevent_id_addr
-= sizeof(kqueue_id_t
);
3877 user_addr_t kevent_data_buf
= kevent_id_addr
- WQ_KEVENT_DATA_SIZE
;
3878 user_size_t kevent_data_available
= WQ_KEVENT_DATA_SIZE
;
3880 int32_t events_out
= 0;
3882 assert(tl
->th_flags
| TH_LIST_KEVENT_BOUND
);
3883 unsigned int flags
= KEVENT_FLAG_STACK_DATA
| KEVENT_FLAG_IMMEDIATE
;
3884 if (tl
->th_priority
== WORKQUEUE_EVENT_MANAGER_BUCKET
) {
3885 flags
|= KEVENT_FLAG_WORKQ_MANAGER
;
3889 flags
|= KEVENT_FLAG_WORKLOOP
;
3890 kqueue_id_t kevent_id
= -1;
3891 ret
= kevent_id_internal(p
, &kevent_id
,
3892 NULL
, 0, kevent_list
, kevent_count
,
3893 kevent_data_buf
, &kevent_data_available
,
3894 flags
, &events_out
);
3895 copyout(&kevent_id
, kevent_id_addr
, sizeof(kevent_id
));
3897 flags
|= KEVENT_FLAG_WORKQ
;
3898 ret
= kevent_qos_internal(p
,
3899 class_index_get_thread_qos(tl
->th_priority
),
3900 NULL
, 0, kevent_list
, kevent_count
,
3901 kevent_data_buf
, &kevent_data_available
,
3902 flags
, &events_out
);
3905 // squash any errors into just empty output
3906 if (ret
!= KERN_SUCCESS
|| events_out
== -1){
3908 kevent_data_available
= WQ_KEVENT_DATA_SIZE
;
3911 // We shouldn't get data out if there aren't events available
3912 assert(events_out
!= 0 || kevent_data_available
== WQ_KEVENT_DATA_SIZE
);
3914 if (events_out
> 0){
3915 if (kevent_data_available
== WQ_KEVENT_DATA_SIZE
){
3916 stack_top_addr
= (kevent_id_addr
- stack_gap_min
) & -stack_align_min
;
3918 stack_top_addr
= (kevent_data_buf
+ kevent_data_available
- stack_gap_min
) & -stack_align_min
;
3921 kevent_count
= events_out
;
3928 PTHREAD_TRACE_WQ(TRACE_wq_runthread
| DBG_FUNC_START
, wq
, 0, 0, 0, 0);
3930 #if defined(__i386__) || defined(__x86_64__)
3931 if (proc_is64bit(p
) == 0) {
3932 x86_thread_state32_t state
= {
3933 .eip
= (unsigned int)wqstart_fnptr
,
3934 .eax
= /* arg0 */ (unsigned int)pthread_self_addr
,
3935 .ebx
= /* arg1 */ (unsigned int)tl
->th_thport
,
3936 .ecx
= /* arg2 */ (unsigned int)stack_bottom_addr
,
3937 .edx
= /* arg3 */ (unsigned int)kevent_list
,
3938 .edi
= /* arg4 */ (unsigned int)upcall_flags
,
3939 .esi
= /* arg5 */ (unsigned int)kevent_count
,
3941 .esp
= (int)((vm_offset_t
)stack_top_addr
),
3944 error
= pthread_kern
->thread_set_wq_state32(th
, (thread_state_t
)&state
);
3945 if (error
!= KERN_SUCCESS
) {
3946 panic(__func__
": thread_set_wq_state failed: %d", error
);
3949 x86_thread_state64_t state64
= {
3950 // x86-64 already passes all the arguments in registers, so we just put them in their final place here
3951 .rip
= (uint64_t)wqstart_fnptr
,
3952 .rdi
= (uint64_t)pthread_self_addr
,
3953 .rsi
= (uint64_t)tl
->th_thport
,
3954 .rdx
= (uint64_t)stack_bottom_addr
,
3955 .rcx
= (uint64_t)kevent_list
,
3956 .r8
= (uint64_t)upcall_flags
,
3957 .r9
= (uint64_t)kevent_count
,
3959 .rsp
= (uint64_t)(stack_top_addr
)
3962 error
= pthread_kern
->thread_set_wq_state64(th
, (thread_state_t
)&state64
);
3963 if (error
!= KERN_SUCCESS
) {
3964 panic(__func__
": thread_set_wq_state failed: %d", error
);
3968 #error setup_wqthread not defined for this architecture
3973 static int wq_kevent_test SYSCTL_HANDLER_ARGS
{
3974 //(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req)
3975 #pragma unused(oidp, arg1, arg2)
3977 struct workq_reqthreads_req_s requests
[64] = {};
3979 if (req
->newlen
> sizeof(requests
) || req
->newlen
< sizeof(struct workq_reqthreads_req_s
))
3982 error
= copyin(req
->newptr
, requests
, req
->newlen
);
3983 if (error
) return error
;
3985 _workq_reqthreads(req
->p
, (int)(req
->newlen
/ sizeof(struct workq_reqthreads_req_s
)), requests
);
3994 _fill_procworkqueue(proc_t p
, struct proc_workqueueinfo
* pwqinfo
)
3996 struct workqueue
* wq
;
4000 if ((wq
= pthread_kern
->proc_get_wqptr(p
)) == NULL
) {
4005 * This is sometimes called from interrupt context by the kperf sampler.
4006 * In that case, it's not safe to spin trying to take the lock since we
4007 * might already hold it. So, we just try-lock it and error out if it's
4008 * already held. Since this is just a debugging aid, and all our callers
4009 * are able to handle an error, that's fine.
4011 bool locked
= workqueue_lock_try(wq
);
4016 activecount
= _wq_thactive_aggregate_downto_qos(wq
, _wq_thactive(wq
),
4017 WORKQUEUE_NUM_BUCKETS
- 1, NULL
, NULL
);
4018 pwqinfo
->pwq_nthreads
= wq
->wq_nthreads
;
4019 pwqinfo
->pwq_runthreads
= activecount
;
4020 pwqinfo
->pwq_blockedthreads
= wq
->wq_threads_scheduled
- activecount
;
4021 pwqinfo
->pwq_state
= 0;
4023 if (wq
->wq_constrained_threads_scheduled
>= wq_max_constrained_threads
) {
4024 pwqinfo
->pwq_state
|= WQ_EXCEEDED_CONSTRAINED_THREAD_LIMIT
;
4027 if (wq
->wq_nthreads
>= wq_max_threads
) {
4028 pwqinfo
->pwq_state
|= WQ_EXCEEDED_TOTAL_THREAD_LIMIT
;
4031 workqueue_unlock(wq
);
4036 _get_pwq_state_kdp(proc_t p
)
4042 struct workqueue
*wq
= pthread_kern
->proc_get_wqptr(p
);
4044 if (wq
== NULL
|| workqueue_lock_spin_is_acquired_kdp(wq
)) {
4048 uint32_t pwq_state
= WQ_FLAGS_AVAILABLE
;
4050 if (wq
->wq_constrained_threads_scheduled
>= wq_max_constrained_threads
) {
4051 pwq_state
|= WQ_EXCEEDED_CONSTRAINED_THREAD_LIMIT
;
4054 if (wq
->wq_nthreads
>= wq_max_threads
) {
4055 pwq_state
|= WQ_EXCEEDED_TOTAL_THREAD_LIMIT
;
4062 _thread_selfid(__unused
struct proc
*p
, uint64_t *retval
)
4064 thread_t thread
= current_thread();
4065 *retval
= thread_tid(thread
);
4066 return KERN_SUCCESS
;
4072 pthread_lck_grp_attr
= lck_grp_attr_alloc_init();
4073 pthread_lck_grp
= lck_grp_alloc_init("pthread", pthread_lck_grp_attr
);
4076 * allocate the lock attribute for pthread synchronizers
4078 pthread_lck_attr
= lck_attr_alloc_init();
4080 pthread_list_mlock
= lck_mtx_alloc_init(pthread_lck_grp
, pthread_lck_attr
);
4082 pth_global_hashinit();
4083 psynch_thcall
= thread_call_allocate(psynch_wq_cleanup
, NULL
);
4086 pthread_zone_workqueue
= zinit(sizeof(struct workqueue
),
4087 1024 * sizeof(struct workqueue
), 8192, "pthread.workqueue");
4088 pthread_zone_threadlist
= zinit(sizeof(struct threadlist
),
4089 1024 * sizeof(struct threadlist
), 8192, "pthread.threadlist");
4090 pthread_zone_threadreq
= zinit(sizeof(struct threadreq
),
4091 1024 * sizeof(struct threadreq
), 8192, "pthread.threadreq");
4096 sysctl_register_oid(&sysctl__kern_wq_stalled_window_usecs
);
4097 sysctl_register_oid(&sysctl__kern_wq_reduce_pool_window_usecs
);
4098 sysctl_register_oid(&sysctl__kern_wq_max_timer_interval_usecs
);
4099 sysctl_register_oid(&sysctl__kern_wq_max_threads
);
4100 sysctl_register_oid(&sysctl__kern_wq_max_constrained_threads
);
4101 sysctl_register_oid(&sysctl__kern_pthread_debug_tracing
);
4104 sysctl_register_oid(&sysctl__debug_wq_kevent_test
);
4107 for (int i
= 0; i
< WORKQUEUE_NUM_BUCKETS
; i
++) {
4108 uint32_t thread_qos
= _wq_bucket_to_thread_qos(i
);
4109 wq_max_concurrency
[i
] = pthread_kern
->qos_max_parallelism(thread_qos
,
4110 QOS_PARALLELISM_COUNT_LOGICAL
);
4112 wq_max_concurrency
[WORKQUEUE_EVENT_MANAGER_BUCKET
] = 1;