2 * Copyright (c) 2000-2012 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
28 /* Copyright (c) 1995-2005 Apple Computer, Inc. All Rights Reserved */
33 #pragma mark - Front Matter
35 #define _PTHREAD_CONDATTR_T
36 #define _PTHREAD_COND_T
37 #define _PTHREAD_MUTEXATTR_T
38 #define _PTHREAD_MUTEX_T
39 #define _PTHREAD_RWLOCKATTR_T
40 #define _PTHREAD_RWLOCK_T
42 #undef pthread_mutexattr_t
43 #undef pthread_mutex_t
44 #undef pthread_condattr_t
46 #undef pthread_rwlockattr_t
47 #undef pthread_rwlock_t
49 #include <sys/cdefs.h>
51 // <rdar://problem/26158937> panic() should be marked noreturn
52 extern void panic(const char *string
, ...) __printflike(1,2) __dead2
;
54 #include <sys/param.h>
55 #include <sys/queue.h>
56 #include <sys/resourcevar.h>
57 //#include <sys/proc_internal.h>
58 #include <sys/kauth.h>
59 #include <sys/systm.h>
60 #include <sys/timeb.h>
61 #include <sys/times.h>
63 #include <sys/kernel.h>
65 #include <sys/signalvar.h>
66 #include <sys/sysctl.h>
67 #include <sys/syslog.h>
70 #include <sys/kdebug.h>
71 //#include <sys/sysproto.h>
73 #include <sys/user.h> /* for coredump */
74 #include <sys/proc_info.h> /* for fill_procworkqueue */
76 #include <mach/mach_port.h>
77 #include <mach/mach_types.h>
78 #include <mach/semaphore.h>
79 #include <mach/sync_policy.h>
80 #include <mach/task.h>
81 #include <mach/vm_prot.h>
82 #include <kern/kern_types.h>
83 #include <kern/task.h>
84 #include <kern/clock.h>
85 #include <mach/kern_return.h>
86 #include <kern/thread.h>
87 #include <kern/sched_prim.h>
88 #include <kern/kalloc.h>
89 #include <kern/sched_prim.h> /* for thread_exception_return */
90 #include <kern/processor.h>
91 #include <kern/assert.h>
92 #include <mach/mach_vm.h>
93 #include <mach/mach_param.h>
94 #include <mach/thread_status.h>
95 #include <mach/thread_policy.h>
96 #include <mach/message.h>
97 #include <mach/port.h>
98 //#include <vm/vm_protos.h>
99 #include <vm/vm_fault.h>
100 #include <vm/vm_map.h>
101 #include <mach/thread_act.h> /* for thread_resume */
102 #include <machine/machine_routines.h>
103 #include <mach/shared_region.h>
105 #include <libkern/OSAtomic.h>
106 #include <libkern/libkern.h>
108 #include <sys/pthread_shims.h>
109 #include "kern_internal.h"
111 // XXX: Dirty import for sys/signarvar.h that's wrapped in BSD_KERNEL_PRIVATE
112 #define sigcantmask (sigmask(SIGKILL) | sigmask(SIGSTOP))
114 // XXX: Ditto for thread tags from kern/thread.h
115 #define THREAD_TAG_MAINTHREAD 0x1
116 #define THREAD_TAG_PTHREAD 0x10
117 #define THREAD_TAG_WORKQUEUE 0x20
119 lck_grp_attr_t
*pthread_lck_grp_attr
;
120 lck_grp_t
*pthread_lck_grp
;
121 lck_attr_t
*pthread_lck_attr
;
123 extern void thread_set_cthreadself(thread_t thread
, uint64_t pself
, int isLP64
);
124 extern void workqueue_thread_yielded(void);
126 enum run_nextreq_mode
{
128 RUN_NEXTREQ_DEFAULT_KEVENT
,
129 RUN_NEXTREQ_OVERCOMMIT
,
130 RUN_NEXTREQ_OVERCOMMIT_KEVENT
,
131 RUN_NEXTREQ_DEFERRED_OVERCOMMIT
,
132 RUN_NEXTREQ_UNCONSTRAINED
,
133 RUN_NEXTREQ_EVENT_MANAGER
,
134 RUN_NEXTREQ_ADD_TIMER
136 static thread_t
workqueue_run_nextreq(proc_t p
, struct workqueue
*wq
, thread_t th
,
137 enum run_nextreq_mode mode
, pthread_priority_t prio
,
138 bool kevent_bind_via_return
);
140 static boolean_t
workqueue_run_one(proc_t p
, struct workqueue
*wq
, boolean_t overcommit
, pthread_priority_t priority
);
142 static void wq_runreq(proc_t p
, thread_t th
, struct workqueue
*wq
,
143 struct threadlist
*tl
, boolean_t return_directly
, boolean_t deferred_kevent
);
145 static void _setup_wqthread(proc_t p
, thread_t th
, struct workqueue
*wq
, struct threadlist
*tl
, bool first_use
);
147 static void reset_priority(struct threadlist
*tl
, pthread_priority_t pri
);
148 static pthread_priority_t
pthread_priority_from_wq_class_index(struct workqueue
*wq
, int index
);
150 static void wq_unpark_continue(void* ptr
, wait_result_t wait_result
) __dead2
;
152 static boolean_t
workqueue_addnewthread(struct workqueue
*wq
, boolean_t ignore_constrained_thread_limit
);
154 static void workqueue_removethread(struct threadlist
*tl
, bool fromexit
, bool first_use
);
155 static void workqueue_lock_spin(struct workqueue
*);
156 static void workqueue_unlock(struct workqueue
*);
158 static boolean_t
may_start_constrained_thread(struct workqueue
*wq
, uint32_t at_priclass
, uint32_t my_priclass
, boolean_t
*start_timer
);
160 static mach_vm_offset_t
stack_addr_hint(proc_t p
, vm_map_t vmap
);
162 int proc_settargetconc(pid_t pid
, int queuenum
, int32_t targetconc
);
163 int proc_setalltargetconc(pid_t pid
, int32_t * targetconcp
);
165 #define WQ_MAXPRI_MIN 0 /* low prio queue num */
166 #define WQ_MAXPRI_MAX 2 /* max prio queuenum */
167 #define WQ_PRI_NUM 3 /* number of prio work queues */
169 #define C_32_STK_ALIGN 16
170 #define C_64_STK_ALIGN 16
171 #define C_64_REDZONE_LEN 128
173 #define PTHREAD_T_OFFSET 0
176 * Flags filed passed to bsdthread_create and back in pthread_start
177 31 <---------------------------------> 0
178 _________________________________________
179 | flags(8) | policy(8) | importance(16) |
180 -----------------------------------------
183 #define PTHREAD_START_CUSTOM 0x01000000
184 #define PTHREAD_START_SETSCHED 0x02000000
185 #define PTHREAD_START_DETACHED 0x04000000
186 #define PTHREAD_START_QOSCLASS 0x08000000
187 #define PTHREAD_START_TSD_BASE_SET 0x10000000
188 #define PTHREAD_START_QOSCLASS_MASK 0x00ffffff
189 #define PTHREAD_START_POLICY_BITSHIFT 16
190 #define PTHREAD_START_POLICY_MASK 0xff
191 #define PTHREAD_START_IMPORTANCE_MASK 0xffff
193 #define SCHED_OTHER POLICY_TIMESHARE
194 #define SCHED_FIFO POLICY_FIFO
195 #define SCHED_RR POLICY_RR
197 #define BASEPRI_DEFAULT 31
201 uint32_t wq_yielded_threshold
= WQ_YIELDED_THRESHOLD
;
202 uint32_t wq_yielded_window_usecs
= WQ_YIELDED_WINDOW_USECS
;
203 uint32_t wq_stalled_window_usecs
= WQ_STALLED_WINDOW_USECS
;
204 uint32_t wq_reduce_pool_window_usecs
= WQ_REDUCE_POOL_WINDOW_USECS
;
205 uint32_t wq_max_timer_interval_usecs
= WQ_MAX_TIMER_INTERVAL_USECS
;
206 uint32_t wq_max_threads
= WORKQUEUE_MAXTHREADS
;
207 uint32_t wq_max_constrained_threads
= WORKQUEUE_MAXTHREADS
/ 8;
208 uint32_t wq_max_concurrency
= 1; // set to ncpus on load
210 SYSCTL_INT(_kern
, OID_AUTO
, wq_yielded_threshold
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
211 &wq_yielded_threshold
, 0, "");
213 SYSCTL_INT(_kern
, OID_AUTO
, wq_yielded_window_usecs
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
214 &wq_yielded_window_usecs
, 0, "");
216 SYSCTL_INT(_kern
, OID_AUTO
, wq_stalled_window_usecs
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
217 &wq_stalled_window_usecs
, 0, "");
219 SYSCTL_INT(_kern
, OID_AUTO
, wq_reduce_pool_window_usecs
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
220 &wq_reduce_pool_window_usecs
, 0, "");
222 SYSCTL_INT(_kern
, OID_AUTO
, wq_max_timer_interval_usecs
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
223 &wq_max_timer_interval_usecs
, 0, "");
225 SYSCTL_INT(_kern
, OID_AUTO
, wq_max_threads
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
226 &wq_max_threads
, 0, "");
228 SYSCTL_INT(_kern
, OID_AUTO
, wq_max_constrained_threads
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
229 &wq_max_constrained_threads
, 0, "");
232 SYSCTL_INT(_kern
, OID_AUTO
, wq_max_concurrency
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
233 &wq_max_concurrency
, 0, "");
235 static int wq_kevent_test SYSCTL_HANDLER_ARGS
;
236 SYSCTL_PROC(_debug
, OID_AUTO
, wq_kevent_test
, CTLFLAG_MASKED
| CTLFLAG_RW
| CTLFLAG_LOCKED
| CTLFLAG_ANYBODY
| CTLTYPE_OPAQUE
, NULL
, 0, wq_kevent_test
, 0, "-");
239 static uint32_t wq_init_constrained_limit
= 1;
241 uint32_t pthread_debug_tracing
= 1;
243 SYSCTL_INT(_kern
, OID_AUTO
, pthread_debug_tracing
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
244 &pthread_debug_tracing
, 0, "")
247 #pragma mark - Process/Thread Setup/Teardown syscalls
249 static mach_vm_offset_t
250 stack_addr_hint(proc_t p
, vm_map_t vmap
)
252 mach_vm_offset_t stackaddr
;
253 mach_vm_offset_t aslr_offset
;
254 bool proc64bit
= proc_is64bit(p
);
256 // We can't safely take random values % something unless its a power-of-two
257 _Static_assert(powerof2(PTH_DEFAULT_STACKSIZE
), "PTH_DEFAULT_STACKSIZE is a power-of-two");
259 #if defined(__i386__) || defined(__x86_64__)
261 // Matches vm_map_get_max_aslr_slide_pages's image shift in xnu
262 aslr_offset
= random() % (1 << 28); // about 512 stacks
264 // Actually bigger than the image shift, we've got ~256MB to work with
265 aslr_offset
= random() % (16 * PTH_DEFAULT_STACKSIZE
);
267 aslr_offset
= vm_map_trunc_page_mask(aslr_offset
, vm_map_page_mask(vmap
));
269 // Above nanomalloc range (see NANOZONE_SIGNATURE)
270 stackaddr
= 0x700000000000 + aslr_offset
;
272 stackaddr
= SHARED_REGION_BASE_I386
+ SHARED_REGION_SIZE_I386
+ aslr_offset
;
274 #elif defined(__arm__) || defined(__arm64__)
275 // vm_map_get_max_aslr_slide_pages ensures 1MB of slide, we do better
276 aslr_offset
= random() % ((proc64bit
? 4 : 2) * PTH_DEFAULT_STACKSIZE
);
277 aslr_offset
= vm_map_trunc_page_mask((vm_map_offset_t
)aslr_offset
, vm_map_page_mask(vmap
));
279 // 64 stacks below nanomalloc (see NANOZONE_SIGNATURE)
280 stackaddr
= 0x170000000 - 64 * PTH_DEFAULT_STACKSIZE
- aslr_offset
;
282 // If you try to slide down from this point, you risk ending up in memory consumed by malloc
283 stackaddr
= SHARED_REGION_BASE_ARM
- 32 * PTH_DEFAULT_STACKSIZE
+ aslr_offset
;
286 #error Need to define a stack address hint for this architecture
292 * bsdthread_create system call. Used by pthread_create.
295 _bsdthread_create(struct proc
*p
, user_addr_t user_func
, user_addr_t user_funcarg
, user_addr_t user_stack
, user_addr_t user_pthread
, uint32_t flags
, user_addr_t
*retval
)
301 mach_vm_offset_t stackaddr
;
302 mach_vm_size_t th_allocsize
= 0;
303 mach_vm_size_t th_guardsize
;
304 mach_vm_offset_t th_stack
;
305 mach_vm_offset_t th_pthread
;
306 mach_vm_offset_t th_tsd_base
;
307 mach_port_name_t th_thport
;
309 vm_map_t vmap
= pthread_kern
->current_map();
310 task_t ctask
= current_task();
311 unsigned int policy
, importance
;
316 if (pthread_kern
->proc_get_register(p
) == 0) {
320 PTHREAD_TRACE(TRACE_pthread_thread_create
| DBG_FUNC_START
, flags
, 0, 0, 0, 0);
322 isLP64
= proc_is64bit(p
);
323 th_guardsize
= vm_map_page_size(vmap
);
325 stackaddr
= pthread_kern
->proc_get_stack_addr_hint(p
);
326 kret
= pthread_kern
->thread_create(ctask
, &th
);
327 if (kret
!= KERN_SUCCESS
)
329 thread_reference(th
);
331 pthread_kern
->thread_set_tag(th
, THREAD_TAG_PTHREAD
);
333 sright
= (void *)pthread_kern
->convert_thread_to_port(th
);
334 th_thport
= pthread_kern
->ipc_port_copyout_send(sright
, pthread_kern
->task_get_ipcspace(ctask
));
336 if ((flags
& PTHREAD_START_CUSTOM
) == 0) {
337 mach_vm_size_t pthread_size
=
338 vm_map_round_page_mask(pthread_kern
->proc_get_pthsize(p
) + PTHREAD_T_OFFSET
, vm_map_page_mask(vmap
));
339 th_allocsize
= th_guardsize
+ user_stack
+ pthread_size
;
340 user_stack
+= PTHREAD_T_OFFSET
;
342 kret
= mach_vm_map(vmap
, &stackaddr
,
345 VM_MAKE_TAG(VM_MEMORY_STACK
)| VM_FLAGS_ANYWHERE
, NULL
,
346 0, FALSE
, VM_PROT_DEFAULT
, VM_PROT_ALL
,
348 if (kret
!= KERN_SUCCESS
){
349 kret
= mach_vm_allocate(vmap
,
350 &stackaddr
, th_allocsize
,
351 VM_MAKE_TAG(VM_MEMORY_STACK
)| VM_FLAGS_ANYWHERE
);
353 if (kret
!= KERN_SUCCESS
) {
358 PTHREAD_TRACE(TRACE_pthread_thread_create
|DBG_FUNC_NONE
, th_allocsize
, stackaddr
, 0, 2, 0);
362 * The guard page is at the lowest address
363 * The stack base is the highest address
365 kret
= mach_vm_protect(vmap
, stackaddr
, th_guardsize
, FALSE
, VM_PROT_NONE
);
367 if (kret
!= KERN_SUCCESS
) {
372 th_pthread
= stackaddr
+ th_guardsize
+ user_stack
;
373 th_stack
= th_pthread
;
376 * Pre-fault the first page of the new thread's stack and the page that will
377 * contain the pthread_t structure.
379 if (vm_map_trunc_page_mask((vm_map_offset_t
)(th_stack
- C_64_REDZONE_LEN
), vm_map_page_mask(vmap
)) !=
380 vm_map_trunc_page_mask((vm_map_offset_t
)th_pthread
, vm_map_page_mask(vmap
))){
382 vm_map_trunc_page_mask((vm_map_offset_t
)(th_stack
- C_64_REDZONE_LEN
), vm_map_page_mask(vmap
)),
383 VM_PROT_READ
| VM_PROT_WRITE
,
385 THREAD_UNINT
, NULL
, 0);
389 vm_map_trunc_page_mask((vm_map_offset_t
)th_pthread
, vm_map_page_mask(vmap
)),
390 VM_PROT_READ
| VM_PROT_WRITE
,
392 THREAD_UNINT
, NULL
, 0);
395 th_stack
= user_stack
;
396 th_pthread
= user_pthread
;
398 PTHREAD_TRACE(TRACE_pthread_thread_create
|DBG_FUNC_NONE
, 0, 0, 0, 3, 0);
401 tsd_offset
= pthread_kern
->proc_get_pthread_tsd_offset(p
);
403 th_tsd_base
= th_pthread
+ tsd_offset
;
404 kret
= pthread_kern
->thread_set_tsd_base(th
, th_tsd_base
);
405 if (kret
== KERN_SUCCESS
) {
406 flags
|= PTHREAD_START_TSD_BASE_SET
;
410 #if defined(__i386__) || defined(__x86_64__)
412 * Set up i386 registers & function call.
415 x86_thread_state32_t state
= {
416 .eip
= (unsigned int)pthread_kern
->proc_get_threadstart(p
),
417 .eax
= (unsigned int)th_pthread
,
418 .ebx
= (unsigned int)th_thport
,
419 .ecx
= (unsigned int)user_func
,
420 .edx
= (unsigned int)user_funcarg
,
421 .edi
= (unsigned int)user_stack
,
422 .esi
= (unsigned int)flags
,
426 .esp
= (int)((vm_offset_t
)(th_stack
-C_32_STK_ALIGN
))
429 error
= pthread_kern
->thread_set_wq_state32(th
, (thread_state_t
)&state
);
430 if (error
!= KERN_SUCCESS
) {
435 x86_thread_state64_t state64
= {
436 .rip
= (uint64_t)pthread_kern
->proc_get_threadstart(p
),
437 .rdi
= (uint64_t)th_pthread
,
438 .rsi
= (uint64_t)(th_thport
),
439 .rdx
= (uint64_t)user_func
,
440 .rcx
= (uint64_t)user_funcarg
,
441 .r8
= (uint64_t)user_stack
,
442 .r9
= (uint64_t)flags
,
444 * set stack pointer aligned to 16 byte boundary
446 .rsp
= (uint64_t)(th_stack
- C_64_REDZONE_LEN
)
449 error
= pthread_kern
->thread_set_wq_state64(th
, (thread_state_t
)&state64
);
450 if (error
!= KERN_SUCCESS
) {
456 #elif defined(__arm__)
457 arm_thread_state_t state
= {
458 .pc
= (int)pthread_kern
->proc_get_threadstart(p
),
459 .r
[0] = (unsigned int)th_pthread
,
460 .r
[1] = (unsigned int)th_thport
,
461 .r
[2] = (unsigned int)user_func
,
462 .r
[3] = (unsigned int)user_funcarg
,
463 .r
[4] = (unsigned int)user_stack
,
464 .r
[5] = (unsigned int)flags
,
466 /* Set r7 & lr to 0 for better back tracing */
473 .sp
= (int)((vm_offset_t
)(th_stack
-C_32_STK_ALIGN
))
476 (void) pthread_kern
->thread_set_wq_state32(th
, (thread_state_t
)&state
);
479 #error bsdthread_create not defined for this architecture
482 if ((flags
& PTHREAD_START_SETSCHED
) != 0) {
483 /* Set scheduling parameters if needed */
484 thread_extended_policy_data_t extinfo
;
485 thread_precedence_policy_data_t precedinfo
;
487 importance
= (flags
& PTHREAD_START_IMPORTANCE_MASK
);
488 policy
= (flags
>> PTHREAD_START_POLICY_BITSHIFT
) & PTHREAD_START_POLICY_MASK
;
490 if (policy
== SCHED_OTHER
) {
491 extinfo
.timeshare
= 1;
493 extinfo
.timeshare
= 0;
496 thread_policy_set(th
, THREAD_EXTENDED_POLICY
, (thread_policy_t
)&extinfo
, THREAD_EXTENDED_POLICY_COUNT
);
498 precedinfo
.importance
= (importance
- BASEPRI_DEFAULT
);
499 thread_policy_set(th
, THREAD_PRECEDENCE_POLICY
, (thread_policy_t
)&precedinfo
, THREAD_PRECEDENCE_POLICY_COUNT
);
500 } else if ((flags
& PTHREAD_START_QOSCLASS
) != 0) {
501 /* Set thread QoS class if requested. */
502 pthread_priority_t priority
= (pthread_priority_t
)(flags
& PTHREAD_START_QOSCLASS_MASK
);
504 thread_qos_policy_data_t qos
;
505 qos
.qos_tier
= pthread_priority_get_thread_qos(priority
);
506 qos
.tier_importance
= (qos
.qos_tier
== QOS_CLASS_UNSPECIFIED
) ? 0 :
507 _pthread_priority_get_relpri(priority
);
509 pthread_kern
->thread_policy_set_internal(th
, THREAD_QOS_POLICY
, (thread_policy_t
)&qos
, THREAD_QOS_POLICY_COUNT
);
512 kret
= pthread_kern
->thread_resume(th
);
513 if (kret
!= KERN_SUCCESS
) {
517 thread_deallocate(th
); /* drop the creator reference */
519 PTHREAD_TRACE(TRACE_pthread_thread_create
|DBG_FUNC_END
, error
, th_pthread
, 0, 0, 0);
521 // cast required as mach_vm_offset_t is always 64 bits even on 32-bit platforms
522 *retval
= (user_addr_t
)th_pthread
;
527 if (allocated
!= 0) {
528 (void)mach_vm_deallocate(vmap
, stackaddr
, th_allocsize
);
531 (void)pthread_kern
->mach_port_deallocate(pthread_kern
->task_get_ipcspace(ctask
), th_thport
);
532 (void)thread_terminate(th
);
533 (void)thread_deallocate(th
);
538 * bsdthread_terminate system call. Used by pthread_terminate
541 _bsdthread_terminate(__unused
struct proc
*p
,
542 user_addr_t stackaddr
,
546 __unused
int32_t *retval
)
548 mach_vm_offset_t freeaddr
;
549 mach_vm_size_t freesize
;
551 thread_t th
= current_thread();
553 freeaddr
= (mach_vm_offset_t
)stackaddr
;
556 PTHREAD_TRACE(TRACE_pthread_thread_terminate
|DBG_FUNC_START
, freeaddr
, freesize
, kthport
, 0xff, 0);
558 if ((freesize
!= (mach_vm_size_t
)0) && (freeaddr
!= (mach_vm_offset_t
)0)) {
559 if (pthread_kern
->thread_get_tag(th
) & THREAD_TAG_MAINTHREAD
){
560 vm_map_t user_map
= pthread_kern
->current_map();
561 freesize
= vm_map_trunc_page_mask((vm_map_offset_t
)freesize
- 1, vm_map_page_mask(user_map
));
562 kret
= mach_vm_behavior_set(user_map
, freeaddr
, freesize
, VM_BEHAVIOR_REUSABLE
);
563 assert(kret
== KERN_SUCCESS
|| kret
== KERN_INVALID_ADDRESS
);
564 kret
= kret
? kret
: mach_vm_protect(user_map
, freeaddr
, freesize
, FALSE
, VM_PROT_NONE
);
565 assert(kret
== KERN_SUCCESS
|| kret
== KERN_INVALID_ADDRESS
);
567 kret
= mach_vm_deallocate(pthread_kern
->current_map(), freeaddr
, freesize
);
568 if (kret
!= KERN_SUCCESS
) {
569 PTHREAD_TRACE(TRACE_pthread_thread_terminate
|DBG_FUNC_END
, kret
, 0, 0, 0, 0);
575 (void) thread_terminate(th
);
576 if (sem
!= MACH_PORT_NULL
) {
577 kret
= pthread_kern
->semaphore_signal_internal_trap(sem
);
578 if (kret
!= KERN_SUCCESS
) {
579 PTHREAD_TRACE(TRACE_pthread_thread_terminate
|DBG_FUNC_END
, kret
, 0, 0, 0, 0);
584 if (kthport
!= MACH_PORT_NULL
) {
585 pthread_kern
->mach_port_deallocate(pthread_kern
->task_get_ipcspace(current_task()), kthport
);
588 PTHREAD_TRACE(TRACE_pthread_thread_terminate
|DBG_FUNC_END
, 0, 0, 0, 0, 0);
590 pthread_kern
->thread_exception_return();
591 panic("bsdthread_terminate: still running\n");
593 PTHREAD_TRACE(TRACE_pthread_thread_terminate
|DBG_FUNC_END
, 0, 0xff, 0, 0, 0);
599 * bsdthread_register system call. Performs per-process setup. Responsible for
600 * returning capabilitiy bits to userspace and receiving userspace function addresses.
603 _bsdthread_register(struct proc
*p
,
604 user_addr_t threadstart
,
605 user_addr_t wqthread
,
607 user_addr_t pthread_init_data
,
608 user_addr_t pthread_init_data_size
,
609 uint64_t dispatchqueue_offset
,
612 /* We have to do this first so that it resets after fork */
613 pthread_kern
->proc_set_stack_addr_hint(p
, (user_addr_t
)stack_addr_hint(p
, pthread_kern
->current_map()));
615 /* prevent multiple registrations */
616 if (pthread_kern
->proc_get_register(p
) != 0) {
619 /* syscall randomizer test can pass bogus values */
620 if (pthsize
< 0 || pthsize
> MAX_PTHREAD_SIZE
) {
623 pthread_kern
->proc_set_threadstart(p
, threadstart
);
624 pthread_kern
->proc_set_wqthread(p
, wqthread
);
625 pthread_kern
->proc_set_pthsize(p
, pthsize
);
626 pthread_kern
->proc_set_register(p
);
628 /* if we have pthread_init_data, then we use that and target_concptr (which is an offset) get data. */
629 if (pthread_init_data
!= 0) {
630 thread_qos_policy_data_t qos
;
632 struct _pthread_registration_data data
= {};
633 size_t pthread_init_sz
= MIN(sizeof(struct _pthread_registration_data
), (size_t)pthread_init_data_size
);
635 kern_return_t kr
= copyin(pthread_init_data
, &data
, pthread_init_sz
);
636 if (kr
!= KERN_SUCCESS
) {
640 /* Incoming data from the data structure */
641 pthread_kern
->proc_set_dispatchqueue_offset(p
, data
.dispatch_queue_offset
);
642 if (data
.version
> offsetof(struct _pthread_registration_data
, tsd_offset
)
643 && data
.tsd_offset
< (uint32_t)pthsize
) {
644 pthread_kern
->proc_set_pthread_tsd_offset(p
, data
.tsd_offset
);
647 /* Outgoing data that userspace expects as a reply */
648 data
.version
= sizeof(struct _pthread_registration_data
);
649 if (pthread_kern
->qos_main_thread_active()) {
650 mach_msg_type_number_t nqos
= THREAD_QOS_POLICY_COUNT
;
651 boolean_t gd
= FALSE
;
653 kr
= pthread_kern
->thread_policy_get(current_thread(), THREAD_QOS_POLICY
, (thread_policy_t
)&qos
, &nqos
, &gd
);
654 if (kr
!= KERN_SUCCESS
|| qos
.qos_tier
== THREAD_QOS_UNSPECIFIED
) {
655 /* Unspecified threads means the kernel wants us to impose legacy upon the thread. */
656 qos
.qos_tier
= THREAD_QOS_LEGACY
;
657 qos
.tier_importance
= 0;
659 kr
= pthread_kern
->thread_policy_set_internal(current_thread(), THREAD_QOS_POLICY
, (thread_policy_t
)&qos
, THREAD_QOS_POLICY_COUNT
);
662 if (kr
== KERN_SUCCESS
) {
663 data
.main_qos
= thread_qos_get_pthread_priority(qos
.qos_tier
);
665 data
.main_qos
= _pthread_priority_make_newest(QOS_CLASS_UNSPECIFIED
, 0, 0);
668 data
.main_qos
= _pthread_priority_make_newest(QOS_CLASS_UNSPECIFIED
, 0, 0);
671 kr
= copyout(&data
, pthread_init_data
, pthread_init_sz
);
672 if (kr
!= KERN_SUCCESS
) {
676 pthread_kern
->proc_set_dispatchqueue_offset(p
, dispatchqueue_offset
);
679 /* return the supported feature set as the return value. */
680 *retval
= PTHREAD_FEATURE_SUPPORTED
;
685 #pragma mark - QoS Manipulation
688 _bsdthread_ctl_set_qos(struct proc
*p
, user_addr_t __unused cmd
, mach_port_name_t kport
, user_addr_t tsd_priority_addr
, user_addr_t arg3
, int *retval
)
693 pthread_priority_t priority
;
695 /* Unused parameters must be zero. */
700 /* QoS is stored in a given slot in the pthread TSD. We need to copy that in and set our QoS based on it. */
701 if (proc_is64bit(p
)) {
703 kr
= copyin(tsd_priority_addr
, &v
, sizeof(v
));
704 if (kr
!= KERN_SUCCESS
) {
707 priority
= (int)(v
& 0xffffffff);
710 kr
= copyin(tsd_priority_addr
, &v
, sizeof(v
));
711 if (kr
!= KERN_SUCCESS
) {
717 if ((th
= port_name_to_thread(kport
)) == THREAD_NULL
) {
721 /* <rdar://problem/16211829> Disable pthread_set_qos_class_np() on threads other than pthread_self */
722 if (th
!= current_thread()) {
723 thread_deallocate(th
);
727 int rv
= _bsdthread_ctl_set_self(p
, 0, priority
, 0, _PTHREAD_SET_SELF_QOS_FLAG
, retval
);
729 /* Static param the thread, we just set QoS on it, so its stuck in QoS land now. */
730 /* pthread_kern->thread_static_param(th, TRUE); */ // see <rdar://problem/16433744>, for details
732 thread_deallocate(th
);
737 static inline struct threadlist
*
738 util_get_thread_threadlist_entry(thread_t th
)
740 struct uthread
*uth
= pthread_kern
->get_bsdthread_info(th
);
742 struct threadlist
*tl
= pthread_kern
->uthread_get_threadlist(uth
);
749 _bsdthread_ctl_set_self(struct proc
*p
, user_addr_t __unused cmd
, pthread_priority_t priority
, mach_port_name_t voucher
, _pthread_set_flags_t flags
, int __unused
*retval
)
751 thread_qos_policy_data_t qos
;
752 mach_msg_type_number_t nqos
= THREAD_QOS_POLICY_COUNT
;
753 boolean_t gd
= FALSE
;
754 bool was_manager_thread
= false;
755 thread_t th
= current_thread();
756 struct workqueue
*wq
= NULL
;
757 struct threadlist
*tl
= NULL
;
760 int qos_rv
= 0, voucher_rv
= 0, fixedpri_rv
= 0;
762 if ((flags
& _PTHREAD_SET_SELF_WQ_KEVENT_UNBIND
) != 0) {
763 tl
= util_get_thread_threadlist_entry(th
);
770 workqueue_lock_spin(wq
);
771 if (tl
->th_flags
& TH_LIST_KEVENT_BOUND
) {
772 tl
->th_flags
&= ~TH_LIST_KEVENT_BOUND
;
773 unsigned int kevent_flags
= KEVENT_FLAG_WORKQ
;
774 if (tl
->th_priority
== WORKQUEUE_EVENT_MANAGER_BUCKET
) {
775 kevent_flags
|= KEVENT_FLAG_WORKQ_MANAGER
;
778 workqueue_unlock(wq
);
779 kevent_qos_internal_unbind(p
, class_index_get_thread_qos(tl
->th_priority
), th
, kevent_flags
);
781 workqueue_unlock(wq
);
786 if ((flags
& _PTHREAD_SET_SELF_QOS_FLAG
) != 0) {
787 kr
= pthread_kern
->thread_policy_get(th
, THREAD_QOS_POLICY
, (thread_policy_t
)&qos
, &nqos
, &gd
);
788 if (kr
!= KERN_SUCCESS
) {
793 /* If we have main-thread QoS then we don't allow a thread to come out of QOS_CLASS_UNSPECIFIED. */
794 if (pthread_kern
->qos_main_thread_active() && qos
.qos_tier
== THREAD_QOS_UNSPECIFIED
) {
799 /* Get the work queue for tracing, also the threadlist for bucket manipluation. */
801 tl
= util_get_thread_threadlist_entry(th
);
802 if (tl
) wq
= tl
->th_workq
;
805 PTHREAD_TRACE_WQ(TRACE_pthread_set_qos_self
| DBG_FUNC_START
, wq
, qos
.qos_tier
, qos
.tier_importance
, 0, 0);
807 qos
.qos_tier
= pthread_priority_get_thread_qos(priority
);
808 qos
.tier_importance
= (qos
.qos_tier
== QOS_CLASS_UNSPECIFIED
) ? 0 : _pthread_priority_get_relpri(priority
);
810 if (qos
.qos_tier
== QOS_CLASS_UNSPECIFIED
) {
815 /* If we're a workqueue, the threadlist item priority needs adjusting, along with the bucket we were running in. */
817 workqueue_lock_spin(wq
);
818 bool now_under_constrained_limit
= false;
820 assert(!(tl
->th_flags
& TH_LIST_KEVENT_BOUND
));
822 kr
= pthread_kern
->thread_set_workq_qos(th
, qos
.qos_tier
, qos
.tier_importance
);
823 assert(kr
== KERN_SUCCESS
|| kr
== KERN_TERMINATED
);
825 /* Fix up counters. */
826 uint8_t old_bucket
= tl
->th_priority
;
827 uint8_t new_bucket
= pthread_priority_get_class_index(priority
);
828 if (old_bucket
== WORKQUEUE_EVENT_MANAGER_BUCKET
) {
829 was_manager_thread
= true;
832 uint32_t old_active
= OSAddAtomic(-1, &wq
->wq_thactive_count
[old_bucket
]);
833 OSAddAtomic(1, &wq
->wq_thactive_count
[new_bucket
]);
835 wq
->wq_thscheduled_count
[old_bucket
]--;
836 wq
->wq_thscheduled_count
[new_bucket
]++;
838 bool old_overcommit
= !(tl
->th_flags
& TH_LIST_CONSTRAINED
);
839 bool new_overcommit
= priority
& _PTHREAD_PRIORITY_OVERCOMMIT_FLAG
;
840 if (!old_overcommit
&& new_overcommit
) {
841 wq
->wq_constrained_threads_scheduled
--;
842 tl
->th_flags
&= ~TH_LIST_CONSTRAINED
;
843 if (wq
->wq_constrained_threads_scheduled
== wq_max_constrained_threads
- 1) {
844 now_under_constrained_limit
= true;
846 } else if (old_overcommit
&& !new_overcommit
) {
847 wq
->wq_constrained_threads_scheduled
++;
848 tl
->th_flags
|= TH_LIST_CONSTRAINED
;
851 tl
->th_priority
= new_bucket
;
853 /* If we were at the ceiling of threads for a given bucket, we have
854 * to reevaluate whether we should start more work.
856 if (old_active
== wq
->wq_reqconc
[old_bucket
] || now_under_constrained_limit
) {
857 /* workqueue_run_nextreq will drop the workqueue lock in all exit paths. */
858 (void)workqueue_run_nextreq(p
, wq
, THREAD_NULL
, RUN_NEXTREQ_DEFAULT
, 0, false);
860 workqueue_unlock(wq
);
863 kr
= pthread_kern
->thread_policy_set_internal(th
, THREAD_QOS_POLICY
, (thread_policy_t
)&qos
, THREAD_QOS_POLICY_COUNT
);
864 if (kr
!= KERN_SUCCESS
) {
869 PTHREAD_TRACE_WQ(TRACE_pthread_set_qos_self
| DBG_FUNC_END
, wq
, qos
.qos_tier
, qos
.tier_importance
, 0, 0);
873 if ((flags
& _PTHREAD_SET_SELF_VOUCHER_FLAG
) != 0) {
874 kr
= pthread_kern
->thread_set_voucher_name(voucher
);
875 if (kr
!= KERN_SUCCESS
) {
882 if (qos_rv
) goto done
;
883 if ((flags
& _PTHREAD_SET_SELF_FIXEDPRIORITY_FLAG
) != 0) {
884 thread_extended_policy_data_t extpol
= {.timeshare
= 0};
886 if (!tl
) tl
= util_get_thread_threadlist_entry(th
);
888 /* Not allowed on workqueue threads */
889 fixedpri_rv
= ENOTSUP
;
893 kr
= pthread_kern
->thread_policy_set_internal(th
, THREAD_EXTENDED_POLICY
, (thread_policy_t
)&extpol
, THREAD_EXTENDED_POLICY_COUNT
);
894 if (kr
!= KERN_SUCCESS
) {
895 fixedpri_rv
= EINVAL
;
898 } else if ((flags
& _PTHREAD_SET_SELF_TIMESHARE_FLAG
) != 0) {
899 thread_extended_policy_data_t extpol
= {.timeshare
= 1};
901 if (!tl
) tl
= util_get_thread_threadlist_entry(th
);
903 /* Not allowed on workqueue threads */
904 fixedpri_rv
= ENOTSUP
;
908 kr
= pthread_kern
->thread_policy_set_internal(th
, THREAD_EXTENDED_POLICY
, (thread_policy_t
)&extpol
, THREAD_EXTENDED_POLICY_COUNT
);
909 if (kr
!= KERN_SUCCESS
) {
910 fixedpri_rv
= EINVAL
;
916 if (qos_rv
&& voucher_rv
) {
917 /* Both failed, give that a unique error. */
937 _bsdthread_ctl_qos_override_start(struct proc __unused
*p
, user_addr_t __unused cmd
, mach_port_name_t kport
, pthread_priority_t priority
, user_addr_t resource
, int __unused
*retval
)
942 if ((th
= port_name_to_thread(kport
)) == THREAD_NULL
) {
946 int override_qos
= pthread_priority_get_thread_qos(priority
);
948 struct threadlist
*tl
= util_get_thread_threadlist_entry(th
);
950 PTHREAD_TRACE_WQ(TRACE_wq_override_start
| DBG_FUNC_NONE
, tl
->th_workq
, thread_tid(th
), 1, priority
, 0);
953 /* The only failure case here is if we pass a tid and have it lookup the thread, we pass the uthread, so this all always succeeds. */
954 pthread_kern
->proc_usynch_thread_qos_add_override_for_resource_check_owner(th
, override_qos
, TRUE
,
955 resource
, THREAD_QOS_OVERRIDE_TYPE_PTHREAD_EXPLICIT_OVERRIDE
, USER_ADDR_NULL
, MACH_PORT_NULL
);
956 thread_deallocate(th
);
961 _bsdthread_ctl_qos_override_end(struct proc __unused
*p
, user_addr_t __unused cmd
, mach_port_name_t kport
, user_addr_t resource
, user_addr_t arg3
, int __unused
*retval
)
970 if ((th
= port_name_to_thread(kport
)) == THREAD_NULL
) {
974 struct uthread
*uth
= pthread_kern
->get_bsdthread_info(th
);
976 struct threadlist
*tl
= util_get_thread_threadlist_entry(th
);
978 PTHREAD_TRACE_WQ(TRACE_wq_override_end
| DBG_FUNC_NONE
, tl
->th_workq
, thread_tid(th
), 0, 0, 0);
981 pthread_kern
->proc_usynch_thread_qos_remove_override_for_resource(current_task(), uth
, 0, resource
, THREAD_QOS_OVERRIDE_TYPE_PTHREAD_EXPLICIT_OVERRIDE
);
983 thread_deallocate(th
);
988 _bsdthread_ctl_qos_dispatch_asynchronous_override_add_internal(mach_port_name_t kport
, pthread_priority_t priority
, user_addr_t resource
, user_addr_t ulock_addr
)
993 if ((th
= port_name_to_thread(kport
)) == THREAD_NULL
) {
997 int override_qos
= pthread_priority_get_thread_qos(priority
);
999 struct threadlist
*tl
= util_get_thread_threadlist_entry(th
);
1001 thread_deallocate(th
);
1005 PTHREAD_TRACE_WQ(TRACE_wq_override_dispatch
| DBG_FUNC_NONE
, tl
->th_workq
, thread_tid(th
), 1, priority
, 0);
1007 rv
= pthread_kern
->proc_usynch_thread_qos_add_override_for_resource_check_owner(th
, override_qos
, TRUE
,
1008 resource
, THREAD_QOS_OVERRIDE_TYPE_DISPATCH_ASYNCHRONOUS_OVERRIDE
, ulock_addr
, kport
);
1010 thread_deallocate(th
);
1014 int _bsdthread_ctl_qos_dispatch_asynchronous_override_add(struct proc __unused
*p
, user_addr_t __unused cmd
,
1015 mach_port_name_t kport
, pthread_priority_t priority
, user_addr_t resource
, int __unused
*retval
)
1017 return _bsdthread_ctl_qos_dispatch_asynchronous_override_add_internal(kport
, priority
, resource
, USER_ADDR_NULL
);
1021 _bsdthread_ctl_qos_override_dispatch(struct proc
*p __unused
, user_addr_t cmd __unused
, mach_port_name_t kport
, pthread_priority_t priority
, user_addr_t ulock_addr
, int __unused
*retval
)
1023 return _bsdthread_ctl_qos_dispatch_asynchronous_override_add_internal(kport
, priority
, USER_ADDR_NULL
, ulock_addr
);
1027 _bsdthread_ctl_qos_override_reset(struct proc
*p
, user_addr_t cmd
, user_addr_t arg1
, user_addr_t arg2
, user_addr_t arg3
, int *retval
)
1029 if (arg1
!= 0 || arg2
!= 0 || arg3
!= 0) {
1033 return _bsdthread_ctl_qos_dispatch_asynchronous_override_reset(p
, cmd
, 1 /* reset_all */, 0, 0, retval
);
1037 _bsdthread_ctl_qos_dispatch_asynchronous_override_reset(struct proc __unused
*p
, user_addr_t __unused cmd
, int reset_all
, user_addr_t resource
, user_addr_t arg3
, int __unused
*retval
)
1039 if ((reset_all
&& (resource
!= 0)) || arg3
!= 0) {
1043 thread_t th
= current_thread();
1044 struct uthread
*uth
= pthread_kern
->get_bsdthread_info(th
);
1045 struct threadlist
*tl
= pthread_kern
->uthread_get_threadlist(uth
);
1051 PTHREAD_TRACE_WQ(TRACE_wq_override_reset
| DBG_FUNC_NONE
, tl
->th_workq
, 0, 0, 0, 0);
1053 resource
= reset_all
? THREAD_QOS_OVERRIDE_RESOURCE_WILDCARD
: resource
;
1054 pthread_kern
->proc_usynch_thread_qos_reset_override_for_resource(current_task(), uth
, 0, resource
, THREAD_QOS_OVERRIDE_TYPE_DISPATCH_ASYNCHRONOUS_OVERRIDE
);
1060 _bsdthread_ctl(struct proc
*p
, user_addr_t cmd
, user_addr_t arg1
, user_addr_t arg2
, user_addr_t arg3
, int *retval
)
1063 case BSDTHREAD_CTL_SET_QOS
:
1064 return _bsdthread_ctl_set_qos(p
, cmd
, (mach_port_name_t
)arg1
, arg2
, arg3
, retval
);
1065 case BSDTHREAD_CTL_QOS_OVERRIDE_START
:
1066 return _bsdthread_ctl_qos_override_start(p
, cmd
, (mach_port_name_t
)arg1
, (pthread_priority_t
)arg2
, arg3
, retval
);
1067 case BSDTHREAD_CTL_QOS_OVERRIDE_END
:
1068 return _bsdthread_ctl_qos_override_end(p
, cmd
, (mach_port_name_t
)arg1
, arg2
, arg3
, retval
);
1069 case BSDTHREAD_CTL_QOS_OVERRIDE_RESET
:
1070 return _bsdthread_ctl_qos_override_reset(p
, cmd
, arg1
, arg2
, arg3
, retval
);
1071 case BSDTHREAD_CTL_QOS_OVERRIDE_DISPATCH
:
1072 return _bsdthread_ctl_qos_override_dispatch(p
, cmd
, (mach_port_name_t
)arg1
, (pthread_priority_t
)arg2
, arg3
, retval
);
1073 case BSDTHREAD_CTL_QOS_DISPATCH_ASYNCHRONOUS_OVERRIDE_ADD
:
1074 return _bsdthread_ctl_qos_dispatch_asynchronous_override_add(p
, cmd
, (mach_port_name_t
)arg1
, (pthread_priority_t
)arg2
, arg3
, retval
);
1075 case BSDTHREAD_CTL_QOS_DISPATCH_ASYNCHRONOUS_OVERRIDE_RESET
:
1076 return _bsdthread_ctl_qos_dispatch_asynchronous_override_reset(p
, cmd
, (int)arg1
, arg2
, arg3
, retval
);
1077 case BSDTHREAD_CTL_SET_SELF
:
1078 return _bsdthread_ctl_set_self(p
, cmd
, (pthread_priority_t
)arg1
, (mach_port_name_t
)arg2
, (_pthread_set_flags_t
)arg3
, retval
);
1084 #pragma mark - Workqueue Implementation
1085 #pragma mark workqueue lock
1087 static boolean_t
workqueue_lock_spin_is_acquired_kdp(struct workqueue
*wq
) {
1088 return kdp_lck_spin_is_acquired(&wq
->wq_lock
);
1092 workqueue_lock_spin(struct workqueue
*wq
)
1094 boolean_t interrupt_state
= ml_set_interrupts_enabled(FALSE
);
1095 lck_spin_lock(&wq
->wq_lock
);
1096 wq
->wq_interrupt_state
= interrupt_state
;
1100 workqueue_unlock(struct workqueue
*wq
)
1102 boolean_t interrupt_state
= wq
->wq_interrupt_state
;
1103 lck_spin_unlock(&wq
->wq_lock
);
1104 ml_set_interrupts_enabled(interrupt_state
);
1107 #pragma mark workqueue add timer
1110 * Sets up the timer which will call out to workqueue_add_timer
1113 workqueue_interval_timer_start(struct workqueue
*wq
)
1117 /* n.b. wq_timer_interval is reset to 0 in workqueue_add_timer if the
1118 ATIMER_RUNNING flag is not present. The net effect here is that if a
1119 sequence of threads is required, we'll double the time before we give out
1121 if (wq
->wq_timer_interval
== 0) {
1122 wq
->wq_timer_interval
= wq_stalled_window_usecs
;
1125 wq
->wq_timer_interval
= wq
->wq_timer_interval
* 2;
1127 if (wq
->wq_timer_interval
> wq_max_timer_interval_usecs
) {
1128 wq
->wq_timer_interval
= wq_max_timer_interval_usecs
;
1131 clock_interval_to_deadline(wq
->wq_timer_interval
, 1000, &deadline
);
1133 PTHREAD_TRACE_WQ(TRACE_wq_start_add_timer
, wq
, wq
->wq_reqcount
, wq
->wq_flags
, wq
->wq_timer_interval
, 0);
1135 boolean_t ret
= thread_call_enter1_delayed(wq
->wq_atimer_delayed_call
, wq
->wq_atimer_delayed_call
, deadline
);
1137 panic("delayed_call was already enqueued");
1142 * Immediately trigger the workqueue_add_timer
1145 workqueue_interval_timer_trigger(struct workqueue
*wq
)
1147 PTHREAD_TRACE_WQ(TRACE_wq_start_add_timer
, wq
, wq
->wq_reqcount
, wq
->wq_flags
, 0, 0);
1149 boolean_t ret
= thread_call_enter1(wq
->wq_atimer_immediate_call
, wq
->wq_atimer_immediate_call
);
1151 panic("immediate_call was already enqueued");
1156 * returns whether lastblocked_tsp is within wq_stalled_window_usecs of cur_ts
1159 wq_thread_is_busy(uint64_t cur_ts
, uint64_t *lastblocked_tsp
)
1163 uint64_t lastblocked_ts
;
1167 * the timestamp is updated atomically w/o holding the workqueue lock
1168 * so we need to do an atomic read of the 64 bits so that we don't see
1169 * a mismatched pair of 32 bit reads... we accomplish this in an architecturally
1170 * independent fashion by using OSCompareAndSwap64 to write back the
1171 * value we grabbed... if it succeeds, then we have a good timestamp to
1172 * evaluate... if it fails, we straddled grabbing the timestamp while it
1173 * was being updated... treat a failed update as a busy thread since
1174 * it implies we are about to see a really fresh timestamp anyway
1176 lastblocked_ts
= *lastblocked_tsp
;
1178 if ( !OSCompareAndSwap64((UInt64
)lastblocked_ts
, (UInt64
)lastblocked_ts
, lastblocked_tsp
))
1181 if (lastblocked_ts
>= cur_ts
) {
1183 * because the update of the timestamp when a thread blocks isn't
1184 * serialized against us looking at it (i.e. we don't hold the workq lock)
1185 * it's possible to have a timestamp that matches the current time or
1186 * that even looks to be in the future relative to when we grabbed the current
1187 * time... just treat this as a busy thread since it must have just blocked.
1191 elapsed
= cur_ts
- lastblocked_ts
;
1193 pthread_kern
->absolutetime_to_microtime(elapsed
, &secs
, &usecs
);
1195 if (secs
== 0 && usecs
< wq_stalled_window_usecs
)
1201 WQ_TIMER_DELAYED_NEEDED(struct workqueue
*wq
)
1205 oldflags
= wq
->wq_flags
;
1206 if ( !(oldflags
& (WQ_EXITING
| WQ_ATIMER_DELAYED_RUNNING
))) {
1207 if (OSCompareAndSwap(oldflags
, oldflags
| WQ_ATIMER_DELAYED_RUNNING
, (UInt32
*)&wq
->wq_flags
)) {
1217 WQ_TIMER_IMMEDIATE_NEEDED(struct workqueue
*wq
)
1221 oldflags
= wq
->wq_flags
;
1222 if ( !(oldflags
& (WQ_EXITING
| WQ_ATIMER_IMMEDIATE_RUNNING
))) {
1223 if (OSCompareAndSwap(oldflags
, oldflags
| WQ_ATIMER_IMMEDIATE_RUNNING
, (UInt32
*)&wq
->wq_flags
)) {
1233 * handler function for the timer
1236 workqueue_add_timer(struct workqueue
*wq
, thread_call_t thread_call_self
)
1239 boolean_t start_timer
= FALSE
;
1242 PTHREAD_TRACE_WQ(TRACE_wq_add_timer
| DBG_FUNC_START
, wq
, wq
->wq_flags
, wq
->wq_nthreads
, wq
->wq_thidlecount
, 0);
1246 workqueue_lock_spin(wq
);
1249 * There's two tricky issues here.
1251 * First issue: we start the thread_call's that invoke this routine without
1252 * the workqueue lock held. The scheduler callback needs to trigger
1253 * reevaluation of the number of running threads but shouldn't take that
1254 * lock, so we can't use it to synchronize state around the thread_call.
1255 * As a result, it might re-enter the thread_call while this routine is
1256 * already running. This could cause it to fire a second time and we'll
1257 * have two add_timers running at once. Obviously, we don't want that to
1258 * keep stacking, so we need to keep it at two timers.
1260 * Solution: use wq_flags (accessed via atomic CAS) to synchronize the
1261 * enqueue of the thread_call itself. When a thread needs to trigger the
1262 * add_timer, it checks for ATIMER_DELAYED_RUNNING and, when not set, sets
1263 * the flag then does a thread_call_enter. We'll then remove that flag
1264 * only once we've got the lock and it's safe for the thread_call to be
1267 * Second issue: we need to make sure that the two timers don't execute this
1268 * routine concurrently. We can't use the workqueue lock for this because
1269 * we'll need to drop it during our execution.
1271 * Solution: use WQL_ATIMER_BUSY as a condition variable to indicate that
1272 * we are currently executing the routine and the next thread should wait.
1274 * After all that, we arrive at the following four possible states:
1275 * !WQ_ATIMER_DELAYED_RUNNING && !WQL_ATIMER_BUSY no pending timer, no active timer
1276 * !WQ_ATIMER_DELAYED_RUNNING && WQL_ATIMER_BUSY no pending timer, 1 active timer
1277 * WQ_ATIMER_DELAYED_RUNNING && !WQL_ATIMER_BUSY 1 pending timer, no active timer
1278 * WQ_ATIMER_DELAYED_RUNNING && WQL_ATIMER_BUSY 1 pending timer, 1 active timer
1280 * Further complication sometimes we need to trigger this function to run
1281 * without delay. Because we aren't under a lock between setting
1282 * WQ_ATIMER_DELAYED_RUNNING and calling thread_call_enter, we can't simply
1283 * re-enter the thread call: if thread_call_enter() returned false, we
1284 * wouldn't be able to distinguish the case where the thread_call had
1285 * already fired from the case where it hadn't been entered yet from the
1286 * other thread. So, we use a separate thread_call for immediate
1287 * invocations, and a separate RUNNING flag, WQ_ATIMER_IMMEDIATE_RUNNING.
1290 while (wq
->wq_lflags
& WQL_ATIMER_BUSY
) {
1291 wq
->wq_lflags
|= WQL_ATIMER_WAITING
;
1293 assert_wait((caddr_t
)wq
, (THREAD_UNINT
));
1294 workqueue_unlock(wq
);
1296 thread_block(THREAD_CONTINUE_NULL
);
1298 workqueue_lock_spin(wq
);
1300 wq
->wq_lflags
|= WQL_ATIMER_BUSY
;
1303 * Decide which timer we are and remove the RUNNING flag.
1305 if (thread_call_self
== wq
->wq_atimer_delayed_call
) {
1306 if ((wq
->wq_flags
& WQ_ATIMER_DELAYED_RUNNING
) == 0) {
1307 panic("workqueue_add_timer is the delayed timer but the delayed running flag isn't set");
1309 WQ_UNSETFLAG(wq
, WQ_ATIMER_DELAYED_RUNNING
);
1310 } else if (thread_call_self
== wq
->wq_atimer_immediate_call
) {
1311 if ((wq
->wq_flags
& WQ_ATIMER_IMMEDIATE_RUNNING
) == 0) {
1312 panic("workqueue_add_timer is the immediate timer but the immediate running flag isn't set");
1314 WQ_UNSETFLAG(wq
, WQ_ATIMER_IMMEDIATE_RUNNING
);
1316 panic("workqueue_add_timer can't figure out which timer it is");
1321 if ( !(wq
->wq_flags
& WQ_EXITING
)) {
1322 boolean_t add_thread
= FALSE
;
1324 * check to see if the stall frequency was beyond our tolerance
1325 * or we have work on the queue, but haven't scheduled any
1326 * new work within our acceptable time interval because
1327 * there were no idle threads left to schedule
1329 if (wq
->wq_reqcount
) {
1330 uint32_t priclass
= 0;
1331 uint32_t thactive_count
= 0;
1332 uint64_t curtime
= mach_absolute_time();
1333 uint64_t busycount
= 0;
1335 if (wq
->wq_requests
[WORKQUEUE_EVENT_MANAGER_BUCKET
] &&
1336 wq
->wq_thscheduled_count
[WORKQUEUE_EVENT_MANAGER_BUCKET
] == 0){
1337 priclass
= WORKQUEUE_EVENT_MANAGER_BUCKET
;
1339 for (priclass
= 0; priclass
< WORKQUEUE_NUM_BUCKETS
; priclass
++) {
1340 if (wq
->wq_requests
[priclass
])
1345 if (priclass
< WORKQUEUE_EVENT_MANAGER_BUCKET
){
1347 * Compute a metric for many how many threads are active. We
1348 * find the highest priority request outstanding and then add up
1349 * the number of active threads in that and all higher-priority
1350 * buckets. We'll also add any "busy" threads which are not
1351 * active but blocked recently enough that we can't be sure
1352 * they've gone idle yet. We'll then compare this metric to our
1353 * max concurrency to decide whether to add a new thread.
1355 for (uint32_t i
= 0; i
<= priclass
; i
++) {
1356 thactive_count
+= wq
->wq_thactive_count
[i
];
1358 if (wq
->wq_thscheduled_count
[i
] < wq
->wq_thactive_count
[i
]) {
1359 if (wq_thread_is_busy(curtime
, &wq
->wq_lastblocked_ts
[i
]))
1365 if (thactive_count
+ busycount
< wq
->wq_max_concurrency
||
1366 priclass
== WORKQUEUE_EVENT_MANAGER_BUCKET
) {
1368 if (wq
->wq_thidlecount
== 0) {
1370 * if we have no idle threads, try to add one
1372 retval
= workqueue_addnewthread(wq
, priclass
== WORKQUEUE_EVENT_MANAGER_BUCKET
);
1377 if (wq
->wq_reqcount
) {
1379 * as long as we have threads to schedule, and we successfully
1380 * scheduled new work, keep trying
1382 while (wq
->wq_thidlecount
&& !(wq
->wq_flags
& WQ_EXITING
)) {
1384 * workqueue_run_nextreq is responsible for
1385 * dropping the workqueue lock in all cases
1387 retval
= (workqueue_run_nextreq(p
, wq
, THREAD_NULL
, RUN_NEXTREQ_ADD_TIMER
, 0, false) != THREAD_NULL
);
1388 workqueue_lock_spin(wq
);
1390 if (retval
== FALSE
)
1393 if ( !(wq
->wq_flags
& WQ_EXITING
) && wq
->wq_reqcount
) {
1395 if (wq
->wq_thidlecount
== 0 && retval
== TRUE
&& add_thread
== TRUE
)
1398 if (wq
->wq_thidlecount
== 0 || busycount
) {
1399 start_timer
= WQ_TIMER_DELAYED_NEEDED(wq
);
1402 PTHREAD_TRACE_WQ(TRACE_wq_add_timer
| DBG_FUNC_NONE
, wq
, wq
->wq_reqcount
, wq
->wq_thidlecount
, busycount
, 0);
1409 * If we called WQ_TIMER_NEEDED above, then this flag will be set if that
1410 * call marked the timer running. If so, we let the timer interval grow.
1411 * Otherwise, we reset it back to 0.
1413 if (!(wq
->wq_flags
& WQ_ATIMER_DELAYED_RUNNING
)) {
1414 wq
->wq_timer_interval
= 0;
1417 wq
->wq_lflags
&= ~WQL_ATIMER_BUSY
;
1419 if ((wq
->wq_flags
& WQ_EXITING
) || (wq
->wq_lflags
& WQL_ATIMER_WAITING
)) {
1421 * wakeup the thread hung up in _workqueue_mark_exiting or workqueue_add_timer waiting for this timer
1422 * to finish getting out of the way
1424 wq
->wq_lflags
&= ~WQL_ATIMER_WAITING
;
1428 PTHREAD_TRACE_WQ(TRACE_wq_add_timer
| DBG_FUNC_END
, wq
, start_timer
, wq
->wq_nthreads
, wq
->wq_thidlecount
, 0);
1430 workqueue_unlock(wq
);
1432 if (start_timer
== TRUE
)
1433 workqueue_interval_timer_start(wq
);
1436 #pragma mark thread state tracking
1438 // called by spinlock code when trying to yield to lock owner
1440 _workqueue_thread_yielded(void)
1442 struct workqueue
*wq
;
1447 if ((wq
= pthread_kern
->proc_get_wqptr(p
)) == NULL
|| wq
->wq_reqcount
== 0)
1450 workqueue_lock_spin(wq
);
1452 if (wq
->wq_reqcount
) {
1458 if (wq
->wq_thread_yielded_count
++ == 0)
1459 wq
->wq_thread_yielded_timestamp
= mach_absolute_time();
1461 if (wq
->wq_thread_yielded_count
< wq_yielded_threshold
) {
1462 workqueue_unlock(wq
);
1466 PTHREAD_TRACE_WQ(TRACE_wq_thread_yielded
| DBG_FUNC_START
, wq
, wq
->wq_thread_yielded_count
, wq
->wq_reqcount
, 0, 0);
1468 wq
->wq_thread_yielded_count
= 0;
1470 curtime
= mach_absolute_time();
1471 elapsed
= curtime
- wq
->wq_thread_yielded_timestamp
;
1472 pthread_kern
->absolutetime_to_microtime(elapsed
, &secs
, &usecs
);
1474 if (secs
== 0 && usecs
< wq_yielded_window_usecs
) {
1476 if (wq
->wq_thidlecount
== 0) {
1477 workqueue_addnewthread(wq
, TRUE
);
1479 * 'workqueue_addnewthread' drops the workqueue lock
1480 * when creating the new thread and then retakes it before
1481 * returning... this window allows other threads to process
1482 * requests, so we need to recheck for available work
1483 * if none found, we just return... the newly created thread
1484 * will eventually get used (if it hasn't already)...
1486 if (wq
->wq_reqcount
== 0) {
1487 workqueue_unlock(wq
);
1491 if (wq
->wq_thidlecount
) {
1492 (void)workqueue_run_nextreq(p
, wq
, THREAD_NULL
, RUN_NEXTREQ_UNCONSTRAINED
, 0, false);
1494 * workqueue_run_nextreq is responsible for
1495 * dropping the workqueue lock in all cases
1497 PTHREAD_TRACE_WQ(TRACE_wq_thread_yielded
| DBG_FUNC_END
, wq
, wq
->wq_thread_yielded_count
, wq
->wq_reqcount
, 1, 0);
1502 PTHREAD_TRACE_WQ(TRACE_wq_thread_yielded
| DBG_FUNC_END
, wq
, wq
->wq_thread_yielded_count
, wq
->wq_reqcount
, 2, 0);
1504 workqueue_unlock(wq
);
1508 workqueue_callback(int type
, thread_t thread
)
1510 struct uthread
*uth
;
1511 struct threadlist
*tl
;
1512 struct workqueue
*wq
;
1514 uth
= pthread_kern
->get_bsdthread_info(thread
);
1515 tl
= pthread_kern
->uthread_get_threadlist(uth
);
1519 case SCHED_CALL_BLOCK
: {
1520 uint32_t old_activecount
;
1521 boolean_t start_timer
= FALSE
;
1523 old_activecount
= OSAddAtomic(-1, &wq
->wq_thactive_count
[tl
->th_priority
]);
1526 * If we blocked and were at the requested concurrency previously, we may
1527 * need to spin up a new thread. Of course, if it's the event manager
1528 * then that's moot, so ignore that case.
1530 if (old_activecount
== wq
->wq_reqconc
[tl
->th_priority
] &&
1531 tl
->th_priority
!= WORKQUEUE_EVENT_MANAGER_BUCKET
) {
1533 UInt64
*lastblocked_ptr
;
1536 * the number of active threads at this priority
1537 * has fallen below the maximum number of concurrent
1538 * threads that we're allowed to run
1540 lastblocked_ptr
= (UInt64
*)&wq
->wq_lastblocked_ts
[tl
->th_priority
];
1541 curtime
= mach_absolute_time();
1544 * if we collide with another thread trying to update the last_blocked (really unlikely
1545 * since another thread would have to get scheduled and then block after we start down
1546 * this path), it's not a problem. Either timestamp is adequate, so no need to retry
1549 OSCompareAndSwap64(*lastblocked_ptr
, (UInt64
)curtime
, lastblocked_ptr
);
1551 if (wq
->wq_reqcount
) {
1553 * We have work to do so start up the timer if it's not
1554 * running; it'll sort out whether we need to start another
1557 start_timer
= WQ_TIMER_DELAYED_NEEDED(wq
);
1560 if (start_timer
== TRUE
) {
1561 workqueue_interval_timer_start(wq
);
1564 PTHREAD_TRACE1_WQ(TRACE_wq_thread_block
| DBG_FUNC_START
, wq
, old_activecount
, tl
->th_priority
, start_timer
, thread_tid(thread
));
1567 case SCHED_CALL_UNBLOCK
:
1569 * we cannot take the workqueue_lock here...
1570 * an UNBLOCK can occur from a timer event which
1571 * is run from an interrupt context... if the workqueue_lock
1572 * is already held by this processor, we'll deadlock...
1573 * the thread lock for the thread being UNBLOCKED
1576 OSAddAtomic(1, &wq
->wq_thactive_count
[tl
->th_priority
]);
1578 PTHREAD_TRACE1_WQ(TRACE_wq_thread_block
| DBG_FUNC_END
, wq
, wq
->wq_threads_scheduled
, tl
->th_priority
, 0, thread_tid(thread
));
1585 _workqueue_get_sched_callback(void)
1587 return workqueue_callback
;
1590 #pragma mark thread addition/removal
1592 static mach_vm_size_t
1593 _workqueue_allocsize(struct workqueue
*wq
)
1595 proc_t p
= wq
->wq_proc
;
1596 mach_vm_size_t guardsize
= vm_map_page_size(wq
->wq_map
);
1597 mach_vm_size_t pthread_size
=
1598 vm_map_round_page_mask(pthread_kern
->proc_get_pthsize(p
) + PTHREAD_T_OFFSET
, vm_map_page_mask(wq
->wq_map
));
1599 return guardsize
+ PTH_DEFAULT_STACKSIZE
+ pthread_size
;
1603 * pop goes the thread
1605 * If fromexit is set, the call is from workqueue_exit(,
1606 * so some cleanups are to be avoided.
1609 workqueue_removethread(struct threadlist
*tl
, bool fromexit
, bool first_use
)
1611 struct uthread
* uth
;
1612 struct workqueue
* wq
= tl
->th_workq
;
1614 if (tl
->th_priority
== WORKQUEUE_EVENT_MANAGER_BUCKET
){
1615 TAILQ_REMOVE(&wq
->wq_thidlemgrlist
, tl
, th_entry
);
1617 TAILQ_REMOVE(&wq
->wq_thidlelist
, tl
, th_entry
);
1620 if (fromexit
== 0) {
1621 assert(wq
->wq_nthreads
&& wq
->wq_thidlecount
);
1623 wq
->wq_thidlecount
--;
1627 * Clear the threadlist pointer in uthread so
1628 * blocked thread on wakeup for termination will
1629 * not access the thread list as it is going to be
1632 pthread_kern
->thread_sched_call(tl
->th_thread
, NULL
);
1634 uth
= pthread_kern
->get_bsdthread_info(tl
->th_thread
);
1635 if (uth
!= (struct uthread
*)0) {
1636 pthread_kern
->uthread_set_threadlist(uth
, NULL
);
1638 if (fromexit
== 0) {
1639 /* during exit the lock is not held */
1640 workqueue_unlock(wq
);
1643 if ( (tl
->th_flags
& TH_LIST_NEW
) || first_use
) {
1645 * thread was created, but never used...
1646 * need to clean up the stack and port ourselves
1647 * since we're not going to spin up through the
1648 * normal exit path triggered from Libc
1650 if (fromexit
== 0) {
1651 /* vm map is already deallocated when this is called from exit */
1652 (void)mach_vm_deallocate(wq
->wq_map
, tl
->th_stackaddr
, _workqueue_allocsize(wq
));
1654 (void)pthread_kern
->mach_port_deallocate(pthread_kern
->task_get_ipcspace(wq
->wq_task
), tl
->th_thport
);
1658 PTHREAD_TRACE1_WQ(TRACE_wq_thread_park
| DBG_FUNC_END
, wq
, (uintptr_t)thread_tid(current_thread()), wq
->wq_nthreads
, 0xdead, thread_tid(tl
->th_thread
));
1661 * drop our ref on the thread
1663 thread_deallocate(tl
->th_thread
);
1665 kfree(tl
, sizeof(struct threadlist
));
1670 * Try to add a new workqueue thread.
1672 * - called with workq lock held
1673 * - dropped and retaken around thread creation
1674 * - return with workq lock held
1677 workqueue_addnewthread(struct workqueue
*wq
, boolean_t ignore_constrained_thread_limit
)
1679 struct threadlist
*tl
;
1680 struct uthread
*uth
;
1685 mach_vm_offset_t stackaddr
;
1687 if ((wq
->wq_flags
& WQ_EXITING
) == WQ_EXITING
) {
1688 PTHREAD_TRACE_WQ(TRACE_wq_thread_add_during_exit
| DBG_FUNC_NONE
, wq
, 0, 0, 0, 0);
1692 if (wq
->wq_nthreads
>= wq_max_threads
) {
1693 PTHREAD_TRACE_WQ(TRACE_wq_thread_limit_exceeded
| DBG_FUNC_NONE
, wq
, wq
->wq_nthreads
, wq_max_threads
, 0, 0);
1697 if (ignore_constrained_thread_limit
== FALSE
&&
1698 wq
->wq_constrained_threads_scheduled
>= wq_max_constrained_threads
) {
1700 * If we're not creating this thread to service an overcommit or
1701 * event manager request, then we check to see if we are over our
1702 * constrained thread limit, in which case we error out.
1704 PTHREAD_TRACE_WQ(TRACE_wq_thread_constrained_maxed
| DBG_FUNC_NONE
, wq
, wq
->wq_constrained_threads_scheduled
,
1705 wq_max_constrained_threads
, 0, 0);
1712 workqueue_unlock(wq
);
1714 tl
= kalloc(sizeof(struct threadlist
));
1715 bzero(tl
, sizeof(struct threadlist
));
1717 kret
= pthread_kern
->thread_create_workq_waiting(wq
->wq_task
, wq_unpark_continue
, tl
, &th
);
1718 if (kret
!= KERN_SUCCESS
) {
1719 PTHREAD_TRACE_WQ(TRACE_wq_thread_create_failed
| DBG_FUNC_NONE
, wq
, kret
, 0, 0, 0);
1720 kfree(tl
, sizeof(struct threadlist
));
1724 stackaddr
= pthread_kern
->proc_get_stack_addr_hint(p
);
1726 mach_vm_size_t guardsize
= vm_map_page_size(wq
->wq_map
);
1727 mach_vm_size_t pthread_size
=
1728 vm_map_round_page_mask(pthread_kern
->proc_get_pthsize(p
) + PTHREAD_T_OFFSET
, vm_map_page_mask(wq
->wq_map
));
1729 mach_vm_size_t th_allocsize
= guardsize
+ PTH_DEFAULT_STACKSIZE
+ pthread_size
;
1731 kret
= mach_vm_map(wq
->wq_map
, &stackaddr
,
1732 th_allocsize
, page_size
-1,
1733 VM_MAKE_TAG(VM_MEMORY_STACK
)| VM_FLAGS_ANYWHERE
, NULL
,
1734 0, FALSE
, VM_PROT_DEFAULT
, VM_PROT_ALL
,
1735 VM_INHERIT_DEFAULT
);
1737 if (kret
!= KERN_SUCCESS
) {
1738 PTHREAD_TRACE_WQ(TRACE_wq_thread_create_failed
| DBG_FUNC_NONE
, wq
, kret
, 1, 0, 0);
1740 kret
= mach_vm_allocate(wq
->wq_map
,
1741 &stackaddr
, th_allocsize
,
1742 VM_MAKE_TAG(VM_MEMORY_STACK
) | VM_FLAGS_ANYWHERE
);
1744 if (kret
== KERN_SUCCESS
) {
1746 * The guard page is at the lowest address
1747 * The stack base is the highest address
1749 kret
= mach_vm_protect(wq
->wq_map
, stackaddr
, guardsize
, FALSE
, VM_PROT_NONE
);
1751 if (kret
!= KERN_SUCCESS
) {
1752 (void) mach_vm_deallocate(wq
->wq_map
, stackaddr
, th_allocsize
);
1753 PTHREAD_TRACE_WQ(TRACE_wq_thread_create_failed
| DBG_FUNC_NONE
, wq
, kret
, 2, 0, 0);
1756 if (kret
!= KERN_SUCCESS
) {
1757 (void) thread_terminate(th
);
1758 thread_deallocate(th
);
1760 kfree(tl
, sizeof(struct threadlist
));
1763 thread_reference(th
);
1765 pthread_kern
->thread_set_tag(th
, THREAD_TAG_PTHREAD
| THREAD_TAG_WORKQUEUE
);
1767 sright
= (void *)pthread_kern
->convert_thread_to_port(th
);
1768 tl
->th_thport
= pthread_kern
->ipc_port_copyout_send(sright
, pthread_kern
->task_get_ipcspace(wq
->wq_task
));
1770 pthread_kern
->thread_static_param(th
, TRUE
);
1772 tl
->th_flags
= TH_LIST_INITED
| TH_LIST_NEW
;
1776 tl
->th_stackaddr
= stackaddr
;
1777 tl
->th_priority
= WORKQUEUE_NUM_BUCKETS
;
1779 uth
= pthread_kern
->get_bsdthread_info(tl
->th_thread
);
1781 workqueue_lock_spin(wq
);
1783 pthread_kern
->uthread_set_threadlist(uth
, tl
);
1784 TAILQ_INSERT_TAIL(&wq
->wq_thidlelist
, tl
, th_entry
);
1786 wq
->wq_thidlecount
++;
1788 PTHREAD_TRACE_WQ(TRACE_wq_thread_create
| DBG_FUNC_NONE
, wq
, 0, 0, 0, 0);
1793 workqueue_lock_spin(wq
);
1800 * Setup per-process state for the workqueue.
1803 _workq_open(struct proc
*p
, __unused
int32_t *retval
)
1805 struct workqueue
* wq
;
1812 if (pthread_kern
->proc_get_register(p
) == 0) {
1816 num_cpus
= pthread_kern
->ml_get_max_cpus();
1818 if (wq_init_constrained_limit
) {
1821 * set up the limit for the constrained pool
1822 * this is a virtual pool in that we don't
1823 * maintain it on a separate idle and run list
1825 limit
= num_cpus
* WORKQUEUE_CONSTRAINED_FACTOR
;
1827 if (limit
> wq_max_constrained_threads
)
1828 wq_max_constrained_threads
= limit
;
1830 wq_init_constrained_limit
= 0;
1832 if (wq_max_threads
> pthread_kern
->config_thread_max
- 20) {
1833 wq_max_threads
= pthread_kern
->config_thread_max
- 20;
1837 if (pthread_kern
->proc_get_wqptr(p
) == NULL
) {
1838 if (pthread_kern
->proc_init_wqptr_or_wait(p
) == FALSE
) {
1839 assert(pthread_kern
->proc_get_wqptr(p
) != NULL
);
1843 wq_size
= sizeof(struct workqueue
);
1845 ptr
= (char *)kalloc(wq_size
);
1846 bzero(ptr
, wq_size
);
1848 wq
= (struct workqueue
*)ptr
;
1849 wq
->wq_flags
= WQ_LIST_INITED
;
1851 wq
->wq_max_concurrency
= wq_max_concurrency
;
1852 wq
->wq_task
= current_task();
1853 wq
->wq_map
= pthread_kern
->current_map();
1855 for (i
= 0; i
< WORKQUEUE_NUM_BUCKETS
; i
++)
1856 wq
->wq_reqconc
[i
] = (uint16_t)wq
->wq_max_concurrency
;
1858 // The event manager bucket is special, so its gets a concurrency of 1
1859 // though we shouldn't ever read this value for that bucket
1860 wq
->wq_reqconc
[WORKQUEUE_EVENT_MANAGER_BUCKET
] = 1;
1862 // Start the event manager at the priority hinted at by the policy engine
1863 int mgr_priority_hint
= pthread_kern
->task_get_default_manager_qos(current_task());
1864 wq
->wq_event_manager_priority
= (uint32_t)thread_qos_get_pthread_priority(mgr_priority_hint
) | _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG
;
1866 TAILQ_INIT(&wq
->wq_thrunlist
);
1867 TAILQ_INIT(&wq
->wq_thidlelist
);
1869 wq
->wq_atimer_delayed_call
= thread_call_allocate((thread_call_func_t
)workqueue_add_timer
, (thread_call_param_t
)wq
);
1870 wq
->wq_atimer_immediate_call
= thread_call_allocate((thread_call_func_t
)workqueue_add_timer
, (thread_call_param_t
)wq
);
1872 lck_spin_init(&wq
->wq_lock
, pthread_lck_grp
, pthread_lck_attr
);
1874 pthread_kern
->proc_set_wqptr(p
, wq
);
1883 * Routine: workqueue_mark_exiting
1885 * Function: Mark the work queue such that new threads will not be added to the
1886 * work queue after we return.
1888 * Conditions: Called against the current process.
1891 _workqueue_mark_exiting(struct proc
*p
)
1893 struct workqueue
*wq
= pthread_kern
->proc_get_wqptr(p
);
1897 PTHREAD_TRACE_WQ(TRACE_wq_pthread_exit
|DBG_FUNC_START
, wq
, 0, 0, 0, 0);
1899 workqueue_lock_spin(wq
);
1902 * We arm the add timer without holding the workqueue lock so we need
1903 * to synchronize with any running or soon to be running timers.
1905 * Threads that intend to arm the timer atomically OR
1906 * WQ_ATIMER_{DELAYED,IMMEDIATE}_RUNNING into the wq_flags, only if
1907 * WQ_EXITING is not present. So, once we have set WQ_EXITING, we can
1908 * be sure that no new RUNNING flags will be set, but still need to
1909 * wait for the already running timers to complete.
1911 * We always hold the workq lock when dropping WQ_ATIMER_RUNNING, so
1912 * the check for and sleep until clear is protected.
1914 WQ_SETFLAG(wq
, WQ_EXITING
);
1916 if (wq
->wq_flags
& WQ_ATIMER_DELAYED_RUNNING
) {
1917 if (thread_call_cancel(wq
->wq_atimer_delayed_call
) == TRUE
) {
1918 WQ_UNSETFLAG(wq
, WQ_ATIMER_DELAYED_RUNNING
);
1921 if (wq
->wq_flags
& WQ_ATIMER_IMMEDIATE_RUNNING
) {
1922 if (thread_call_cancel(wq
->wq_atimer_immediate_call
) == TRUE
) {
1923 WQ_UNSETFLAG(wq
, WQ_ATIMER_IMMEDIATE_RUNNING
);
1926 while (wq
->wq_flags
& (WQ_ATIMER_DELAYED_RUNNING
| WQ_ATIMER_IMMEDIATE_RUNNING
) ||
1927 (wq
->wq_lflags
& WQL_ATIMER_BUSY
)) {
1928 assert_wait((caddr_t
)wq
, (THREAD_UNINT
));
1929 workqueue_unlock(wq
);
1931 thread_block(THREAD_CONTINUE_NULL
);
1933 workqueue_lock_spin(wq
);
1935 workqueue_unlock(wq
);
1937 PTHREAD_TRACE(TRACE_wq_pthread_exit
|DBG_FUNC_END
, 0, 0, 0, 0, 0);
1942 * Routine: workqueue_exit
1944 * Function: clean up the work queue structure(s) now that there are no threads
1945 * left running inside the work queue (except possibly current_thread).
1947 * Conditions: Called by the last thread in the process.
1948 * Called against current process.
1951 _workqueue_exit(struct proc
*p
)
1953 struct workqueue
* wq
;
1954 struct threadlist
* tl
, *tlist
;
1955 struct uthread
*uth
;
1956 size_t wq_size
= sizeof(struct workqueue
);
1958 wq
= pthread_kern
->proc_get_wqptr(p
);
1961 PTHREAD_TRACE_WQ(TRACE_wq_workqueue_exit
|DBG_FUNC_START
, wq
, 0, 0, 0, 0);
1963 pthread_kern
->proc_set_wqptr(p
, NULL
);
1966 * Clean up workqueue data structures for threads that exited and
1967 * didn't get a chance to clean up after themselves.
1969 TAILQ_FOREACH_SAFE(tl
, &wq
->wq_thrunlist
, th_entry
, tlist
) {
1970 assert((tl
->th_flags
& TH_LIST_RUNNING
) != 0);
1972 pthread_kern
->thread_sched_call(tl
->th_thread
, NULL
);
1974 uth
= pthread_kern
->get_bsdthread_info(tl
->th_thread
);
1975 if (uth
!= (struct uthread
*)0) {
1976 pthread_kern
->uthread_set_threadlist(uth
, NULL
);
1978 TAILQ_REMOVE(&wq
->wq_thrunlist
, tl
, th_entry
);
1981 * drop our last ref on the thread
1983 thread_deallocate(tl
->th_thread
);
1985 kfree(tl
, sizeof(struct threadlist
));
1987 TAILQ_FOREACH_SAFE(tl
, &wq
->wq_thidlelist
, th_entry
, tlist
) {
1988 assert((tl
->th_flags
& TH_LIST_RUNNING
) == 0);
1989 assert(tl
->th_priority
!= WORKQUEUE_EVENT_MANAGER_BUCKET
);
1990 workqueue_removethread(tl
, true, false);
1992 TAILQ_FOREACH_SAFE(tl
, &wq
->wq_thidlemgrlist
, th_entry
, tlist
) {
1993 assert((tl
->th_flags
& TH_LIST_RUNNING
) == 0);
1994 assert(tl
->th_priority
== WORKQUEUE_EVENT_MANAGER_BUCKET
);
1995 workqueue_removethread(tl
, true, false);
1997 thread_call_free(wq
->wq_atimer_delayed_call
);
1998 thread_call_free(wq
->wq_atimer_immediate_call
);
1999 lck_spin_destroy(&wq
->wq_lock
, pthread_lck_grp
);
2003 PTHREAD_TRACE(TRACE_wq_workqueue_exit
|DBG_FUNC_END
, 0, 0, 0, 0, 0);
2008 #pragma mark workqueue thread manipulation
2011 * Entry point for libdispatch to ask for threads
2013 static int wqops_queue_reqthreads(struct proc
*p
, int reqcount
, pthread_priority_t priority
){
2014 struct workqueue
*wq
;
2015 boolean_t start_timer
= FALSE
;
2017 boolean_t overcommit
= (_pthread_priority_get_flags(priority
) & _PTHREAD_PRIORITY_OVERCOMMIT_FLAG
) != 0;
2018 int class = pthread_priority_get_class_index(priority
);
2020 boolean_t event_manager
= (_pthread_priority_get_flags(priority
) & _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG
) != 0;
2022 class = WORKQUEUE_EVENT_MANAGER_BUCKET
;
2025 if ((reqcount
<= 0) || (class < 0) || (class >= WORKQUEUE_NUM_BUCKETS
) || (overcommit
&& event_manager
)) {
2030 if ((wq
= (struct workqueue
*)pthread_kern
->proc_get_wqptr(p
)) == NULL
) {
2034 workqueue_lock_spin(wq
);
2036 if (overcommit
== 0 && event_manager
== 0) {
2037 wq
->wq_reqcount
+= reqcount
;
2038 wq
->wq_requests
[class] += reqcount
;
2040 PTHREAD_TRACE_WQ(TRACE_wq_req_threads
| DBG_FUNC_NONE
, wq
, priority
, wq
->wq_requests
[class], reqcount
, 0);
2042 while (wq
->wq_reqcount
) {
2043 if (!workqueue_run_one(p
, wq
, overcommit
, 0))
2046 } else if (overcommit
) {
2047 PTHREAD_TRACE_WQ(TRACE_wq_req_octhreads
| DBG_FUNC_NONE
, wq
, priority
, wq
->wq_ocrequests
[class], reqcount
, 0);
2050 if (!workqueue_run_one(p
, wq
, overcommit
, priority
))
2056 * We need to delay starting some of the overcommit requests.
2057 * We'll record the request here and as existing threads return to
2058 * the kernel, we'll notice the ocrequests and spin them back to
2059 * user space as the overcommit variety.
2061 wq
->wq_reqcount
+= reqcount
;
2062 wq
->wq_requests
[class] += reqcount
;
2063 wq
->wq_ocrequests
[class] += reqcount
;
2065 PTHREAD_TRACE_WQ(TRACE_wq_delay_octhreads
| DBG_FUNC_NONE
, wq
, priority
, wq
->wq_ocrequests
[class], reqcount
, 0);
2068 * If we delayed this thread coming up but we're not constrained
2069 * or at max threads then we need to start the timer so we don't
2070 * risk dropping this request on the floor.
2072 if ((wq
->wq_constrained_threads_scheduled
< wq_max_constrained_threads
) &&
2073 (wq
->wq_nthreads
< wq_max_threads
)){
2074 start_timer
= WQ_TIMER_DELAYED_NEEDED(wq
);
2077 } else if (event_manager
) {
2078 PTHREAD_TRACE_WQ(TRACE_wq_req_event_manager
| DBG_FUNC_NONE
, wq
, wq
->wq_event_manager_priority
, wq
->wq_requests
[WORKQUEUE_EVENT_MANAGER_BUCKET
], wq
->wq_thscheduled_count
[WORKQUEUE_EVENT_MANAGER_BUCKET
], 0);
2080 if (wq
->wq_requests
[WORKQUEUE_EVENT_MANAGER_BUCKET
] == 0){
2081 wq
->wq_reqcount
+= 1;
2082 wq
->wq_requests
[WORKQUEUE_EVENT_MANAGER_BUCKET
] = 1;
2085 // We've recorded the request for an event manager thread above. We'll
2086 // let the timer pick it up as we would for a kernel callout. We can
2087 // do a direct add/wakeup when that support is added for the kevent path.
2088 if (wq
->wq_thscheduled_count
[WORKQUEUE_EVENT_MANAGER_BUCKET
] == 0){
2089 start_timer
= WQ_TIMER_DELAYED_NEEDED(wq
);
2094 workqueue_interval_timer_start(wq
);
2097 workqueue_unlock(wq
);
2103 * Used by the kevent system to request threads.
2105 * Currently count is ignored and we always return one thread per invocation.
2107 thread_t
_workq_reqthreads(struct proc
*p
, int requests_count
, workq_reqthreads_req_t requests
){
2108 thread_t th
= THREAD_NULL
;
2109 boolean_t do_thread_call
= FALSE
;
2110 boolean_t emergency_thread
= FALSE
;
2111 assert(requests_count
> 0);
2114 // Make sure that the requests array is sorted, highest priority first
2115 if (requests_count
> 1){
2116 __assert_only qos_class_t priority
= _pthread_priority_get_qos_newest(requests
[0].priority
);
2117 __assert_only
unsigned long flags
= ((_pthread_priority_get_flags(requests
[0].priority
) & (_PTHREAD_PRIORITY_OVERCOMMIT_FLAG
|_PTHREAD_PRIORITY_EVENT_MANAGER_FLAG
)) != 0);
2118 for (int i
= 1; i
< requests_count
; i
++){
2119 if (requests
[i
].count
== 0) continue;
2120 __assert_only qos_class_t next_priority
= _pthread_priority_get_qos_newest(requests
[i
].priority
);
2121 __assert_only
unsigned long next_flags
= ((_pthread_priority_get_flags(requests
[i
].priority
) & (_PTHREAD_PRIORITY_OVERCOMMIT_FLAG
|_PTHREAD_PRIORITY_EVENT_MANAGER_FLAG
)) != 0);
2122 if (next_flags
!= flags
){
2124 priority
= next_priority
;
2126 assert(next_priority
<= priority
);
2132 struct workqueue
*wq
;
2133 if ((wq
= (struct workqueue
*)pthread_kern
->proc_get_wqptr(p
)) == NULL
) {
2137 workqueue_lock_spin(wq
);
2139 PTHREAD_TRACE_WQ(TRACE_wq_kevent_req_threads
| DBG_FUNC_START
, wq
, requests_count
, 0, 0, 0);
2141 // Look for overcommit or event-manager-only requests.
2142 boolean_t have_overcommit
= FALSE
;
2143 pthread_priority_t priority
= 0;
2144 for (int i
= 0; i
< requests_count
; i
++){
2145 if (requests
[i
].count
== 0)
2147 priority
= requests
[i
].priority
;
2148 if (_pthread_priority_get_qos_newest(priority
) == QOS_CLASS_UNSPECIFIED
){
2149 priority
|= _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG
;
2151 if ((_pthread_priority_get_flags(priority
) & _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG
) != 0){
2154 if ((_pthread_priority_get_flags(priority
) & _PTHREAD_PRIORITY_OVERCOMMIT_FLAG
) != 0){
2155 have_overcommit
= TRUE
;
2160 if (have_overcommit
){
2161 if (wq
->wq_thidlecount
){
2162 th
= workqueue_run_nextreq(p
, wq
, THREAD_NULL
, RUN_NEXTREQ_OVERCOMMIT_KEVENT
, priority
, true);
2163 if (th
!= THREAD_NULL
){
2166 workqueue_lock_spin(wq
); // reacquire lock
2170 int class = pthread_priority_get_class_index(priority
);
2171 wq
->wq_reqcount
+= 1;
2172 wq
->wq_requests
[class] += 1;
2173 wq
->wq_kevent_ocrequests
[class] += 1;
2175 do_thread_call
= WQ_TIMER_IMMEDIATE_NEEDED(wq
);
2179 // Having no overcommit requests, try to find any request that can start
2180 // There's no TOCTTOU since we hold the workqueue lock
2181 for (int i
= 0; i
< requests_count
; i
++){
2182 workq_reqthreads_req_t req
= requests
+ i
;
2183 priority
= req
->priority
;
2184 int class = pthread_priority_get_class_index(priority
);
2186 if (req
->count
== 0)
2189 if (!may_start_constrained_thread(wq
, class, WORKQUEUE_NUM_BUCKETS
, NULL
))
2192 wq
->wq_reqcount
+= 1;
2193 wq
->wq_requests
[class] += 1;
2194 wq
->wq_kevent_requests
[class] += 1;
2196 PTHREAD_TRACE_WQ(TRACE_wq_req_kevent_threads
| DBG_FUNC_NONE
, wq
, priority
, wq
->wq_kevent_requests
[class], 1, 0);
2198 if (wq
->wq_thidlecount
){
2199 th
= workqueue_run_nextreq(p
, wq
, THREAD_NULL
, RUN_NEXTREQ_DEFAULT_KEVENT
, priority
, true);
2202 do_thread_call
= WQ_TIMER_IMMEDIATE_NEEDED(wq
);
2207 // Okay, here's the fun case: we can't spin up any of the non-overcommit threads
2208 // that we've seen a request for, so we kick this over to the event manager thread
2209 emergency_thread
= TRUE
;
2212 if (wq
->wq_requests
[WORKQUEUE_EVENT_MANAGER_BUCKET
] == 0){
2213 wq
->wq_reqcount
+= 1;
2214 wq
->wq_requests
[WORKQUEUE_EVENT_MANAGER_BUCKET
] = 1;
2215 PTHREAD_TRACE_WQ(TRACE_wq_req_event_manager
| DBG_FUNC_NONE
, wq
, 0, wq
->wq_kevent_requests
[WORKQUEUE_EVENT_MANAGER_BUCKET
], 1, 0);
2217 PTHREAD_TRACE_WQ(TRACE_wq_req_event_manager
| DBG_FUNC_NONE
, wq
, 0, wq
->wq_kevent_requests
[WORKQUEUE_EVENT_MANAGER_BUCKET
], 0, 0);
2219 wq
->wq_kevent_requests
[WORKQUEUE_EVENT_MANAGER_BUCKET
] = 1;
2221 if (wq
->wq_thidlecount
&& wq
->wq_thscheduled_count
[WORKQUEUE_EVENT_MANAGER_BUCKET
] == 0){
2222 th
= workqueue_run_nextreq(p
, wq
, THREAD_NULL
, RUN_NEXTREQ_EVENT_MANAGER
, 0, true);
2223 assert(th
!= THREAD_NULL
);
2226 do_thread_call
= WQ_TIMER_IMMEDIATE_NEEDED(wq
);
2229 workqueue_unlock(wq
);
2231 if (do_thread_call
== TRUE
){
2232 workqueue_interval_timer_trigger(wq
);
2236 PTHREAD_TRACE_WQ(TRACE_wq_kevent_req_threads
| DBG_FUNC_END
, wq
, do_thread_call
, 0, 0, 0);
2238 return emergency_thread
? (void*)-1 : th
;
2242 static int wqops_thread_return(struct proc
*p
){
2243 thread_t th
= current_thread();
2244 struct uthread
*uth
= pthread_kern
->get_bsdthread_info(th
);
2245 struct threadlist
*tl
= pthread_kern
->uthread_get_threadlist(uth
);
2247 /* reset signal mask on the workqueue thread to default state */
2248 if (pthread_kern
->uthread_get_sigmask(uth
) != (sigset_t
)(~workq_threadmask
)) {
2249 pthread_kern
->proc_lock(p
);
2250 pthread_kern
->uthread_set_sigmask(uth
, ~workq_threadmask
);
2251 pthread_kern
->proc_unlock(p
);
2254 struct workqueue
*wq
= (struct workqueue
*)pthread_kern
->proc_get_wqptr(p
);
2255 if (wq
== NULL
|| !tl
) {
2259 PTHREAD_TRACE_WQ(TRACE_wq_override_reset
| DBG_FUNC_START
, tl
->th_workq
, 0, 0, 0, 0);
2262 * This squash call has neat semantics: it removes the specified overrides,
2263 * replacing the current requested QoS with the previous effective QoS from
2264 * those overrides. This means we won't be preempted due to having our QoS
2265 * lowered. Of course, now our understanding of the thread's QoS is wrong,
2266 * so we'll adjust below.
2269 pthread_kern
->proc_usynch_thread_qos_squash_override_for_resource(th
,
2270 THREAD_QOS_OVERRIDE_RESOURCE_WILDCARD
,
2271 THREAD_QOS_OVERRIDE_TYPE_DISPATCH_ASYNCHRONOUS_OVERRIDE
);
2273 workqueue_lock_spin(wq
);
2275 if (tl
->th_flags
& TH_LIST_KEVENT_BOUND
) {
2276 unsigned int flags
= KEVENT_FLAG_WORKQ
;
2277 if (tl
->th_priority
== WORKQUEUE_EVENT_MANAGER_BUCKET
) {
2278 flags
|= KEVENT_FLAG_WORKQ_MANAGER
;
2281 workqueue_unlock(wq
);
2282 kevent_qos_internal_unbind(p
, class_index_get_thread_qos(tl
->th_priority
), th
, flags
);
2283 workqueue_lock_spin(wq
);
2285 tl
->th_flags
&= ~TH_LIST_KEVENT_BOUND
;
2288 /* Fix up counters from the squash operation. */
2289 uint8_t old_bucket
= tl
->th_priority
;
2290 uint8_t new_bucket
= thread_qos_get_class_index(new_qos
);
2292 if (old_bucket
!= new_bucket
) {
2293 OSAddAtomic(-1, &wq
->wq_thactive_count
[old_bucket
]);
2294 OSAddAtomic(1, &wq
->wq_thactive_count
[new_bucket
]);
2296 wq
->wq_thscheduled_count
[old_bucket
]--;
2297 wq
->wq_thscheduled_count
[new_bucket
]++;
2299 tl
->th_priority
= new_bucket
;
2302 PTHREAD_TRACE_WQ(TRACE_wq_override_reset
| DBG_FUNC_END
, tl
->th_workq
, new_qos
, 0, 0, 0);
2304 PTHREAD_TRACE_WQ(TRACE_wq_runitem
| DBG_FUNC_END
, wq
, 0, 0, 0, 0);
2306 (void)workqueue_run_nextreq(p
, wq
, th
, RUN_NEXTREQ_DEFAULT
, 0, false);
2308 * workqueue_run_nextreq is responsible for
2309 * dropping the workqueue lock in all cases
2315 * Multiplexed call to interact with the workqueue mechanism
2318 _workq_kernreturn(struct proc
*p
,
2327 if (pthread_kern
->proc_get_register(p
) == 0) {
2332 case WQOPS_QUEUE_NEWSPISUPP
: {
2334 * arg2 = offset of serialno into dispatch queue
2335 * arg3 = kevent support
2339 // If we get here, then userspace has indicated support for kevent delivery.
2342 pthread_kern
->proc_set_dispatchqueue_serialno_offset(p
, (uint64_t)offset
);
2345 case WQOPS_QUEUE_REQTHREADS
: {
2347 * arg2 = number of threads to start
2350 error
= wqops_queue_reqthreads(p
, arg2
, arg3
);
2353 case WQOPS_SET_EVENT_MANAGER_PRIORITY
: {
2355 * arg2 = priority for the manager thread
2357 * if _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG is set, the
2358 * ~_PTHREAD_PRIORITY_FLAGS_MASK contains a scheduling priority instead
2361 pthread_priority_t pri
= arg2
;
2363 struct workqueue
*wq
= (struct workqueue
*)pthread_kern
->proc_get_wqptr(p
);
2368 workqueue_lock_spin(wq
);
2369 if (pri
& _PTHREAD_PRIORITY_SCHED_PRI_FLAG
){
2370 // If userspace passes a scheduling priority, that takes precidence
2371 // over any QoS. (So, userspace should take care not to accidenatally
2372 // lower the priority this way.)
2373 uint32_t sched_pri
= pri
& (~_PTHREAD_PRIORITY_FLAGS_MASK
);
2374 if (wq
->wq_event_manager_priority
& _PTHREAD_PRIORITY_SCHED_PRI_FLAG
){
2375 wq
->wq_event_manager_priority
= MAX(sched_pri
, wq
->wq_event_manager_priority
& (~_PTHREAD_PRIORITY_FLAGS_MASK
))
2376 | _PTHREAD_PRIORITY_SCHED_PRI_FLAG
| _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG
;
2378 wq
->wq_event_manager_priority
= sched_pri
2379 | _PTHREAD_PRIORITY_SCHED_PRI_FLAG
| _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG
;
2381 } else if ((wq
->wq_event_manager_priority
& _PTHREAD_PRIORITY_SCHED_PRI_FLAG
) == 0){
2382 int cur_qos
= pthread_priority_get_thread_qos(wq
->wq_event_manager_priority
);
2383 int new_qos
= pthread_priority_get_thread_qos(pri
);
2384 wq
->wq_event_manager_priority
= (uint32_t)thread_qos_get_pthread_priority(MAX(cur_qos
, new_qos
)) | _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG
;
2386 workqueue_unlock(wq
);
2389 case WQOPS_THREAD_KEVENT_RETURN
:
2391 int32_t kevent_retval
;
2392 int ret
= kevent_qos_internal(p
, -1, item
, arg2
, item
, arg2
, NULL
, NULL
, KEVENT_FLAG_WORKQ
| KEVENT_FLAG_IMMEDIATE
| KEVENT_FLAG_ERROR_EVENTS
, &kevent_retval
);
2393 // We shouldn't be getting more errors out than events we put in, so
2394 // reusing the input buffer should always provide enough space. But,
2395 // the assert is commented out since we get errors in edge cases in the
2396 // process lifecycle.
2397 //assert(ret == KERN_SUCCESS && kevent_retval >= 0);
2398 if (ret
!= KERN_SUCCESS
){
2401 } else if (kevent_retval
> 0){
2402 assert(kevent_retval
<= arg2
);
2403 *retval
= kevent_retval
;
2409 case WQOPS_THREAD_RETURN
:
2410 error
= wqops_thread_return(p
);
2411 // NOT REACHED except in case of error
2423 workqueue_run_one(proc_t p
, struct workqueue
*wq
, boolean_t overcommit
, pthread_priority_t priority
)
2427 if (wq
->wq_thidlecount
== 0) {
2428 if (overcommit
== FALSE
) {
2429 if (wq
->wq_constrained_threads_scheduled
< wq
->wq_max_concurrency
)
2430 workqueue_addnewthread(wq
, overcommit
);
2432 workqueue_addnewthread(wq
, overcommit
);
2434 if (wq
->wq_thidlecount
== 0)
2438 ran_one
= (workqueue_run_nextreq(p
, wq
, THREAD_NULL
, overcommit
? RUN_NEXTREQ_OVERCOMMIT
: RUN_NEXTREQ_DEFAULT
, priority
, false) != THREAD_NULL
);
2440 * workqueue_run_nextreq is responsible for
2441 * dropping the workqueue lock in all cases
2443 workqueue_lock_spin(wq
);
2449 * We have no work to do, park ourselves on the idle list.
2451 * Consumes the workqueue lock and does not return.
2454 parkit(struct workqueue
*wq
, struct threadlist
*tl
, thread_t thread
)
2456 assert(thread
== tl
->th_thread
);
2457 assert(thread
== current_thread());
2459 uint32_t us_to_wait
= 0;
2461 TAILQ_REMOVE(&wq
->wq_thrunlist
, tl
, th_entry
);
2463 tl
->th_flags
&= ~TH_LIST_RUNNING
;
2464 tl
->th_flags
&= ~TH_LIST_KEVENT
;
2465 assert((tl
->th_flags
& TH_LIST_KEVENT_BOUND
) == 0);
2467 if (tl
->th_flags
& TH_LIST_CONSTRAINED
) {
2468 wq
->wq_constrained_threads_scheduled
--;
2469 tl
->th_flags
&= ~TH_LIST_CONSTRAINED
;
2472 OSAddAtomic(-1, &wq
->wq_thactive_count
[tl
->th_priority
]);
2473 wq
->wq_thscheduled_count
[tl
->th_priority
]--;
2474 wq
->wq_threads_scheduled
--;
2475 uint32_t thidlecount
= ++wq
->wq_thidlecount
;
2477 pthread_kern
->thread_sched_call(thread
, NULL
);
2480 * We'd like to always have one manager thread parked so that we can have
2481 * low latency when we need to bring a manager thread up. If that idle
2482 * thread list is empty, make this thread a manager thread.
2484 * XXX: This doesn't check that there's not a manager thread outstanding,
2485 * so it's based on the assumption that most manager callouts will change
2486 * their QoS before parking. If that stops being true, this may end up
2487 * costing us more than we gain.
2489 if (TAILQ_EMPTY(&wq
->wq_thidlemgrlist
) &&
2490 tl
->th_priority
!= WORKQUEUE_EVENT_MANAGER_BUCKET
){
2491 reset_priority(tl
, pthread_priority_from_wq_class_index(wq
, WORKQUEUE_EVENT_MANAGER_BUCKET
));
2492 tl
->th_priority
= WORKQUEUE_EVENT_MANAGER_BUCKET
;
2495 if (tl
->th_priority
== WORKQUEUE_EVENT_MANAGER_BUCKET
){
2496 TAILQ_INSERT_HEAD(&wq
->wq_thidlemgrlist
, tl
, th_entry
);
2498 TAILQ_INSERT_HEAD(&wq
->wq_thidlelist
, tl
, th_entry
);
2501 PTHREAD_TRACE_WQ(TRACE_wq_thread_park
| DBG_FUNC_START
, wq
,
2502 wq
->wq_threads_scheduled
, wq
->wq_thidlecount
, us_to_wait
, 0);
2505 * When we remove the voucher from the thread, we may lose our importance
2506 * causing us to get preempted, so we do this after putting the thread on
2507 * the idle list. That when, when we get our importance back we'll be able
2508 * to use this thread from e.g. the kevent call out to deliver a boosting
2511 workqueue_unlock(wq
);
2512 kern_return_t kr
= pthread_kern
->thread_set_voucher_name(MACH_PORT_NULL
);
2513 assert(kr
== KERN_SUCCESS
);
2514 workqueue_lock_spin(wq
);
2516 if ((tl
->th_flags
& TH_LIST_RUNNING
) == 0) {
2517 if (thidlecount
< 101) {
2518 us_to_wait
= wq_reduce_pool_window_usecs
- ((thidlecount
-2) * (wq_reduce_pool_window_usecs
/ 100));
2520 us_to_wait
= wq_reduce_pool_window_usecs
/ 100;
2523 assert_wait_timeout_with_leeway((caddr_t
)tl
, (THREAD_INTERRUPTIBLE
),
2524 TIMEOUT_URGENCY_SYS_BACKGROUND
|TIMEOUT_URGENCY_LEEWAY
, us_to_wait
,
2525 wq_reduce_pool_window_usecs
/10, NSEC_PER_USEC
);
2527 workqueue_unlock(wq
);
2529 thread_block(wq_unpark_continue
);
2530 panic("thread_block(wq_unpark_continue) returned!");
2532 workqueue_unlock(wq
);
2535 * While we'd dropped the lock to unset our voucher, someone came
2536 * around and made us runnable. But because we weren't waiting on the
2537 * event their wakeup() was ineffectual. To correct for that, we just
2538 * run the continuation ourselves.
2540 wq_unpark_continue(NULL
, THREAD_AWAKENED
);
2544 static boolean_t
may_start_constrained_thread(struct workqueue
*wq
, uint32_t at_priclass
, uint32_t my_priclass
, boolean_t
*start_timer
){
2545 if (wq
->wq_constrained_threads_scheduled
>= wq_max_constrained_threads
) {
2547 * we need 1 or more constrained threads to return to the kernel before
2548 * we can dispatch additional work
2553 uint32_t busycount
= 0;
2554 uint32_t thactive_count
= wq
->wq_thactive_count
[at_priclass
];
2556 // Has our most recently blocked thread blocked recently enough that we
2557 // should still consider it busy?
2558 if (wq
->wq_thscheduled_count
[at_priclass
] > wq
->wq_thactive_count
[at_priclass
]) {
2559 if (wq_thread_is_busy(mach_absolute_time(), &wq
->wq_lastblocked_ts
[at_priclass
])) {
2564 if (my_priclass
< WORKQUEUE_NUM_BUCKETS
&& my_priclass
== at_priclass
){
2566 * don't count this thread as currently active
2571 if (thactive_count
+ busycount
>= wq
->wq_max_concurrency
) {
2572 if (busycount
&& start_timer
) {
2574 * we found at least 1 thread in the
2575 * 'busy' state... make sure we start
2576 * the timer because if they are the only
2577 * threads keeping us from scheduling
2578 * this work request, we won't get a callback
2579 * to kick off the timer... we need to
2582 *start_timer
= WQ_TIMER_DELAYED_NEEDED(wq
);
2585 PTHREAD_TRACE_WQ(TRACE_wq_overcommitted
|DBG_FUNC_NONE
, wq
, ((start_timer
&& *start_timer
) ? 1 << _PTHREAD_PRIORITY_FLAGS_SHIFT
: 0) | class_index_get_pthread_priority(at_priclass
), thactive_count
, busycount
, 0);
2592 static struct threadlist
*
2593 pop_from_thidlelist(struct workqueue
*wq
, uint32_t priclass
)
2595 assert(wq
->wq_thidlecount
);
2597 struct threadlist
*tl
= NULL
;
2599 if (!TAILQ_EMPTY(&wq
->wq_thidlemgrlist
) &&
2600 (priclass
== WORKQUEUE_EVENT_MANAGER_BUCKET
|| TAILQ_EMPTY(&wq
->wq_thidlelist
))){
2601 tl
= TAILQ_FIRST(&wq
->wq_thidlemgrlist
);
2602 TAILQ_REMOVE(&wq
->wq_thidlemgrlist
, tl
, th_entry
);
2603 assert(tl
->th_priority
== WORKQUEUE_EVENT_MANAGER_BUCKET
);
2604 } else if (!TAILQ_EMPTY(&wq
->wq_thidlelist
) &&
2605 (priclass
!= WORKQUEUE_EVENT_MANAGER_BUCKET
|| TAILQ_EMPTY(&wq
->wq_thidlemgrlist
))){
2606 tl
= TAILQ_FIRST(&wq
->wq_thidlelist
);
2607 TAILQ_REMOVE(&wq
->wq_thidlelist
, tl
, th_entry
);
2608 assert(tl
->th_priority
!= WORKQUEUE_EVENT_MANAGER_BUCKET
);
2610 panic("pop_from_thidlelist called with no threads available");
2612 assert((tl
->th_flags
& TH_LIST_RUNNING
) == 0);
2614 assert(wq
->wq_thidlecount
);
2615 wq
->wq_thidlecount
--;
2617 TAILQ_INSERT_TAIL(&wq
->wq_thrunlist
, tl
, th_entry
);
2619 tl
->th_flags
|= TH_LIST_RUNNING
| TH_LIST_BUSY
;
2621 wq
->wq_threads_scheduled
++;
2622 wq
->wq_thscheduled_count
[priclass
]++;
2623 OSAddAtomic(1, &wq
->wq_thactive_count
[priclass
]);
2628 static pthread_priority_t
2629 pthread_priority_from_wq_class_index(struct workqueue
*wq
, int index
){
2630 if (index
== WORKQUEUE_EVENT_MANAGER_BUCKET
){
2631 return wq
->wq_event_manager_priority
;
2633 return class_index_get_pthread_priority(index
);
2638 reset_priority(struct threadlist
*tl
, pthread_priority_t pri
){
2640 thread_t th
= tl
->th_thread
;
2642 if ((pri
& _PTHREAD_PRIORITY_SCHED_PRI_FLAG
) == 0){
2643 ret
= pthread_kern
->thread_set_workq_qos(th
, pthread_priority_get_thread_qos(pri
), 0);
2644 assert(ret
== KERN_SUCCESS
|| ret
== KERN_TERMINATED
);
2646 if (tl
->th_flags
& TH_LIST_EVENT_MGR_SCHED_PRI
) {
2648 /* Reset priority to default (masked by QoS) */
2650 ret
= pthread_kern
->thread_set_workq_pri(th
, 31, POLICY_TIMESHARE
);
2651 assert(ret
== KERN_SUCCESS
|| ret
== KERN_TERMINATED
);
2653 tl
->th_flags
&= ~TH_LIST_EVENT_MGR_SCHED_PRI
;
2656 ret
= pthread_kern
->thread_set_workq_qos(th
, THREAD_QOS_UNSPECIFIED
, 0);
2657 assert(ret
== KERN_SUCCESS
|| ret
== KERN_TERMINATED
);
2658 ret
= pthread_kern
->thread_set_workq_pri(th
, (pri
& (~_PTHREAD_PRIORITY_FLAGS_MASK
)), POLICY_TIMESHARE
);
2659 assert(ret
== KERN_SUCCESS
|| ret
== KERN_TERMINATED
);
2661 tl
->th_flags
|= TH_LIST_EVENT_MGR_SCHED_PRI
;
2666 * grabs a thread for a request
2668 * - called with the workqueue lock held...
2669 * - responsible for dropping it in all cases
2670 * - if provided mode is for overcommit, doesn't consume a reqcount
2674 workqueue_run_nextreq(proc_t p
, struct workqueue
*wq
, thread_t thread
,
2675 enum run_nextreq_mode mode
, pthread_priority_t prio
,
2676 bool kevent_bind_via_return
)
2678 thread_t th_to_run
= THREAD_NULL
;
2679 uint32_t upcall_flags
= 0;
2681 struct threadlist
*tl
= NULL
;
2682 struct uthread
*uth
= NULL
;
2683 boolean_t start_timer
= FALSE
;
2685 if (mode
== RUN_NEXTREQ_ADD_TIMER
) {
2686 mode
= RUN_NEXTREQ_DEFAULT
;
2689 // valid modes to call this function with
2690 assert(mode
== RUN_NEXTREQ_DEFAULT
|| mode
== RUN_NEXTREQ_DEFAULT_KEVENT
||
2691 mode
== RUN_NEXTREQ_OVERCOMMIT
|| mode
== RUN_NEXTREQ_UNCONSTRAINED
||
2692 mode
== RUN_NEXTREQ_EVENT_MANAGER
|| mode
== RUN_NEXTREQ_OVERCOMMIT_KEVENT
);
2693 // may only have a priority if in OVERCOMMIT or DEFAULT_KEVENT mode
2694 assert(mode
== RUN_NEXTREQ_OVERCOMMIT
|| mode
== RUN_NEXTREQ_OVERCOMMIT_KEVENT
||
2695 mode
== RUN_NEXTREQ_DEFAULT_KEVENT
|| prio
== 0);
2696 // thread == thread_null means "please spin up a new workqueue thread, we can't reuse this"
2697 // thread != thread_null is thread reuse, and must be the current thread
2698 assert(thread
== THREAD_NULL
|| thread
== current_thread());
2700 PTHREAD_TRACE_WQ(TRACE_wq_run_nextitem
|DBG_FUNC_START
, wq
, thread_tid(thread
), wq
->wq_thidlecount
, wq
->wq_reqcount
, 0);
2702 if (thread
!= THREAD_NULL
) {
2703 uth
= pthread_kern
->get_bsdthread_info(thread
);
2705 if ((tl
= pthread_kern
->uthread_get_threadlist(uth
)) == NULL
) {
2706 panic("wq thread with no threadlist");
2711 * from here until we drop the workq lock we can't be pre-empted since we
2712 * hold the lock in spin mode... this is important since we have to
2713 * independently update the priority that the thread is associated with and
2714 * the priorty based counters that "workqueue_callback" also changes and
2715 * bases decisions on.
2719 * This giant monstrosity does three things:
2721 * - adjusts the mode, if required
2722 * - selects the priclass that we'll be servicing
2723 * - sets any mode-specific upcall flags
2725 * When possible special-cases should be handled here and converted into
2726 * non-special cases.
2728 if (mode
== RUN_NEXTREQ_OVERCOMMIT
) {
2729 priclass
= pthread_priority_get_class_index(prio
);
2730 upcall_flags
|= WQ_FLAG_THREAD_OVERCOMMIT
;
2731 } else if (mode
== RUN_NEXTREQ_OVERCOMMIT_KEVENT
){
2732 priclass
= pthread_priority_get_class_index(prio
);
2733 upcall_flags
|= WQ_FLAG_THREAD_KEVENT
;
2734 } else if (mode
== RUN_NEXTREQ_EVENT_MANAGER
){
2735 assert(wq
->wq_thscheduled_count
[WORKQUEUE_EVENT_MANAGER_BUCKET
] == 0);
2736 priclass
= WORKQUEUE_EVENT_MANAGER_BUCKET
;
2737 upcall_flags
|= WQ_FLAG_THREAD_EVENT_MANAGER
;
2738 if (wq
->wq_kevent_requests
[WORKQUEUE_EVENT_MANAGER_BUCKET
]){
2739 upcall_flags
|= WQ_FLAG_THREAD_KEVENT
;
2741 } else if (wq
->wq_reqcount
== 0){
2742 // no work to do. we'll check again when new work arrives.
2744 } else if (mode
== RUN_NEXTREQ_DEFAULT_KEVENT
) {
2745 assert(kevent_bind_via_return
);
2747 priclass
= pthread_priority_get_class_index(prio
);
2748 assert(priclass
< WORKQUEUE_EVENT_MANAGER_BUCKET
);
2749 assert(wq
->wq_kevent_requests
[priclass
] > 0);
2751 upcall_flags
|= WQ_FLAG_THREAD_KEVENT
;
2752 mode
= RUN_NEXTREQ_DEFAULT
;
2753 } else if (wq
->wq_requests
[WORKQUEUE_EVENT_MANAGER_BUCKET
] &&
2754 ((wq
->wq_thscheduled_count
[WORKQUEUE_EVENT_MANAGER_BUCKET
] == 0) ||
2755 (thread
!= THREAD_NULL
&& tl
->th_priority
== WORKQUEUE_EVENT_MANAGER_BUCKET
))){
2756 // There's an event manager request and either:
2757 // - no event manager currently running
2758 // - we are re-using the event manager
2759 mode
= RUN_NEXTREQ_EVENT_MANAGER
;
2760 priclass
= WORKQUEUE_EVENT_MANAGER_BUCKET
;
2761 upcall_flags
|= WQ_FLAG_THREAD_EVENT_MANAGER
;
2762 if (wq
->wq_kevent_requests
[WORKQUEUE_EVENT_MANAGER_BUCKET
]){
2763 upcall_flags
|= WQ_FLAG_THREAD_KEVENT
;
2766 // Find highest priority and check for special request types
2767 for (priclass
= 0; priclass
< WORKQUEUE_EVENT_MANAGER_BUCKET
; priclass
++) {
2768 if (wq
->wq_requests
[priclass
])
2771 if (priclass
== WORKQUEUE_EVENT_MANAGER_BUCKET
){
2772 // only request should have been event manager since it's not in a bucket,
2773 // but we weren't able to handle it since there's already an event manager running,
2774 // so we fell into this case
2775 assert(wq
->wq_requests
[WORKQUEUE_EVENT_MANAGER_BUCKET
] == 1 &&
2776 wq
->wq_thscheduled_count
[WORKQUEUE_EVENT_MANAGER_BUCKET
] == 1 &&
2777 wq
->wq_reqcount
== 1);
2781 if (wq
->wq_kevent_ocrequests
[priclass
]){
2782 mode
= RUN_NEXTREQ_DEFERRED_OVERCOMMIT
;
2783 upcall_flags
|= WQ_FLAG_THREAD_KEVENT
;
2784 upcall_flags
|= WQ_FLAG_THREAD_OVERCOMMIT
;
2785 } else if (wq
->wq_ocrequests
[priclass
]){
2786 mode
= RUN_NEXTREQ_DEFERRED_OVERCOMMIT
;
2787 upcall_flags
|= WQ_FLAG_THREAD_OVERCOMMIT
;
2788 } else if (wq
->wq_kevent_requests
[priclass
]){
2789 upcall_flags
|= WQ_FLAG_THREAD_KEVENT
;
2793 assert(mode
!= RUN_NEXTREQ_EVENT_MANAGER
|| priclass
== WORKQUEUE_EVENT_MANAGER_BUCKET
);
2794 assert(mode
== RUN_NEXTREQ_EVENT_MANAGER
|| priclass
!= WORKQUEUE_EVENT_MANAGER_BUCKET
);
2796 if (mode
== RUN_NEXTREQ_DEFAULT
/* non-overcommit */){
2797 uint32_t my_priclass
= (thread
!= THREAD_NULL
) ? tl
->th_priority
: WORKQUEUE_NUM_BUCKETS
;
2798 if (may_start_constrained_thread(wq
, priclass
, my_priclass
, &start_timer
) == FALSE
){
2799 // per policy, we won't start another constrained thread
2804 if (thread
!= THREAD_NULL
) {
2806 * thread is non-NULL here when we return from userspace
2807 * in workq_kernreturn, rather than trying to find a thread
2808 * we pick up new work for this specific thread.
2811 upcall_flags
|= WQ_FLAG_THREAD_REUSE
;
2812 } else if (wq
->wq_thidlecount
== 0) {
2814 * we have no additional threads waiting to pick up
2815 * work, however, there is additional work to do.
2817 start_timer
= WQ_TIMER_DELAYED_NEEDED(wq
);
2819 PTHREAD_TRACE_WQ(TRACE_wq_stalled
, wq
, wq
->wq_nthreads
, start_timer
, 0, 0);
2823 // there is both work available and an idle thread, so activate a thread
2824 tl
= pop_from_thidlelist(wq
, priclass
);
2825 th_to_run
= tl
->th_thread
;
2828 // Adjust counters and thread flags AKA consume the request
2829 // TODO: It would be lovely if OVERCOMMIT consumed reqcount
2831 case RUN_NEXTREQ_DEFAULT
:
2832 case RUN_NEXTREQ_DEFAULT_KEVENT
: /* actually mapped to DEFAULT above */
2833 case RUN_NEXTREQ_ADD_TIMER
: /* actually mapped to DEFAULT above */
2834 case RUN_NEXTREQ_UNCONSTRAINED
:
2836 wq
->wq_requests
[priclass
]--;
2838 if (mode
== RUN_NEXTREQ_DEFAULT
){
2839 if (!(tl
->th_flags
& TH_LIST_CONSTRAINED
)) {
2840 wq
->wq_constrained_threads_scheduled
++;
2841 tl
->th_flags
|= TH_LIST_CONSTRAINED
;
2843 } else if (mode
== RUN_NEXTREQ_UNCONSTRAINED
){
2844 if (tl
->th_flags
& TH_LIST_CONSTRAINED
) {
2845 wq
->wq_constrained_threads_scheduled
--;
2846 tl
->th_flags
&= ~TH_LIST_CONSTRAINED
;
2849 if (upcall_flags
& WQ_FLAG_THREAD_KEVENT
){
2850 wq
->wq_kevent_requests
[priclass
]--;
2854 case RUN_NEXTREQ_EVENT_MANAGER
:
2856 wq
->wq_requests
[priclass
]--;
2858 if (tl
->th_flags
& TH_LIST_CONSTRAINED
) {
2859 wq
->wq_constrained_threads_scheduled
--;
2860 tl
->th_flags
&= ~TH_LIST_CONSTRAINED
;
2862 if (upcall_flags
& WQ_FLAG_THREAD_KEVENT
){
2863 wq
->wq_kevent_requests
[priclass
]--;
2867 case RUN_NEXTREQ_DEFERRED_OVERCOMMIT
:
2869 wq
->wq_requests
[priclass
]--;
2870 if (upcall_flags
& WQ_FLAG_THREAD_KEVENT
){
2871 wq
->wq_kevent_ocrequests
[priclass
]--;
2873 wq
->wq_ocrequests
[priclass
]--;
2876 case RUN_NEXTREQ_OVERCOMMIT
:
2877 case RUN_NEXTREQ_OVERCOMMIT_KEVENT
:
2878 if (tl
->th_flags
& TH_LIST_CONSTRAINED
) {
2879 wq
->wq_constrained_threads_scheduled
--;
2880 tl
->th_flags
&= ~TH_LIST_CONSTRAINED
;
2885 // Confirm we've maintained our counter invariants
2886 assert(wq
->wq_requests
[priclass
] < UINT16_MAX
);
2887 assert(wq
->wq_ocrequests
[priclass
] < UINT16_MAX
);
2888 assert(wq
->wq_kevent_requests
[priclass
] < UINT16_MAX
);
2889 assert(wq
->wq_kevent_ocrequests
[priclass
] < UINT16_MAX
);
2890 assert(wq
->wq_ocrequests
[priclass
] + wq
->wq_kevent_requests
[priclass
] +
2891 wq
->wq_kevent_ocrequests
[priclass
] <=
2892 wq
->wq_requests
[priclass
]);
2894 assert((tl
->th_flags
& TH_LIST_KEVENT_BOUND
) == 0);
2895 if (upcall_flags
& WQ_FLAG_THREAD_KEVENT
) {
2896 tl
->th_flags
|= TH_LIST_KEVENT
;
2898 tl
->th_flags
&= ~TH_LIST_KEVENT
;
2901 uint32_t orig_class
= tl
->th_priority
;
2902 tl
->th_priority
= (uint8_t)priclass
;
2904 if ((thread
!= THREAD_NULL
) && (orig_class
!= priclass
)) {
2906 * we need to adjust these counters based on this
2907 * thread's new disposition w/r to priority
2909 OSAddAtomic(-1, &wq
->wq_thactive_count
[orig_class
]);
2910 OSAddAtomic(1, &wq
->wq_thactive_count
[priclass
]);
2912 wq
->wq_thscheduled_count
[orig_class
]--;
2913 wq
->wq_thscheduled_count
[priclass
]++;
2915 wq
->wq_thread_yielded_count
= 0;
2917 pthread_priority_t outgoing_priority
= pthread_priority_from_wq_class_index(wq
, tl
->th_priority
);
2918 PTHREAD_TRACE_WQ(TRACE_wq_reset_priority
| DBG_FUNC_START
, wq
, thread_tid(tl
->th_thread
), outgoing_priority
, 0, 0);
2919 reset_priority(tl
, outgoing_priority
);
2920 PTHREAD_TRACE_WQ(TRACE_wq_reset_priority
| DBG_FUNC_END
, wq
, thread_tid(tl
->th_thread
), outgoing_priority
, 0, 0);
2923 * persist upcall_flags so that in can be retrieved in setup_wqthread
2925 tl
->th_upcall_flags
= upcall_flags
>> WQ_FLAG_THREAD_PRIOSHIFT
;
2928 * if current thread is reused for work request, does not return via unix_syscall
2930 wq_runreq(p
, th_to_run
, wq
, tl
, (thread
== th_to_run
),
2931 (upcall_flags
& WQ_FLAG_THREAD_KEVENT
) && !kevent_bind_via_return
);
2933 PTHREAD_TRACE_WQ(TRACE_wq_run_nextitem
|DBG_FUNC_END
, wq
, thread_tid(th_to_run
), mode
== RUN_NEXTREQ_OVERCOMMIT
, 1, 0);
2935 assert(!kevent_bind_via_return
|| (upcall_flags
& WQ_FLAG_THREAD_KEVENT
));
2936 if (kevent_bind_via_return
&& (upcall_flags
& WQ_FLAG_THREAD_KEVENT
)) {
2937 tl
->th_flags
|= TH_LIST_KEVENT_BOUND
;
2940 workqueue_unlock(wq
);
2946 workqueue_interval_timer_start(wq
);
2948 PTHREAD_TRACE_WQ(TRACE_wq_run_nextitem
| DBG_FUNC_END
, wq
, thread_tid(thread
), start_timer
, 3, 0);
2950 if (thread
!= THREAD_NULL
){
2951 parkit(wq
, tl
, thread
);
2955 workqueue_unlock(wq
);
2961 * parked thread wakes up
2964 wq_unpark_continue(void* __unused ptr
, wait_result_t wait_result
)
2966 boolean_t first_use
= false;
2967 thread_t th
= current_thread();
2968 proc_t p
= current_proc();
2970 struct uthread
*uth
= pthread_kern
->get_bsdthread_info(th
);
2971 if (uth
== NULL
) goto done
;
2973 struct threadlist
*tl
= pthread_kern
->uthread_get_threadlist(uth
);
2974 if (tl
== NULL
) goto done
;
2976 struct workqueue
*wq
= tl
->th_workq
;
2978 workqueue_lock_spin(wq
);
2980 assert(tl
->th_flags
& TH_LIST_INITED
);
2982 if ((tl
->th_flags
& TH_LIST_NEW
)){
2983 tl
->th_flags
&= ~(TH_LIST_NEW
);
2987 if ((tl
->th_flags
& (TH_LIST_RUNNING
| TH_LIST_BUSY
)) == TH_LIST_RUNNING
) {
2989 * The normal wakeup path.
2991 goto return_to_user
;
2994 if ((tl
->th_flags
& TH_LIST_RUNNING
) == 0 &&
2995 wait_result
== THREAD_TIMED_OUT
&&
2996 tl
->th_priority
== WORKQUEUE_EVENT_MANAGER_BUCKET
&&
2997 TAILQ_FIRST(&wq
->wq_thidlemgrlist
) == tl
&&
2998 TAILQ_NEXT(tl
, th_entry
) == NULL
){
3000 * If we are the only idle manager and we pop'ed for self-destruction,
3001 * then don't actually exit. Instead, free our stack to save some
3002 * memory and re-park.
3005 workqueue_unlock(wq
);
3007 vm_map_t vmap
= wq
->wq_map
;
3009 // Keep this in sync with _setup_wqthread()
3010 const vm_size_t guardsize
= vm_map_page_size(vmap
);
3011 const user_addr_t freeaddr
= (user_addr_t
)tl
->th_stackaddr
+ guardsize
;
3012 const vm_map_offset_t freesize
= vm_map_trunc_page_mask((PTH_DEFAULT_STACKSIZE
+ guardsize
+ PTHREAD_T_OFFSET
) - 1, vm_map_page_mask(vmap
)) - guardsize
;
3015 kr
= mach_vm_behavior_set(vmap
, freeaddr
, freesize
, VM_BEHAVIOR_REUSABLE
);
3016 assert(kr
== KERN_SUCCESS
|| kr
== KERN_INVALID_ADDRESS
);
3018 workqueue_lock_spin(wq
);
3020 if ( !(tl
->th_flags
& TH_LIST_RUNNING
)) {
3021 assert_wait((caddr_t
)tl
, (THREAD_INTERRUPTIBLE
));
3023 workqueue_unlock(wq
);
3025 thread_block(wq_unpark_continue
);
3030 if ((tl
->th_flags
& TH_LIST_RUNNING
) == 0) {
3031 assert((tl
->th_flags
& TH_LIST_BUSY
) == 0);
3033 * We were set running, but not for the purposes of actually running.
3034 * This could be because the timer elapsed. Or it could be because the
3035 * thread aborted. Either way, we need to return to userspace to exit.
3037 * The call to workqueue_removethread will consume the lock.
3041 tl
->th_priority
!= qos_class_get_class_index(WQ_THREAD_CLEANUP_QOS
)) {
3042 // Reset the QoS to something low for the pthread cleanup
3043 pthread_priority_t cleanup_pri
= _pthread_priority_make_newest(WQ_THREAD_CLEANUP_QOS
, 0, 0);
3044 reset_priority(tl
, cleanup_pri
);
3047 workqueue_removethread(tl
, 0, first_use
);
3050 pthread_kern
->thread_bootstrap_return();
3052 pthread_kern
->unix_syscall_return(0);
3058 * The timer woke us up or the thread was aborted. However, we have
3059 * already started to make this a runnable thread. Wait for that to
3060 * finish, then continue to userspace.
3062 while ((tl
->th_flags
& TH_LIST_BUSY
)) {
3063 assert_wait((caddr_t
)tl
, (THREAD_UNINT
));
3065 workqueue_unlock(wq
);
3067 thread_block(THREAD_CONTINUE_NULL
);
3069 workqueue_lock_spin(wq
);
3073 workqueue_unlock(wq
);
3074 _setup_wqthread(p
, th
, wq
, tl
, first_use
);
3075 pthread_kern
->thread_sched_call(th
, workqueue_callback
);
3078 pthread_kern
->thread_bootstrap_return();
3080 pthread_kern
->unix_syscall_return(EJUSTRETURN
);
3082 panic("Our attempt to return to userspace failed...");
3085 /* called with workqueue lock held */
3087 wq_runreq(proc_t p
, thread_t th
, struct workqueue
*wq
, struct threadlist
*tl
,
3088 boolean_t return_directly
, boolean_t needs_kevent_bind
)
3090 PTHREAD_TRACE1_WQ(TRACE_wq_runitem
| DBG_FUNC_START
, tl
->th_workq
, 0, 0, thread_tid(current_thread()), thread_tid(th
));
3092 unsigned int kevent_flags
= KEVENT_FLAG_WORKQ
;
3093 if (tl
->th_priority
== WORKQUEUE_EVENT_MANAGER_BUCKET
) {
3094 kevent_flags
|= KEVENT_FLAG_WORKQ_MANAGER
;
3097 if (return_directly
) {
3098 if (needs_kevent_bind
) {
3099 assert((tl
->th_flags
& TH_LIST_KEVENT_BOUND
) == 0);
3100 tl
->th_flags
|= TH_LIST_KEVENT_BOUND
;
3103 workqueue_unlock(wq
);
3105 if (needs_kevent_bind
) {
3106 kevent_qos_internal_bind(p
, class_index_get_thread_qos(tl
->th_priority
), th
, kevent_flags
);
3110 * For preemption reasons, we want to reset the voucher as late as
3111 * possible, so we do it in two places:
3112 * - Just before parking (i.e. in parkit())
3113 * - Prior to doing the setup for the next workitem (i.e. here)
3115 * Those two places are sufficient to ensure we always reset it before
3116 * it goes back out to user space, but be careful to not break that
3119 kern_return_t kr
= pthread_kern
->thread_set_voucher_name(MACH_PORT_NULL
);
3120 assert(kr
== KERN_SUCCESS
);
3122 _setup_wqthread(p
, th
, wq
, tl
, false);
3124 PTHREAD_TRACE_WQ(TRACE_wq_run_nextitem
|DBG_FUNC_END
, tl
->th_workq
, 0, 0, 4, 0);
3126 pthread_kern
->unix_syscall_return(EJUSTRETURN
);
3130 if (needs_kevent_bind
) {
3131 // Leave TH_LIST_BUSY set so that the thread can't beat us to calling kevent
3132 workqueue_unlock(wq
);
3133 assert((tl
->th_flags
& TH_LIST_KEVENT_BOUND
) == 0);
3134 kevent_qos_internal_bind(p
, class_index_get_thread_qos(tl
->th_priority
), th
, kevent_flags
);
3135 tl
->th_flags
|= TH_LIST_KEVENT_BOUND
;
3136 workqueue_lock_spin(wq
);
3138 tl
->th_flags
&= ~(TH_LIST_BUSY
);
3139 thread_wakeup_thread(tl
,th
);
3142 #define KEVENT_LIST_LEN 16 // WORKQ_KEVENT_EVENT_BUFFER_LEN
3143 #define KEVENT_DATA_SIZE (32 * 1024)
3146 * configures initial thread stack/registers to jump into:
3147 * _pthread_wqthread(pthread_t self, mach_port_t kport, void *stackaddr, void *keventlist, int upcall_flags, int nkevents);
3148 * to get there we jump through assembily stubs in pthread_asm.s. Those
3149 * routines setup a stack frame, using the current stack pointer, and marshall
3150 * arguments from registers to the stack as required by the ABI.
3152 * One odd thing we do here is to start the pthread_t 4k below what would be the
3153 * top of the stack otherwise. This is because usually only the first 4k of the
3154 * pthread_t will be used and so we want to put it on the same 16k page as the
3155 * top of the stack to save memory.
3157 * When we are done the stack will look like:
3158 * |-----------| th_stackaddr + th_allocsize
3159 * |pthread_t | th_stackaddr + DEFAULT_STACKSIZE + guardsize + PTHREAD_STACK_OFFSET
3160 * |kevent list| optionally - at most KEVENT_LIST_LEN events
3161 * |kevent data| optionally - at most KEVENT_DATA_SIZE bytes
3162 * |stack gap | bottom aligned to 16 bytes, and at least as big as stack_gap_min
3166 * |guard page | guardsize
3167 * |-----------| th_stackaddr
3170 _setup_wqthread(proc_t p
, thread_t th
, struct workqueue
*wq
, struct threadlist
*tl
,
3174 uint32_t upcall_flags
;
3176 pthread_priority_t priority
= pthread_priority_from_wq_class_index(wq
, tl
->th_priority
);
3178 const vm_size_t guardsize
= vm_map_page_size(tl
->th_workq
->wq_map
);
3179 const vm_size_t stack_gap_min
= (proc_is64bit(p
) == 0) ? C_32_STK_ALIGN
: C_64_REDZONE_LEN
;
3180 const vm_size_t stack_align_min
= (proc_is64bit(p
) == 0) ? C_32_STK_ALIGN
: C_64_STK_ALIGN
;
3182 user_addr_t pthread_self_addr
= (user_addr_t
)(tl
->th_stackaddr
+ PTH_DEFAULT_STACKSIZE
+ guardsize
+ PTHREAD_T_OFFSET
);
3183 user_addr_t stack_top_addr
= (user_addr_t
)((pthread_self_addr
- stack_gap_min
) & -stack_align_min
);
3184 user_addr_t stack_bottom_addr
= (user_addr_t
)(tl
->th_stackaddr
+ guardsize
);
3186 user_addr_t wqstart_fnptr
= pthread_kern
->proc_get_wqthread(p
);
3187 if (!wqstart_fnptr
) {
3188 panic("workqueue thread start function pointer is NULL");
3191 /* Put the QoS class value into the lower bits of the reuse_thread register, this is where
3192 * the thread priority used to be stored anyway.
3194 upcall_flags
= tl
->th_upcall_flags
<< WQ_FLAG_THREAD_PRIOSHIFT
;
3195 upcall_flags
|= (_pthread_priority_get_qos_newest(priority
) & WQ_FLAG_THREAD_PRIOMASK
);
3197 upcall_flags
|= WQ_FLAG_THREAD_NEWSPI
;
3199 uint32_t tsd_offset
= pthread_kern
->proc_get_pthread_tsd_offset(p
);
3201 mach_vm_offset_t th_tsd_base
= (mach_vm_offset_t
)pthread_self_addr
+ tsd_offset
;
3202 kern_return_t kret
= pthread_kern
->thread_set_tsd_base(th
, th_tsd_base
);
3203 if (kret
== KERN_SUCCESS
) {
3204 upcall_flags
|= WQ_FLAG_THREAD_TSD_BASE_SET
;
3210 * Pre-fault the first page of the new thread's stack and the page that will
3211 * contain the pthread_t structure.
3213 vm_map_t vmap
= pthread_kern
->current_map();
3214 if (vm_map_trunc_page_mask((vm_map_offset_t
)(stack_top_addr
- C_64_REDZONE_LEN
), vm_map_page_mask(vmap
)) !=
3215 vm_map_trunc_page_mask((vm_map_offset_t
)pthread_self_addr
, vm_map_page_mask(vmap
))){
3217 vm_map_trunc_page_mask((vm_map_offset_t
)(stack_top_addr
- C_64_REDZONE_LEN
), vm_map_page_mask(vmap
)),
3218 VM_PROT_READ
| VM_PROT_WRITE
,
3220 THREAD_UNINT
, NULL
, 0);
3223 vm_map_trunc_page_mask((vm_map_offset_t
)pthread_self_addr
, vm_map_page_mask(vmap
)),
3224 VM_PROT_READ
| VM_PROT_WRITE
,
3226 THREAD_UNINT
, NULL
, 0);
3228 upcall_flags
|= WQ_FLAG_THREAD_REUSE
;
3231 user_addr_t kevent_list
= NULL
;
3232 int kevent_count
= 0;
3233 if (upcall_flags
& WQ_FLAG_THREAD_KEVENT
){
3234 kevent_list
= pthread_self_addr
- KEVENT_LIST_LEN
* sizeof(struct kevent_qos_s
);
3235 kevent_count
= KEVENT_LIST_LEN
;
3237 user_addr_t kevent_data_buf
= kevent_list
- KEVENT_DATA_SIZE
;
3238 user_size_t kevent_data_available
= KEVENT_DATA_SIZE
;
3240 int32_t events_out
= 0;
3242 assert(tl
->th_flags
| TH_LIST_KEVENT_BOUND
);
3243 unsigned int flags
= KEVENT_FLAG_WORKQ
| KEVENT_FLAG_STACK_DATA
| KEVENT_FLAG_IMMEDIATE
;
3244 if (tl
->th_priority
== WORKQUEUE_EVENT_MANAGER_BUCKET
) {
3245 flags
|= KEVENT_FLAG_WORKQ_MANAGER
;
3247 int ret
= kevent_qos_internal(p
, class_index_get_thread_qos(tl
->th_priority
), NULL
, 0, kevent_list
, kevent_count
,
3248 kevent_data_buf
, &kevent_data_available
,
3249 flags
, &events_out
);
3251 // turns out there are a lot of edge cases where this will fail, so not enabled by default
3252 //assert((ret == KERN_SUCCESS && events_out != -1) || ret == KERN_ABORTED);
3254 // squash any errors into just empty output on
3255 if (ret
!= KERN_SUCCESS
|| events_out
== -1){
3257 kevent_data_available
= KEVENT_DATA_SIZE
;
3260 // We shouldn't get data out if there aren't events available
3261 assert(events_out
!= 0 || kevent_data_available
== KEVENT_DATA_SIZE
);
3263 if (events_out
> 0){
3264 if (kevent_data_available
== KEVENT_DATA_SIZE
){
3265 stack_top_addr
= (kevent_list
- stack_gap_min
) & -stack_align_min
;
3267 stack_top_addr
= (kevent_data_buf
+ kevent_data_available
- stack_gap_min
) & -stack_align_min
;
3270 kevent_count
= events_out
;
3277 #if defined(__i386__) || defined(__x86_64__)
3278 if (proc_is64bit(p
) == 0) {
3279 x86_thread_state32_t state
= {
3280 .eip
= (unsigned int)wqstart_fnptr
,
3281 .eax
= /* arg0 */ (unsigned int)pthread_self_addr
,
3282 .ebx
= /* arg1 */ (unsigned int)tl
->th_thport
,
3283 .ecx
= /* arg2 */ (unsigned int)stack_bottom_addr
,
3284 .edx
= /* arg3 */ (unsigned int)kevent_list
,
3285 .edi
= /* arg4 */ (unsigned int)upcall_flags
,
3286 .esi
= /* arg5 */ (unsigned int)kevent_count
,
3288 .esp
= (int)((vm_offset_t
)stack_top_addr
),
3291 error
= pthread_kern
->thread_set_wq_state32(th
, (thread_state_t
)&state
);
3292 if (error
!= KERN_SUCCESS
) {
3293 panic(__func__
": thread_set_wq_state failed: %d", error
);
3296 x86_thread_state64_t state64
= {
3297 // x86-64 already passes all the arguments in registers, so we just put them in their final place here
3298 .rip
= (uint64_t)wqstart_fnptr
,
3299 .rdi
= (uint64_t)pthread_self_addr
,
3300 .rsi
= (uint64_t)tl
->th_thport
,
3301 .rdx
= (uint64_t)stack_bottom_addr
,
3302 .rcx
= (uint64_t)kevent_list
,
3303 .r8
= (uint64_t)upcall_flags
,
3304 .r9
= (uint64_t)kevent_count
,
3306 .rsp
= (uint64_t)(stack_top_addr
)
3309 error
= pthread_kern
->thread_set_wq_state64(th
, (thread_state_t
)&state64
);
3310 if (error
!= KERN_SUCCESS
) {
3311 panic(__func__
": thread_set_wq_state failed: %d", error
);
3315 #error setup_wqthread not defined for this architecture
3320 static int wq_kevent_test SYSCTL_HANDLER_ARGS
{
3321 //(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req)
3322 #pragma unused(oidp, arg1, arg2)
3324 struct workq_reqthreads_req_s requests
[64] = {};
3326 if (req
->newlen
> sizeof(requests
) || req
->newlen
< sizeof(struct workq_reqthreads_req_s
))
3329 error
= copyin(req
->newptr
, requests
, req
->newlen
);
3330 if (error
) return error
;
3332 _workq_reqthreads(req
->p
, (int)(req
->newlen
/ sizeof(struct workq_reqthreads_req_s
)), requests
);
3341 _fill_procworkqueue(proc_t p
, struct proc_workqueueinfo
* pwqinfo
)
3343 struct workqueue
* wq
;
3348 if ((wq
= pthread_kern
->proc_get_wqptr(p
)) == NULL
) {
3352 workqueue_lock_spin(wq
);
3355 for (pri
= 0; pri
< WORKQUEUE_NUM_BUCKETS
; pri
++) {
3356 activecount
+= wq
->wq_thactive_count
[pri
];
3358 pwqinfo
->pwq_nthreads
= wq
->wq_nthreads
;
3359 pwqinfo
->pwq_runthreads
= activecount
;
3360 pwqinfo
->pwq_blockedthreads
= wq
->wq_threads_scheduled
- activecount
;
3361 pwqinfo
->pwq_state
= 0;
3363 if (wq
->wq_constrained_threads_scheduled
>= wq_max_constrained_threads
) {
3364 pwqinfo
->pwq_state
|= WQ_EXCEEDED_CONSTRAINED_THREAD_LIMIT
;
3367 if (wq
->wq_nthreads
>= wq_max_threads
) {
3368 pwqinfo
->pwq_state
|= WQ_EXCEEDED_TOTAL_THREAD_LIMIT
;
3371 workqueue_unlock(wq
);
3376 _get_pwq_state_kdp(proc_t p
)
3382 struct workqueue
*wq
= pthread_kern
->proc_get_wqptr(p
);
3384 if (wq
== NULL
|| workqueue_lock_spin_is_acquired_kdp(wq
)) {
3388 uint32_t pwq_state
= WQ_FLAGS_AVAILABLE
;
3390 if (wq
->wq_constrained_threads_scheduled
>= wq_max_constrained_threads
) {
3391 pwq_state
|= WQ_EXCEEDED_CONSTRAINED_THREAD_LIMIT
;
3394 if (wq
->wq_nthreads
>= wq_max_threads
) {
3395 pwq_state
|= WQ_EXCEEDED_TOTAL_THREAD_LIMIT
;
3402 _thread_selfid(__unused
struct proc
*p
, uint64_t *retval
)
3404 thread_t thread
= current_thread();
3405 *retval
= thread_tid(thread
);
3406 return KERN_SUCCESS
;
3412 pthread_lck_grp_attr
= lck_grp_attr_alloc_init();
3413 pthread_lck_grp
= lck_grp_alloc_init("pthread", pthread_lck_grp_attr
);
3416 * allocate the lock attribute for pthread synchronizers
3418 pthread_lck_attr
= lck_attr_alloc_init();
3420 pthread_list_mlock
= lck_mtx_alloc_init(pthread_lck_grp
, pthread_lck_attr
);
3422 pth_global_hashinit();
3423 psynch_thcall
= thread_call_allocate(psynch_wq_cleanup
, NULL
);
3429 sysctl_register_oid(&sysctl__kern_wq_yielded_threshold
);
3430 sysctl_register_oid(&sysctl__kern_wq_yielded_window_usecs
);
3431 sysctl_register_oid(&sysctl__kern_wq_stalled_window_usecs
);
3432 sysctl_register_oid(&sysctl__kern_wq_reduce_pool_window_usecs
);
3433 sysctl_register_oid(&sysctl__kern_wq_max_timer_interval_usecs
);
3434 sysctl_register_oid(&sysctl__kern_wq_max_threads
);
3435 sysctl_register_oid(&sysctl__kern_wq_max_constrained_threads
);
3436 sysctl_register_oid(&sysctl__kern_pthread_debug_tracing
);
3439 sysctl_register_oid(&sysctl__kern_wq_max_concurrency
);
3440 sysctl_register_oid(&sysctl__debug_wq_kevent_test
);
3443 wq_max_concurrency
= pthread_kern
->ml_get_max_cpus();