]> git.saurik.com Git - apple/libpthread.git/blob - kern/kern_support.c
e48fcd8473812e75fd7ead521f475f75751ad2b5
[apple/libpthread.git] / kern / kern_support.c
1 /*
2 * Copyright (c) 2000-2012 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /* Copyright (c) 1995-2005 Apple Computer, Inc. All Rights Reserved */
29 /*
30 * pthread_synch.c
31 */
32
33 #pragma mark - Front Matter
34
35 #define _PTHREAD_CONDATTR_T
36 #define _PTHREAD_COND_T
37 #define _PTHREAD_MUTEXATTR_T
38 #define _PTHREAD_MUTEX_T
39 #define _PTHREAD_RWLOCKATTR_T
40 #define _PTHREAD_RWLOCK_T
41
42 #undef pthread_mutexattr_t
43 #undef pthread_mutex_t
44 #undef pthread_condattr_t
45 #undef pthread_cond_t
46 #undef pthread_rwlockattr_t
47 #undef pthread_rwlock_t
48
49 #include <sys/cdefs.h>
50
51 // <rdar://problem/26158937> panic() should be marked noreturn
52 extern void panic(const char *string, ...) __printflike(1,2) __dead2;
53
54 #include <sys/param.h>
55 #include <sys/queue.h>
56 #include <sys/resourcevar.h>
57 //#include <sys/proc_internal.h>
58 #include <sys/kauth.h>
59 #include <sys/systm.h>
60 #include <sys/timeb.h>
61 #include <sys/times.h>
62 #include <sys/acct.h>
63 #include <sys/kernel.h>
64 #include <sys/wait.h>
65 #include <sys/signalvar.h>
66 #include <sys/sysctl.h>
67 #include <sys/syslog.h>
68 #include <sys/stat.h>
69 #include <sys/lock.h>
70 #include <sys/kdebug.h>
71 //#include <sys/sysproto.h>
72 #include <sys/vm.h>
73 #include <sys/user.h> /* for coredump */
74 #include <sys/proc_info.h> /* for fill_procworkqueue */
75
76 #include <mach/mach_port.h>
77 #include <mach/mach_types.h>
78 #include <mach/semaphore.h>
79 #include <mach/sync_policy.h>
80 #include <mach/task.h>
81 #include <mach/vm_prot.h>
82 #include <kern/kern_types.h>
83 #include <kern/task.h>
84 #include <kern/clock.h>
85 #include <mach/kern_return.h>
86 #include <kern/thread.h>
87 #include <kern/sched_prim.h>
88 #include <kern/kalloc.h>
89 #include <kern/sched_prim.h> /* for thread_exception_return */
90 #include <kern/processor.h>
91 #include <kern/assert.h>
92 #include <mach/mach_vm.h>
93 #include <mach/mach_param.h>
94 #include <mach/thread_status.h>
95 #include <mach/thread_policy.h>
96 #include <mach/message.h>
97 #include <mach/port.h>
98 //#include <vm/vm_protos.h>
99 #include <vm/vm_fault.h>
100 #include <vm/vm_map.h>
101 #include <mach/thread_act.h> /* for thread_resume */
102 #include <machine/machine_routines.h>
103 #include <mach/shared_region.h>
104
105 #include <libkern/OSAtomic.h>
106 #include <libkern/libkern.h>
107
108 #include <sys/pthread_shims.h>
109 #include "kern_internal.h"
110
111 // XXX: Dirty import for sys/signarvar.h that's wrapped in BSD_KERNEL_PRIVATE
112 #define sigcantmask (sigmask(SIGKILL) | sigmask(SIGSTOP))
113
114 // XXX: Ditto for thread tags from kern/thread.h
115 #define THREAD_TAG_MAINTHREAD 0x1
116 #define THREAD_TAG_PTHREAD 0x10
117 #define THREAD_TAG_WORKQUEUE 0x20
118
119 lck_grp_attr_t *pthread_lck_grp_attr;
120 lck_grp_t *pthread_lck_grp;
121 lck_attr_t *pthread_lck_attr;
122
123 extern void thread_set_cthreadself(thread_t thread, uint64_t pself, int isLP64);
124 extern void workqueue_thread_yielded(void);
125
126 enum run_nextreq_mode {
127 RUN_NEXTREQ_DEFAULT,
128 RUN_NEXTREQ_DEFAULT_KEVENT,
129 RUN_NEXTREQ_OVERCOMMIT,
130 RUN_NEXTREQ_OVERCOMMIT_KEVENT,
131 RUN_NEXTREQ_DEFERRED_OVERCOMMIT,
132 RUN_NEXTREQ_UNCONSTRAINED,
133 RUN_NEXTREQ_EVENT_MANAGER,
134 RUN_NEXTREQ_ADD_TIMER
135 };
136 static thread_t workqueue_run_nextreq(proc_t p, struct workqueue *wq, thread_t th,
137 enum run_nextreq_mode mode, pthread_priority_t prio,
138 bool kevent_bind_via_return);
139
140 static boolean_t workqueue_run_one(proc_t p, struct workqueue *wq, boolean_t overcommit, pthread_priority_t priority);
141
142 static void wq_runreq(proc_t p, thread_t th, struct workqueue *wq,
143 struct threadlist *tl, boolean_t return_directly, boolean_t deferred_kevent);
144
145 static void _setup_wqthread(proc_t p, thread_t th, struct workqueue *wq, struct threadlist *tl, bool first_use);
146
147 static void reset_priority(struct threadlist *tl, pthread_priority_t pri);
148 static pthread_priority_t pthread_priority_from_wq_class_index(struct workqueue *wq, int index);
149
150 static void wq_unpark_continue(void* ptr, wait_result_t wait_result) __dead2;
151
152 static boolean_t workqueue_addnewthread(struct workqueue *wq, boolean_t ignore_constrained_thread_limit);
153
154 static void workqueue_removethread(struct threadlist *tl, bool fromexit, bool first_use);
155 static void workqueue_lock_spin(struct workqueue *);
156 static void workqueue_unlock(struct workqueue *);
157
158 static boolean_t may_start_constrained_thread(struct workqueue *wq, uint32_t at_priclass, uint32_t my_priclass, boolean_t *start_timer);
159
160 static mach_vm_offset_t stack_addr_hint(proc_t p, vm_map_t vmap);
161
162 int proc_settargetconc(pid_t pid, int queuenum, int32_t targetconc);
163 int proc_setalltargetconc(pid_t pid, int32_t * targetconcp);
164
165 #define WQ_MAXPRI_MIN 0 /* low prio queue num */
166 #define WQ_MAXPRI_MAX 2 /* max prio queuenum */
167 #define WQ_PRI_NUM 3 /* number of prio work queues */
168
169 #define C_32_STK_ALIGN 16
170 #define C_64_STK_ALIGN 16
171 #define C_64_REDZONE_LEN 128
172
173 #define PTHREAD_T_OFFSET 0
174
175 /*
176 * Flags filed passed to bsdthread_create and back in pthread_start
177 31 <---------------------------------> 0
178 _________________________________________
179 | flags(8) | policy(8) | importance(16) |
180 -----------------------------------------
181 */
182
183 #define PTHREAD_START_CUSTOM 0x01000000
184 #define PTHREAD_START_SETSCHED 0x02000000
185 #define PTHREAD_START_DETACHED 0x04000000
186 #define PTHREAD_START_QOSCLASS 0x08000000
187 #define PTHREAD_START_TSD_BASE_SET 0x10000000
188 #define PTHREAD_START_QOSCLASS_MASK 0x00ffffff
189 #define PTHREAD_START_POLICY_BITSHIFT 16
190 #define PTHREAD_START_POLICY_MASK 0xff
191 #define PTHREAD_START_IMPORTANCE_MASK 0xffff
192
193 #define SCHED_OTHER POLICY_TIMESHARE
194 #define SCHED_FIFO POLICY_FIFO
195 #define SCHED_RR POLICY_RR
196
197 #define BASEPRI_DEFAULT 31
198
199 #pragma mark sysctls
200
201 uint32_t wq_yielded_threshold = WQ_YIELDED_THRESHOLD;
202 uint32_t wq_yielded_window_usecs = WQ_YIELDED_WINDOW_USECS;
203 uint32_t wq_stalled_window_usecs = WQ_STALLED_WINDOW_USECS;
204 uint32_t wq_reduce_pool_window_usecs = WQ_REDUCE_POOL_WINDOW_USECS;
205 uint32_t wq_max_timer_interval_usecs = WQ_MAX_TIMER_INTERVAL_USECS;
206 uint32_t wq_max_threads = WORKQUEUE_MAXTHREADS;
207 uint32_t wq_max_constrained_threads = WORKQUEUE_MAXTHREADS / 8;
208 uint32_t wq_max_concurrency = 1; // set to ncpus on load
209
210 SYSCTL_INT(_kern, OID_AUTO, wq_yielded_threshold, CTLFLAG_RW | CTLFLAG_LOCKED,
211 &wq_yielded_threshold, 0, "");
212
213 SYSCTL_INT(_kern, OID_AUTO, wq_yielded_window_usecs, CTLFLAG_RW | CTLFLAG_LOCKED,
214 &wq_yielded_window_usecs, 0, "");
215
216 SYSCTL_INT(_kern, OID_AUTO, wq_stalled_window_usecs, CTLFLAG_RW | CTLFLAG_LOCKED,
217 &wq_stalled_window_usecs, 0, "");
218
219 SYSCTL_INT(_kern, OID_AUTO, wq_reduce_pool_window_usecs, CTLFLAG_RW | CTLFLAG_LOCKED,
220 &wq_reduce_pool_window_usecs, 0, "");
221
222 SYSCTL_INT(_kern, OID_AUTO, wq_max_timer_interval_usecs, CTLFLAG_RW | CTLFLAG_LOCKED,
223 &wq_max_timer_interval_usecs, 0, "");
224
225 SYSCTL_INT(_kern, OID_AUTO, wq_max_threads, CTLFLAG_RW | CTLFLAG_LOCKED,
226 &wq_max_threads, 0, "");
227
228 SYSCTL_INT(_kern, OID_AUTO, wq_max_constrained_threads, CTLFLAG_RW | CTLFLAG_LOCKED,
229 &wq_max_constrained_threads, 0, "");
230
231 #ifdef DEBUG
232 SYSCTL_INT(_kern, OID_AUTO, wq_max_concurrency, CTLFLAG_RW | CTLFLAG_LOCKED,
233 &wq_max_concurrency, 0, "");
234
235 static int wq_kevent_test SYSCTL_HANDLER_ARGS;
236 SYSCTL_PROC(_debug, OID_AUTO, wq_kevent_test, CTLFLAG_MASKED | CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY | CTLTYPE_OPAQUE, NULL, 0, wq_kevent_test, 0, "-");
237 #endif
238
239 static uint32_t wq_init_constrained_limit = 1;
240
241 uint32_t pthread_debug_tracing = 1;
242
243 SYSCTL_INT(_kern, OID_AUTO, pthread_debug_tracing, CTLFLAG_RW | CTLFLAG_LOCKED,
244 &pthread_debug_tracing, 0, "")
245
246
247 #pragma mark - Process/Thread Setup/Teardown syscalls
248
249 static mach_vm_offset_t
250 stack_addr_hint(proc_t p, vm_map_t vmap)
251 {
252 mach_vm_offset_t stackaddr;
253 mach_vm_offset_t aslr_offset;
254 bool proc64bit = proc_is64bit(p);
255
256 // We can't safely take random values % something unless its a power-of-two
257 _Static_assert(powerof2(PTH_DEFAULT_STACKSIZE), "PTH_DEFAULT_STACKSIZE is a power-of-two");
258
259 #if defined(__i386__) || defined(__x86_64__)
260 if (proc64bit) {
261 // Matches vm_map_get_max_aslr_slide_pages's image shift in xnu
262 aslr_offset = random() % (1 << 28); // about 512 stacks
263 } else {
264 // Actually bigger than the image shift, we've got ~256MB to work with
265 aslr_offset = random() % (16 * PTH_DEFAULT_STACKSIZE);
266 }
267 aslr_offset = vm_map_trunc_page_mask(aslr_offset, vm_map_page_mask(vmap));
268 if (proc64bit) {
269 // Above nanomalloc range (see NANOZONE_SIGNATURE)
270 stackaddr = 0x700000000000 + aslr_offset;
271 } else {
272 stackaddr = SHARED_REGION_BASE_I386 + SHARED_REGION_SIZE_I386 + aslr_offset;
273 }
274 #elif defined(__arm__) || defined(__arm64__)
275 // vm_map_get_max_aslr_slide_pages ensures 1MB of slide, we do better
276 aslr_offset = random() % ((proc64bit ? 4 : 2) * PTH_DEFAULT_STACKSIZE);
277 aslr_offset = vm_map_trunc_page_mask((vm_map_offset_t)aslr_offset, vm_map_page_mask(vmap));
278 if (proc64bit) {
279 // 64 stacks below nanomalloc (see NANOZONE_SIGNATURE)
280 stackaddr = 0x170000000 - 64 * PTH_DEFAULT_STACKSIZE - aslr_offset;
281 } else {
282 // If you try to slide down from this point, you risk ending up in memory consumed by malloc
283 stackaddr = SHARED_REGION_BASE_ARM - 32 * PTH_DEFAULT_STACKSIZE + aslr_offset;
284 }
285 #else
286 #error Need to define a stack address hint for this architecture
287 #endif
288 return stackaddr;
289 }
290
291 /**
292 * bsdthread_create system call. Used by pthread_create.
293 */
294 int
295 _bsdthread_create(struct proc *p, user_addr_t user_func, user_addr_t user_funcarg, user_addr_t user_stack, user_addr_t user_pthread, uint32_t flags, user_addr_t *retval)
296 {
297 kern_return_t kret;
298 void * sright;
299 int error = 0;
300 int allocated = 0;
301 mach_vm_offset_t stackaddr;
302 mach_vm_size_t th_allocsize = 0;
303 mach_vm_size_t th_guardsize;
304 mach_vm_offset_t th_stack;
305 mach_vm_offset_t th_pthread;
306 mach_vm_offset_t th_tsd_base;
307 mach_port_name_t th_thport;
308 thread_t th;
309 vm_map_t vmap = pthread_kern->current_map();
310 task_t ctask = current_task();
311 unsigned int policy, importance;
312 uint32_t tsd_offset;
313
314 int isLP64 = 0;
315
316 if (pthread_kern->proc_get_register(p) == 0) {
317 return EINVAL;
318 }
319
320 PTHREAD_TRACE(TRACE_pthread_thread_create | DBG_FUNC_START, flags, 0, 0, 0, 0);
321
322 isLP64 = proc_is64bit(p);
323 th_guardsize = vm_map_page_size(vmap);
324
325 stackaddr = pthread_kern->proc_get_stack_addr_hint(p);
326 kret = pthread_kern->thread_create(ctask, &th);
327 if (kret != KERN_SUCCESS)
328 return(ENOMEM);
329 thread_reference(th);
330
331 pthread_kern->thread_set_tag(th, THREAD_TAG_PTHREAD);
332
333 sright = (void *)pthread_kern->convert_thread_to_port(th);
334 th_thport = pthread_kern->ipc_port_copyout_send(sright, pthread_kern->task_get_ipcspace(ctask));
335
336 if ((flags & PTHREAD_START_CUSTOM) == 0) {
337 mach_vm_size_t pthread_size =
338 vm_map_round_page_mask(pthread_kern->proc_get_pthsize(p) + PTHREAD_T_OFFSET, vm_map_page_mask(vmap));
339 th_allocsize = th_guardsize + user_stack + pthread_size;
340 user_stack += PTHREAD_T_OFFSET;
341
342 kret = mach_vm_map(vmap, &stackaddr,
343 th_allocsize,
344 page_size-1,
345 VM_MAKE_TAG(VM_MEMORY_STACK)| VM_FLAGS_ANYWHERE , NULL,
346 0, FALSE, VM_PROT_DEFAULT, VM_PROT_ALL,
347 VM_INHERIT_DEFAULT);
348 if (kret != KERN_SUCCESS){
349 kret = mach_vm_allocate(vmap,
350 &stackaddr, th_allocsize,
351 VM_MAKE_TAG(VM_MEMORY_STACK)| VM_FLAGS_ANYWHERE);
352 }
353 if (kret != KERN_SUCCESS) {
354 error = ENOMEM;
355 goto out;
356 }
357
358 PTHREAD_TRACE(TRACE_pthread_thread_create|DBG_FUNC_NONE, th_allocsize, stackaddr, 0, 2, 0);
359
360 allocated = 1;
361 /*
362 * The guard page is at the lowest address
363 * The stack base is the highest address
364 */
365 kret = mach_vm_protect(vmap, stackaddr, th_guardsize, FALSE, VM_PROT_NONE);
366
367 if (kret != KERN_SUCCESS) {
368 error = ENOMEM;
369 goto out1;
370 }
371
372 th_pthread = stackaddr + th_guardsize + user_stack;
373 th_stack = th_pthread;
374
375 /*
376 * Pre-fault the first page of the new thread's stack and the page that will
377 * contain the pthread_t structure.
378 */
379 if (vm_map_trunc_page_mask((vm_map_offset_t)(th_stack - C_64_REDZONE_LEN), vm_map_page_mask(vmap)) !=
380 vm_map_trunc_page_mask((vm_map_offset_t)th_pthread, vm_map_page_mask(vmap))){
381 vm_fault( vmap,
382 vm_map_trunc_page_mask((vm_map_offset_t)(th_stack - C_64_REDZONE_LEN), vm_map_page_mask(vmap)),
383 VM_PROT_READ | VM_PROT_WRITE,
384 FALSE,
385 THREAD_UNINT, NULL, 0);
386 }
387
388 vm_fault( vmap,
389 vm_map_trunc_page_mask((vm_map_offset_t)th_pthread, vm_map_page_mask(vmap)),
390 VM_PROT_READ | VM_PROT_WRITE,
391 FALSE,
392 THREAD_UNINT, NULL, 0);
393
394 } else {
395 th_stack = user_stack;
396 th_pthread = user_pthread;
397
398 PTHREAD_TRACE(TRACE_pthread_thread_create|DBG_FUNC_NONE, 0, 0, 0, 3, 0);
399 }
400
401 tsd_offset = pthread_kern->proc_get_pthread_tsd_offset(p);
402 if (tsd_offset) {
403 th_tsd_base = th_pthread + tsd_offset;
404 kret = pthread_kern->thread_set_tsd_base(th, th_tsd_base);
405 if (kret == KERN_SUCCESS) {
406 flags |= PTHREAD_START_TSD_BASE_SET;
407 }
408 }
409
410 #if defined(__i386__) || defined(__x86_64__)
411 /*
412 * Set up i386 registers & function call.
413 */
414 if (isLP64 == 0) {
415 x86_thread_state32_t state = {
416 .eip = (unsigned int)pthread_kern->proc_get_threadstart(p),
417 .eax = (unsigned int)th_pthread,
418 .ebx = (unsigned int)th_thport,
419 .ecx = (unsigned int)user_func,
420 .edx = (unsigned int)user_funcarg,
421 .edi = (unsigned int)user_stack,
422 .esi = (unsigned int)flags,
423 /*
424 * set stack pointer
425 */
426 .esp = (int)((vm_offset_t)(th_stack-C_32_STK_ALIGN))
427 };
428
429 error = pthread_kern->thread_set_wq_state32(th, (thread_state_t)&state);
430 if (error != KERN_SUCCESS) {
431 error = EINVAL;
432 goto out;
433 }
434 } else {
435 x86_thread_state64_t state64 = {
436 .rip = (uint64_t)pthread_kern->proc_get_threadstart(p),
437 .rdi = (uint64_t)th_pthread,
438 .rsi = (uint64_t)(th_thport),
439 .rdx = (uint64_t)user_func,
440 .rcx = (uint64_t)user_funcarg,
441 .r8 = (uint64_t)user_stack,
442 .r9 = (uint64_t)flags,
443 /*
444 * set stack pointer aligned to 16 byte boundary
445 */
446 .rsp = (uint64_t)(th_stack - C_64_REDZONE_LEN)
447 };
448
449 error = pthread_kern->thread_set_wq_state64(th, (thread_state_t)&state64);
450 if (error != KERN_SUCCESS) {
451 error = EINVAL;
452 goto out;
453 }
454
455 }
456 #elif defined(__arm__)
457 arm_thread_state_t state = {
458 .pc = (int)pthread_kern->proc_get_threadstart(p),
459 .r[0] = (unsigned int)th_pthread,
460 .r[1] = (unsigned int)th_thport,
461 .r[2] = (unsigned int)user_func,
462 .r[3] = (unsigned int)user_funcarg,
463 .r[4] = (unsigned int)user_stack,
464 .r[5] = (unsigned int)flags,
465
466 /* Set r7 & lr to 0 for better back tracing */
467 .r[7] = 0,
468 .lr = 0,
469
470 /*
471 * set stack pointer
472 */
473 .sp = (int)((vm_offset_t)(th_stack-C_32_STK_ALIGN))
474 };
475
476 (void) pthread_kern->thread_set_wq_state32(th, (thread_state_t)&state);
477
478 #else
479 #error bsdthread_create not defined for this architecture
480 #endif
481
482 if ((flags & PTHREAD_START_SETSCHED) != 0) {
483 /* Set scheduling parameters if needed */
484 thread_extended_policy_data_t extinfo;
485 thread_precedence_policy_data_t precedinfo;
486
487 importance = (flags & PTHREAD_START_IMPORTANCE_MASK);
488 policy = (flags >> PTHREAD_START_POLICY_BITSHIFT) & PTHREAD_START_POLICY_MASK;
489
490 if (policy == SCHED_OTHER) {
491 extinfo.timeshare = 1;
492 } else {
493 extinfo.timeshare = 0;
494 }
495
496 thread_policy_set(th, THREAD_EXTENDED_POLICY, (thread_policy_t)&extinfo, THREAD_EXTENDED_POLICY_COUNT);
497
498 precedinfo.importance = (importance - BASEPRI_DEFAULT);
499 thread_policy_set(th, THREAD_PRECEDENCE_POLICY, (thread_policy_t)&precedinfo, THREAD_PRECEDENCE_POLICY_COUNT);
500 } else if ((flags & PTHREAD_START_QOSCLASS) != 0) {
501 /* Set thread QoS class if requested. */
502 pthread_priority_t priority = (pthread_priority_t)(flags & PTHREAD_START_QOSCLASS_MASK);
503
504 thread_qos_policy_data_t qos;
505 qos.qos_tier = pthread_priority_get_thread_qos(priority);
506 qos.tier_importance = (qos.qos_tier == QOS_CLASS_UNSPECIFIED) ? 0 :
507 _pthread_priority_get_relpri(priority);
508
509 pthread_kern->thread_policy_set_internal(th, THREAD_QOS_POLICY, (thread_policy_t)&qos, THREAD_QOS_POLICY_COUNT);
510 }
511
512 kret = pthread_kern->thread_resume(th);
513 if (kret != KERN_SUCCESS) {
514 error = EINVAL;
515 goto out1;
516 }
517 thread_deallocate(th); /* drop the creator reference */
518
519 PTHREAD_TRACE(TRACE_pthread_thread_create|DBG_FUNC_END, error, th_pthread, 0, 0, 0);
520
521 // cast required as mach_vm_offset_t is always 64 bits even on 32-bit platforms
522 *retval = (user_addr_t)th_pthread;
523
524 return(0);
525
526 out1:
527 if (allocated != 0) {
528 (void)mach_vm_deallocate(vmap, stackaddr, th_allocsize);
529 }
530 out:
531 (void)pthread_kern->mach_port_deallocate(pthread_kern->task_get_ipcspace(ctask), th_thport);
532 (void)thread_terminate(th);
533 (void)thread_deallocate(th);
534 return(error);
535 }
536
537 /**
538 * bsdthread_terminate system call. Used by pthread_terminate
539 */
540 int
541 _bsdthread_terminate(__unused struct proc *p,
542 user_addr_t stackaddr,
543 size_t size,
544 uint32_t kthport,
545 uint32_t sem,
546 __unused int32_t *retval)
547 {
548 mach_vm_offset_t freeaddr;
549 mach_vm_size_t freesize;
550 kern_return_t kret;
551 thread_t th = current_thread();
552
553 freeaddr = (mach_vm_offset_t)stackaddr;
554 freesize = size;
555
556 PTHREAD_TRACE(TRACE_pthread_thread_terminate|DBG_FUNC_START, freeaddr, freesize, kthport, 0xff, 0);
557
558 if ((freesize != (mach_vm_size_t)0) && (freeaddr != (mach_vm_offset_t)0)) {
559 if (pthread_kern->thread_get_tag(th) & THREAD_TAG_MAINTHREAD){
560 vm_map_t user_map = pthread_kern->current_map();
561 freesize = vm_map_trunc_page_mask((vm_map_offset_t)freesize - 1, vm_map_page_mask(user_map));
562 kret = mach_vm_behavior_set(user_map, freeaddr, freesize, VM_BEHAVIOR_REUSABLE);
563 assert(kret == KERN_SUCCESS || kret == KERN_INVALID_ADDRESS);
564 kret = kret ? kret : mach_vm_protect(user_map, freeaddr, freesize, FALSE, VM_PROT_NONE);
565 assert(kret == KERN_SUCCESS || kret == KERN_INVALID_ADDRESS);
566 } else {
567 kret = mach_vm_deallocate(pthread_kern->current_map(), freeaddr, freesize);
568 if (kret != KERN_SUCCESS) {
569 PTHREAD_TRACE(TRACE_pthread_thread_terminate|DBG_FUNC_END, kret, 0, 0, 0, 0);
570 return(EINVAL);
571 }
572 }
573 }
574
575 (void) thread_terminate(th);
576 if (sem != MACH_PORT_NULL) {
577 kret = pthread_kern->semaphore_signal_internal_trap(sem);
578 if (kret != KERN_SUCCESS) {
579 PTHREAD_TRACE(TRACE_pthread_thread_terminate|DBG_FUNC_END, kret, 0, 0, 0, 0);
580 return(EINVAL);
581 }
582 }
583
584 if (kthport != MACH_PORT_NULL) {
585 pthread_kern->mach_port_deallocate(pthread_kern->task_get_ipcspace(current_task()), kthport);
586 }
587
588 PTHREAD_TRACE(TRACE_pthread_thread_terminate|DBG_FUNC_END, 0, 0, 0, 0, 0);
589
590 pthread_kern->thread_exception_return();
591 panic("bsdthread_terminate: still running\n");
592
593 PTHREAD_TRACE(TRACE_pthread_thread_terminate|DBG_FUNC_END, 0, 0xff, 0, 0, 0);
594
595 return(0);
596 }
597
598 /**
599 * bsdthread_register system call. Performs per-process setup. Responsible for
600 * returning capabilitiy bits to userspace and receiving userspace function addresses.
601 */
602 int
603 _bsdthread_register(struct proc *p,
604 user_addr_t threadstart,
605 user_addr_t wqthread,
606 int pthsize,
607 user_addr_t pthread_init_data,
608 user_addr_t pthread_init_data_size,
609 uint64_t dispatchqueue_offset,
610 int32_t *retval)
611 {
612 /* We have to do this first so that it resets after fork */
613 pthread_kern->proc_set_stack_addr_hint(p, (user_addr_t)stack_addr_hint(p, pthread_kern->current_map()));
614
615 /* prevent multiple registrations */
616 if (pthread_kern->proc_get_register(p) != 0) {
617 return(EINVAL);
618 }
619 /* syscall randomizer test can pass bogus values */
620 if (pthsize < 0 || pthsize > MAX_PTHREAD_SIZE) {
621 return(EINVAL);
622 }
623 pthread_kern->proc_set_threadstart(p, threadstart);
624 pthread_kern->proc_set_wqthread(p, wqthread);
625 pthread_kern->proc_set_pthsize(p, pthsize);
626 pthread_kern->proc_set_register(p);
627
628 /* if we have pthread_init_data, then we use that and target_concptr (which is an offset) get data. */
629 if (pthread_init_data != 0) {
630 thread_qos_policy_data_t qos;
631
632 struct _pthread_registration_data data = {};
633 size_t pthread_init_sz = MIN(sizeof(struct _pthread_registration_data), (size_t)pthread_init_data_size);
634
635 kern_return_t kr = copyin(pthread_init_data, &data, pthread_init_sz);
636 if (kr != KERN_SUCCESS) {
637 return EINVAL;
638 }
639
640 /* Incoming data from the data structure */
641 pthread_kern->proc_set_dispatchqueue_offset(p, data.dispatch_queue_offset);
642 if (data.version > offsetof(struct _pthread_registration_data, tsd_offset)
643 && data.tsd_offset < (uint32_t)pthsize) {
644 pthread_kern->proc_set_pthread_tsd_offset(p, data.tsd_offset);
645 }
646
647 /* Outgoing data that userspace expects as a reply */
648 data.version = sizeof(struct _pthread_registration_data);
649 if (pthread_kern->qos_main_thread_active()) {
650 mach_msg_type_number_t nqos = THREAD_QOS_POLICY_COUNT;
651 boolean_t gd = FALSE;
652
653 kr = pthread_kern->thread_policy_get(current_thread(), THREAD_QOS_POLICY, (thread_policy_t)&qos, &nqos, &gd);
654 if (kr != KERN_SUCCESS || qos.qos_tier == THREAD_QOS_UNSPECIFIED) {
655 /* Unspecified threads means the kernel wants us to impose legacy upon the thread. */
656 qos.qos_tier = THREAD_QOS_LEGACY;
657 qos.tier_importance = 0;
658
659 kr = pthread_kern->thread_policy_set_internal(current_thread(), THREAD_QOS_POLICY, (thread_policy_t)&qos, THREAD_QOS_POLICY_COUNT);
660 }
661
662 if (kr == KERN_SUCCESS) {
663 data.main_qos = thread_qos_get_pthread_priority(qos.qos_tier);
664 } else {
665 data.main_qos = _pthread_priority_make_newest(QOS_CLASS_UNSPECIFIED, 0, 0);
666 }
667 } else {
668 data.main_qos = _pthread_priority_make_newest(QOS_CLASS_UNSPECIFIED, 0, 0);
669 }
670
671 kr = copyout(&data, pthread_init_data, pthread_init_sz);
672 if (kr != KERN_SUCCESS) {
673 return EINVAL;
674 }
675 } else {
676 pthread_kern->proc_set_dispatchqueue_offset(p, dispatchqueue_offset);
677 }
678
679 /* return the supported feature set as the return value. */
680 *retval = PTHREAD_FEATURE_SUPPORTED;
681
682 return(0);
683 }
684
685 #pragma mark - QoS Manipulation
686
687 int
688 _bsdthread_ctl_set_qos(struct proc *p, user_addr_t __unused cmd, mach_port_name_t kport, user_addr_t tsd_priority_addr, user_addr_t arg3, int *retval)
689 {
690 kern_return_t kr;
691 thread_t th;
692
693 pthread_priority_t priority;
694
695 /* Unused parameters must be zero. */
696 if (arg3 != 0) {
697 return EINVAL;
698 }
699
700 /* QoS is stored in a given slot in the pthread TSD. We need to copy that in and set our QoS based on it. */
701 if (proc_is64bit(p)) {
702 uint64_t v;
703 kr = copyin(tsd_priority_addr, &v, sizeof(v));
704 if (kr != KERN_SUCCESS) {
705 return kr;
706 }
707 priority = (int)(v & 0xffffffff);
708 } else {
709 uint32_t v;
710 kr = copyin(tsd_priority_addr, &v, sizeof(v));
711 if (kr != KERN_SUCCESS) {
712 return kr;
713 }
714 priority = v;
715 }
716
717 if ((th = port_name_to_thread(kport)) == THREAD_NULL) {
718 return ESRCH;
719 }
720
721 /* <rdar://problem/16211829> Disable pthread_set_qos_class_np() on threads other than pthread_self */
722 if (th != current_thread()) {
723 thread_deallocate(th);
724 return EPERM;
725 }
726
727 int rv = _bsdthread_ctl_set_self(p, 0, priority, 0, _PTHREAD_SET_SELF_QOS_FLAG, retval);
728
729 /* Static param the thread, we just set QoS on it, so its stuck in QoS land now. */
730 /* pthread_kern->thread_static_param(th, TRUE); */ // see <rdar://problem/16433744>, for details
731
732 thread_deallocate(th);
733
734 return rv;
735 }
736
737 static inline struct threadlist *
738 util_get_thread_threadlist_entry(thread_t th)
739 {
740 struct uthread *uth = pthread_kern->get_bsdthread_info(th);
741 if (uth) {
742 struct threadlist *tl = pthread_kern->uthread_get_threadlist(uth);
743 return tl;
744 }
745 return NULL;
746 }
747
748 int
749 _bsdthread_ctl_set_self(struct proc *p, user_addr_t __unused cmd, pthread_priority_t priority, mach_port_name_t voucher, _pthread_set_flags_t flags, int __unused *retval)
750 {
751 thread_qos_policy_data_t qos;
752 mach_msg_type_number_t nqos = THREAD_QOS_POLICY_COUNT;
753 boolean_t gd = FALSE;
754 bool was_manager_thread = false;
755 thread_t th = current_thread();
756 struct workqueue *wq = NULL;
757 struct threadlist *tl = NULL;
758
759 kern_return_t kr;
760 int qos_rv = 0, voucher_rv = 0, fixedpri_rv = 0;
761
762 if ((flags & _PTHREAD_SET_SELF_WQ_KEVENT_UNBIND) != 0) {
763 tl = util_get_thread_threadlist_entry(th);
764 if (tl) {
765 wq = tl->th_workq;
766 } else {
767 goto qos;
768 }
769
770 workqueue_lock_spin(wq);
771 if (tl->th_flags & TH_LIST_KEVENT_BOUND) {
772 tl->th_flags &= ~TH_LIST_KEVENT_BOUND;
773 unsigned int kevent_flags = KEVENT_FLAG_WORKQ;
774 if (tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET) {
775 kevent_flags |= KEVENT_FLAG_WORKQ_MANAGER;
776 }
777
778 workqueue_unlock(wq);
779 kevent_qos_internal_unbind(p, class_index_get_thread_qos(tl->th_priority), th, kevent_flags);
780 } else {
781 workqueue_unlock(wq);
782 }
783 }
784
785 qos:
786 if ((flags & _PTHREAD_SET_SELF_QOS_FLAG) != 0) {
787 kr = pthread_kern->thread_policy_get(th, THREAD_QOS_POLICY, (thread_policy_t)&qos, &nqos, &gd);
788 if (kr != KERN_SUCCESS) {
789 qos_rv = EINVAL;
790 goto voucher;
791 }
792
793 /* If we have main-thread QoS then we don't allow a thread to come out of QOS_CLASS_UNSPECIFIED. */
794 if (pthread_kern->qos_main_thread_active() && qos.qos_tier == THREAD_QOS_UNSPECIFIED) {
795 qos_rv = EPERM;
796 goto voucher;
797 }
798
799 /* Get the work queue for tracing, also the threadlist for bucket manipluation. */
800 if (!tl) {
801 tl = util_get_thread_threadlist_entry(th);
802 if (tl) wq = tl->th_workq;
803 }
804
805 PTHREAD_TRACE_WQ(TRACE_pthread_set_qos_self | DBG_FUNC_START, wq, qos.qos_tier, qos.tier_importance, 0, 0);
806
807 qos.qos_tier = pthread_priority_get_thread_qos(priority);
808 qos.tier_importance = (qos.qos_tier == QOS_CLASS_UNSPECIFIED) ? 0 : _pthread_priority_get_relpri(priority);
809
810 if (qos.qos_tier == QOS_CLASS_UNSPECIFIED) {
811 qos_rv = EINVAL;
812 goto voucher;
813 }
814
815 /* If we're a workqueue, the threadlist item priority needs adjusting, along with the bucket we were running in. */
816 if (tl) {
817 workqueue_lock_spin(wq);
818 bool now_under_constrained_limit = false;
819
820 assert(!(tl->th_flags & TH_LIST_KEVENT_BOUND));
821
822 kr = pthread_kern->thread_set_workq_qos(th, qos.qos_tier, qos.tier_importance);
823 assert(kr == KERN_SUCCESS || kr == KERN_TERMINATED);
824
825 /* Fix up counters. */
826 uint8_t old_bucket = tl->th_priority;
827 uint8_t new_bucket = pthread_priority_get_class_index(priority);
828 if (old_bucket == WORKQUEUE_EVENT_MANAGER_BUCKET) {
829 was_manager_thread = true;
830 }
831
832 uint32_t old_active = OSAddAtomic(-1, &wq->wq_thactive_count[old_bucket]);
833 OSAddAtomic(1, &wq->wq_thactive_count[new_bucket]);
834
835 wq->wq_thscheduled_count[old_bucket]--;
836 wq->wq_thscheduled_count[new_bucket]++;
837
838 bool old_overcommit = !(tl->th_flags & TH_LIST_CONSTRAINED);
839 bool new_overcommit = priority & _PTHREAD_PRIORITY_OVERCOMMIT_FLAG;
840 if (!old_overcommit && new_overcommit) {
841 wq->wq_constrained_threads_scheduled--;
842 tl->th_flags &= ~TH_LIST_CONSTRAINED;
843 if (wq->wq_constrained_threads_scheduled == wq_max_constrained_threads - 1) {
844 now_under_constrained_limit = true;
845 }
846 } else if (old_overcommit && !new_overcommit) {
847 wq->wq_constrained_threads_scheduled++;
848 tl->th_flags |= TH_LIST_CONSTRAINED;
849 }
850
851 tl->th_priority = new_bucket;
852
853 /* If we were at the ceiling of threads for a given bucket, we have
854 * to reevaluate whether we should start more work.
855 */
856 if (old_active == wq->wq_reqconc[old_bucket] || now_under_constrained_limit) {
857 /* workqueue_run_nextreq will drop the workqueue lock in all exit paths. */
858 (void)workqueue_run_nextreq(p, wq, THREAD_NULL, RUN_NEXTREQ_DEFAULT, 0, false);
859 } else {
860 workqueue_unlock(wq);
861 }
862 } else {
863 kr = pthread_kern->thread_policy_set_internal(th, THREAD_QOS_POLICY, (thread_policy_t)&qos, THREAD_QOS_POLICY_COUNT);
864 if (kr != KERN_SUCCESS) {
865 qos_rv = EINVAL;
866 }
867 }
868
869 PTHREAD_TRACE_WQ(TRACE_pthread_set_qos_self | DBG_FUNC_END, wq, qos.qos_tier, qos.tier_importance, 0, 0);
870 }
871
872 voucher:
873 if ((flags & _PTHREAD_SET_SELF_VOUCHER_FLAG) != 0) {
874 kr = pthread_kern->thread_set_voucher_name(voucher);
875 if (kr != KERN_SUCCESS) {
876 voucher_rv = ENOENT;
877 goto fixedpri;
878 }
879 }
880
881 fixedpri:
882 if (qos_rv) goto done;
883 if ((flags & _PTHREAD_SET_SELF_FIXEDPRIORITY_FLAG) != 0) {
884 thread_extended_policy_data_t extpol = {.timeshare = 0};
885
886 if (!tl) tl = util_get_thread_threadlist_entry(th);
887 if (tl) {
888 /* Not allowed on workqueue threads */
889 fixedpri_rv = ENOTSUP;
890 goto done;
891 }
892
893 kr = pthread_kern->thread_policy_set_internal(th, THREAD_EXTENDED_POLICY, (thread_policy_t)&extpol, THREAD_EXTENDED_POLICY_COUNT);
894 if (kr != KERN_SUCCESS) {
895 fixedpri_rv = EINVAL;
896 goto done;
897 }
898 } else if ((flags & _PTHREAD_SET_SELF_TIMESHARE_FLAG) != 0) {
899 thread_extended_policy_data_t extpol = {.timeshare = 1};
900
901 if (!tl) tl = util_get_thread_threadlist_entry(th);
902 if (tl) {
903 /* Not allowed on workqueue threads */
904 fixedpri_rv = ENOTSUP;
905 goto done;
906 }
907
908 kr = pthread_kern->thread_policy_set_internal(th, THREAD_EXTENDED_POLICY, (thread_policy_t)&extpol, THREAD_EXTENDED_POLICY_COUNT);
909 if (kr != KERN_SUCCESS) {
910 fixedpri_rv = EINVAL;
911 goto done;
912 }
913 }
914
915 done:
916 if (qos_rv && voucher_rv) {
917 /* Both failed, give that a unique error. */
918 return EBADMSG;
919 }
920
921 if (qos_rv) {
922 return qos_rv;
923 }
924
925 if (voucher_rv) {
926 return voucher_rv;
927 }
928
929 if (fixedpri_rv) {
930 return fixedpri_rv;
931 }
932
933 return 0;
934 }
935
936 int
937 _bsdthread_ctl_qos_override_start(struct proc __unused *p, user_addr_t __unused cmd, mach_port_name_t kport, pthread_priority_t priority, user_addr_t resource, int __unused *retval)
938 {
939 thread_t th;
940 int rv = 0;
941
942 if ((th = port_name_to_thread(kport)) == THREAD_NULL) {
943 return ESRCH;
944 }
945
946 int override_qos = pthread_priority_get_thread_qos(priority);
947
948 struct threadlist *tl = util_get_thread_threadlist_entry(th);
949 if (tl) {
950 PTHREAD_TRACE_WQ(TRACE_wq_override_start | DBG_FUNC_NONE, tl->th_workq, thread_tid(th), 1, priority, 0);
951 }
952
953 /* The only failure case here is if we pass a tid and have it lookup the thread, we pass the uthread, so this all always succeeds. */
954 pthread_kern->proc_usynch_thread_qos_add_override_for_resource_check_owner(th, override_qos, TRUE,
955 resource, THREAD_QOS_OVERRIDE_TYPE_PTHREAD_EXPLICIT_OVERRIDE, USER_ADDR_NULL, MACH_PORT_NULL);
956 thread_deallocate(th);
957 return rv;
958 }
959
960 int
961 _bsdthread_ctl_qos_override_end(struct proc __unused *p, user_addr_t __unused cmd, mach_port_name_t kport, user_addr_t resource, user_addr_t arg3, int __unused *retval)
962 {
963 thread_t th;
964 int rv = 0;
965
966 if (arg3 != 0) {
967 return EINVAL;
968 }
969
970 if ((th = port_name_to_thread(kport)) == THREAD_NULL) {
971 return ESRCH;
972 }
973
974 struct uthread *uth = pthread_kern->get_bsdthread_info(th);
975
976 struct threadlist *tl = util_get_thread_threadlist_entry(th);
977 if (tl) {
978 PTHREAD_TRACE_WQ(TRACE_wq_override_end | DBG_FUNC_NONE, tl->th_workq, thread_tid(th), 0, 0, 0);
979 }
980
981 pthread_kern->proc_usynch_thread_qos_remove_override_for_resource(current_task(), uth, 0, resource, THREAD_QOS_OVERRIDE_TYPE_PTHREAD_EXPLICIT_OVERRIDE);
982
983 thread_deallocate(th);
984 return rv;
985 }
986
987 static int
988 _bsdthread_ctl_qos_dispatch_asynchronous_override_add_internal(mach_port_name_t kport, pthread_priority_t priority, user_addr_t resource, user_addr_t ulock_addr)
989 {
990 thread_t th;
991 int rv = 0;
992
993 if ((th = port_name_to_thread(kport)) == THREAD_NULL) {
994 return ESRCH;
995 }
996
997 int override_qos = pthread_priority_get_thread_qos(priority);
998
999 struct threadlist *tl = util_get_thread_threadlist_entry(th);
1000 if (!tl) {
1001 thread_deallocate(th);
1002 return EPERM;
1003 }
1004
1005 PTHREAD_TRACE_WQ(TRACE_wq_override_dispatch | DBG_FUNC_NONE, tl->th_workq, thread_tid(th), 1, priority, 0);
1006
1007 rv = pthread_kern->proc_usynch_thread_qos_add_override_for_resource_check_owner(th, override_qos, TRUE,
1008 resource, THREAD_QOS_OVERRIDE_TYPE_DISPATCH_ASYNCHRONOUS_OVERRIDE, ulock_addr, kport);
1009
1010 thread_deallocate(th);
1011 return rv;
1012 }
1013
1014 int _bsdthread_ctl_qos_dispatch_asynchronous_override_add(struct proc __unused *p, user_addr_t __unused cmd,
1015 mach_port_name_t kport, pthread_priority_t priority, user_addr_t resource, int __unused *retval)
1016 {
1017 return _bsdthread_ctl_qos_dispatch_asynchronous_override_add_internal(kport, priority, resource, USER_ADDR_NULL);
1018 }
1019
1020 int
1021 _bsdthread_ctl_qos_override_dispatch(struct proc *p __unused, user_addr_t cmd __unused, mach_port_name_t kport, pthread_priority_t priority, user_addr_t ulock_addr, int __unused *retval)
1022 {
1023 return _bsdthread_ctl_qos_dispatch_asynchronous_override_add_internal(kport, priority, USER_ADDR_NULL, ulock_addr);
1024 }
1025
1026 int
1027 _bsdthread_ctl_qos_override_reset(struct proc *p, user_addr_t cmd, user_addr_t arg1, user_addr_t arg2, user_addr_t arg3, int *retval)
1028 {
1029 if (arg1 != 0 || arg2 != 0 || arg3 != 0) {
1030 return EINVAL;
1031 }
1032
1033 return _bsdthread_ctl_qos_dispatch_asynchronous_override_reset(p, cmd, 1 /* reset_all */, 0, 0, retval);
1034 }
1035
1036 int
1037 _bsdthread_ctl_qos_dispatch_asynchronous_override_reset(struct proc __unused *p, user_addr_t __unused cmd, int reset_all, user_addr_t resource, user_addr_t arg3, int __unused *retval)
1038 {
1039 if ((reset_all && (resource != 0)) || arg3 != 0) {
1040 return EINVAL;
1041 }
1042
1043 thread_t th = current_thread();
1044 struct uthread *uth = pthread_kern->get_bsdthread_info(th);
1045 struct threadlist *tl = pthread_kern->uthread_get_threadlist(uth);
1046
1047 if (!tl) {
1048 return EPERM;
1049 }
1050
1051 PTHREAD_TRACE_WQ(TRACE_wq_override_reset | DBG_FUNC_NONE, tl->th_workq, 0, 0, 0, 0);
1052
1053 resource = reset_all ? THREAD_QOS_OVERRIDE_RESOURCE_WILDCARD : resource;
1054 pthread_kern->proc_usynch_thread_qos_reset_override_for_resource(current_task(), uth, 0, resource, THREAD_QOS_OVERRIDE_TYPE_DISPATCH_ASYNCHRONOUS_OVERRIDE);
1055
1056 return 0;
1057 }
1058
1059 int
1060 _bsdthread_ctl(struct proc *p, user_addr_t cmd, user_addr_t arg1, user_addr_t arg2, user_addr_t arg3, int *retval)
1061 {
1062 switch (cmd) {
1063 case BSDTHREAD_CTL_SET_QOS:
1064 return _bsdthread_ctl_set_qos(p, cmd, (mach_port_name_t)arg1, arg2, arg3, retval);
1065 case BSDTHREAD_CTL_QOS_OVERRIDE_START:
1066 return _bsdthread_ctl_qos_override_start(p, cmd, (mach_port_name_t)arg1, (pthread_priority_t)arg2, arg3, retval);
1067 case BSDTHREAD_CTL_QOS_OVERRIDE_END:
1068 return _bsdthread_ctl_qos_override_end(p, cmd, (mach_port_name_t)arg1, arg2, arg3, retval);
1069 case BSDTHREAD_CTL_QOS_OVERRIDE_RESET:
1070 return _bsdthread_ctl_qos_override_reset(p, cmd, arg1, arg2, arg3, retval);
1071 case BSDTHREAD_CTL_QOS_OVERRIDE_DISPATCH:
1072 return _bsdthread_ctl_qos_override_dispatch(p, cmd, (mach_port_name_t)arg1, (pthread_priority_t)arg2, arg3, retval);
1073 case BSDTHREAD_CTL_QOS_DISPATCH_ASYNCHRONOUS_OVERRIDE_ADD:
1074 return _bsdthread_ctl_qos_dispatch_asynchronous_override_add(p, cmd, (mach_port_name_t)arg1, (pthread_priority_t)arg2, arg3, retval);
1075 case BSDTHREAD_CTL_QOS_DISPATCH_ASYNCHRONOUS_OVERRIDE_RESET:
1076 return _bsdthread_ctl_qos_dispatch_asynchronous_override_reset(p, cmd, (int)arg1, arg2, arg3, retval);
1077 case BSDTHREAD_CTL_SET_SELF:
1078 return _bsdthread_ctl_set_self(p, cmd, (pthread_priority_t)arg1, (mach_port_name_t)arg2, (_pthread_set_flags_t)arg3, retval);
1079 default:
1080 return EINVAL;
1081 }
1082 }
1083
1084 #pragma mark - Workqueue Implementation
1085 #pragma mark workqueue lock
1086
1087 static boolean_t workqueue_lock_spin_is_acquired_kdp(struct workqueue *wq) {
1088 return kdp_lck_spin_is_acquired(&wq->wq_lock);
1089 }
1090
1091 static void
1092 workqueue_lock_spin(struct workqueue *wq)
1093 {
1094 boolean_t interrupt_state = ml_set_interrupts_enabled(FALSE);
1095 lck_spin_lock(&wq->wq_lock);
1096 wq->wq_interrupt_state = interrupt_state;
1097 }
1098
1099 static void
1100 workqueue_unlock(struct workqueue *wq)
1101 {
1102 boolean_t interrupt_state = wq->wq_interrupt_state;
1103 lck_spin_unlock(&wq->wq_lock);
1104 ml_set_interrupts_enabled(interrupt_state);
1105 }
1106
1107 #pragma mark workqueue add timer
1108
1109 /**
1110 * Sets up the timer which will call out to workqueue_add_timer
1111 */
1112 static void
1113 workqueue_interval_timer_start(struct workqueue *wq)
1114 {
1115 uint64_t deadline;
1116
1117 /* n.b. wq_timer_interval is reset to 0 in workqueue_add_timer if the
1118 ATIMER_RUNNING flag is not present. The net effect here is that if a
1119 sequence of threads is required, we'll double the time before we give out
1120 the next one. */
1121 if (wq->wq_timer_interval == 0) {
1122 wq->wq_timer_interval = wq_stalled_window_usecs;
1123
1124 } else {
1125 wq->wq_timer_interval = wq->wq_timer_interval * 2;
1126
1127 if (wq->wq_timer_interval > wq_max_timer_interval_usecs) {
1128 wq->wq_timer_interval = wq_max_timer_interval_usecs;
1129 }
1130 }
1131 clock_interval_to_deadline(wq->wq_timer_interval, 1000, &deadline);
1132
1133 PTHREAD_TRACE_WQ(TRACE_wq_start_add_timer, wq, wq->wq_reqcount, wq->wq_flags, wq->wq_timer_interval, 0);
1134
1135 boolean_t ret = thread_call_enter1_delayed(wq->wq_atimer_delayed_call, wq->wq_atimer_delayed_call, deadline);
1136 if (ret) {
1137 panic("delayed_call was already enqueued");
1138 }
1139 }
1140
1141 /**
1142 * Immediately trigger the workqueue_add_timer
1143 */
1144 static void
1145 workqueue_interval_timer_trigger(struct workqueue *wq)
1146 {
1147 PTHREAD_TRACE_WQ(TRACE_wq_start_add_timer, wq, wq->wq_reqcount, wq->wq_flags, 0, 0);
1148
1149 boolean_t ret = thread_call_enter1(wq->wq_atimer_immediate_call, wq->wq_atimer_immediate_call);
1150 if (ret) {
1151 panic("immediate_call was already enqueued");
1152 }
1153 }
1154
1155 /**
1156 * returns whether lastblocked_tsp is within wq_stalled_window_usecs of cur_ts
1157 */
1158 static boolean_t
1159 wq_thread_is_busy(uint64_t cur_ts, uint64_t *lastblocked_tsp)
1160 {
1161 clock_sec_t secs;
1162 clock_usec_t usecs;
1163 uint64_t lastblocked_ts;
1164 uint64_t elapsed;
1165
1166 /*
1167 * the timestamp is updated atomically w/o holding the workqueue lock
1168 * so we need to do an atomic read of the 64 bits so that we don't see
1169 * a mismatched pair of 32 bit reads... we accomplish this in an architecturally
1170 * independent fashion by using OSCompareAndSwap64 to write back the
1171 * value we grabbed... if it succeeds, then we have a good timestamp to
1172 * evaluate... if it fails, we straddled grabbing the timestamp while it
1173 * was being updated... treat a failed update as a busy thread since
1174 * it implies we are about to see a really fresh timestamp anyway
1175 */
1176 lastblocked_ts = *lastblocked_tsp;
1177
1178 if ( !OSCompareAndSwap64((UInt64)lastblocked_ts, (UInt64)lastblocked_ts, lastblocked_tsp))
1179 return (TRUE);
1180
1181 if (lastblocked_ts >= cur_ts) {
1182 /*
1183 * because the update of the timestamp when a thread blocks isn't
1184 * serialized against us looking at it (i.e. we don't hold the workq lock)
1185 * it's possible to have a timestamp that matches the current time or
1186 * that even looks to be in the future relative to when we grabbed the current
1187 * time... just treat this as a busy thread since it must have just blocked.
1188 */
1189 return (TRUE);
1190 }
1191 elapsed = cur_ts - lastblocked_ts;
1192
1193 pthread_kern->absolutetime_to_microtime(elapsed, &secs, &usecs);
1194
1195 if (secs == 0 && usecs < wq_stalled_window_usecs)
1196 return (TRUE);
1197 return (FALSE);
1198 }
1199
1200 static inline bool
1201 WQ_TIMER_DELAYED_NEEDED(struct workqueue *wq)
1202 {
1203 int oldflags;
1204 retry:
1205 oldflags = wq->wq_flags;
1206 if ( !(oldflags & (WQ_EXITING | WQ_ATIMER_DELAYED_RUNNING))) {
1207 if (OSCompareAndSwap(oldflags, oldflags | WQ_ATIMER_DELAYED_RUNNING, (UInt32 *)&wq->wq_flags)) {
1208 return true;
1209 } else {
1210 goto retry;
1211 }
1212 }
1213 return false;
1214 }
1215
1216 static inline bool
1217 WQ_TIMER_IMMEDIATE_NEEDED(struct workqueue *wq)
1218 {
1219 int oldflags;
1220 retry:
1221 oldflags = wq->wq_flags;
1222 if ( !(oldflags & (WQ_EXITING | WQ_ATIMER_IMMEDIATE_RUNNING))) {
1223 if (OSCompareAndSwap(oldflags, oldflags | WQ_ATIMER_IMMEDIATE_RUNNING, (UInt32 *)&wq->wq_flags)) {
1224 return true;
1225 } else {
1226 goto retry;
1227 }
1228 }
1229 return false;
1230 }
1231
1232 /**
1233 * handler function for the timer
1234 */
1235 static void
1236 workqueue_add_timer(struct workqueue *wq, thread_call_t thread_call_self)
1237 {
1238 proc_t p;
1239 boolean_t start_timer = FALSE;
1240 boolean_t retval;
1241
1242 PTHREAD_TRACE_WQ(TRACE_wq_add_timer | DBG_FUNC_START, wq, wq->wq_flags, wq->wq_nthreads, wq->wq_thidlecount, 0);
1243
1244 p = wq->wq_proc;
1245
1246 workqueue_lock_spin(wq);
1247
1248 /*
1249 * There's two tricky issues here.
1250 *
1251 * First issue: we start the thread_call's that invoke this routine without
1252 * the workqueue lock held. The scheduler callback needs to trigger
1253 * reevaluation of the number of running threads but shouldn't take that
1254 * lock, so we can't use it to synchronize state around the thread_call.
1255 * As a result, it might re-enter the thread_call while this routine is
1256 * already running. This could cause it to fire a second time and we'll
1257 * have two add_timers running at once. Obviously, we don't want that to
1258 * keep stacking, so we need to keep it at two timers.
1259 *
1260 * Solution: use wq_flags (accessed via atomic CAS) to synchronize the
1261 * enqueue of the thread_call itself. When a thread needs to trigger the
1262 * add_timer, it checks for ATIMER_DELAYED_RUNNING and, when not set, sets
1263 * the flag then does a thread_call_enter. We'll then remove that flag
1264 * only once we've got the lock and it's safe for the thread_call to be
1265 * entered again.
1266 *
1267 * Second issue: we need to make sure that the two timers don't execute this
1268 * routine concurrently. We can't use the workqueue lock for this because
1269 * we'll need to drop it during our execution.
1270 *
1271 * Solution: use WQL_ATIMER_BUSY as a condition variable to indicate that
1272 * we are currently executing the routine and the next thread should wait.
1273 *
1274 * After all that, we arrive at the following four possible states:
1275 * !WQ_ATIMER_DELAYED_RUNNING && !WQL_ATIMER_BUSY no pending timer, no active timer
1276 * !WQ_ATIMER_DELAYED_RUNNING && WQL_ATIMER_BUSY no pending timer, 1 active timer
1277 * WQ_ATIMER_DELAYED_RUNNING && !WQL_ATIMER_BUSY 1 pending timer, no active timer
1278 * WQ_ATIMER_DELAYED_RUNNING && WQL_ATIMER_BUSY 1 pending timer, 1 active timer
1279 *
1280 * Further complication sometimes we need to trigger this function to run
1281 * without delay. Because we aren't under a lock between setting
1282 * WQ_ATIMER_DELAYED_RUNNING and calling thread_call_enter, we can't simply
1283 * re-enter the thread call: if thread_call_enter() returned false, we
1284 * wouldn't be able to distinguish the case where the thread_call had
1285 * already fired from the case where it hadn't been entered yet from the
1286 * other thread. So, we use a separate thread_call for immediate
1287 * invocations, and a separate RUNNING flag, WQ_ATIMER_IMMEDIATE_RUNNING.
1288 */
1289
1290 while (wq->wq_lflags & WQL_ATIMER_BUSY) {
1291 wq->wq_lflags |= WQL_ATIMER_WAITING;
1292
1293 assert_wait((caddr_t)wq, (THREAD_UNINT));
1294 workqueue_unlock(wq);
1295
1296 thread_block(THREAD_CONTINUE_NULL);
1297
1298 workqueue_lock_spin(wq);
1299 }
1300 wq->wq_lflags |= WQL_ATIMER_BUSY;
1301
1302 /*
1303 * Decide which timer we are and remove the RUNNING flag.
1304 */
1305 if (thread_call_self == wq->wq_atimer_delayed_call) {
1306 if ((wq->wq_flags & WQ_ATIMER_DELAYED_RUNNING) == 0) {
1307 panic("workqueue_add_timer is the delayed timer but the delayed running flag isn't set");
1308 }
1309 WQ_UNSETFLAG(wq, WQ_ATIMER_DELAYED_RUNNING);
1310 } else if (thread_call_self == wq->wq_atimer_immediate_call) {
1311 if ((wq->wq_flags & WQ_ATIMER_IMMEDIATE_RUNNING) == 0) {
1312 panic("workqueue_add_timer is the immediate timer but the immediate running flag isn't set");
1313 }
1314 WQ_UNSETFLAG(wq, WQ_ATIMER_IMMEDIATE_RUNNING);
1315 } else {
1316 panic("workqueue_add_timer can't figure out which timer it is");
1317 }
1318
1319 again:
1320 retval = TRUE;
1321 if ( !(wq->wq_flags & WQ_EXITING)) {
1322 boolean_t add_thread = FALSE;
1323 /*
1324 * check to see if the stall frequency was beyond our tolerance
1325 * or we have work on the queue, but haven't scheduled any
1326 * new work within our acceptable time interval because
1327 * there were no idle threads left to schedule
1328 */
1329 if (wq->wq_reqcount) {
1330 uint32_t priclass = 0;
1331 uint32_t thactive_count = 0;
1332 uint64_t curtime = mach_absolute_time();
1333 uint64_t busycount = 0;
1334
1335 if (wq->wq_requests[WORKQUEUE_EVENT_MANAGER_BUCKET] &&
1336 wq->wq_thscheduled_count[WORKQUEUE_EVENT_MANAGER_BUCKET] == 0){
1337 priclass = WORKQUEUE_EVENT_MANAGER_BUCKET;
1338 } else {
1339 for (priclass = 0; priclass < WORKQUEUE_NUM_BUCKETS; priclass++) {
1340 if (wq->wq_requests[priclass])
1341 break;
1342 }
1343 }
1344
1345 if (priclass < WORKQUEUE_EVENT_MANAGER_BUCKET){
1346 /*
1347 * Compute a metric for many how many threads are active. We
1348 * find the highest priority request outstanding and then add up
1349 * the number of active threads in that and all higher-priority
1350 * buckets. We'll also add any "busy" threads which are not
1351 * active but blocked recently enough that we can't be sure
1352 * they've gone idle yet. We'll then compare this metric to our
1353 * max concurrency to decide whether to add a new thread.
1354 */
1355 for (uint32_t i = 0; i <= priclass; i++) {
1356 thactive_count += wq->wq_thactive_count[i];
1357
1358 if (wq->wq_thscheduled_count[i] < wq->wq_thactive_count[i]) {
1359 if (wq_thread_is_busy(curtime, &wq->wq_lastblocked_ts[i]))
1360 busycount++;
1361 }
1362 }
1363 }
1364
1365 if (thactive_count + busycount < wq->wq_max_concurrency ||
1366 priclass == WORKQUEUE_EVENT_MANAGER_BUCKET) {
1367
1368 if (wq->wq_thidlecount == 0) {
1369 /*
1370 * if we have no idle threads, try to add one
1371 */
1372 retval = workqueue_addnewthread(wq, priclass == WORKQUEUE_EVENT_MANAGER_BUCKET);
1373 }
1374 add_thread = TRUE;
1375 }
1376
1377 if (wq->wq_reqcount) {
1378 /*
1379 * as long as we have threads to schedule, and we successfully
1380 * scheduled new work, keep trying
1381 */
1382 while (wq->wq_thidlecount && !(wq->wq_flags & WQ_EXITING)) {
1383 /*
1384 * workqueue_run_nextreq is responsible for
1385 * dropping the workqueue lock in all cases
1386 */
1387 retval = (workqueue_run_nextreq(p, wq, THREAD_NULL, RUN_NEXTREQ_ADD_TIMER, 0, false) != THREAD_NULL);
1388 workqueue_lock_spin(wq);
1389
1390 if (retval == FALSE)
1391 break;
1392 }
1393 if ( !(wq->wq_flags & WQ_EXITING) && wq->wq_reqcount) {
1394
1395 if (wq->wq_thidlecount == 0 && retval == TRUE && add_thread == TRUE)
1396 goto again;
1397
1398 if (wq->wq_thidlecount == 0 || busycount) {
1399 start_timer = WQ_TIMER_DELAYED_NEEDED(wq);
1400 }
1401
1402 PTHREAD_TRACE_WQ(TRACE_wq_add_timer | DBG_FUNC_NONE, wq, wq->wq_reqcount, wq->wq_thidlecount, busycount, 0);
1403 }
1404 }
1405 }
1406 }
1407
1408 /*
1409 * If we called WQ_TIMER_NEEDED above, then this flag will be set if that
1410 * call marked the timer running. If so, we let the timer interval grow.
1411 * Otherwise, we reset it back to 0.
1412 */
1413 if (!(wq->wq_flags & WQ_ATIMER_DELAYED_RUNNING)) {
1414 wq->wq_timer_interval = 0;
1415 }
1416
1417 wq->wq_lflags &= ~WQL_ATIMER_BUSY;
1418
1419 if ((wq->wq_flags & WQ_EXITING) || (wq->wq_lflags & WQL_ATIMER_WAITING)) {
1420 /*
1421 * wakeup the thread hung up in _workqueue_mark_exiting or workqueue_add_timer waiting for this timer
1422 * to finish getting out of the way
1423 */
1424 wq->wq_lflags &= ~WQL_ATIMER_WAITING;
1425 wakeup(wq);
1426 }
1427
1428 PTHREAD_TRACE_WQ(TRACE_wq_add_timer | DBG_FUNC_END, wq, start_timer, wq->wq_nthreads, wq->wq_thidlecount, 0);
1429
1430 workqueue_unlock(wq);
1431
1432 if (start_timer == TRUE)
1433 workqueue_interval_timer_start(wq);
1434 }
1435
1436 #pragma mark thread state tracking
1437
1438 // called by spinlock code when trying to yield to lock owner
1439 void
1440 _workqueue_thread_yielded(void)
1441 {
1442 struct workqueue *wq;
1443 proc_t p;
1444
1445 p = current_proc();
1446
1447 if ((wq = pthread_kern->proc_get_wqptr(p)) == NULL || wq->wq_reqcount == 0)
1448 return;
1449
1450 workqueue_lock_spin(wq);
1451
1452 if (wq->wq_reqcount) {
1453 uint64_t curtime;
1454 uint64_t elapsed;
1455 clock_sec_t secs;
1456 clock_usec_t usecs;
1457
1458 if (wq->wq_thread_yielded_count++ == 0)
1459 wq->wq_thread_yielded_timestamp = mach_absolute_time();
1460
1461 if (wq->wq_thread_yielded_count < wq_yielded_threshold) {
1462 workqueue_unlock(wq);
1463 return;
1464 }
1465
1466 PTHREAD_TRACE_WQ(TRACE_wq_thread_yielded | DBG_FUNC_START, wq, wq->wq_thread_yielded_count, wq->wq_reqcount, 0, 0);
1467
1468 wq->wq_thread_yielded_count = 0;
1469
1470 curtime = mach_absolute_time();
1471 elapsed = curtime - wq->wq_thread_yielded_timestamp;
1472 pthread_kern->absolutetime_to_microtime(elapsed, &secs, &usecs);
1473
1474 if (secs == 0 && usecs < wq_yielded_window_usecs) {
1475
1476 if (wq->wq_thidlecount == 0) {
1477 workqueue_addnewthread(wq, TRUE);
1478 /*
1479 * 'workqueue_addnewthread' drops the workqueue lock
1480 * when creating the new thread and then retakes it before
1481 * returning... this window allows other threads to process
1482 * requests, so we need to recheck for available work
1483 * if none found, we just return... the newly created thread
1484 * will eventually get used (if it hasn't already)...
1485 */
1486 if (wq->wq_reqcount == 0) {
1487 workqueue_unlock(wq);
1488 return;
1489 }
1490 }
1491 if (wq->wq_thidlecount) {
1492 (void)workqueue_run_nextreq(p, wq, THREAD_NULL, RUN_NEXTREQ_UNCONSTRAINED, 0, false);
1493 /*
1494 * workqueue_run_nextreq is responsible for
1495 * dropping the workqueue lock in all cases
1496 */
1497 PTHREAD_TRACE_WQ(TRACE_wq_thread_yielded | DBG_FUNC_END, wq, wq->wq_thread_yielded_count, wq->wq_reqcount, 1, 0);
1498
1499 return;
1500 }
1501 }
1502 PTHREAD_TRACE_WQ(TRACE_wq_thread_yielded | DBG_FUNC_END, wq, wq->wq_thread_yielded_count, wq->wq_reqcount, 2, 0);
1503 }
1504 workqueue_unlock(wq);
1505 }
1506
1507 static void
1508 workqueue_callback(int type, thread_t thread)
1509 {
1510 struct uthread *uth;
1511 struct threadlist *tl;
1512 struct workqueue *wq;
1513
1514 uth = pthread_kern->get_bsdthread_info(thread);
1515 tl = pthread_kern->uthread_get_threadlist(uth);
1516 wq = tl->th_workq;
1517
1518 switch (type) {
1519 case SCHED_CALL_BLOCK: {
1520 uint32_t old_activecount;
1521 boolean_t start_timer = FALSE;
1522
1523 old_activecount = OSAddAtomic(-1, &wq->wq_thactive_count[tl->th_priority]);
1524
1525 /*
1526 * If we blocked and were at the requested concurrency previously, we may
1527 * need to spin up a new thread. Of course, if it's the event manager
1528 * then that's moot, so ignore that case.
1529 */
1530 if (old_activecount == wq->wq_reqconc[tl->th_priority] &&
1531 tl->th_priority != WORKQUEUE_EVENT_MANAGER_BUCKET) {
1532 uint64_t curtime;
1533 UInt64 *lastblocked_ptr;
1534
1535 /*
1536 * the number of active threads at this priority
1537 * has fallen below the maximum number of concurrent
1538 * threads that we're allowed to run
1539 */
1540 lastblocked_ptr = (UInt64 *)&wq->wq_lastblocked_ts[tl->th_priority];
1541 curtime = mach_absolute_time();
1542
1543 /*
1544 * if we collide with another thread trying to update the last_blocked (really unlikely
1545 * since another thread would have to get scheduled and then block after we start down
1546 * this path), it's not a problem. Either timestamp is adequate, so no need to retry
1547 */
1548
1549 OSCompareAndSwap64(*lastblocked_ptr, (UInt64)curtime, lastblocked_ptr);
1550
1551 if (wq->wq_reqcount) {
1552 /*
1553 * We have work to do so start up the timer if it's not
1554 * running; it'll sort out whether we need to start another
1555 * thread
1556 */
1557 start_timer = WQ_TIMER_DELAYED_NEEDED(wq);
1558 }
1559
1560 if (start_timer == TRUE) {
1561 workqueue_interval_timer_start(wq);
1562 }
1563 }
1564 PTHREAD_TRACE1_WQ(TRACE_wq_thread_block | DBG_FUNC_START, wq, old_activecount, tl->th_priority, start_timer, thread_tid(thread));
1565 break;
1566 }
1567 case SCHED_CALL_UNBLOCK:
1568 /*
1569 * we cannot take the workqueue_lock here...
1570 * an UNBLOCK can occur from a timer event which
1571 * is run from an interrupt context... if the workqueue_lock
1572 * is already held by this processor, we'll deadlock...
1573 * the thread lock for the thread being UNBLOCKED
1574 * is also held
1575 */
1576 OSAddAtomic(1, &wq->wq_thactive_count[tl->th_priority]);
1577
1578 PTHREAD_TRACE1_WQ(TRACE_wq_thread_block | DBG_FUNC_END, wq, wq->wq_threads_scheduled, tl->th_priority, 0, thread_tid(thread));
1579
1580 break;
1581 }
1582 }
1583
1584 sched_call_t
1585 _workqueue_get_sched_callback(void)
1586 {
1587 return workqueue_callback;
1588 }
1589
1590 #pragma mark thread addition/removal
1591
1592 static mach_vm_size_t
1593 _workqueue_allocsize(struct workqueue *wq)
1594 {
1595 proc_t p = wq->wq_proc;
1596 mach_vm_size_t guardsize = vm_map_page_size(wq->wq_map);
1597 mach_vm_size_t pthread_size =
1598 vm_map_round_page_mask(pthread_kern->proc_get_pthsize(p) + PTHREAD_T_OFFSET, vm_map_page_mask(wq->wq_map));
1599 return guardsize + PTH_DEFAULT_STACKSIZE + pthread_size;
1600 }
1601
1602 /**
1603 * pop goes the thread
1604 *
1605 * If fromexit is set, the call is from workqueue_exit(,
1606 * so some cleanups are to be avoided.
1607 */
1608 static void
1609 workqueue_removethread(struct threadlist *tl, bool fromexit, bool first_use)
1610 {
1611 struct uthread * uth;
1612 struct workqueue * wq = tl->th_workq;
1613
1614 if (tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET){
1615 TAILQ_REMOVE(&wq->wq_thidlemgrlist, tl, th_entry);
1616 } else {
1617 TAILQ_REMOVE(&wq->wq_thidlelist, tl, th_entry);
1618 }
1619
1620 if (fromexit == 0) {
1621 assert(wq->wq_nthreads && wq->wq_thidlecount);
1622 wq->wq_nthreads--;
1623 wq->wq_thidlecount--;
1624 }
1625
1626 /*
1627 * Clear the threadlist pointer in uthread so
1628 * blocked thread on wakeup for termination will
1629 * not access the thread list as it is going to be
1630 * freed.
1631 */
1632 pthread_kern->thread_sched_call(tl->th_thread, NULL);
1633
1634 uth = pthread_kern->get_bsdthread_info(tl->th_thread);
1635 if (uth != (struct uthread *)0) {
1636 pthread_kern->uthread_set_threadlist(uth, NULL);
1637 }
1638 if (fromexit == 0) {
1639 /* during exit the lock is not held */
1640 workqueue_unlock(wq);
1641 }
1642
1643 if ( (tl->th_flags & TH_LIST_NEW) || first_use ) {
1644 /*
1645 * thread was created, but never used...
1646 * need to clean up the stack and port ourselves
1647 * since we're not going to spin up through the
1648 * normal exit path triggered from Libc
1649 */
1650 if (fromexit == 0) {
1651 /* vm map is already deallocated when this is called from exit */
1652 (void)mach_vm_deallocate(wq->wq_map, tl->th_stackaddr, _workqueue_allocsize(wq));
1653 }
1654 (void)pthread_kern->mach_port_deallocate(pthread_kern->task_get_ipcspace(wq->wq_task), tl->th_thport);
1655
1656 } else {
1657
1658 PTHREAD_TRACE1_WQ(TRACE_wq_thread_park | DBG_FUNC_END, wq, (uintptr_t)thread_tid(current_thread()), wq->wq_nthreads, 0xdead, thread_tid(tl->th_thread));
1659 }
1660 /*
1661 * drop our ref on the thread
1662 */
1663 thread_deallocate(tl->th_thread);
1664
1665 kfree(tl, sizeof(struct threadlist));
1666 }
1667
1668
1669 /**
1670 * Try to add a new workqueue thread.
1671 *
1672 * - called with workq lock held
1673 * - dropped and retaken around thread creation
1674 * - return with workq lock held
1675 */
1676 static boolean_t
1677 workqueue_addnewthread(struct workqueue *wq, boolean_t ignore_constrained_thread_limit)
1678 {
1679 struct threadlist *tl;
1680 struct uthread *uth;
1681 kern_return_t kret;
1682 thread_t th;
1683 proc_t p;
1684 void *sright;
1685 mach_vm_offset_t stackaddr;
1686
1687 if ((wq->wq_flags & WQ_EXITING) == WQ_EXITING) {
1688 PTHREAD_TRACE_WQ(TRACE_wq_thread_add_during_exit | DBG_FUNC_NONE, wq, 0, 0, 0, 0);
1689 return (FALSE);
1690 }
1691
1692 if (wq->wq_nthreads >= wq_max_threads) {
1693 PTHREAD_TRACE_WQ(TRACE_wq_thread_limit_exceeded | DBG_FUNC_NONE, wq, wq->wq_nthreads, wq_max_threads, 0, 0);
1694 return (FALSE);
1695 }
1696
1697 if (ignore_constrained_thread_limit == FALSE &&
1698 wq->wq_constrained_threads_scheduled >= wq_max_constrained_threads) {
1699 /*
1700 * If we're not creating this thread to service an overcommit or
1701 * event manager request, then we check to see if we are over our
1702 * constrained thread limit, in which case we error out.
1703 */
1704 PTHREAD_TRACE_WQ(TRACE_wq_thread_constrained_maxed | DBG_FUNC_NONE, wq, wq->wq_constrained_threads_scheduled,
1705 wq_max_constrained_threads, 0, 0);
1706 return (FALSE);
1707 }
1708
1709 wq->wq_nthreads++;
1710
1711 p = wq->wq_proc;
1712 workqueue_unlock(wq);
1713
1714 tl = kalloc(sizeof(struct threadlist));
1715 bzero(tl, sizeof(struct threadlist));
1716
1717 kret = pthread_kern->thread_create_workq_waiting(wq->wq_task, wq_unpark_continue, tl, &th);
1718 if (kret != KERN_SUCCESS) {
1719 PTHREAD_TRACE_WQ(TRACE_wq_thread_create_failed | DBG_FUNC_NONE, wq, kret, 0, 0, 0);
1720 kfree(tl, sizeof(struct threadlist));
1721 goto failed;
1722 }
1723
1724 stackaddr = pthread_kern->proc_get_stack_addr_hint(p);
1725
1726 mach_vm_size_t guardsize = vm_map_page_size(wq->wq_map);
1727 mach_vm_size_t pthread_size =
1728 vm_map_round_page_mask(pthread_kern->proc_get_pthsize(p) + PTHREAD_T_OFFSET, vm_map_page_mask(wq->wq_map));
1729 mach_vm_size_t th_allocsize = guardsize + PTH_DEFAULT_STACKSIZE + pthread_size;
1730
1731 kret = mach_vm_map(wq->wq_map, &stackaddr,
1732 th_allocsize, page_size-1,
1733 VM_MAKE_TAG(VM_MEMORY_STACK)| VM_FLAGS_ANYWHERE, NULL,
1734 0, FALSE, VM_PROT_DEFAULT, VM_PROT_ALL,
1735 VM_INHERIT_DEFAULT);
1736
1737 if (kret != KERN_SUCCESS) {
1738 PTHREAD_TRACE_WQ(TRACE_wq_thread_create_failed | DBG_FUNC_NONE, wq, kret, 1, 0, 0);
1739
1740 kret = mach_vm_allocate(wq->wq_map,
1741 &stackaddr, th_allocsize,
1742 VM_MAKE_TAG(VM_MEMORY_STACK) | VM_FLAGS_ANYWHERE);
1743 }
1744 if (kret == KERN_SUCCESS) {
1745 /*
1746 * The guard page is at the lowest address
1747 * The stack base is the highest address
1748 */
1749 kret = mach_vm_protect(wq->wq_map, stackaddr, guardsize, FALSE, VM_PROT_NONE);
1750
1751 if (kret != KERN_SUCCESS) {
1752 (void) mach_vm_deallocate(wq->wq_map, stackaddr, th_allocsize);
1753 PTHREAD_TRACE_WQ(TRACE_wq_thread_create_failed | DBG_FUNC_NONE, wq, kret, 2, 0, 0);
1754 }
1755 }
1756 if (kret != KERN_SUCCESS) {
1757 (void) thread_terminate(th);
1758 thread_deallocate(th);
1759
1760 kfree(tl, sizeof(struct threadlist));
1761 goto failed;
1762 }
1763 thread_reference(th);
1764
1765 pthread_kern->thread_set_tag(th, THREAD_TAG_PTHREAD | THREAD_TAG_WORKQUEUE);
1766
1767 sright = (void *)pthread_kern->convert_thread_to_port(th);
1768 tl->th_thport = pthread_kern->ipc_port_copyout_send(sright, pthread_kern->task_get_ipcspace(wq->wq_task));
1769
1770 pthread_kern->thread_static_param(th, TRUE);
1771
1772 tl->th_flags = TH_LIST_INITED | TH_LIST_NEW;
1773
1774 tl->th_thread = th;
1775 tl->th_workq = wq;
1776 tl->th_stackaddr = stackaddr;
1777 tl->th_priority = WORKQUEUE_NUM_BUCKETS;
1778
1779 uth = pthread_kern->get_bsdthread_info(tl->th_thread);
1780
1781 workqueue_lock_spin(wq);
1782
1783 pthread_kern->uthread_set_threadlist(uth, tl);
1784 TAILQ_INSERT_TAIL(&wq->wq_thidlelist, tl, th_entry);
1785
1786 wq->wq_thidlecount++;
1787
1788 PTHREAD_TRACE_WQ(TRACE_wq_thread_create | DBG_FUNC_NONE, wq, 0, 0, 0, 0);
1789
1790 return (TRUE);
1791
1792 failed:
1793 workqueue_lock_spin(wq);
1794 wq->wq_nthreads--;
1795
1796 return (FALSE);
1797 }
1798
1799 /**
1800 * Setup per-process state for the workqueue.
1801 */
1802 int
1803 _workq_open(struct proc *p, __unused int32_t *retval)
1804 {
1805 struct workqueue * wq;
1806 int wq_size;
1807 char * ptr;
1808 uint32_t i;
1809 uint32_t num_cpus;
1810 int error = 0;
1811
1812 if (pthread_kern->proc_get_register(p) == 0) {
1813 return EINVAL;
1814 }
1815
1816 num_cpus = pthread_kern->ml_get_max_cpus();
1817
1818 if (wq_init_constrained_limit) {
1819 uint32_t limit;
1820 /*
1821 * set up the limit for the constrained pool
1822 * this is a virtual pool in that we don't
1823 * maintain it on a separate idle and run list
1824 */
1825 limit = num_cpus * WORKQUEUE_CONSTRAINED_FACTOR;
1826
1827 if (limit > wq_max_constrained_threads)
1828 wq_max_constrained_threads = limit;
1829
1830 wq_init_constrained_limit = 0;
1831
1832 if (wq_max_threads > pthread_kern->config_thread_max - 20) {
1833 wq_max_threads = pthread_kern->config_thread_max - 20;
1834 }
1835 }
1836
1837 if (pthread_kern->proc_get_wqptr(p) == NULL) {
1838 if (pthread_kern->proc_init_wqptr_or_wait(p) == FALSE) {
1839 assert(pthread_kern->proc_get_wqptr(p) != NULL);
1840 goto out;
1841 }
1842
1843 wq_size = sizeof(struct workqueue);
1844
1845 ptr = (char *)kalloc(wq_size);
1846 bzero(ptr, wq_size);
1847
1848 wq = (struct workqueue *)ptr;
1849 wq->wq_flags = WQ_LIST_INITED;
1850 wq->wq_proc = p;
1851 wq->wq_max_concurrency = wq_max_concurrency;
1852 wq->wq_task = current_task();
1853 wq->wq_map = pthread_kern->current_map();
1854
1855 for (i = 0; i < WORKQUEUE_NUM_BUCKETS; i++)
1856 wq->wq_reqconc[i] = (uint16_t)wq->wq_max_concurrency;
1857
1858 // The event manager bucket is special, so its gets a concurrency of 1
1859 // though we shouldn't ever read this value for that bucket
1860 wq->wq_reqconc[WORKQUEUE_EVENT_MANAGER_BUCKET] = 1;
1861
1862 // Start the event manager at the priority hinted at by the policy engine
1863 int mgr_priority_hint = pthread_kern->task_get_default_manager_qos(current_task());
1864 wq->wq_event_manager_priority = (uint32_t)thread_qos_get_pthread_priority(mgr_priority_hint) | _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG;
1865
1866 TAILQ_INIT(&wq->wq_thrunlist);
1867 TAILQ_INIT(&wq->wq_thidlelist);
1868
1869 wq->wq_atimer_delayed_call = thread_call_allocate((thread_call_func_t)workqueue_add_timer, (thread_call_param_t)wq);
1870 wq->wq_atimer_immediate_call = thread_call_allocate((thread_call_func_t)workqueue_add_timer, (thread_call_param_t)wq);
1871
1872 lck_spin_init(&wq->wq_lock, pthread_lck_grp, pthread_lck_attr);
1873
1874 pthread_kern->proc_set_wqptr(p, wq);
1875
1876 }
1877 out:
1878
1879 return(error);
1880 }
1881
1882 /*
1883 * Routine: workqueue_mark_exiting
1884 *
1885 * Function: Mark the work queue such that new threads will not be added to the
1886 * work queue after we return.
1887 *
1888 * Conditions: Called against the current process.
1889 */
1890 void
1891 _workqueue_mark_exiting(struct proc *p)
1892 {
1893 struct workqueue *wq = pthread_kern->proc_get_wqptr(p);
1894
1895 if (wq != NULL) {
1896
1897 PTHREAD_TRACE_WQ(TRACE_wq_pthread_exit|DBG_FUNC_START, wq, 0, 0, 0, 0);
1898
1899 workqueue_lock_spin(wq);
1900
1901 /*
1902 * We arm the add timer without holding the workqueue lock so we need
1903 * to synchronize with any running or soon to be running timers.
1904 *
1905 * Threads that intend to arm the timer atomically OR
1906 * WQ_ATIMER_{DELAYED,IMMEDIATE}_RUNNING into the wq_flags, only if
1907 * WQ_EXITING is not present. So, once we have set WQ_EXITING, we can
1908 * be sure that no new RUNNING flags will be set, but still need to
1909 * wait for the already running timers to complete.
1910 *
1911 * We always hold the workq lock when dropping WQ_ATIMER_RUNNING, so
1912 * the check for and sleep until clear is protected.
1913 */
1914 WQ_SETFLAG(wq, WQ_EXITING);
1915
1916 if (wq->wq_flags & WQ_ATIMER_DELAYED_RUNNING) {
1917 if (thread_call_cancel(wq->wq_atimer_delayed_call) == TRUE) {
1918 WQ_UNSETFLAG(wq, WQ_ATIMER_DELAYED_RUNNING);
1919 }
1920 }
1921 if (wq->wq_flags & WQ_ATIMER_IMMEDIATE_RUNNING) {
1922 if (thread_call_cancel(wq->wq_atimer_immediate_call) == TRUE) {
1923 WQ_UNSETFLAG(wq, WQ_ATIMER_IMMEDIATE_RUNNING);
1924 }
1925 }
1926 while (wq->wq_flags & (WQ_ATIMER_DELAYED_RUNNING | WQ_ATIMER_IMMEDIATE_RUNNING) ||
1927 (wq->wq_lflags & WQL_ATIMER_BUSY)) {
1928 assert_wait((caddr_t)wq, (THREAD_UNINT));
1929 workqueue_unlock(wq);
1930
1931 thread_block(THREAD_CONTINUE_NULL);
1932
1933 workqueue_lock_spin(wq);
1934 }
1935 workqueue_unlock(wq);
1936
1937 PTHREAD_TRACE(TRACE_wq_pthread_exit|DBG_FUNC_END, 0, 0, 0, 0, 0);
1938 }
1939 }
1940
1941 /*
1942 * Routine: workqueue_exit
1943 *
1944 * Function: clean up the work queue structure(s) now that there are no threads
1945 * left running inside the work queue (except possibly current_thread).
1946 *
1947 * Conditions: Called by the last thread in the process.
1948 * Called against current process.
1949 */
1950 void
1951 _workqueue_exit(struct proc *p)
1952 {
1953 struct workqueue * wq;
1954 struct threadlist * tl, *tlist;
1955 struct uthread *uth;
1956 size_t wq_size = sizeof(struct workqueue);
1957
1958 wq = pthread_kern->proc_get_wqptr(p);
1959 if (wq != NULL) {
1960
1961 PTHREAD_TRACE_WQ(TRACE_wq_workqueue_exit|DBG_FUNC_START, wq, 0, 0, 0, 0);
1962
1963 pthread_kern->proc_set_wqptr(p, NULL);
1964
1965 /*
1966 * Clean up workqueue data structures for threads that exited and
1967 * didn't get a chance to clean up after themselves.
1968 */
1969 TAILQ_FOREACH_SAFE(tl, &wq->wq_thrunlist, th_entry, tlist) {
1970 assert((tl->th_flags & TH_LIST_RUNNING) != 0);
1971
1972 pthread_kern->thread_sched_call(tl->th_thread, NULL);
1973
1974 uth = pthread_kern->get_bsdthread_info(tl->th_thread);
1975 if (uth != (struct uthread *)0) {
1976 pthread_kern->uthread_set_threadlist(uth, NULL);
1977 }
1978 TAILQ_REMOVE(&wq->wq_thrunlist, tl, th_entry);
1979
1980 /*
1981 * drop our last ref on the thread
1982 */
1983 thread_deallocate(tl->th_thread);
1984
1985 kfree(tl, sizeof(struct threadlist));
1986 }
1987 TAILQ_FOREACH_SAFE(tl, &wq->wq_thidlelist, th_entry, tlist) {
1988 assert((tl->th_flags & TH_LIST_RUNNING) == 0);
1989 assert(tl->th_priority != WORKQUEUE_EVENT_MANAGER_BUCKET);
1990 workqueue_removethread(tl, true, false);
1991 }
1992 TAILQ_FOREACH_SAFE(tl, &wq->wq_thidlemgrlist, th_entry, tlist) {
1993 assert((tl->th_flags & TH_LIST_RUNNING) == 0);
1994 assert(tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET);
1995 workqueue_removethread(tl, true, false);
1996 }
1997 thread_call_free(wq->wq_atimer_delayed_call);
1998 thread_call_free(wq->wq_atimer_immediate_call);
1999 lck_spin_destroy(&wq->wq_lock, pthread_lck_grp);
2000
2001 kfree(wq, wq_size);
2002
2003 PTHREAD_TRACE(TRACE_wq_workqueue_exit|DBG_FUNC_END, 0, 0, 0, 0, 0);
2004 }
2005 }
2006
2007
2008 #pragma mark workqueue thread manipulation
2009
2010 /**
2011 * Entry point for libdispatch to ask for threads
2012 */
2013 static int wqops_queue_reqthreads(struct proc *p, int reqcount, pthread_priority_t priority){
2014 struct workqueue *wq;
2015 boolean_t start_timer = FALSE;
2016
2017 boolean_t overcommit = (_pthread_priority_get_flags(priority) & _PTHREAD_PRIORITY_OVERCOMMIT_FLAG) != 0;
2018 int class = pthread_priority_get_class_index(priority);
2019
2020 boolean_t event_manager = (_pthread_priority_get_flags(priority) & _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG) != 0;
2021 if (event_manager){
2022 class = WORKQUEUE_EVENT_MANAGER_BUCKET;
2023 }
2024
2025 if ((reqcount <= 0) || (class < 0) || (class >= WORKQUEUE_NUM_BUCKETS) || (overcommit && event_manager)) {
2026 return EINVAL;
2027 }
2028
2029
2030 if ((wq = (struct workqueue *)pthread_kern->proc_get_wqptr(p)) == NULL) {
2031 return EINVAL;
2032 }
2033
2034 workqueue_lock_spin(wq);
2035
2036 if (overcommit == 0 && event_manager == 0) {
2037 wq->wq_reqcount += reqcount;
2038 wq->wq_requests[class] += reqcount;
2039
2040 PTHREAD_TRACE_WQ(TRACE_wq_req_threads | DBG_FUNC_NONE, wq, priority, wq->wq_requests[class], reqcount, 0);
2041
2042 while (wq->wq_reqcount) {
2043 if (!workqueue_run_one(p, wq, overcommit, 0))
2044 break;
2045 }
2046 } else if (overcommit) {
2047 PTHREAD_TRACE_WQ(TRACE_wq_req_octhreads | DBG_FUNC_NONE, wq, priority, wq->wq_ocrequests[class], reqcount, 0);
2048
2049 while (reqcount) {
2050 if (!workqueue_run_one(p, wq, overcommit, priority))
2051 break;
2052 reqcount--;
2053 }
2054 if (reqcount) {
2055 /*
2056 * We need to delay starting some of the overcommit requests.
2057 * We'll record the request here and as existing threads return to
2058 * the kernel, we'll notice the ocrequests and spin them back to
2059 * user space as the overcommit variety.
2060 */
2061 wq->wq_reqcount += reqcount;
2062 wq->wq_requests[class] += reqcount;
2063 wq->wq_ocrequests[class] += reqcount;
2064
2065 PTHREAD_TRACE_WQ(TRACE_wq_delay_octhreads | DBG_FUNC_NONE, wq, priority, wq->wq_ocrequests[class], reqcount, 0);
2066
2067 /*
2068 * If we delayed this thread coming up but we're not constrained
2069 * or at max threads then we need to start the timer so we don't
2070 * risk dropping this request on the floor.
2071 */
2072 if ((wq->wq_constrained_threads_scheduled < wq_max_constrained_threads) &&
2073 (wq->wq_nthreads < wq_max_threads)){
2074 start_timer = WQ_TIMER_DELAYED_NEEDED(wq);
2075 }
2076 }
2077 } else if (event_manager) {
2078 PTHREAD_TRACE_WQ(TRACE_wq_req_event_manager | DBG_FUNC_NONE, wq, wq->wq_event_manager_priority, wq->wq_requests[WORKQUEUE_EVENT_MANAGER_BUCKET], wq->wq_thscheduled_count[WORKQUEUE_EVENT_MANAGER_BUCKET], 0);
2079
2080 if (wq->wq_requests[WORKQUEUE_EVENT_MANAGER_BUCKET] == 0){
2081 wq->wq_reqcount += 1;
2082 wq->wq_requests[WORKQUEUE_EVENT_MANAGER_BUCKET] = 1;
2083 }
2084
2085 // We've recorded the request for an event manager thread above. We'll
2086 // let the timer pick it up as we would for a kernel callout. We can
2087 // do a direct add/wakeup when that support is added for the kevent path.
2088 if (wq->wq_thscheduled_count[WORKQUEUE_EVENT_MANAGER_BUCKET] == 0){
2089 start_timer = WQ_TIMER_DELAYED_NEEDED(wq);
2090 }
2091 }
2092
2093 if (start_timer) {
2094 workqueue_interval_timer_start(wq);
2095 }
2096
2097 workqueue_unlock(wq);
2098
2099 return 0;
2100 }
2101
2102 /*
2103 * Used by the kevent system to request threads.
2104 *
2105 * Currently count is ignored and we always return one thread per invocation.
2106 */
2107 thread_t _workq_reqthreads(struct proc *p, int requests_count, workq_reqthreads_req_t requests){
2108 thread_t th = THREAD_NULL;
2109 boolean_t do_thread_call = FALSE;
2110 boolean_t emergency_thread = FALSE;
2111 assert(requests_count > 0);
2112
2113 #if DEBUG
2114 // Make sure that the requests array is sorted, highest priority first
2115 if (requests_count > 1){
2116 __assert_only qos_class_t priority = _pthread_priority_get_qos_newest(requests[0].priority);
2117 __assert_only unsigned long flags = ((_pthread_priority_get_flags(requests[0].priority) & (_PTHREAD_PRIORITY_OVERCOMMIT_FLAG|_PTHREAD_PRIORITY_EVENT_MANAGER_FLAG)) != 0);
2118 for (int i = 1; i < requests_count; i++){
2119 if (requests[i].count == 0) continue;
2120 __assert_only qos_class_t next_priority = _pthread_priority_get_qos_newest(requests[i].priority);
2121 __assert_only unsigned long next_flags = ((_pthread_priority_get_flags(requests[i].priority) & (_PTHREAD_PRIORITY_OVERCOMMIT_FLAG|_PTHREAD_PRIORITY_EVENT_MANAGER_FLAG)) != 0);
2122 if (next_flags != flags){
2123 flags = next_flags;
2124 priority = next_priority;
2125 } else {
2126 assert(next_priority <= priority);
2127 }
2128 }
2129 }
2130 #endif // DEBUG
2131
2132 struct workqueue *wq;
2133 if ((wq = (struct workqueue *)pthread_kern->proc_get_wqptr(p)) == NULL) {
2134 return THREAD_NULL;
2135 }
2136
2137 workqueue_lock_spin(wq);
2138
2139 PTHREAD_TRACE_WQ(TRACE_wq_kevent_req_threads | DBG_FUNC_START, wq, requests_count, 0, 0, 0);
2140
2141 // Look for overcommit or event-manager-only requests.
2142 boolean_t have_overcommit = FALSE;
2143 pthread_priority_t priority = 0;
2144 for (int i = 0; i < requests_count; i++){
2145 if (requests[i].count == 0)
2146 continue;
2147 priority = requests[i].priority;
2148 if (_pthread_priority_get_qos_newest(priority) == QOS_CLASS_UNSPECIFIED){
2149 priority |= _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG;
2150 }
2151 if ((_pthread_priority_get_flags(priority) & _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG) != 0){
2152 goto event_manager;
2153 }
2154 if ((_pthread_priority_get_flags(priority) & _PTHREAD_PRIORITY_OVERCOMMIT_FLAG) != 0){
2155 have_overcommit = TRUE;
2156 break;
2157 }
2158 }
2159
2160 if (have_overcommit){
2161 if (wq->wq_thidlecount){
2162 th = workqueue_run_nextreq(p, wq, THREAD_NULL, RUN_NEXTREQ_OVERCOMMIT_KEVENT, priority, true);
2163 if (th != THREAD_NULL){
2164 goto out;
2165 } else {
2166 workqueue_lock_spin(wq); // reacquire lock
2167 }
2168 }
2169
2170 int class = pthread_priority_get_class_index(priority);
2171 wq->wq_reqcount += 1;
2172 wq->wq_requests[class] += 1;
2173 wq->wq_kevent_ocrequests[class] += 1;
2174
2175 do_thread_call = WQ_TIMER_IMMEDIATE_NEEDED(wq);
2176 goto deferred;
2177 }
2178
2179 // Having no overcommit requests, try to find any request that can start
2180 // There's no TOCTTOU since we hold the workqueue lock
2181 for (int i = 0; i < requests_count; i++){
2182 workq_reqthreads_req_t req = requests + i;
2183 priority = req->priority;
2184 int class = pthread_priority_get_class_index(priority);
2185
2186 if (req->count == 0)
2187 continue;
2188
2189 if (!may_start_constrained_thread(wq, class, WORKQUEUE_NUM_BUCKETS, NULL))
2190 continue;
2191
2192 wq->wq_reqcount += 1;
2193 wq->wq_requests[class] += 1;
2194 wq->wq_kevent_requests[class] += 1;
2195
2196 PTHREAD_TRACE_WQ(TRACE_wq_req_kevent_threads | DBG_FUNC_NONE, wq, priority, wq->wq_kevent_requests[class], 1, 0);
2197
2198 if (wq->wq_thidlecount){
2199 th = workqueue_run_nextreq(p, wq, THREAD_NULL, RUN_NEXTREQ_DEFAULT_KEVENT, priority, true);
2200 goto out;
2201 } else {
2202 do_thread_call = WQ_TIMER_IMMEDIATE_NEEDED(wq);
2203 goto deferred;
2204 }
2205 }
2206
2207 // Okay, here's the fun case: we can't spin up any of the non-overcommit threads
2208 // that we've seen a request for, so we kick this over to the event manager thread
2209 emergency_thread = TRUE;
2210
2211 event_manager:
2212 if (wq->wq_requests[WORKQUEUE_EVENT_MANAGER_BUCKET] == 0){
2213 wq->wq_reqcount += 1;
2214 wq->wq_requests[WORKQUEUE_EVENT_MANAGER_BUCKET] = 1;
2215 PTHREAD_TRACE_WQ(TRACE_wq_req_event_manager | DBG_FUNC_NONE, wq, 0, wq->wq_kevent_requests[WORKQUEUE_EVENT_MANAGER_BUCKET], 1, 0);
2216 } else {
2217 PTHREAD_TRACE_WQ(TRACE_wq_req_event_manager | DBG_FUNC_NONE, wq, 0, wq->wq_kevent_requests[WORKQUEUE_EVENT_MANAGER_BUCKET], 0, 0);
2218 }
2219 wq->wq_kevent_requests[WORKQUEUE_EVENT_MANAGER_BUCKET] = 1;
2220
2221 if (wq->wq_thidlecount && wq->wq_thscheduled_count[WORKQUEUE_EVENT_MANAGER_BUCKET] == 0){
2222 th = workqueue_run_nextreq(p, wq, THREAD_NULL, RUN_NEXTREQ_EVENT_MANAGER, 0, true);
2223 assert(th != THREAD_NULL);
2224 goto out;
2225 }
2226 do_thread_call = WQ_TIMER_IMMEDIATE_NEEDED(wq);
2227
2228 deferred:
2229 workqueue_unlock(wq);
2230
2231 if (do_thread_call == TRUE){
2232 workqueue_interval_timer_trigger(wq);
2233 }
2234
2235 out:
2236 PTHREAD_TRACE_WQ(TRACE_wq_kevent_req_threads | DBG_FUNC_END, wq, do_thread_call, 0, 0, 0);
2237
2238 return emergency_thread ? (void*)-1 : th;
2239 }
2240
2241
2242 static int wqops_thread_return(struct proc *p){
2243 thread_t th = current_thread();
2244 struct uthread *uth = pthread_kern->get_bsdthread_info(th);
2245 struct threadlist *tl = pthread_kern->uthread_get_threadlist(uth);
2246
2247 /* reset signal mask on the workqueue thread to default state */
2248 if (pthread_kern->uthread_get_sigmask(uth) != (sigset_t)(~workq_threadmask)) {
2249 pthread_kern->proc_lock(p);
2250 pthread_kern->uthread_set_sigmask(uth, ~workq_threadmask);
2251 pthread_kern->proc_unlock(p);
2252 }
2253
2254 struct workqueue *wq = (struct workqueue *)pthread_kern->proc_get_wqptr(p);
2255 if (wq == NULL || !tl) {
2256 return EINVAL;
2257 }
2258
2259 PTHREAD_TRACE_WQ(TRACE_wq_override_reset | DBG_FUNC_START, tl->th_workq, 0, 0, 0, 0);
2260
2261 /*
2262 * This squash call has neat semantics: it removes the specified overrides,
2263 * replacing the current requested QoS with the previous effective QoS from
2264 * those overrides. This means we won't be preempted due to having our QoS
2265 * lowered. Of course, now our understanding of the thread's QoS is wrong,
2266 * so we'll adjust below.
2267 */
2268 int new_qos =
2269 pthread_kern->proc_usynch_thread_qos_squash_override_for_resource(th,
2270 THREAD_QOS_OVERRIDE_RESOURCE_WILDCARD,
2271 THREAD_QOS_OVERRIDE_TYPE_DISPATCH_ASYNCHRONOUS_OVERRIDE);
2272
2273 workqueue_lock_spin(wq);
2274
2275 if (tl->th_flags & TH_LIST_KEVENT_BOUND) {
2276 unsigned int flags = KEVENT_FLAG_WORKQ;
2277 if (tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET) {
2278 flags |= KEVENT_FLAG_WORKQ_MANAGER;
2279 }
2280
2281 workqueue_unlock(wq);
2282 kevent_qos_internal_unbind(p, class_index_get_thread_qos(tl->th_priority), th, flags);
2283 workqueue_lock_spin(wq);
2284
2285 tl->th_flags &= ~TH_LIST_KEVENT_BOUND;
2286 }
2287
2288 /* Fix up counters from the squash operation. */
2289 uint8_t old_bucket = tl->th_priority;
2290 uint8_t new_bucket = thread_qos_get_class_index(new_qos);
2291
2292 if (old_bucket != new_bucket) {
2293 OSAddAtomic(-1, &wq->wq_thactive_count[old_bucket]);
2294 OSAddAtomic(1, &wq->wq_thactive_count[new_bucket]);
2295
2296 wq->wq_thscheduled_count[old_bucket]--;
2297 wq->wq_thscheduled_count[new_bucket]++;
2298
2299 tl->th_priority = new_bucket;
2300 }
2301
2302 PTHREAD_TRACE_WQ(TRACE_wq_override_reset | DBG_FUNC_END, tl->th_workq, new_qos, 0, 0, 0);
2303
2304 PTHREAD_TRACE_WQ(TRACE_wq_runitem | DBG_FUNC_END, wq, 0, 0, 0, 0);
2305
2306 (void)workqueue_run_nextreq(p, wq, th, RUN_NEXTREQ_DEFAULT, 0, false);
2307 /*
2308 * workqueue_run_nextreq is responsible for
2309 * dropping the workqueue lock in all cases
2310 */
2311 return 0;
2312 }
2313
2314 /**
2315 * Multiplexed call to interact with the workqueue mechanism
2316 */
2317 int
2318 _workq_kernreturn(struct proc *p,
2319 int options,
2320 user_addr_t item,
2321 int arg2,
2322 int arg3,
2323 int32_t *retval)
2324 {
2325 int error = 0;
2326
2327 if (pthread_kern->proc_get_register(p) == 0) {
2328 return EINVAL;
2329 }
2330
2331 switch (options) {
2332 case WQOPS_QUEUE_NEWSPISUPP: {
2333 /*
2334 * arg2 = offset of serialno into dispatch queue
2335 * arg3 = kevent support
2336 */
2337 int offset = arg2;
2338 if (arg3 & 0x01){
2339 // If we get here, then userspace has indicated support for kevent delivery.
2340 }
2341
2342 pthread_kern->proc_set_dispatchqueue_serialno_offset(p, (uint64_t)offset);
2343 break;
2344 }
2345 case WQOPS_QUEUE_REQTHREADS: {
2346 /*
2347 * arg2 = number of threads to start
2348 * arg3 = priority
2349 */
2350 error = wqops_queue_reqthreads(p, arg2, arg3);
2351 break;
2352 }
2353 case WQOPS_SET_EVENT_MANAGER_PRIORITY: {
2354 /*
2355 * arg2 = priority for the manager thread
2356 *
2357 * if _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG is set, the
2358 * ~_PTHREAD_PRIORITY_FLAGS_MASK contains a scheduling priority instead
2359 * of a QOS value
2360 */
2361 pthread_priority_t pri = arg2;
2362
2363 struct workqueue *wq = (struct workqueue *)pthread_kern->proc_get_wqptr(p);
2364 if (wq == NULL) {
2365 error = EINVAL;
2366 break;
2367 }
2368 workqueue_lock_spin(wq);
2369 if (pri & _PTHREAD_PRIORITY_SCHED_PRI_FLAG){
2370 // If userspace passes a scheduling priority, that takes precidence
2371 // over any QoS. (So, userspace should take care not to accidenatally
2372 // lower the priority this way.)
2373 uint32_t sched_pri = pri & (~_PTHREAD_PRIORITY_FLAGS_MASK);
2374 if (wq->wq_event_manager_priority & _PTHREAD_PRIORITY_SCHED_PRI_FLAG){
2375 wq->wq_event_manager_priority = MAX(sched_pri, wq->wq_event_manager_priority & (~_PTHREAD_PRIORITY_FLAGS_MASK))
2376 | _PTHREAD_PRIORITY_SCHED_PRI_FLAG | _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG;
2377 } else {
2378 wq->wq_event_manager_priority = sched_pri
2379 | _PTHREAD_PRIORITY_SCHED_PRI_FLAG | _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG;
2380 }
2381 } else if ((wq->wq_event_manager_priority & _PTHREAD_PRIORITY_SCHED_PRI_FLAG) == 0){
2382 int cur_qos = pthread_priority_get_thread_qos(wq->wq_event_manager_priority);
2383 int new_qos = pthread_priority_get_thread_qos(pri);
2384 wq->wq_event_manager_priority = (uint32_t)thread_qos_get_pthread_priority(MAX(cur_qos, new_qos)) | _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG;
2385 }
2386 workqueue_unlock(wq);
2387 break;
2388 }
2389 case WQOPS_THREAD_KEVENT_RETURN:
2390 if (item != 0) {
2391 int32_t kevent_retval;
2392 int ret = kevent_qos_internal(p, -1, item, arg2, item, arg2, NULL, NULL, KEVENT_FLAG_WORKQ | KEVENT_FLAG_IMMEDIATE | KEVENT_FLAG_ERROR_EVENTS, &kevent_retval);
2393 // We shouldn't be getting more errors out than events we put in, so
2394 // reusing the input buffer should always provide enough space. But,
2395 // the assert is commented out since we get errors in edge cases in the
2396 // process lifecycle.
2397 //assert(ret == KERN_SUCCESS && kevent_retval >= 0);
2398 if (ret != KERN_SUCCESS){
2399 error = ret;
2400 break;
2401 } else if (kevent_retval > 0){
2402 assert(kevent_retval <= arg2);
2403 *retval = kevent_retval;
2404 error = 0;
2405 break;
2406 }
2407 }
2408 // FALLTHRU
2409 case WQOPS_THREAD_RETURN:
2410 error = wqops_thread_return(p);
2411 // NOT REACHED except in case of error
2412 assert(error);
2413 break;
2414 default:
2415 error = EINVAL;
2416 break;
2417 }
2418 return (error);
2419 }
2420
2421
2422 static boolean_t
2423 workqueue_run_one(proc_t p, struct workqueue *wq, boolean_t overcommit, pthread_priority_t priority)
2424 {
2425 boolean_t ran_one;
2426
2427 if (wq->wq_thidlecount == 0) {
2428 if (overcommit == FALSE) {
2429 if (wq->wq_constrained_threads_scheduled < wq->wq_max_concurrency)
2430 workqueue_addnewthread(wq, overcommit);
2431 } else {
2432 workqueue_addnewthread(wq, overcommit);
2433
2434 if (wq->wq_thidlecount == 0)
2435 return (FALSE);
2436 }
2437 }
2438 ran_one = (workqueue_run_nextreq(p, wq, THREAD_NULL, overcommit ? RUN_NEXTREQ_OVERCOMMIT : RUN_NEXTREQ_DEFAULT, priority, false) != THREAD_NULL);
2439 /*
2440 * workqueue_run_nextreq is responsible for
2441 * dropping the workqueue lock in all cases
2442 */
2443 workqueue_lock_spin(wq);
2444
2445 return (ran_one);
2446 }
2447
2448 /*
2449 * We have no work to do, park ourselves on the idle list.
2450 *
2451 * Consumes the workqueue lock and does not return.
2452 */
2453 static void __dead2
2454 parkit(struct workqueue *wq, struct threadlist *tl, thread_t thread)
2455 {
2456 assert(thread == tl->th_thread);
2457 assert(thread == current_thread());
2458
2459 uint32_t us_to_wait = 0;
2460
2461 TAILQ_REMOVE(&wq->wq_thrunlist, tl, th_entry);
2462
2463 tl->th_flags &= ~TH_LIST_RUNNING;
2464 tl->th_flags &= ~TH_LIST_KEVENT;
2465 assert((tl->th_flags & TH_LIST_KEVENT_BOUND) == 0);
2466
2467 if (tl->th_flags & TH_LIST_CONSTRAINED) {
2468 wq->wq_constrained_threads_scheduled--;
2469 tl->th_flags &= ~TH_LIST_CONSTRAINED;
2470 }
2471
2472 OSAddAtomic(-1, &wq->wq_thactive_count[tl->th_priority]);
2473 wq->wq_thscheduled_count[tl->th_priority]--;
2474 wq->wq_threads_scheduled--;
2475 uint32_t thidlecount = ++wq->wq_thidlecount;
2476
2477 pthread_kern->thread_sched_call(thread, NULL);
2478
2479 /*
2480 * We'd like to always have one manager thread parked so that we can have
2481 * low latency when we need to bring a manager thread up. If that idle
2482 * thread list is empty, make this thread a manager thread.
2483 *
2484 * XXX: This doesn't check that there's not a manager thread outstanding,
2485 * so it's based on the assumption that most manager callouts will change
2486 * their QoS before parking. If that stops being true, this may end up
2487 * costing us more than we gain.
2488 */
2489 if (TAILQ_EMPTY(&wq->wq_thidlemgrlist) &&
2490 tl->th_priority != WORKQUEUE_EVENT_MANAGER_BUCKET){
2491 reset_priority(tl, pthread_priority_from_wq_class_index(wq, WORKQUEUE_EVENT_MANAGER_BUCKET));
2492 tl->th_priority = WORKQUEUE_EVENT_MANAGER_BUCKET;
2493 }
2494
2495 if (tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET){
2496 TAILQ_INSERT_HEAD(&wq->wq_thidlemgrlist, tl, th_entry);
2497 } else {
2498 TAILQ_INSERT_HEAD(&wq->wq_thidlelist, tl, th_entry);
2499 }
2500
2501 PTHREAD_TRACE_WQ(TRACE_wq_thread_park | DBG_FUNC_START, wq,
2502 wq->wq_threads_scheduled, wq->wq_thidlecount, us_to_wait, 0);
2503
2504 /*
2505 * When we remove the voucher from the thread, we may lose our importance
2506 * causing us to get preempted, so we do this after putting the thread on
2507 * the idle list. That when, when we get our importance back we'll be able
2508 * to use this thread from e.g. the kevent call out to deliver a boosting
2509 * message.
2510 */
2511 workqueue_unlock(wq);
2512 kern_return_t kr = pthread_kern->thread_set_voucher_name(MACH_PORT_NULL);
2513 assert(kr == KERN_SUCCESS);
2514 workqueue_lock_spin(wq);
2515
2516 if ((tl->th_flags & TH_LIST_RUNNING) == 0) {
2517 if (thidlecount < 101) {
2518 us_to_wait = wq_reduce_pool_window_usecs - ((thidlecount-2) * (wq_reduce_pool_window_usecs / 100));
2519 } else {
2520 us_to_wait = wq_reduce_pool_window_usecs / 100;
2521 }
2522
2523 assert_wait_timeout_with_leeway((caddr_t)tl, (THREAD_INTERRUPTIBLE),
2524 TIMEOUT_URGENCY_SYS_BACKGROUND|TIMEOUT_URGENCY_LEEWAY, us_to_wait,
2525 wq_reduce_pool_window_usecs/10, NSEC_PER_USEC);
2526
2527 workqueue_unlock(wq);
2528
2529 thread_block(wq_unpark_continue);
2530 panic("thread_block(wq_unpark_continue) returned!");
2531 } else {
2532 workqueue_unlock(wq);
2533
2534 /*
2535 * While we'd dropped the lock to unset our voucher, someone came
2536 * around and made us runnable. But because we weren't waiting on the
2537 * event their wakeup() was ineffectual. To correct for that, we just
2538 * run the continuation ourselves.
2539 */
2540 wq_unpark_continue(NULL, THREAD_AWAKENED);
2541 }
2542 }
2543
2544 static boolean_t may_start_constrained_thread(struct workqueue *wq, uint32_t at_priclass, uint32_t my_priclass, boolean_t *start_timer){
2545 if (wq->wq_constrained_threads_scheduled >= wq_max_constrained_threads) {
2546 /*
2547 * we need 1 or more constrained threads to return to the kernel before
2548 * we can dispatch additional work
2549 */
2550 return FALSE;
2551 }
2552
2553 uint32_t busycount = 0;
2554 uint32_t thactive_count = wq->wq_thactive_count[at_priclass];
2555
2556 // Has our most recently blocked thread blocked recently enough that we
2557 // should still consider it busy?
2558 if (wq->wq_thscheduled_count[at_priclass] > wq->wq_thactive_count[at_priclass]) {
2559 if (wq_thread_is_busy(mach_absolute_time(), &wq->wq_lastblocked_ts[at_priclass])) {
2560 busycount++;
2561 }
2562 }
2563
2564 if (my_priclass < WORKQUEUE_NUM_BUCKETS && my_priclass == at_priclass){
2565 /*
2566 * don't count this thread as currently active
2567 */
2568 thactive_count--;
2569 }
2570
2571 if (thactive_count + busycount >= wq->wq_max_concurrency) {
2572 if (busycount && start_timer) {
2573 /*
2574 * we found at least 1 thread in the
2575 * 'busy' state... make sure we start
2576 * the timer because if they are the only
2577 * threads keeping us from scheduling
2578 * this work request, we won't get a callback
2579 * to kick off the timer... we need to
2580 * start it now...
2581 */
2582 *start_timer = WQ_TIMER_DELAYED_NEEDED(wq);
2583 }
2584
2585 PTHREAD_TRACE_WQ(TRACE_wq_overcommitted|DBG_FUNC_NONE, wq, ((start_timer && *start_timer) ? 1 << _PTHREAD_PRIORITY_FLAGS_SHIFT : 0) | class_index_get_pthread_priority(at_priclass), thactive_count, busycount, 0);
2586
2587 return FALSE;
2588 }
2589 return TRUE;
2590 }
2591
2592 static struct threadlist *
2593 pop_from_thidlelist(struct workqueue *wq, uint32_t priclass)
2594 {
2595 assert(wq->wq_thidlecount);
2596
2597 struct threadlist *tl = NULL;
2598
2599 if (!TAILQ_EMPTY(&wq->wq_thidlemgrlist) &&
2600 (priclass == WORKQUEUE_EVENT_MANAGER_BUCKET || TAILQ_EMPTY(&wq->wq_thidlelist))){
2601 tl = TAILQ_FIRST(&wq->wq_thidlemgrlist);
2602 TAILQ_REMOVE(&wq->wq_thidlemgrlist, tl, th_entry);
2603 assert(tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET);
2604 } else if (!TAILQ_EMPTY(&wq->wq_thidlelist) &&
2605 (priclass != WORKQUEUE_EVENT_MANAGER_BUCKET || TAILQ_EMPTY(&wq->wq_thidlemgrlist))){
2606 tl = TAILQ_FIRST(&wq->wq_thidlelist);
2607 TAILQ_REMOVE(&wq->wq_thidlelist, tl, th_entry);
2608 assert(tl->th_priority != WORKQUEUE_EVENT_MANAGER_BUCKET);
2609 } else {
2610 panic("pop_from_thidlelist called with no threads available");
2611 }
2612 assert((tl->th_flags & TH_LIST_RUNNING) == 0);
2613
2614 assert(wq->wq_thidlecount);
2615 wq->wq_thidlecount--;
2616
2617 TAILQ_INSERT_TAIL(&wq->wq_thrunlist, tl, th_entry);
2618
2619 tl->th_flags |= TH_LIST_RUNNING | TH_LIST_BUSY;
2620
2621 wq->wq_threads_scheduled++;
2622 wq->wq_thscheduled_count[priclass]++;
2623 OSAddAtomic(1, &wq->wq_thactive_count[priclass]);
2624
2625 return tl;
2626 }
2627
2628 static pthread_priority_t
2629 pthread_priority_from_wq_class_index(struct workqueue *wq, int index){
2630 if (index == WORKQUEUE_EVENT_MANAGER_BUCKET){
2631 return wq->wq_event_manager_priority;
2632 } else {
2633 return class_index_get_pthread_priority(index);
2634 }
2635 }
2636
2637 static void
2638 reset_priority(struct threadlist *tl, pthread_priority_t pri){
2639 kern_return_t ret;
2640 thread_t th = tl->th_thread;
2641
2642 if ((pri & _PTHREAD_PRIORITY_SCHED_PRI_FLAG) == 0){
2643 ret = pthread_kern->thread_set_workq_qos(th, pthread_priority_get_thread_qos(pri), 0);
2644 assert(ret == KERN_SUCCESS || ret == KERN_TERMINATED);
2645
2646 if (tl->th_flags & TH_LIST_EVENT_MGR_SCHED_PRI) {
2647
2648 /* Reset priority to default (masked by QoS) */
2649
2650 ret = pthread_kern->thread_set_workq_pri(th, 31, POLICY_TIMESHARE);
2651 assert(ret == KERN_SUCCESS || ret == KERN_TERMINATED);
2652
2653 tl->th_flags &= ~TH_LIST_EVENT_MGR_SCHED_PRI;
2654 }
2655 } else {
2656 ret = pthread_kern->thread_set_workq_qos(th, THREAD_QOS_UNSPECIFIED, 0);
2657 assert(ret == KERN_SUCCESS || ret == KERN_TERMINATED);
2658 ret = pthread_kern->thread_set_workq_pri(th, (pri & (~_PTHREAD_PRIORITY_FLAGS_MASK)), POLICY_TIMESHARE);
2659 assert(ret == KERN_SUCCESS || ret == KERN_TERMINATED);
2660
2661 tl->th_flags |= TH_LIST_EVENT_MGR_SCHED_PRI;
2662 }
2663 }
2664
2665 /**
2666 * grabs a thread for a request
2667 *
2668 * - called with the workqueue lock held...
2669 * - responsible for dropping it in all cases
2670 * - if provided mode is for overcommit, doesn't consume a reqcount
2671 *
2672 */
2673 static thread_t
2674 workqueue_run_nextreq(proc_t p, struct workqueue *wq, thread_t thread,
2675 enum run_nextreq_mode mode, pthread_priority_t prio,
2676 bool kevent_bind_via_return)
2677 {
2678 thread_t th_to_run = THREAD_NULL;
2679 uint32_t upcall_flags = 0;
2680 uint32_t priclass;
2681 struct threadlist *tl = NULL;
2682 struct uthread *uth = NULL;
2683 boolean_t start_timer = FALSE;
2684
2685 if (mode == RUN_NEXTREQ_ADD_TIMER) {
2686 mode = RUN_NEXTREQ_DEFAULT;
2687 }
2688
2689 // valid modes to call this function with
2690 assert(mode == RUN_NEXTREQ_DEFAULT || mode == RUN_NEXTREQ_DEFAULT_KEVENT ||
2691 mode == RUN_NEXTREQ_OVERCOMMIT || mode == RUN_NEXTREQ_UNCONSTRAINED ||
2692 mode == RUN_NEXTREQ_EVENT_MANAGER || mode == RUN_NEXTREQ_OVERCOMMIT_KEVENT);
2693 // may only have a priority if in OVERCOMMIT or DEFAULT_KEVENT mode
2694 assert(mode == RUN_NEXTREQ_OVERCOMMIT || mode == RUN_NEXTREQ_OVERCOMMIT_KEVENT ||
2695 mode == RUN_NEXTREQ_DEFAULT_KEVENT || prio == 0);
2696 // thread == thread_null means "please spin up a new workqueue thread, we can't reuse this"
2697 // thread != thread_null is thread reuse, and must be the current thread
2698 assert(thread == THREAD_NULL || thread == current_thread());
2699
2700 PTHREAD_TRACE_WQ(TRACE_wq_run_nextitem|DBG_FUNC_START, wq, thread_tid(thread), wq->wq_thidlecount, wq->wq_reqcount, 0);
2701
2702 if (thread != THREAD_NULL) {
2703 uth = pthread_kern->get_bsdthread_info(thread);
2704
2705 if ((tl = pthread_kern->uthread_get_threadlist(uth)) == NULL) {
2706 panic("wq thread with no threadlist");
2707 }
2708 }
2709
2710 /*
2711 * from here until we drop the workq lock we can't be pre-empted since we
2712 * hold the lock in spin mode... this is important since we have to
2713 * independently update the priority that the thread is associated with and
2714 * the priorty based counters that "workqueue_callback" also changes and
2715 * bases decisions on.
2716 */
2717
2718 /*
2719 * This giant monstrosity does three things:
2720 *
2721 * - adjusts the mode, if required
2722 * - selects the priclass that we'll be servicing
2723 * - sets any mode-specific upcall flags
2724 *
2725 * When possible special-cases should be handled here and converted into
2726 * non-special cases.
2727 */
2728 if (mode == RUN_NEXTREQ_OVERCOMMIT) {
2729 priclass = pthread_priority_get_class_index(prio);
2730 upcall_flags |= WQ_FLAG_THREAD_OVERCOMMIT;
2731 } else if (mode == RUN_NEXTREQ_OVERCOMMIT_KEVENT){
2732 priclass = pthread_priority_get_class_index(prio);
2733 upcall_flags |= WQ_FLAG_THREAD_KEVENT;
2734 } else if (mode == RUN_NEXTREQ_EVENT_MANAGER){
2735 assert(wq->wq_thscheduled_count[WORKQUEUE_EVENT_MANAGER_BUCKET] == 0);
2736 priclass = WORKQUEUE_EVENT_MANAGER_BUCKET;
2737 upcall_flags |= WQ_FLAG_THREAD_EVENT_MANAGER;
2738 if (wq->wq_kevent_requests[WORKQUEUE_EVENT_MANAGER_BUCKET]){
2739 upcall_flags |= WQ_FLAG_THREAD_KEVENT;
2740 }
2741 } else if (wq->wq_reqcount == 0){
2742 // no work to do. we'll check again when new work arrives.
2743 goto done;
2744 } else if (mode == RUN_NEXTREQ_DEFAULT_KEVENT) {
2745 assert(kevent_bind_via_return);
2746
2747 priclass = pthread_priority_get_class_index(prio);
2748 assert(priclass < WORKQUEUE_EVENT_MANAGER_BUCKET);
2749 assert(wq->wq_kevent_requests[priclass] > 0);
2750
2751 upcall_flags |= WQ_FLAG_THREAD_KEVENT;
2752 mode = RUN_NEXTREQ_DEFAULT;
2753 } else if (wq->wq_requests[WORKQUEUE_EVENT_MANAGER_BUCKET] &&
2754 ((wq->wq_thscheduled_count[WORKQUEUE_EVENT_MANAGER_BUCKET] == 0) ||
2755 (thread != THREAD_NULL && tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET))){
2756 // There's an event manager request and either:
2757 // - no event manager currently running
2758 // - we are re-using the event manager
2759 mode = RUN_NEXTREQ_EVENT_MANAGER;
2760 priclass = WORKQUEUE_EVENT_MANAGER_BUCKET;
2761 upcall_flags |= WQ_FLAG_THREAD_EVENT_MANAGER;
2762 if (wq->wq_kevent_requests[WORKQUEUE_EVENT_MANAGER_BUCKET]){
2763 upcall_flags |= WQ_FLAG_THREAD_KEVENT;
2764 }
2765 } else {
2766 // Find highest priority and check for special request types
2767 for (priclass = 0; priclass < WORKQUEUE_EVENT_MANAGER_BUCKET; priclass++) {
2768 if (wq->wq_requests[priclass])
2769 break;
2770 }
2771 if (priclass == WORKQUEUE_EVENT_MANAGER_BUCKET){
2772 // only request should have been event manager since it's not in a bucket,
2773 // but we weren't able to handle it since there's already an event manager running,
2774 // so we fell into this case
2775 assert(wq->wq_requests[WORKQUEUE_EVENT_MANAGER_BUCKET] == 1 &&
2776 wq->wq_thscheduled_count[WORKQUEUE_EVENT_MANAGER_BUCKET] == 1 &&
2777 wq->wq_reqcount == 1);
2778 goto done;
2779 }
2780
2781 if (wq->wq_kevent_ocrequests[priclass]){
2782 mode = RUN_NEXTREQ_DEFERRED_OVERCOMMIT;
2783 upcall_flags |= WQ_FLAG_THREAD_KEVENT;
2784 upcall_flags |= WQ_FLAG_THREAD_OVERCOMMIT;
2785 } else if (wq->wq_ocrequests[priclass]){
2786 mode = RUN_NEXTREQ_DEFERRED_OVERCOMMIT;
2787 upcall_flags |= WQ_FLAG_THREAD_OVERCOMMIT;
2788 } else if (wq->wq_kevent_requests[priclass]){
2789 upcall_flags |= WQ_FLAG_THREAD_KEVENT;
2790 }
2791 }
2792
2793 assert(mode != RUN_NEXTREQ_EVENT_MANAGER || priclass == WORKQUEUE_EVENT_MANAGER_BUCKET);
2794 assert(mode == RUN_NEXTREQ_EVENT_MANAGER || priclass != WORKQUEUE_EVENT_MANAGER_BUCKET);
2795
2796 if (mode == RUN_NEXTREQ_DEFAULT /* non-overcommit */){
2797 uint32_t my_priclass = (thread != THREAD_NULL) ? tl->th_priority : WORKQUEUE_NUM_BUCKETS;
2798 if (may_start_constrained_thread(wq, priclass, my_priclass, &start_timer) == FALSE){
2799 // per policy, we won't start another constrained thread
2800 goto done;
2801 }
2802 }
2803
2804 if (thread != THREAD_NULL) {
2805 /*
2806 * thread is non-NULL here when we return from userspace
2807 * in workq_kernreturn, rather than trying to find a thread
2808 * we pick up new work for this specific thread.
2809 */
2810 th_to_run = thread;
2811 upcall_flags |= WQ_FLAG_THREAD_REUSE;
2812 } else if (wq->wq_thidlecount == 0) {
2813 /*
2814 * we have no additional threads waiting to pick up
2815 * work, however, there is additional work to do.
2816 */
2817 start_timer = WQ_TIMER_DELAYED_NEEDED(wq);
2818
2819 PTHREAD_TRACE_WQ(TRACE_wq_stalled, wq, wq->wq_nthreads, start_timer, 0, 0);
2820
2821 goto done;
2822 } else {
2823 // there is both work available and an idle thread, so activate a thread
2824 tl = pop_from_thidlelist(wq, priclass);
2825 th_to_run = tl->th_thread;
2826 }
2827
2828 // Adjust counters and thread flags AKA consume the request
2829 // TODO: It would be lovely if OVERCOMMIT consumed reqcount
2830 switch (mode) {
2831 case RUN_NEXTREQ_DEFAULT:
2832 case RUN_NEXTREQ_DEFAULT_KEVENT: /* actually mapped to DEFAULT above */
2833 case RUN_NEXTREQ_ADD_TIMER: /* actually mapped to DEFAULT above */
2834 case RUN_NEXTREQ_UNCONSTRAINED:
2835 wq->wq_reqcount--;
2836 wq->wq_requests[priclass]--;
2837
2838 if (mode == RUN_NEXTREQ_DEFAULT){
2839 if (!(tl->th_flags & TH_LIST_CONSTRAINED)) {
2840 wq->wq_constrained_threads_scheduled++;
2841 tl->th_flags |= TH_LIST_CONSTRAINED;
2842 }
2843 } else if (mode == RUN_NEXTREQ_UNCONSTRAINED){
2844 if (tl->th_flags & TH_LIST_CONSTRAINED) {
2845 wq->wq_constrained_threads_scheduled--;
2846 tl->th_flags &= ~TH_LIST_CONSTRAINED;
2847 }
2848 }
2849 if (upcall_flags & WQ_FLAG_THREAD_KEVENT){
2850 wq->wq_kevent_requests[priclass]--;
2851 }
2852 break;
2853
2854 case RUN_NEXTREQ_EVENT_MANAGER:
2855 wq->wq_reqcount--;
2856 wq->wq_requests[priclass]--;
2857
2858 if (tl->th_flags & TH_LIST_CONSTRAINED) {
2859 wq->wq_constrained_threads_scheduled--;
2860 tl->th_flags &= ~TH_LIST_CONSTRAINED;
2861 }
2862 if (upcall_flags & WQ_FLAG_THREAD_KEVENT){
2863 wq->wq_kevent_requests[priclass]--;
2864 }
2865 break;
2866
2867 case RUN_NEXTREQ_DEFERRED_OVERCOMMIT:
2868 wq->wq_reqcount--;
2869 wq->wq_requests[priclass]--;
2870 if (upcall_flags & WQ_FLAG_THREAD_KEVENT){
2871 wq->wq_kevent_ocrequests[priclass]--;
2872 } else {
2873 wq->wq_ocrequests[priclass]--;
2874 }
2875 /* FALLTHROUGH */
2876 case RUN_NEXTREQ_OVERCOMMIT:
2877 case RUN_NEXTREQ_OVERCOMMIT_KEVENT:
2878 if (tl->th_flags & TH_LIST_CONSTRAINED) {
2879 wq->wq_constrained_threads_scheduled--;
2880 tl->th_flags &= ~TH_LIST_CONSTRAINED;
2881 }
2882 break;
2883 }
2884
2885 // Confirm we've maintained our counter invariants
2886 assert(wq->wq_requests[priclass] < UINT16_MAX);
2887 assert(wq->wq_ocrequests[priclass] < UINT16_MAX);
2888 assert(wq->wq_kevent_requests[priclass] < UINT16_MAX);
2889 assert(wq->wq_kevent_ocrequests[priclass] < UINT16_MAX);
2890 assert(wq->wq_ocrequests[priclass] + wq->wq_kevent_requests[priclass] +
2891 wq->wq_kevent_ocrequests[priclass] <=
2892 wq->wq_requests[priclass]);
2893
2894 assert((tl->th_flags & TH_LIST_KEVENT_BOUND) == 0);
2895 if (upcall_flags & WQ_FLAG_THREAD_KEVENT) {
2896 tl->th_flags |= TH_LIST_KEVENT;
2897 } else {
2898 tl->th_flags &= ~TH_LIST_KEVENT;
2899 }
2900
2901 uint32_t orig_class = tl->th_priority;
2902 tl->th_priority = (uint8_t)priclass;
2903
2904 if ((thread != THREAD_NULL) && (orig_class != priclass)) {
2905 /*
2906 * we need to adjust these counters based on this
2907 * thread's new disposition w/r to priority
2908 */
2909 OSAddAtomic(-1, &wq->wq_thactive_count[orig_class]);
2910 OSAddAtomic(1, &wq->wq_thactive_count[priclass]);
2911
2912 wq->wq_thscheduled_count[orig_class]--;
2913 wq->wq_thscheduled_count[priclass]++;
2914 }
2915 wq->wq_thread_yielded_count = 0;
2916
2917 pthread_priority_t outgoing_priority = pthread_priority_from_wq_class_index(wq, tl->th_priority);
2918 PTHREAD_TRACE_WQ(TRACE_wq_reset_priority | DBG_FUNC_START, wq, thread_tid(tl->th_thread), outgoing_priority, 0, 0);
2919 reset_priority(tl, outgoing_priority);
2920 PTHREAD_TRACE_WQ(TRACE_wq_reset_priority | DBG_FUNC_END, wq, thread_tid(tl->th_thread), outgoing_priority, 0, 0);
2921
2922 /*
2923 * persist upcall_flags so that in can be retrieved in setup_wqthread
2924 */
2925 tl->th_upcall_flags = upcall_flags >> WQ_FLAG_THREAD_PRIOSHIFT;
2926
2927 /*
2928 * if current thread is reused for work request, does not return via unix_syscall
2929 */
2930 wq_runreq(p, th_to_run, wq, tl, (thread == th_to_run),
2931 (upcall_flags & WQ_FLAG_THREAD_KEVENT) && !kevent_bind_via_return);
2932
2933 PTHREAD_TRACE_WQ(TRACE_wq_run_nextitem|DBG_FUNC_END, wq, thread_tid(th_to_run), mode == RUN_NEXTREQ_OVERCOMMIT, 1, 0);
2934
2935 assert(!kevent_bind_via_return || (upcall_flags & WQ_FLAG_THREAD_KEVENT));
2936 if (kevent_bind_via_return && (upcall_flags & WQ_FLAG_THREAD_KEVENT)) {
2937 tl->th_flags |= TH_LIST_KEVENT_BOUND;
2938 }
2939
2940 workqueue_unlock(wq);
2941
2942 return th_to_run;
2943
2944 done:
2945 if (start_timer)
2946 workqueue_interval_timer_start(wq);
2947
2948 PTHREAD_TRACE_WQ(TRACE_wq_run_nextitem | DBG_FUNC_END, wq, thread_tid(thread), start_timer, 3, 0);
2949
2950 if (thread != THREAD_NULL){
2951 parkit(wq, tl, thread);
2952 /* NOT REACHED */
2953 }
2954
2955 workqueue_unlock(wq);
2956
2957 return THREAD_NULL;
2958 }
2959
2960 /**
2961 * parked thread wakes up
2962 */
2963 static void __dead2
2964 wq_unpark_continue(void* __unused ptr, wait_result_t wait_result)
2965 {
2966 boolean_t first_use = false;
2967 thread_t th = current_thread();
2968 proc_t p = current_proc();
2969
2970 struct uthread *uth = pthread_kern->get_bsdthread_info(th);
2971 if (uth == NULL) goto done;
2972
2973 struct threadlist *tl = pthread_kern->uthread_get_threadlist(uth);
2974 if (tl == NULL) goto done;
2975
2976 struct workqueue *wq = tl->th_workq;
2977
2978 workqueue_lock_spin(wq);
2979
2980 assert(tl->th_flags & TH_LIST_INITED);
2981
2982 if ((tl->th_flags & TH_LIST_NEW)){
2983 tl->th_flags &= ~(TH_LIST_NEW);
2984 first_use = true;
2985 }
2986
2987 if ((tl->th_flags & (TH_LIST_RUNNING | TH_LIST_BUSY)) == TH_LIST_RUNNING) {
2988 /*
2989 * The normal wakeup path.
2990 */
2991 goto return_to_user;
2992 }
2993
2994 if ((tl->th_flags & TH_LIST_RUNNING) == 0 &&
2995 wait_result == THREAD_TIMED_OUT &&
2996 tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET &&
2997 TAILQ_FIRST(&wq->wq_thidlemgrlist) == tl &&
2998 TAILQ_NEXT(tl, th_entry) == NULL){
2999 /*
3000 * If we are the only idle manager and we pop'ed for self-destruction,
3001 * then don't actually exit. Instead, free our stack to save some
3002 * memory and re-park.
3003 */
3004
3005 workqueue_unlock(wq);
3006
3007 vm_map_t vmap = wq->wq_map;
3008
3009 // Keep this in sync with _setup_wqthread()
3010 const vm_size_t guardsize = vm_map_page_size(vmap);
3011 const user_addr_t freeaddr = (user_addr_t)tl->th_stackaddr + guardsize;
3012 const vm_map_offset_t freesize = vm_map_trunc_page_mask((PTH_DEFAULT_STACKSIZE + guardsize + PTHREAD_T_OFFSET) - 1, vm_map_page_mask(vmap)) - guardsize;
3013
3014 int kr;
3015 kr = mach_vm_behavior_set(vmap, freeaddr, freesize, VM_BEHAVIOR_REUSABLE);
3016 assert(kr == KERN_SUCCESS || kr == KERN_INVALID_ADDRESS);
3017
3018 workqueue_lock_spin(wq);
3019
3020 if ( !(tl->th_flags & TH_LIST_RUNNING)) {
3021 assert_wait((caddr_t)tl, (THREAD_INTERRUPTIBLE));
3022
3023 workqueue_unlock(wq);
3024
3025 thread_block(wq_unpark_continue);
3026 /* NOT REACHED */
3027 }
3028 }
3029
3030 if ((tl->th_flags & TH_LIST_RUNNING) == 0) {
3031 assert((tl->th_flags & TH_LIST_BUSY) == 0);
3032 /*
3033 * We were set running, but not for the purposes of actually running.
3034 * This could be because the timer elapsed. Or it could be because the
3035 * thread aborted. Either way, we need to return to userspace to exit.
3036 *
3037 * The call to workqueue_removethread will consume the lock.
3038 */
3039
3040 if (!first_use &&
3041 tl->th_priority != qos_class_get_class_index(WQ_THREAD_CLEANUP_QOS)) {
3042 // Reset the QoS to something low for the pthread cleanup
3043 pthread_priority_t cleanup_pri = _pthread_priority_make_newest(WQ_THREAD_CLEANUP_QOS, 0, 0);
3044 reset_priority(tl, cleanup_pri);
3045 }
3046
3047 workqueue_removethread(tl, 0, first_use);
3048
3049 if (first_use){
3050 pthread_kern->thread_bootstrap_return();
3051 } else {
3052 pthread_kern->unix_syscall_return(0);
3053 }
3054 /* NOT REACHED */
3055 }
3056
3057 /*
3058 * The timer woke us up or the thread was aborted. However, we have
3059 * already started to make this a runnable thread. Wait for that to
3060 * finish, then continue to userspace.
3061 */
3062 while ((tl->th_flags & TH_LIST_BUSY)) {
3063 assert_wait((caddr_t)tl, (THREAD_UNINT));
3064
3065 workqueue_unlock(wq);
3066
3067 thread_block(THREAD_CONTINUE_NULL);
3068
3069 workqueue_lock_spin(wq);
3070 }
3071
3072 return_to_user:
3073 workqueue_unlock(wq);
3074 _setup_wqthread(p, th, wq, tl, first_use);
3075 pthread_kern->thread_sched_call(th, workqueue_callback);
3076 done:
3077 if (first_use){
3078 pthread_kern->thread_bootstrap_return();
3079 } else {
3080 pthread_kern->unix_syscall_return(EJUSTRETURN);
3081 }
3082 panic("Our attempt to return to userspace failed...");
3083 }
3084
3085 /* called with workqueue lock held */
3086 static void
3087 wq_runreq(proc_t p, thread_t th, struct workqueue *wq, struct threadlist *tl,
3088 boolean_t return_directly, boolean_t needs_kevent_bind)
3089 {
3090 PTHREAD_TRACE1_WQ(TRACE_wq_runitem | DBG_FUNC_START, tl->th_workq, 0, 0, thread_tid(current_thread()), thread_tid(th));
3091
3092 unsigned int kevent_flags = KEVENT_FLAG_WORKQ;
3093 if (tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET) {
3094 kevent_flags |= KEVENT_FLAG_WORKQ_MANAGER;
3095 }
3096
3097 if (return_directly) {
3098 if (needs_kevent_bind) {
3099 assert((tl->th_flags & TH_LIST_KEVENT_BOUND) == 0);
3100 tl->th_flags |= TH_LIST_KEVENT_BOUND;
3101 }
3102
3103 workqueue_unlock(wq);
3104
3105 if (needs_kevent_bind) {
3106 kevent_qos_internal_bind(p, class_index_get_thread_qos(tl->th_priority), th, kevent_flags);
3107 }
3108
3109 /*
3110 * For preemption reasons, we want to reset the voucher as late as
3111 * possible, so we do it in two places:
3112 * - Just before parking (i.e. in parkit())
3113 * - Prior to doing the setup for the next workitem (i.e. here)
3114 *
3115 * Those two places are sufficient to ensure we always reset it before
3116 * it goes back out to user space, but be careful to not break that
3117 * guarantee.
3118 */
3119 kern_return_t kr = pthread_kern->thread_set_voucher_name(MACH_PORT_NULL);
3120 assert(kr == KERN_SUCCESS);
3121
3122 _setup_wqthread(p, th, wq, tl, false);
3123
3124 PTHREAD_TRACE_WQ(TRACE_wq_run_nextitem|DBG_FUNC_END, tl->th_workq, 0, 0, 4, 0);
3125
3126 pthread_kern->unix_syscall_return(EJUSTRETURN);
3127 /* NOT REACHED */
3128 }
3129
3130 if (needs_kevent_bind) {
3131 // Leave TH_LIST_BUSY set so that the thread can't beat us to calling kevent
3132 workqueue_unlock(wq);
3133 assert((tl->th_flags & TH_LIST_KEVENT_BOUND) == 0);
3134 kevent_qos_internal_bind(p, class_index_get_thread_qos(tl->th_priority), th, kevent_flags);
3135 tl->th_flags |= TH_LIST_KEVENT_BOUND;
3136 workqueue_lock_spin(wq);
3137 }
3138 tl->th_flags &= ~(TH_LIST_BUSY);
3139 thread_wakeup_thread(tl,th);
3140 }
3141
3142 #define KEVENT_LIST_LEN 16 // WORKQ_KEVENT_EVENT_BUFFER_LEN
3143 #define KEVENT_DATA_SIZE (32 * 1024)
3144
3145 /**
3146 * configures initial thread stack/registers to jump into:
3147 * _pthread_wqthread(pthread_t self, mach_port_t kport, void *stackaddr, void *keventlist, int upcall_flags, int nkevents);
3148 * to get there we jump through assembily stubs in pthread_asm.s. Those
3149 * routines setup a stack frame, using the current stack pointer, and marshall
3150 * arguments from registers to the stack as required by the ABI.
3151 *
3152 * One odd thing we do here is to start the pthread_t 4k below what would be the
3153 * top of the stack otherwise. This is because usually only the first 4k of the
3154 * pthread_t will be used and so we want to put it on the same 16k page as the
3155 * top of the stack to save memory.
3156 *
3157 * When we are done the stack will look like:
3158 * |-----------| th_stackaddr + th_allocsize
3159 * |pthread_t | th_stackaddr + DEFAULT_STACKSIZE + guardsize + PTHREAD_STACK_OFFSET
3160 * |kevent list| optionally - at most KEVENT_LIST_LEN events
3161 * |kevent data| optionally - at most KEVENT_DATA_SIZE bytes
3162 * |stack gap | bottom aligned to 16 bytes, and at least as big as stack_gap_min
3163 * | STACK |
3164 * | ⇓ |
3165 * | |
3166 * |guard page | guardsize
3167 * |-----------| th_stackaddr
3168 */
3169 void
3170 _setup_wqthread(proc_t p, thread_t th, struct workqueue *wq, struct threadlist *tl,
3171 bool first_use)
3172 {
3173 int error;
3174 uint32_t upcall_flags;
3175
3176 pthread_priority_t priority = pthread_priority_from_wq_class_index(wq, tl->th_priority);
3177
3178 const vm_size_t guardsize = vm_map_page_size(tl->th_workq->wq_map);
3179 const vm_size_t stack_gap_min = (proc_is64bit(p) == 0) ? C_32_STK_ALIGN : C_64_REDZONE_LEN;
3180 const vm_size_t stack_align_min = (proc_is64bit(p) == 0) ? C_32_STK_ALIGN : C_64_STK_ALIGN;
3181
3182 user_addr_t pthread_self_addr = (user_addr_t)(tl->th_stackaddr + PTH_DEFAULT_STACKSIZE + guardsize + PTHREAD_T_OFFSET);
3183 user_addr_t stack_top_addr = (user_addr_t)((pthread_self_addr - stack_gap_min) & -stack_align_min);
3184 user_addr_t stack_bottom_addr = (user_addr_t)(tl->th_stackaddr + guardsize);
3185
3186 user_addr_t wqstart_fnptr = pthread_kern->proc_get_wqthread(p);
3187 if (!wqstart_fnptr) {
3188 panic("workqueue thread start function pointer is NULL");
3189 }
3190
3191 /* Put the QoS class value into the lower bits of the reuse_thread register, this is where
3192 * the thread priority used to be stored anyway.
3193 */
3194 upcall_flags = tl->th_upcall_flags << WQ_FLAG_THREAD_PRIOSHIFT;
3195 upcall_flags |= (_pthread_priority_get_qos_newest(priority) & WQ_FLAG_THREAD_PRIOMASK);
3196
3197 upcall_flags |= WQ_FLAG_THREAD_NEWSPI;
3198
3199 uint32_t tsd_offset = pthread_kern->proc_get_pthread_tsd_offset(p);
3200 if (tsd_offset) {
3201 mach_vm_offset_t th_tsd_base = (mach_vm_offset_t)pthread_self_addr + tsd_offset;
3202 kern_return_t kret = pthread_kern->thread_set_tsd_base(th, th_tsd_base);
3203 if (kret == KERN_SUCCESS) {
3204 upcall_flags |= WQ_FLAG_THREAD_TSD_BASE_SET;
3205 }
3206 }
3207
3208 if (first_use) {
3209 /*
3210 * Pre-fault the first page of the new thread's stack and the page that will
3211 * contain the pthread_t structure.
3212 */
3213 vm_map_t vmap = pthread_kern->current_map();
3214 if (vm_map_trunc_page_mask((vm_map_offset_t)(stack_top_addr - C_64_REDZONE_LEN), vm_map_page_mask(vmap)) !=
3215 vm_map_trunc_page_mask((vm_map_offset_t)pthread_self_addr, vm_map_page_mask(vmap))){
3216 vm_fault( vmap,
3217 vm_map_trunc_page_mask((vm_map_offset_t)(stack_top_addr - C_64_REDZONE_LEN), vm_map_page_mask(vmap)),
3218 VM_PROT_READ | VM_PROT_WRITE,
3219 FALSE,
3220 THREAD_UNINT, NULL, 0);
3221 }
3222 vm_fault( vmap,
3223 vm_map_trunc_page_mask((vm_map_offset_t)pthread_self_addr, vm_map_page_mask(vmap)),
3224 VM_PROT_READ | VM_PROT_WRITE,
3225 FALSE,
3226 THREAD_UNINT, NULL, 0);
3227 } else {
3228 upcall_flags |= WQ_FLAG_THREAD_REUSE;
3229 }
3230
3231 user_addr_t kevent_list = NULL;
3232 int kevent_count = 0;
3233 if (upcall_flags & WQ_FLAG_THREAD_KEVENT){
3234 kevent_list = pthread_self_addr - KEVENT_LIST_LEN * sizeof(struct kevent_qos_s);
3235 kevent_count = KEVENT_LIST_LEN;
3236
3237 user_addr_t kevent_data_buf = kevent_list - KEVENT_DATA_SIZE;
3238 user_size_t kevent_data_available = KEVENT_DATA_SIZE;
3239
3240 int32_t events_out = 0;
3241
3242 assert(tl->th_flags | TH_LIST_KEVENT_BOUND);
3243 unsigned int flags = KEVENT_FLAG_WORKQ | KEVENT_FLAG_STACK_DATA | KEVENT_FLAG_IMMEDIATE;
3244 if (tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET) {
3245 flags |= KEVENT_FLAG_WORKQ_MANAGER;
3246 }
3247 int ret = kevent_qos_internal(p, class_index_get_thread_qos(tl->th_priority), NULL, 0, kevent_list, kevent_count,
3248 kevent_data_buf, &kevent_data_available,
3249 flags, &events_out);
3250
3251 // turns out there are a lot of edge cases where this will fail, so not enabled by default
3252 //assert((ret == KERN_SUCCESS && events_out != -1) || ret == KERN_ABORTED);
3253
3254 // squash any errors into just empty output on
3255 if (ret != KERN_SUCCESS || events_out == -1){
3256 events_out = 0;
3257 kevent_data_available = KEVENT_DATA_SIZE;
3258 }
3259
3260 // We shouldn't get data out if there aren't events available
3261 assert(events_out != 0 || kevent_data_available == KEVENT_DATA_SIZE);
3262
3263 if (events_out > 0){
3264 if (kevent_data_available == KEVENT_DATA_SIZE){
3265 stack_top_addr = (kevent_list - stack_gap_min) & -stack_align_min;
3266 } else {
3267 stack_top_addr = (kevent_data_buf + kevent_data_available - stack_gap_min) & -stack_align_min;
3268 }
3269
3270 kevent_count = events_out;
3271 } else {
3272 kevent_list = NULL;
3273 kevent_count = 0;
3274 }
3275 }
3276
3277 #if defined(__i386__) || defined(__x86_64__)
3278 if (proc_is64bit(p) == 0) {
3279 x86_thread_state32_t state = {
3280 .eip = (unsigned int)wqstart_fnptr,
3281 .eax = /* arg0 */ (unsigned int)pthread_self_addr,
3282 .ebx = /* arg1 */ (unsigned int)tl->th_thport,
3283 .ecx = /* arg2 */ (unsigned int)stack_bottom_addr,
3284 .edx = /* arg3 */ (unsigned int)kevent_list,
3285 .edi = /* arg4 */ (unsigned int)upcall_flags,
3286 .esi = /* arg5 */ (unsigned int)kevent_count,
3287
3288 .esp = (int)((vm_offset_t)stack_top_addr),
3289 };
3290
3291 error = pthread_kern->thread_set_wq_state32(th, (thread_state_t)&state);
3292 if (error != KERN_SUCCESS) {
3293 panic(__func__ ": thread_set_wq_state failed: %d", error);
3294 }
3295 } else {
3296 x86_thread_state64_t state64 = {
3297 // x86-64 already passes all the arguments in registers, so we just put them in their final place here
3298 .rip = (uint64_t)wqstart_fnptr,
3299 .rdi = (uint64_t)pthread_self_addr,
3300 .rsi = (uint64_t)tl->th_thport,
3301 .rdx = (uint64_t)stack_bottom_addr,
3302 .rcx = (uint64_t)kevent_list,
3303 .r8 = (uint64_t)upcall_flags,
3304 .r9 = (uint64_t)kevent_count,
3305
3306 .rsp = (uint64_t)(stack_top_addr)
3307 };
3308
3309 error = pthread_kern->thread_set_wq_state64(th, (thread_state_t)&state64);
3310 if (error != KERN_SUCCESS) {
3311 panic(__func__ ": thread_set_wq_state failed: %d", error);
3312 }
3313 }
3314 #else
3315 #error setup_wqthread not defined for this architecture
3316 #endif
3317 }
3318
3319 #if DEBUG
3320 static int wq_kevent_test SYSCTL_HANDLER_ARGS {
3321 //(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req)
3322 #pragma unused(oidp, arg1, arg2)
3323 int error;
3324 struct workq_reqthreads_req_s requests[64] = {};
3325
3326 if (req->newlen > sizeof(requests) || req->newlen < sizeof(struct workq_reqthreads_req_s))
3327 return EINVAL;
3328
3329 error = copyin(req->newptr, requests, req->newlen);
3330 if (error) return error;
3331
3332 _workq_reqthreads(req->p, (int)(req->newlen / sizeof(struct workq_reqthreads_req_s)), requests);
3333
3334 return 0;
3335 }
3336 #endif // DEBUG
3337
3338 #pragma mark - Misc
3339
3340 int
3341 _fill_procworkqueue(proc_t p, struct proc_workqueueinfo * pwqinfo)
3342 {
3343 struct workqueue * wq;
3344 int error = 0;
3345 int activecount;
3346 uint32_t pri;
3347
3348 if ((wq = pthread_kern->proc_get_wqptr(p)) == NULL) {
3349 return EINVAL;
3350 }
3351
3352 workqueue_lock_spin(wq);
3353 activecount = 0;
3354
3355 for (pri = 0; pri < WORKQUEUE_NUM_BUCKETS; pri++) {
3356 activecount += wq->wq_thactive_count[pri];
3357 }
3358 pwqinfo->pwq_nthreads = wq->wq_nthreads;
3359 pwqinfo->pwq_runthreads = activecount;
3360 pwqinfo->pwq_blockedthreads = wq->wq_threads_scheduled - activecount;
3361 pwqinfo->pwq_state = 0;
3362
3363 if (wq->wq_constrained_threads_scheduled >= wq_max_constrained_threads) {
3364 pwqinfo->pwq_state |= WQ_EXCEEDED_CONSTRAINED_THREAD_LIMIT;
3365 }
3366
3367 if (wq->wq_nthreads >= wq_max_threads) {
3368 pwqinfo->pwq_state |= WQ_EXCEEDED_TOTAL_THREAD_LIMIT;
3369 }
3370
3371 workqueue_unlock(wq);
3372 return(error);
3373 }
3374
3375 uint32_t
3376 _get_pwq_state_kdp(proc_t p)
3377 {
3378 if (p == NULL) {
3379 return 0;
3380 }
3381
3382 struct workqueue *wq = pthread_kern->proc_get_wqptr(p);
3383
3384 if (wq == NULL || workqueue_lock_spin_is_acquired_kdp(wq)) {
3385 return 0;
3386 }
3387
3388 uint32_t pwq_state = WQ_FLAGS_AVAILABLE;
3389
3390 if (wq->wq_constrained_threads_scheduled >= wq_max_constrained_threads) {
3391 pwq_state |= WQ_EXCEEDED_CONSTRAINED_THREAD_LIMIT;
3392 }
3393
3394 if (wq->wq_nthreads >= wq_max_threads) {
3395 pwq_state |= WQ_EXCEEDED_TOTAL_THREAD_LIMIT;
3396 }
3397
3398 return pwq_state;
3399 }
3400
3401 int
3402 _thread_selfid(__unused struct proc *p, uint64_t *retval)
3403 {
3404 thread_t thread = current_thread();
3405 *retval = thread_tid(thread);
3406 return KERN_SUCCESS;
3407 }
3408
3409 void
3410 _pthread_init(void)
3411 {
3412 pthread_lck_grp_attr = lck_grp_attr_alloc_init();
3413 pthread_lck_grp = lck_grp_alloc_init("pthread", pthread_lck_grp_attr);
3414
3415 /*
3416 * allocate the lock attribute for pthread synchronizers
3417 */
3418 pthread_lck_attr = lck_attr_alloc_init();
3419
3420 pthread_list_mlock = lck_mtx_alloc_init(pthread_lck_grp, pthread_lck_attr);
3421
3422 pth_global_hashinit();
3423 psynch_thcall = thread_call_allocate(psynch_wq_cleanup, NULL);
3424 psynch_zoneinit();
3425
3426 /*
3427 * register sysctls
3428 */
3429 sysctl_register_oid(&sysctl__kern_wq_yielded_threshold);
3430 sysctl_register_oid(&sysctl__kern_wq_yielded_window_usecs);
3431 sysctl_register_oid(&sysctl__kern_wq_stalled_window_usecs);
3432 sysctl_register_oid(&sysctl__kern_wq_reduce_pool_window_usecs);
3433 sysctl_register_oid(&sysctl__kern_wq_max_timer_interval_usecs);
3434 sysctl_register_oid(&sysctl__kern_wq_max_threads);
3435 sysctl_register_oid(&sysctl__kern_wq_max_constrained_threads);
3436 sysctl_register_oid(&sysctl__kern_pthread_debug_tracing);
3437
3438 #if DEBUG
3439 sysctl_register_oid(&sysctl__kern_wq_max_concurrency);
3440 sysctl_register_oid(&sysctl__debug_wq_kevent_test);
3441 #endif
3442
3443 wq_max_concurrency = pthread_kern->ml_get_max_cpus();
3444
3445 }