]> git.saurik.com Git - apple/libpthread.git/blob - kern/kern_support.c
280a18b7565eb8c47b0b1b5e47d94a568298c820
[apple/libpthread.git] / kern / kern_support.c
1 /*
2 * Copyright (c) 2000-2012 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /* Copyright (c) 1995-2005 Apple Computer, Inc. All Rights Reserved */
29 /*
30 * pthread_synch.c
31 */
32
33 #pragma mark - Front Matter
34
35 #define _PTHREAD_CONDATTR_T
36 #define _PTHREAD_COND_T
37 #define _PTHREAD_MUTEXATTR_T
38 #define _PTHREAD_MUTEX_T
39 #define _PTHREAD_RWLOCKATTR_T
40 #define _PTHREAD_RWLOCK_T
41
42 #undef pthread_mutexattr_t
43 #undef pthread_mutex_t
44 #undef pthread_condattr_t
45 #undef pthread_cond_t
46 #undef pthread_rwlockattr_t
47 #undef pthread_rwlock_t
48
49 #include <sys/cdefs.h>
50 #include <os/log.h>
51
52 // <rdar://problem/26158937> panic() should be marked noreturn
53 extern void panic(const char *string, ...) __printflike(1,2) __dead2;
54
55 #include <sys/param.h>
56 #include <sys/queue.h>
57 #include <sys/resourcevar.h>
58 //#include <sys/proc_internal.h>
59 #include <sys/kauth.h>
60 #include <sys/systm.h>
61 #include <sys/timeb.h>
62 #include <sys/times.h>
63 #include <sys/acct.h>
64 #include <sys/kernel.h>
65 #include <sys/wait.h>
66 #include <sys/signalvar.h>
67 #include <sys/sysctl.h>
68 #include <sys/syslog.h>
69 #include <sys/stat.h>
70 #include <sys/lock.h>
71 #include <sys/kdebug.h>
72 //#include <sys/sysproto.h>
73 #include <sys/vm.h>
74 #include <sys/user.h> /* for coredump */
75 #include <sys/proc_info.h> /* for fill_procworkqueue */
76
77 #include <mach/mach_port.h>
78 #include <mach/mach_types.h>
79 #include <mach/semaphore.h>
80 #include <mach/sync_policy.h>
81 #include <mach/task.h>
82 #include <mach/vm_prot.h>
83 #include <kern/kern_types.h>
84 #include <kern/task.h>
85 #include <kern/clock.h>
86 #include <mach/kern_return.h>
87 #include <kern/thread.h>
88 #include <kern/zalloc.h>
89 #include <kern/sched_prim.h> /* for thread_exception_return */
90 #include <kern/processor.h>
91 #include <kern/assert.h>
92 #include <mach/mach_vm.h>
93 #include <mach/mach_param.h>
94 #include <mach/thread_status.h>
95 #include <mach/thread_policy.h>
96 #include <mach/message.h>
97 #include <mach/port.h>
98 //#include <vm/vm_protos.h>
99 #include <vm/vm_fault.h>
100 #include <vm/vm_map.h>
101 #include <mach/thread_act.h> /* for thread_resume */
102 #include <machine/machine_routines.h>
103 #include <mach/shared_region.h>
104
105 #include <libkern/OSAtomic.h>
106 #include <libkern/libkern.h>
107
108 #include <sys/pthread_shims.h>
109 #include "kern_internal.h"
110
111 // XXX: Dirty import for sys/signarvar.h that's wrapped in BSD_KERNEL_PRIVATE
112 #define sigcantmask (sigmask(SIGKILL) | sigmask(SIGSTOP))
113
114 // XXX: Ditto for thread tags from kern/thread.h
115 #define THREAD_TAG_MAINTHREAD 0x1
116 #define THREAD_TAG_PTHREAD 0x10
117 #define THREAD_TAG_WORKQUEUE 0x20
118
119 lck_grp_attr_t *pthread_lck_grp_attr;
120 lck_grp_t *pthread_lck_grp;
121 lck_attr_t *pthread_lck_attr;
122
123 zone_t pthread_zone_workqueue;
124 zone_t pthread_zone_threadlist;
125 zone_t pthread_zone_threadreq;
126
127 extern void thread_set_cthreadself(thread_t thread, uint64_t pself, int isLP64);
128 extern void workqueue_thread_yielded(void);
129
130 #define WQ_SETUP_FIRST_USE 1
131 #define WQ_SETUP_CLEAR_VOUCHER 2
132 static void _setup_wqthread(proc_t p, thread_t th, struct workqueue *wq,
133 struct threadlist *tl, int flags);
134
135 static void reset_priority(struct threadlist *tl, pthread_priority_t pri);
136 static pthread_priority_t pthread_priority_from_wq_class_index(struct workqueue *wq, int index);
137
138 static void wq_unpark_continue(void* ptr, wait_result_t wait_result) __dead2;
139
140 static bool workqueue_addnewthread(proc_t p, struct workqueue *wq);
141 static void workqueue_removethread(struct threadlist *tl, bool fromexit, bool first_use);
142 static void workqueue_lock_spin(struct workqueue *);
143 static void workqueue_unlock(struct workqueue *);
144
145 #define WQ_RUN_TR_THROTTLED 0
146 #define WQ_RUN_TR_THREAD_NEEDED 1
147 #define WQ_RUN_TR_THREAD_STARTED 2
148 #define WQ_RUN_TR_EXITING 3
149 static int workqueue_run_threadreq_and_unlock(proc_t p, struct workqueue *wq,
150 struct threadlist *tl, struct threadreq *req, bool may_add_new_thread);
151
152 static bool may_start_constrained_thread(struct workqueue *wq,
153 uint32_t at_priclass, struct threadlist *tl, bool may_start_timer);
154
155 static mach_vm_offset_t stack_addr_hint(proc_t p, vm_map_t vmap);
156 static boolean_t wq_thread_is_busy(uint64_t cur_ts,
157 _Atomic uint64_t *lastblocked_tsp);
158
159 int proc_settargetconc(pid_t pid, int queuenum, int32_t targetconc);
160 int proc_setalltargetconc(pid_t pid, int32_t * targetconcp);
161
162 #define WQ_MAXPRI_MIN 0 /* low prio queue num */
163 #define WQ_MAXPRI_MAX 2 /* max prio queuenum */
164 #define WQ_PRI_NUM 3 /* number of prio work queues */
165
166 #define C_32_STK_ALIGN 16
167 #define C_64_STK_ALIGN 16
168 #define C_64_REDZONE_LEN 128
169
170 #define PTHREAD_T_OFFSET 0
171
172 /*
173 * Flags filed passed to bsdthread_create and back in pthread_start
174 31 <---------------------------------> 0
175 _________________________________________
176 | flags(8) | policy(8) | importance(16) |
177 -----------------------------------------
178 */
179
180 #define PTHREAD_START_CUSTOM 0x01000000
181 #define PTHREAD_START_SETSCHED 0x02000000
182 #define PTHREAD_START_DETACHED 0x04000000
183 #define PTHREAD_START_QOSCLASS 0x08000000
184 #define PTHREAD_START_TSD_BASE_SET 0x10000000
185 #define PTHREAD_START_QOSCLASS_MASK 0x00ffffff
186 #define PTHREAD_START_POLICY_BITSHIFT 16
187 #define PTHREAD_START_POLICY_MASK 0xff
188 #define PTHREAD_START_IMPORTANCE_MASK 0xffff
189
190 #define SCHED_OTHER POLICY_TIMESHARE
191 #define SCHED_FIFO POLICY_FIFO
192 #define SCHED_RR POLICY_RR
193
194 #define BASEPRI_DEFAULT 31
195
196 #pragma mark sysctls
197
198 static uint32_t wq_stalled_window_usecs = WQ_STALLED_WINDOW_USECS;
199 static uint32_t wq_reduce_pool_window_usecs = WQ_REDUCE_POOL_WINDOW_USECS;
200 static uint32_t wq_max_timer_interval_usecs = WQ_MAX_TIMER_INTERVAL_USECS;
201 static uint32_t wq_max_threads = WORKQUEUE_MAXTHREADS;
202 static uint32_t wq_max_constrained_threads = WORKQUEUE_MAXTHREADS / 8;
203 static uint32_t wq_max_concurrency[WORKQUEUE_NUM_BUCKETS + 1]; // set to ncpus on load
204
205 SYSCTL_INT(_kern, OID_AUTO, wq_stalled_window_usecs, CTLFLAG_RW | CTLFLAG_LOCKED,
206 &wq_stalled_window_usecs, 0, "");
207
208 SYSCTL_INT(_kern, OID_AUTO, wq_reduce_pool_window_usecs, CTLFLAG_RW | CTLFLAG_LOCKED,
209 &wq_reduce_pool_window_usecs, 0, "");
210
211 SYSCTL_INT(_kern, OID_AUTO, wq_max_timer_interval_usecs, CTLFLAG_RW | CTLFLAG_LOCKED,
212 &wq_max_timer_interval_usecs, 0, "");
213
214 SYSCTL_INT(_kern, OID_AUTO, wq_max_threads, CTLFLAG_RW | CTLFLAG_LOCKED,
215 &wq_max_threads, 0, "");
216
217 SYSCTL_INT(_kern, OID_AUTO, wq_max_constrained_threads, CTLFLAG_RW | CTLFLAG_LOCKED,
218 &wq_max_constrained_threads, 0, "");
219
220 #ifdef DEBUG
221 static int wq_kevent_test SYSCTL_HANDLER_ARGS;
222 SYSCTL_PROC(_debug, OID_AUTO, wq_kevent_test, CTLFLAG_MASKED | CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY | CTLTYPE_OPAQUE, NULL, 0, wq_kevent_test, 0, "-");
223 #endif
224
225 static uint32_t wq_init_constrained_limit = 1;
226
227 uint32_t pthread_debug_tracing = 1;
228
229 SYSCTL_INT(_kern, OID_AUTO, pthread_debug_tracing, CTLFLAG_RW | CTLFLAG_LOCKED,
230 &pthread_debug_tracing, 0, "")
231
232 static uint32_t pthread_mutex_default_policy;
233
234 SYSCTL_INT(_kern, OID_AUTO, pthread_mutex_default_policy, CTLFLAG_RW | CTLFLAG_LOCKED,
235 &pthread_mutex_default_policy, 0, "");
236
237 /*
238 * +-----+-----+-----+-----+-----+-----+-----+
239 * | MT | BG | UT | DE | IN | UN | mgr |
240 * +-----+-----+-----+-----+-----+-----+-----+-----+
241 * | pri | 5 | 4 | 3 | 2 | 1 | 0 | 6 |
242 * | qos | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
243 * +-----+-----+-----+-----+-----+-----+-----+-----+
244 */
245 static inline uint32_t
246 _wq_bucket_to_thread_qos(int pri)
247 {
248 if (pri == WORKQUEUE_EVENT_MANAGER_BUCKET) {
249 return WORKQUEUE_EVENT_MANAGER_BUCKET + 1;
250 }
251 return WORKQUEUE_EVENT_MANAGER_BUCKET - pri;
252 }
253
254 #pragma mark wq_thactive
255
256 #if defined(__LP64__)
257 // Layout is:
258 // 7 * 16 bits for each QoS bucket request count (including manager)
259 // 3 bits of best QoS among all pending constrained requests
260 // 13 bits of zeroes
261 #define WQ_THACTIVE_BUCKET_WIDTH 16
262 #define WQ_THACTIVE_QOS_SHIFT (7 * WQ_THACTIVE_BUCKET_WIDTH)
263 #else
264 // Layout is:
265 // 6 * 10 bits for each QoS bucket request count (except manager)
266 // 1 bit for the manager bucket
267 // 3 bits of best QoS among all pending constrained requests
268 #define WQ_THACTIVE_BUCKET_WIDTH 10
269 #define WQ_THACTIVE_QOS_SHIFT (6 * WQ_THACTIVE_BUCKET_WIDTH + 1)
270 #endif
271 #define WQ_THACTIVE_BUCKET_MASK ((1U << WQ_THACTIVE_BUCKET_WIDTH) - 1)
272 #define WQ_THACTIVE_BUCKET_HALF (1U << (WQ_THACTIVE_BUCKET_WIDTH - 1))
273 #define WQ_THACTIVE_NO_PENDING_REQUEST 6
274
275 _Static_assert(sizeof(wq_thactive_t) * CHAR_BIT - WQ_THACTIVE_QOS_SHIFT >= 3,
276 "Make sure we have space to encode a QoS");
277
278 static inline wq_thactive_t
279 _wq_thactive_fetch_and_add(struct workqueue *wq, wq_thactive_t offset)
280 {
281 #if PTHREAD_INLINE_RMW_ATOMICS || !defined(__LP64__)
282 return atomic_fetch_add_explicit(&wq->wq_thactive, offset,
283 memory_order_relaxed);
284 #else
285 return pthread_kern->atomic_fetch_add_128_relaxed(&wq->wq_thactive, offset);
286 #endif
287 }
288
289 static inline wq_thactive_t
290 _wq_thactive(struct workqueue *wq)
291 {
292 #if PTHREAD_INLINE_RMW_ATOMICS || !defined(__LP64__)
293 return atomic_load_explicit(&wq->wq_thactive, memory_order_relaxed);
294 #else
295 return pthread_kern->atomic_load_128_relaxed(&wq->wq_thactive);
296 #endif
297 }
298
299 #define WQ_THACTIVE_BEST_CONSTRAINED_REQ_QOS(tha) \
300 ((tha) >> WQ_THACTIVE_QOS_SHIFT)
301
302 static inline uint32_t
303 _wq_thactive_best_constrained_req_qos(struct workqueue *wq)
304 {
305 // Avoid expensive atomic operations: the three bits we're loading are in
306 // a single byte, and always updated under the workqueue lock
307 wq_thactive_t v = *(wq_thactive_t *)&wq->wq_thactive;
308 return WQ_THACTIVE_BEST_CONSTRAINED_REQ_QOS(v);
309 }
310
311 static inline wq_thactive_t
312 _wq_thactive_set_best_constrained_req_qos(struct workqueue *wq,
313 uint32_t orig_qos, uint32_t new_qos)
314 {
315 wq_thactive_t v;
316 v = (wq_thactive_t)(new_qos - orig_qos) << WQ_THACTIVE_QOS_SHIFT;
317 /*
318 * We can do an atomic add relative to the initial load because updates
319 * to this qos are always serialized under the workqueue lock.
320 */
321 return _wq_thactive_fetch_and_add(wq, v) + v;
322 }
323
324 static inline wq_thactive_t
325 _wq_thactive_offset_for_qos(int qos)
326 {
327 return (wq_thactive_t)1 << (qos * WQ_THACTIVE_BUCKET_WIDTH);
328 }
329
330 static inline wq_thactive_t
331 _wq_thactive_inc(struct workqueue *wq, int qos)
332 {
333 return _wq_thactive_fetch_and_add(wq, _wq_thactive_offset_for_qos(qos));
334 }
335
336 static inline wq_thactive_t
337 _wq_thactive_dec(struct workqueue *wq, int qos)
338 {
339 return _wq_thactive_fetch_and_add(wq, -_wq_thactive_offset_for_qos(qos));
340 }
341
342 static inline wq_thactive_t
343 _wq_thactive_move(struct workqueue *wq, int oldqos, int newqos)
344 {
345 return _wq_thactive_fetch_and_add(wq, _wq_thactive_offset_for_qos(newqos) -
346 _wq_thactive_offset_for_qos(oldqos));
347 }
348
349 static inline uint32_t
350 _wq_thactive_aggregate_downto_qos(struct workqueue *wq, wq_thactive_t v,
351 int qos, uint32_t *busycount, uint32_t *max_busycount)
352 {
353 uint32_t count = 0, active;
354 uint64_t curtime;
355
356 #ifndef __LP64__
357 /*
358 * on 32bits the manager bucket is a single bit and the best constrained
359 * request QoS 3 bits are where the 10 bits of a regular QoS bucket count
360 * would be. Mask them out.
361 */
362 v &= ~(~0ull << WQ_THACTIVE_QOS_SHIFT);
363 #endif
364 if (busycount) {
365 curtime = mach_absolute_time();
366 *busycount = 0;
367 }
368 if (max_busycount) {
369 *max_busycount = qos + 1;
370 }
371 for (int i = 0; i <= qos; i++, v >>= WQ_THACTIVE_BUCKET_WIDTH) {
372 active = v & WQ_THACTIVE_BUCKET_MASK;
373 count += active;
374 if (busycount && wq->wq_thscheduled_count[i] > active) {
375 if (wq_thread_is_busy(curtime, &wq->wq_lastblocked_ts[i])) {
376 /*
377 * We only consider the last blocked thread for a given bucket
378 * as busy because we don't want to take the list lock in each
379 * sched callback. However this is an approximation that could
380 * contribute to thread creation storms.
381 */
382 (*busycount)++;
383 }
384 }
385 }
386 return count;
387 }
388
389 #pragma mark - Process/Thread Setup/Teardown syscalls
390
391 static mach_vm_offset_t
392 stack_addr_hint(proc_t p, vm_map_t vmap)
393 {
394 mach_vm_offset_t stackaddr;
395 mach_vm_offset_t aslr_offset;
396 bool proc64bit = proc_is64bit(p);
397
398 // We can't safely take random values % something unless its a power-of-two
399 _Static_assert(powerof2(PTH_DEFAULT_STACKSIZE), "PTH_DEFAULT_STACKSIZE is a power-of-two");
400
401 #if defined(__i386__) || defined(__x86_64__)
402 if (proc64bit) {
403 // Matches vm_map_get_max_aslr_slide_pages's image shift in xnu
404 aslr_offset = random() % (1 << 28); // about 512 stacks
405 } else {
406 // Actually bigger than the image shift, we've got ~256MB to work with
407 aslr_offset = random() % (16 * PTH_DEFAULT_STACKSIZE);
408 }
409 aslr_offset = vm_map_trunc_page_mask(aslr_offset, vm_map_page_mask(vmap));
410 if (proc64bit) {
411 // Above nanomalloc range (see NANOZONE_SIGNATURE)
412 stackaddr = 0x700000000000 + aslr_offset;
413 } else {
414 stackaddr = SHARED_REGION_BASE_I386 + SHARED_REGION_SIZE_I386 + aslr_offset;
415 }
416 #elif defined(__arm__) || defined(__arm64__)
417 user_addr_t main_thread_stack_top = 0;
418 if (pthread_kern->proc_get_user_stack) {
419 main_thread_stack_top = pthread_kern->proc_get_user_stack(p);
420 }
421 if (proc64bit && main_thread_stack_top) {
422 // The main thread stack position is randomly slid by xnu (c.f.
423 // load_main() in mach_loader.c), so basing pthread stack allocations
424 // where the main thread stack ends is already ASLRd and doing so
425 // avoids creating a gap in the process address space that may cause
426 // extra PTE memory usage. rdar://problem/33328206
427 stackaddr = vm_map_trunc_page_mask((vm_map_offset_t)main_thread_stack_top,
428 vm_map_page_mask(vmap));
429 } else {
430 // vm_map_get_max_aslr_slide_pages ensures 1MB of slide, we do better
431 aslr_offset = random() % ((proc64bit ? 4 : 2) * PTH_DEFAULT_STACKSIZE);
432 aslr_offset = vm_map_trunc_page_mask((vm_map_offset_t)aslr_offset,
433 vm_map_page_mask(vmap));
434 if (proc64bit) {
435 // 64 stacks below shared region
436 stackaddr = SHARED_REGION_BASE_ARM64 - 64 * PTH_DEFAULT_STACKSIZE - aslr_offset;
437 } else {
438 // If you try to slide down from this point, you risk ending up in memory consumed by malloc
439 stackaddr = SHARED_REGION_BASE_ARM - 32 * PTH_DEFAULT_STACKSIZE + aslr_offset;
440 }
441 }
442 #else
443 #error Need to define a stack address hint for this architecture
444 #endif
445 return stackaddr;
446 }
447
448 /**
449 * bsdthread_create system call. Used by pthread_create.
450 */
451 int
452 _bsdthread_create(struct proc *p, user_addr_t user_func, user_addr_t user_funcarg, user_addr_t user_stack, user_addr_t user_pthread, uint32_t flags, user_addr_t *retval)
453 {
454 kern_return_t kret;
455 void * sright;
456 int error = 0;
457 int allocated = 0;
458 mach_vm_offset_t stackaddr;
459 mach_vm_size_t th_allocsize = 0;
460 mach_vm_size_t th_guardsize;
461 mach_vm_offset_t th_stack;
462 mach_vm_offset_t th_pthread;
463 mach_vm_offset_t th_tsd_base;
464 mach_port_name_t th_thport;
465 thread_t th;
466 vm_map_t vmap = pthread_kern->current_map();
467 task_t ctask = current_task();
468 unsigned int policy, importance;
469 uint32_t tsd_offset;
470
471 int isLP64 = 0;
472
473 if (pthread_kern->proc_get_register(p) == 0) {
474 return EINVAL;
475 }
476
477 PTHREAD_TRACE(TRACE_pthread_thread_create | DBG_FUNC_START, flags, 0, 0, 0, 0);
478
479 isLP64 = proc_is64bit(p);
480 th_guardsize = vm_map_page_size(vmap);
481
482 stackaddr = pthread_kern->proc_get_stack_addr_hint(p);
483 kret = pthread_kern->thread_create(ctask, &th);
484 if (kret != KERN_SUCCESS)
485 return(ENOMEM);
486 thread_reference(th);
487
488 pthread_kern->thread_set_tag(th, THREAD_TAG_PTHREAD);
489
490 sright = (void *)pthread_kern->convert_thread_to_port(th);
491 th_thport = pthread_kern->ipc_port_copyout_send(sright, pthread_kern->task_get_ipcspace(ctask));
492 if (!MACH_PORT_VALID(th_thport)) {
493 error = EMFILE; // userland will convert this into a crash
494 goto out;
495 }
496
497 if ((flags & PTHREAD_START_CUSTOM) == 0) {
498 mach_vm_size_t pthread_size =
499 vm_map_round_page_mask(pthread_kern->proc_get_pthsize(p) + PTHREAD_T_OFFSET, vm_map_page_mask(vmap));
500 th_allocsize = th_guardsize + user_stack + pthread_size;
501 user_stack += PTHREAD_T_OFFSET;
502
503 kret = mach_vm_map(vmap, &stackaddr,
504 th_allocsize,
505 page_size-1,
506 VM_MAKE_TAG(VM_MEMORY_STACK)| VM_FLAGS_ANYWHERE , NULL,
507 0, FALSE, VM_PROT_DEFAULT, VM_PROT_ALL,
508 VM_INHERIT_DEFAULT);
509 if (kret != KERN_SUCCESS){
510 kret = mach_vm_allocate(vmap,
511 &stackaddr, th_allocsize,
512 VM_MAKE_TAG(VM_MEMORY_STACK)| VM_FLAGS_ANYWHERE);
513 }
514 if (kret != KERN_SUCCESS) {
515 error = ENOMEM;
516 goto out;
517 }
518
519 PTHREAD_TRACE(TRACE_pthread_thread_create|DBG_FUNC_NONE, th_allocsize, stackaddr, 0, 2, 0);
520
521 allocated = 1;
522 /*
523 * The guard page is at the lowest address
524 * The stack base is the highest address
525 */
526 kret = mach_vm_protect(vmap, stackaddr, th_guardsize, FALSE, VM_PROT_NONE);
527
528 if (kret != KERN_SUCCESS) {
529 error = ENOMEM;
530 goto out1;
531 }
532
533 th_pthread = stackaddr + th_guardsize + user_stack;
534 th_stack = th_pthread;
535
536 /*
537 * Pre-fault the first page of the new thread's stack and the page that will
538 * contain the pthread_t structure.
539 */
540 if (vm_map_trunc_page_mask((vm_map_offset_t)(th_stack - C_64_REDZONE_LEN), vm_map_page_mask(vmap)) !=
541 vm_map_trunc_page_mask((vm_map_offset_t)th_pthread, vm_map_page_mask(vmap))){
542 vm_fault( vmap,
543 vm_map_trunc_page_mask((vm_map_offset_t)(th_stack - C_64_REDZONE_LEN), vm_map_page_mask(vmap)),
544 VM_PROT_READ | VM_PROT_WRITE,
545 FALSE,
546 THREAD_UNINT, NULL, 0);
547 }
548
549 vm_fault( vmap,
550 vm_map_trunc_page_mask((vm_map_offset_t)th_pthread, vm_map_page_mask(vmap)),
551 VM_PROT_READ | VM_PROT_WRITE,
552 FALSE,
553 THREAD_UNINT, NULL, 0);
554
555 } else {
556 th_stack = user_stack;
557 th_pthread = user_pthread;
558
559 PTHREAD_TRACE(TRACE_pthread_thread_create|DBG_FUNC_NONE, 0, 0, 0, 3, 0);
560 }
561
562 tsd_offset = pthread_kern->proc_get_pthread_tsd_offset(p);
563 if (tsd_offset) {
564 th_tsd_base = th_pthread + tsd_offset;
565 kret = pthread_kern->thread_set_tsd_base(th, th_tsd_base);
566 if (kret == KERN_SUCCESS) {
567 flags |= PTHREAD_START_TSD_BASE_SET;
568 }
569 }
570
571 #if defined(__i386__) || defined(__x86_64__)
572 /*
573 * Set up i386 registers & function call.
574 */
575 if (isLP64 == 0) {
576 x86_thread_state32_t state = {
577 .eip = (unsigned int)pthread_kern->proc_get_threadstart(p),
578 .eax = (unsigned int)th_pthread,
579 .ebx = (unsigned int)th_thport,
580 .ecx = (unsigned int)user_func,
581 .edx = (unsigned int)user_funcarg,
582 .edi = (unsigned int)user_stack,
583 .esi = (unsigned int)flags,
584 /*
585 * set stack pointer
586 */
587 .esp = (int)((vm_offset_t)(th_stack-C_32_STK_ALIGN))
588 };
589
590 error = pthread_kern->thread_set_wq_state32(th, (thread_state_t)&state);
591 if (error != KERN_SUCCESS) {
592 error = EINVAL;
593 goto out;
594 }
595 } else {
596 x86_thread_state64_t state64 = {
597 .rip = (uint64_t)pthread_kern->proc_get_threadstart(p),
598 .rdi = (uint64_t)th_pthread,
599 .rsi = (uint64_t)(th_thport),
600 .rdx = (uint64_t)user_func,
601 .rcx = (uint64_t)user_funcarg,
602 .r8 = (uint64_t)user_stack,
603 .r9 = (uint64_t)flags,
604 /*
605 * set stack pointer aligned to 16 byte boundary
606 */
607 .rsp = (uint64_t)(th_stack - C_64_REDZONE_LEN)
608 };
609
610 error = pthread_kern->thread_set_wq_state64(th, (thread_state_t)&state64);
611 if (error != KERN_SUCCESS) {
612 error = EINVAL;
613 goto out;
614 }
615
616 }
617 #elif defined(__arm__)
618 arm_thread_state_t state = {
619 .pc = (int)pthread_kern->proc_get_threadstart(p),
620 .r[0] = (unsigned int)th_pthread,
621 .r[1] = (unsigned int)th_thport,
622 .r[2] = (unsigned int)user_func,
623 .r[3] = (unsigned int)user_funcarg,
624 .r[4] = (unsigned int)user_stack,
625 .r[5] = (unsigned int)flags,
626
627 /* Set r7 & lr to 0 for better back tracing */
628 .r[7] = 0,
629 .lr = 0,
630
631 /*
632 * set stack pointer
633 */
634 .sp = (int)((vm_offset_t)(th_stack-C_32_STK_ALIGN))
635 };
636
637 (void) pthread_kern->thread_set_wq_state32(th, (thread_state_t)&state);
638
639 #else
640 #error bsdthread_create not defined for this architecture
641 #endif
642
643 if ((flags & PTHREAD_START_SETSCHED) != 0) {
644 /* Set scheduling parameters if needed */
645 thread_extended_policy_data_t extinfo;
646 thread_precedence_policy_data_t precedinfo;
647
648 importance = (flags & PTHREAD_START_IMPORTANCE_MASK);
649 policy = (flags >> PTHREAD_START_POLICY_BITSHIFT) & PTHREAD_START_POLICY_MASK;
650
651 if (policy == SCHED_OTHER) {
652 extinfo.timeshare = 1;
653 } else {
654 extinfo.timeshare = 0;
655 }
656
657 thread_policy_set(th, THREAD_EXTENDED_POLICY, (thread_policy_t)&extinfo, THREAD_EXTENDED_POLICY_COUNT);
658
659 precedinfo.importance = (importance - BASEPRI_DEFAULT);
660 thread_policy_set(th, THREAD_PRECEDENCE_POLICY, (thread_policy_t)&precedinfo, THREAD_PRECEDENCE_POLICY_COUNT);
661 } else if ((flags & PTHREAD_START_QOSCLASS) != 0) {
662 /* Set thread QoS class if requested. */
663 pthread_priority_t priority = (pthread_priority_t)(flags & PTHREAD_START_QOSCLASS_MASK);
664
665 thread_qos_policy_data_t qos;
666 qos.qos_tier = pthread_priority_get_thread_qos(priority);
667 qos.tier_importance = (qos.qos_tier == QOS_CLASS_UNSPECIFIED) ? 0 :
668 _pthread_priority_get_relpri(priority);
669
670 pthread_kern->thread_policy_set_internal(th, THREAD_QOS_POLICY, (thread_policy_t)&qos, THREAD_QOS_POLICY_COUNT);
671 }
672
673 if (pthread_kern->proc_get_mach_thread_self_tsd_offset) {
674 uint64_t mach_thread_self_offset =
675 pthread_kern->proc_get_mach_thread_self_tsd_offset(p);
676 if (mach_thread_self_offset && tsd_offset) {
677 bool proc64bit = proc_is64bit(p);
678 if (proc64bit) {
679 uint64_t th_thport_tsd = (uint64_t)th_thport;
680 error = copyout(&th_thport_tsd, th_pthread + tsd_offset +
681 mach_thread_self_offset, sizeof(th_thport_tsd));
682 } else {
683 uint32_t th_thport_tsd = (uint32_t)th_thport;
684 error = copyout(&th_thport_tsd, th_pthread + tsd_offset +
685 mach_thread_self_offset, sizeof(th_thport_tsd));
686 }
687 if (error) {
688 goto out1;
689 }
690 }
691 }
692
693 kret = pthread_kern->thread_resume(th);
694 if (kret != KERN_SUCCESS) {
695 error = EINVAL;
696 goto out1;
697 }
698 thread_deallocate(th); /* drop the creator reference */
699
700 PTHREAD_TRACE(TRACE_pthread_thread_create|DBG_FUNC_END, error, th_pthread, 0, 0, 0);
701
702 // cast required as mach_vm_offset_t is always 64 bits even on 32-bit platforms
703 *retval = (user_addr_t)th_pthread;
704
705 return(0);
706
707 out1:
708 if (allocated != 0) {
709 (void)mach_vm_deallocate(vmap, stackaddr, th_allocsize);
710 }
711 out:
712 (void)pthread_kern->mach_port_deallocate(pthread_kern->task_get_ipcspace(ctask), th_thport);
713 if (pthread_kern->thread_will_park_or_terminate) {
714 pthread_kern->thread_will_park_or_terminate(th);
715 }
716 (void)thread_terminate(th);
717 (void)thread_deallocate(th);
718 return(error);
719 }
720
721 /**
722 * bsdthread_terminate system call. Used by pthread_terminate
723 */
724 int
725 _bsdthread_terminate(__unused struct proc *p,
726 user_addr_t stackaddr,
727 size_t size,
728 uint32_t kthport,
729 uint32_t sem,
730 __unused int32_t *retval)
731 {
732 mach_vm_offset_t freeaddr;
733 mach_vm_size_t freesize;
734 kern_return_t kret;
735 thread_t th = current_thread();
736
737 freeaddr = (mach_vm_offset_t)stackaddr;
738 freesize = size;
739
740 PTHREAD_TRACE(TRACE_pthread_thread_terminate|DBG_FUNC_START, freeaddr, freesize, kthport, 0xff, 0);
741
742 if ((freesize != (mach_vm_size_t)0) && (freeaddr != (mach_vm_offset_t)0)) {
743 if (pthread_kern->thread_get_tag(th) & THREAD_TAG_MAINTHREAD){
744 vm_map_t user_map = pthread_kern->current_map();
745 freesize = vm_map_trunc_page_mask((vm_map_offset_t)freesize - 1, vm_map_page_mask(user_map));
746 kret = mach_vm_behavior_set(user_map, freeaddr, freesize, VM_BEHAVIOR_REUSABLE);
747 assert(kret == KERN_SUCCESS || kret == KERN_INVALID_ADDRESS);
748 kret = kret ? kret : mach_vm_protect(user_map, freeaddr, freesize, FALSE, VM_PROT_NONE);
749 assert(kret == KERN_SUCCESS || kret == KERN_INVALID_ADDRESS);
750 } else {
751 kret = mach_vm_deallocate(pthread_kern->current_map(), freeaddr, freesize);
752 if (kret != KERN_SUCCESS) {
753 PTHREAD_TRACE(TRACE_pthread_thread_terminate|DBG_FUNC_END, kret, 0, 0, 0, 0);
754 return(EINVAL);
755 }
756 }
757 }
758
759 if (pthread_kern->thread_will_park_or_terminate) {
760 pthread_kern->thread_will_park_or_terminate(th);
761 }
762 (void)thread_terminate(th);
763 if (sem != MACH_PORT_NULL) {
764 kret = pthread_kern->semaphore_signal_internal_trap(sem);
765 if (kret != KERN_SUCCESS) {
766 PTHREAD_TRACE(TRACE_pthread_thread_terminate|DBG_FUNC_END, kret, 0, 0, 0, 0);
767 return(EINVAL);
768 }
769 }
770
771 if (kthport != MACH_PORT_NULL) {
772 pthread_kern->mach_port_deallocate(pthread_kern->task_get_ipcspace(current_task()), kthport);
773 }
774
775 PTHREAD_TRACE(TRACE_pthread_thread_terminate|DBG_FUNC_END, 0, 0, 0, 0, 0);
776
777 pthread_kern->thread_exception_return();
778 panic("bsdthread_terminate: still running\n");
779
780 PTHREAD_TRACE(TRACE_pthread_thread_terminate|DBG_FUNC_END, 0, 0xff, 0, 0, 0);
781
782 return(0);
783 }
784
785 /**
786 * bsdthread_register system call. Performs per-process setup. Responsible for
787 * returning capabilitiy bits to userspace and receiving userspace function addresses.
788 */
789 int
790 _bsdthread_register(struct proc *p,
791 user_addr_t threadstart,
792 user_addr_t wqthread,
793 int pthsize,
794 user_addr_t pthread_init_data,
795 user_addr_t pthread_init_data_size,
796 uint64_t dispatchqueue_offset,
797 int32_t *retval)
798 {
799 struct _pthread_registration_data data = {};
800 uint32_t max_tsd_offset;
801 kern_return_t kr;
802 size_t pthread_init_sz = 0;
803
804 /* syscall randomizer test can pass bogus values */
805 if (pthsize < 0 || pthsize > MAX_PTHREAD_SIZE) {
806 return(EINVAL);
807 }
808 /*
809 * if we have pthread_init_data, then we use that and target_concptr
810 * (which is an offset) get data.
811 */
812 if (pthread_init_data != 0) {
813 if (pthread_init_data_size < sizeof(data.version)) {
814 return EINVAL;
815 }
816 pthread_init_sz = MIN(sizeof(data), (size_t)pthread_init_data_size);
817 int ret = copyin(pthread_init_data, &data, pthread_init_sz);
818 if (ret) {
819 return ret;
820 }
821 if (data.version != (size_t)pthread_init_data_size) {
822 return EINVAL;
823 }
824 } else {
825 data.dispatch_queue_offset = dispatchqueue_offset;
826 }
827
828 /* We have to do this before proc_get_register so that it resets after fork */
829 mach_vm_offset_t stackaddr = stack_addr_hint(p, pthread_kern->current_map());
830 pthread_kern->proc_set_stack_addr_hint(p, (user_addr_t)stackaddr);
831
832 /* prevent multiple registrations */
833 if (pthread_kern->proc_get_register(p) != 0) {
834 return(EINVAL);
835 }
836
837 pthread_kern->proc_set_threadstart(p, threadstart);
838 pthread_kern->proc_set_wqthread(p, wqthread);
839 pthread_kern->proc_set_pthsize(p, pthsize);
840 pthread_kern->proc_set_register(p);
841
842 uint32_t tsd_slot_sz = proc_is64bit(p) ? sizeof(uint64_t) : sizeof(uint32_t);
843 if ((uint32_t)pthsize >= tsd_slot_sz &&
844 data.tsd_offset <= (uint32_t)(pthsize - tsd_slot_sz)) {
845 max_tsd_offset = ((uint32_t)pthsize - data.tsd_offset - tsd_slot_sz);
846 } else {
847 data.tsd_offset = 0;
848 max_tsd_offset = 0;
849 }
850 pthread_kern->proc_set_pthread_tsd_offset(p, data.tsd_offset);
851
852 if (data.dispatch_queue_offset > max_tsd_offset) {
853 data.dispatch_queue_offset = 0;
854 }
855 pthread_kern->proc_set_dispatchqueue_offset(p, data.dispatch_queue_offset);
856
857 if (pthread_kern->proc_set_return_to_kernel_offset) {
858 if (data.return_to_kernel_offset > max_tsd_offset) {
859 data.return_to_kernel_offset = 0;
860 }
861 pthread_kern->proc_set_return_to_kernel_offset(p,
862 data.return_to_kernel_offset);
863 }
864
865 if (pthread_kern->proc_set_mach_thread_self_tsd_offset) {
866 if (data.mach_thread_self_offset > max_tsd_offset) {
867 data.mach_thread_self_offset = 0;
868 }
869 pthread_kern->proc_set_mach_thread_self_tsd_offset(p,
870 data.mach_thread_self_offset);
871 }
872
873 if (pthread_init_data != 0) {
874 /* Outgoing data that userspace expects as a reply */
875 data.version = sizeof(struct _pthread_registration_data);
876 if (pthread_kern->qos_main_thread_active()) {
877 mach_msg_type_number_t nqos = THREAD_QOS_POLICY_COUNT;
878 thread_qos_policy_data_t qos;
879 boolean_t gd = FALSE;
880
881 kr = pthread_kern->thread_policy_get(current_thread(), THREAD_QOS_POLICY, (thread_policy_t)&qos, &nqos, &gd);
882 if (kr != KERN_SUCCESS || qos.qos_tier == THREAD_QOS_UNSPECIFIED) {
883 /* Unspecified threads means the kernel wants us to impose legacy upon the thread. */
884 qos.qos_tier = THREAD_QOS_LEGACY;
885 qos.tier_importance = 0;
886
887 kr = pthread_kern->thread_policy_set_internal(current_thread(), THREAD_QOS_POLICY, (thread_policy_t)&qos, THREAD_QOS_POLICY_COUNT);
888 }
889
890 if (kr == KERN_SUCCESS) {
891 data.main_qos = thread_qos_get_pthread_priority(qos.qos_tier);
892 } else {
893 data.main_qos = _pthread_priority_make_newest(QOS_CLASS_UNSPECIFIED, 0, 0);
894 }
895 } else {
896 data.main_qos = _pthread_priority_make_newest(QOS_CLASS_UNSPECIFIED, 0, 0);
897 }
898
899 data.mutex_default_policy = pthread_mutex_default_policy;
900
901 kr = copyout(&data, pthread_init_data, pthread_init_sz);
902 if (kr != KERN_SUCCESS) {
903 return EINVAL;
904 }
905 }
906
907 /* return the supported feature set as the return value. */
908 *retval = PTHREAD_FEATURE_SUPPORTED;
909
910 return(0);
911 }
912
913 #pragma mark - QoS Manipulation
914
915 int
916 _bsdthread_ctl_set_qos(struct proc *p, user_addr_t __unused cmd, mach_port_name_t kport, user_addr_t tsd_priority_addr, user_addr_t arg3, int *retval)
917 {
918 int rv;
919 thread_t th;
920
921 pthread_priority_t priority;
922
923 /* Unused parameters must be zero. */
924 if (arg3 != 0) {
925 return EINVAL;
926 }
927
928 /* QoS is stored in a given slot in the pthread TSD. We need to copy that in and set our QoS based on it. */
929 if (proc_is64bit(p)) {
930 uint64_t v;
931 rv = copyin(tsd_priority_addr, &v, sizeof(v));
932 if (rv) goto out;
933 priority = (int)(v & 0xffffffff);
934 } else {
935 uint32_t v;
936 rv = copyin(tsd_priority_addr, &v, sizeof(v));
937 if (rv) goto out;
938 priority = v;
939 }
940
941 if ((th = port_name_to_thread(kport)) == THREAD_NULL) {
942 return ESRCH;
943 }
944
945 /* <rdar://problem/16211829> Disable pthread_set_qos_class_np() on threads other than pthread_self */
946 if (th != current_thread()) {
947 thread_deallocate(th);
948 return EPERM;
949 }
950
951 rv = _bsdthread_ctl_set_self(p, 0, priority, 0, _PTHREAD_SET_SELF_QOS_FLAG, retval);
952
953 /* Static param the thread, we just set QoS on it, so its stuck in QoS land now. */
954 /* pthread_kern->thread_static_param(th, TRUE); */ // see <rdar://problem/16433744>, for details
955
956 thread_deallocate(th);
957
958 out:
959 return rv;
960 }
961
962 static inline struct threadlist *
963 util_get_thread_threadlist_entry(thread_t th)
964 {
965 struct uthread *uth = pthread_kern->get_bsdthread_info(th);
966 if (uth) {
967 struct threadlist *tl = pthread_kern->uthread_get_threadlist(uth);
968 return tl;
969 }
970 return NULL;
971 }
972
973 boolean_t
974 _workq_thread_has_been_unbound(thread_t th, int qos_class)
975 {
976 struct threadlist *tl = util_get_thread_threadlist_entry(th);
977 if (!tl) {
978 return FALSE;
979 }
980
981 struct workqueue *wq = tl->th_workq;
982 workqueue_lock_spin(wq);
983
984 if (tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET) {
985 goto failure;
986 } else if (qos_class != class_index_get_thread_qos(tl->th_priority)) {
987 goto failure;
988 }
989
990 if ((tl->th_flags & TH_LIST_KEVENT_BOUND)){
991 goto failure;
992 }
993 tl->th_flags &= ~TH_LIST_KEVENT_BOUND;
994
995 workqueue_unlock(wq);
996 return TRUE;
997
998 failure:
999 workqueue_unlock(wq);
1000 return FALSE;
1001 }
1002
1003 int
1004 _bsdthread_ctl_set_self(struct proc *p, user_addr_t __unused cmd, pthread_priority_t priority, mach_port_name_t voucher, _pthread_set_flags_t flags, int __unused *retval)
1005 {
1006 thread_qos_policy_data_t qos;
1007 mach_msg_type_number_t nqos = THREAD_QOS_POLICY_COUNT;
1008 boolean_t gd = FALSE;
1009 thread_t th = current_thread();
1010 struct workqueue *wq = NULL;
1011 struct threadlist *tl = NULL;
1012
1013 kern_return_t kr;
1014 int qos_rv = 0, voucher_rv = 0, fixedpri_rv = 0;
1015
1016 if ((flags & _PTHREAD_SET_SELF_WQ_KEVENT_UNBIND) != 0) {
1017 tl = util_get_thread_threadlist_entry(th);
1018 if (tl) {
1019 wq = tl->th_workq;
1020 } else {
1021 goto qos;
1022 }
1023
1024 workqueue_lock_spin(wq);
1025 if (tl->th_flags & TH_LIST_KEVENT_BOUND) {
1026 tl->th_flags &= ~TH_LIST_KEVENT_BOUND;
1027 unsigned int kevent_flags = KEVENT_FLAG_WORKQ | KEVENT_FLAG_UNBIND_CHECK_FLAGS;
1028 if (tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET) {
1029 kevent_flags |= KEVENT_FLAG_WORKQ_MANAGER;
1030 }
1031
1032 workqueue_unlock(wq);
1033 __assert_only int ret = kevent_qos_internal_unbind(p, class_index_get_thread_qos(tl->th_priority), th, kevent_flags);
1034 assert(ret == 0);
1035 } else {
1036 workqueue_unlock(wq);
1037 }
1038 }
1039
1040 qos:
1041 if ((flags & _PTHREAD_SET_SELF_QOS_FLAG) != 0) {
1042 kr = pthread_kern->thread_policy_get(th, THREAD_QOS_POLICY, (thread_policy_t)&qos, &nqos, &gd);
1043 if (kr != KERN_SUCCESS) {
1044 qos_rv = EINVAL;
1045 goto voucher;
1046 }
1047
1048 /*
1049 * If we have main-thread QoS then we don't allow a thread to come out
1050 * of QOS_CLASS_UNSPECIFIED.
1051 */
1052 if (pthread_kern->qos_main_thread_active() && qos.qos_tier ==
1053 THREAD_QOS_UNSPECIFIED) {
1054 qos_rv = EPERM;
1055 goto voucher;
1056 }
1057
1058 if (!tl) {
1059 tl = util_get_thread_threadlist_entry(th);
1060 if (tl) wq = tl->th_workq;
1061 }
1062
1063 PTHREAD_TRACE_WQ(TRACE_pthread_set_qos_self | DBG_FUNC_START, wq, qos.qos_tier, qos.tier_importance, 0, 0);
1064
1065 qos.qos_tier = pthread_priority_get_thread_qos(priority);
1066 qos.tier_importance = (qos.qos_tier == QOS_CLASS_UNSPECIFIED) ? 0 : _pthread_priority_get_relpri(priority);
1067
1068 if (qos.qos_tier == QOS_CLASS_UNSPECIFIED ||
1069 qos.tier_importance > 0 || qos.tier_importance < THREAD_QOS_MIN_TIER_IMPORTANCE) {
1070 qos_rv = EINVAL;
1071 goto voucher;
1072 }
1073
1074 /*
1075 * If we're a workqueue, the threadlist item priority needs adjusting,
1076 * along with the bucket we were running in.
1077 */
1078 if (tl) {
1079 bool try_run_threadreq = false;
1080
1081 workqueue_lock_spin(wq);
1082 kr = pthread_kern->thread_set_workq_qos(th, qos.qos_tier, qos.tier_importance);
1083 assert(kr == KERN_SUCCESS || kr == KERN_TERMINATED);
1084
1085 /* Fix up counters. */
1086 uint8_t old_bucket = tl->th_priority;
1087 uint8_t new_bucket = pthread_priority_get_class_index(priority);
1088
1089 if (old_bucket != new_bucket) {
1090 _wq_thactive_move(wq, old_bucket, new_bucket);
1091 wq->wq_thscheduled_count[old_bucket]--;
1092 wq->wq_thscheduled_count[new_bucket]++;
1093 if (old_bucket == WORKQUEUE_EVENT_MANAGER_BUCKET ||
1094 old_bucket < new_bucket) {
1095 /*
1096 * if the QoS of the thread was lowered, then this could
1097 * allow for a higher QoS thread request to run, so we need
1098 * to reevaluate.
1099 */
1100 try_run_threadreq = true;
1101 }
1102 tl->th_priority = new_bucket;
1103 }
1104
1105 bool old_overcommit = !(tl->th_flags & TH_LIST_CONSTRAINED);
1106 bool new_overcommit = priority & _PTHREAD_PRIORITY_OVERCOMMIT_FLAG;
1107 if (!old_overcommit && new_overcommit) {
1108 if (wq->wq_constrained_threads_scheduled-- ==
1109 wq_max_constrained_threads) {
1110 try_run_threadreq = true;
1111 }
1112 tl->th_flags &= ~TH_LIST_CONSTRAINED;
1113 } else if (old_overcommit && !new_overcommit) {
1114 wq->wq_constrained_threads_scheduled++;
1115 tl->th_flags |= TH_LIST_CONSTRAINED;
1116 }
1117
1118 if (try_run_threadreq) {
1119 workqueue_run_threadreq_and_unlock(p, wq, NULL, NULL, true);
1120 } else {
1121 workqueue_unlock(wq);
1122 }
1123 } else {
1124 kr = pthread_kern->thread_policy_set_internal(th, THREAD_QOS_POLICY, (thread_policy_t)&qos, THREAD_QOS_POLICY_COUNT);
1125 if (kr != KERN_SUCCESS) {
1126 qos_rv = EINVAL;
1127 }
1128 }
1129
1130 PTHREAD_TRACE_WQ(TRACE_pthread_set_qos_self | DBG_FUNC_END, wq, qos.qos_tier, qos.tier_importance, 0, 0);
1131 }
1132
1133 voucher:
1134 if ((flags & _PTHREAD_SET_SELF_VOUCHER_FLAG) != 0) {
1135 kr = pthread_kern->thread_set_voucher_name(voucher);
1136 if (kr != KERN_SUCCESS) {
1137 voucher_rv = ENOENT;
1138 goto fixedpri;
1139 }
1140 }
1141
1142 fixedpri:
1143 if (qos_rv) goto done;
1144 if ((flags & _PTHREAD_SET_SELF_FIXEDPRIORITY_FLAG) != 0) {
1145 thread_extended_policy_data_t extpol = {.timeshare = 0};
1146
1147 if (!tl) tl = util_get_thread_threadlist_entry(th);
1148 if (tl) {
1149 /* Not allowed on workqueue threads */
1150 fixedpri_rv = ENOTSUP;
1151 goto done;
1152 }
1153
1154 kr = pthread_kern->thread_policy_set_internal(th, THREAD_EXTENDED_POLICY, (thread_policy_t)&extpol, THREAD_EXTENDED_POLICY_COUNT);
1155 if (kr != KERN_SUCCESS) {
1156 fixedpri_rv = EINVAL;
1157 goto done;
1158 }
1159 } else if ((flags & _PTHREAD_SET_SELF_TIMESHARE_FLAG) != 0) {
1160 thread_extended_policy_data_t extpol = {.timeshare = 1};
1161
1162 if (!tl) tl = util_get_thread_threadlist_entry(th);
1163 if (tl) {
1164 /* Not allowed on workqueue threads */
1165 fixedpri_rv = ENOTSUP;
1166 goto done;
1167 }
1168
1169 kr = pthread_kern->thread_policy_set_internal(th, THREAD_EXTENDED_POLICY, (thread_policy_t)&extpol, THREAD_EXTENDED_POLICY_COUNT);
1170 if (kr != KERN_SUCCESS) {
1171 fixedpri_rv = EINVAL;
1172 goto done;
1173 }
1174 }
1175
1176 done:
1177 if (qos_rv && voucher_rv) {
1178 /* Both failed, give that a unique error. */
1179 return EBADMSG;
1180 }
1181
1182 if (qos_rv) {
1183 return qos_rv;
1184 }
1185
1186 if (voucher_rv) {
1187 return voucher_rv;
1188 }
1189
1190 if (fixedpri_rv) {
1191 return fixedpri_rv;
1192 }
1193
1194 return 0;
1195 }
1196
1197 int
1198 _bsdthread_ctl_qos_override_start(struct proc __unused *p, user_addr_t __unused cmd, mach_port_name_t kport, pthread_priority_t priority, user_addr_t resource, int __unused *retval)
1199 {
1200 thread_t th;
1201 int rv = 0;
1202
1203 if ((th = port_name_to_thread(kport)) == THREAD_NULL) {
1204 return ESRCH;
1205 }
1206
1207 int override_qos = pthread_priority_get_thread_qos(priority);
1208
1209 struct threadlist *tl = util_get_thread_threadlist_entry(th);
1210 if (tl) {
1211 PTHREAD_TRACE_WQ(TRACE_wq_override_start | DBG_FUNC_NONE, tl->th_workq, thread_tid(th), 1, priority, 0);
1212 }
1213
1214 /* The only failure case here is if we pass a tid and have it lookup the thread, we pass the uthread, so this all always succeeds. */
1215 pthread_kern->proc_usynch_thread_qos_add_override_for_resource_check_owner(th, override_qos, TRUE,
1216 resource, THREAD_QOS_OVERRIDE_TYPE_PTHREAD_EXPLICIT_OVERRIDE, USER_ADDR_NULL, MACH_PORT_NULL);
1217 thread_deallocate(th);
1218 return rv;
1219 }
1220
1221 int
1222 _bsdthread_ctl_qos_override_end(struct proc __unused *p, user_addr_t __unused cmd, mach_port_name_t kport, user_addr_t resource, user_addr_t arg3, int __unused *retval)
1223 {
1224 thread_t th;
1225 int rv = 0;
1226
1227 if (arg3 != 0) {
1228 return EINVAL;
1229 }
1230
1231 if ((th = port_name_to_thread(kport)) == THREAD_NULL) {
1232 return ESRCH;
1233 }
1234
1235 struct uthread *uth = pthread_kern->get_bsdthread_info(th);
1236
1237 struct threadlist *tl = util_get_thread_threadlist_entry(th);
1238 if (tl) {
1239 PTHREAD_TRACE_WQ(TRACE_wq_override_end | DBG_FUNC_NONE, tl->th_workq, thread_tid(th), 0, 0, 0);
1240 }
1241
1242 pthread_kern->proc_usynch_thread_qos_remove_override_for_resource(current_task(), uth, 0, resource, THREAD_QOS_OVERRIDE_TYPE_PTHREAD_EXPLICIT_OVERRIDE);
1243
1244 thread_deallocate(th);
1245 return rv;
1246 }
1247
1248 static int
1249 _bsdthread_ctl_qos_dispatch_asynchronous_override_add_internal(mach_port_name_t kport, pthread_priority_t priority, user_addr_t resource, user_addr_t ulock_addr)
1250 {
1251 thread_t th;
1252 int rv = 0;
1253
1254 if ((th = port_name_to_thread(kport)) == THREAD_NULL) {
1255 return ESRCH;
1256 }
1257
1258 int override_qos = pthread_priority_get_thread_qos(priority);
1259
1260 struct threadlist *tl = util_get_thread_threadlist_entry(th);
1261 if (!tl) {
1262 thread_deallocate(th);
1263 return EPERM;
1264 }
1265
1266 PTHREAD_TRACE_WQ(TRACE_wq_override_dispatch | DBG_FUNC_NONE, tl->th_workq, thread_tid(th), 1, priority, 0);
1267
1268 rv = pthread_kern->proc_usynch_thread_qos_add_override_for_resource_check_owner(th, override_qos, TRUE,
1269 resource, THREAD_QOS_OVERRIDE_TYPE_DISPATCH_ASYNCHRONOUS_OVERRIDE, ulock_addr, kport);
1270
1271 thread_deallocate(th);
1272 return rv;
1273 }
1274
1275 int _bsdthread_ctl_qos_dispatch_asynchronous_override_add(struct proc __unused *p, user_addr_t __unused cmd,
1276 mach_port_name_t kport, pthread_priority_t priority, user_addr_t resource, int __unused *retval)
1277 {
1278 return _bsdthread_ctl_qos_dispatch_asynchronous_override_add_internal(kport, priority, resource, USER_ADDR_NULL);
1279 }
1280
1281 int
1282 _bsdthread_ctl_qos_override_dispatch(struct proc *p __unused, user_addr_t cmd __unused, mach_port_name_t kport, pthread_priority_t priority, user_addr_t ulock_addr, int __unused *retval)
1283 {
1284 return _bsdthread_ctl_qos_dispatch_asynchronous_override_add_internal(kport, priority, USER_ADDR_NULL, ulock_addr);
1285 }
1286
1287 int
1288 _bsdthread_ctl_qos_override_reset(struct proc *p, user_addr_t cmd, user_addr_t arg1, user_addr_t arg2, user_addr_t arg3, int *retval)
1289 {
1290 if (arg1 != 0 || arg2 != 0 || arg3 != 0) {
1291 return EINVAL;
1292 }
1293
1294 return _bsdthread_ctl_qos_dispatch_asynchronous_override_reset(p, cmd, 1 /* reset_all */, 0, 0, retval);
1295 }
1296
1297 int
1298 _bsdthread_ctl_qos_dispatch_asynchronous_override_reset(struct proc __unused *p, user_addr_t __unused cmd, int reset_all, user_addr_t resource, user_addr_t arg3, int __unused *retval)
1299 {
1300 if ((reset_all && (resource != 0)) || arg3 != 0) {
1301 return EINVAL;
1302 }
1303
1304 thread_t th = current_thread();
1305 struct uthread *uth = pthread_kern->get_bsdthread_info(th);
1306 struct threadlist *tl = pthread_kern->uthread_get_threadlist(uth);
1307
1308 if (!tl) {
1309 return EPERM;
1310 }
1311
1312 PTHREAD_TRACE_WQ(TRACE_wq_override_reset | DBG_FUNC_NONE, tl->th_workq, 0, 0, 0, 0);
1313
1314 resource = reset_all ? THREAD_QOS_OVERRIDE_RESOURCE_WILDCARD : resource;
1315 pthread_kern->proc_usynch_thread_qos_reset_override_for_resource(current_task(), uth, 0, resource, THREAD_QOS_OVERRIDE_TYPE_DISPATCH_ASYNCHRONOUS_OVERRIDE);
1316
1317 return 0;
1318 }
1319
1320 static int
1321 _bsdthread_ctl_max_parallelism(struct proc __unused *p, user_addr_t __unused cmd,
1322 int qos, unsigned long flags, int *retval)
1323 {
1324 _Static_assert(QOS_PARALLELISM_COUNT_LOGICAL ==
1325 _PTHREAD_QOS_PARALLELISM_COUNT_LOGICAL, "logical");
1326 _Static_assert(QOS_PARALLELISM_REALTIME ==
1327 _PTHREAD_QOS_PARALLELISM_REALTIME, "realtime");
1328
1329 if (flags & ~(QOS_PARALLELISM_REALTIME | QOS_PARALLELISM_COUNT_LOGICAL)) {
1330 return EINVAL;
1331 }
1332
1333 if (flags & QOS_PARALLELISM_REALTIME) {
1334 if (qos) {
1335 return EINVAL;
1336 }
1337 } else if (qos == THREAD_QOS_UNSPECIFIED || qos >= THREAD_QOS_LAST) {
1338 return EINVAL;
1339 }
1340
1341 *retval = pthread_kern->qos_max_parallelism(qos, flags);
1342 return 0;
1343 }
1344
1345 int
1346 _bsdthread_ctl(struct proc *p, user_addr_t cmd, user_addr_t arg1, user_addr_t arg2, user_addr_t arg3, int *retval)
1347 {
1348 switch (cmd) {
1349 case BSDTHREAD_CTL_SET_QOS:
1350 return _bsdthread_ctl_set_qos(p, cmd, (mach_port_name_t)arg1, arg2, arg3, retval);
1351 case BSDTHREAD_CTL_QOS_OVERRIDE_START:
1352 return _bsdthread_ctl_qos_override_start(p, cmd, (mach_port_name_t)arg1, (pthread_priority_t)arg2, arg3, retval);
1353 case BSDTHREAD_CTL_QOS_OVERRIDE_END:
1354 return _bsdthread_ctl_qos_override_end(p, cmd, (mach_port_name_t)arg1, arg2, arg3, retval);
1355 case BSDTHREAD_CTL_QOS_OVERRIDE_RESET:
1356 return _bsdthread_ctl_qos_override_reset(p, cmd, arg1, arg2, arg3, retval);
1357 case BSDTHREAD_CTL_QOS_OVERRIDE_DISPATCH:
1358 return _bsdthread_ctl_qos_override_dispatch(p, cmd, (mach_port_name_t)arg1, (pthread_priority_t)arg2, arg3, retval);
1359 case BSDTHREAD_CTL_QOS_DISPATCH_ASYNCHRONOUS_OVERRIDE_ADD:
1360 return _bsdthread_ctl_qos_dispatch_asynchronous_override_add(p, cmd, (mach_port_name_t)arg1, (pthread_priority_t)arg2, arg3, retval);
1361 case BSDTHREAD_CTL_QOS_DISPATCH_ASYNCHRONOUS_OVERRIDE_RESET:
1362 return _bsdthread_ctl_qos_dispatch_asynchronous_override_reset(p, cmd, (int)arg1, arg2, arg3, retval);
1363 case BSDTHREAD_CTL_SET_SELF:
1364 return _bsdthread_ctl_set_self(p, cmd, (pthread_priority_t)arg1, (mach_port_name_t)arg2, (_pthread_set_flags_t)arg3, retval);
1365 case BSDTHREAD_CTL_QOS_MAX_PARALLELISM:
1366 return _bsdthread_ctl_max_parallelism(p, cmd, (int)arg1, (unsigned long)arg2, retval);
1367 default:
1368 return EINVAL;
1369 }
1370 }
1371
1372 #pragma mark - Workqueue Implementation
1373
1374 #pragma mark wq_flags
1375
1376 static inline uint32_t
1377 _wq_flags(struct workqueue *wq)
1378 {
1379 return atomic_load_explicit(&wq->wq_flags, memory_order_relaxed);
1380 }
1381
1382 static inline bool
1383 _wq_exiting(struct workqueue *wq)
1384 {
1385 return _wq_flags(wq) & WQ_EXITING;
1386 }
1387
1388 static inline uint32_t
1389 _wq_flags_or_orig(struct workqueue *wq, uint32_t v)
1390 {
1391 #if PTHREAD_INLINE_RMW_ATOMICS
1392 uint32_t state;
1393 do {
1394 state = _wq_flags(wq);
1395 } while (!OSCompareAndSwap(state, state | v, &wq->wq_flags));
1396 return state;
1397 #else
1398 return atomic_fetch_or_explicit(&wq->wq_flags, v, memory_order_relaxed);
1399 #endif
1400 }
1401
1402 static inline uint32_t
1403 _wq_flags_and_orig(struct workqueue *wq, uint32_t v)
1404 {
1405 #if PTHREAD_INLINE_RMW_ATOMICS
1406 uint32_t state;
1407 do {
1408 state = _wq_flags(wq);
1409 } while (!OSCompareAndSwap(state, state & v, &wq->wq_flags));
1410 return state;
1411 #else
1412 return atomic_fetch_and_explicit(&wq->wq_flags, v, memory_order_relaxed);
1413 #endif
1414 }
1415
1416 static inline bool
1417 WQ_TIMER_DELAYED_NEEDED(struct workqueue *wq)
1418 {
1419 uint32_t oldflags, newflags;
1420 do {
1421 oldflags = _wq_flags(wq);
1422 if (oldflags & (WQ_EXITING | WQ_ATIMER_DELAYED_RUNNING)) {
1423 return false;
1424 }
1425 newflags = oldflags | WQ_ATIMER_DELAYED_RUNNING;
1426 } while (!OSCompareAndSwap(oldflags, newflags, &wq->wq_flags));
1427 return true;
1428 }
1429
1430 static inline bool
1431 WQ_TIMER_IMMEDIATE_NEEDED(struct workqueue *wq)
1432 {
1433 uint32_t oldflags, newflags;
1434 do {
1435 oldflags = _wq_flags(wq);
1436 if (oldflags & (WQ_EXITING | WQ_ATIMER_IMMEDIATE_RUNNING)) {
1437 return false;
1438 }
1439 newflags = oldflags | WQ_ATIMER_IMMEDIATE_RUNNING;
1440 } while (!OSCompareAndSwap(oldflags, newflags, &wq->wq_flags));
1441 return true;
1442 }
1443
1444 #pragma mark thread requests pacing
1445
1446 static inline uint32_t
1447 _wq_pacing_shift_for_pri(int pri)
1448 {
1449 return _wq_bucket_to_thread_qos(pri) - 1;
1450 }
1451
1452 static inline int
1453 _wq_highest_paced_priority(struct workqueue *wq)
1454 {
1455 uint8_t paced = wq->wq_paced;
1456 int msb = paced ? 32 - __builtin_clz(paced) : 0; // fls(paced) == bit + 1
1457 return WORKQUEUE_EVENT_MANAGER_BUCKET - msb;
1458 }
1459
1460 static inline uint8_t
1461 _wq_pacing_bit_for_pri(int pri)
1462 {
1463 return 1u << _wq_pacing_shift_for_pri(pri);
1464 }
1465
1466 static inline bool
1467 _wq_should_pace_priority(struct workqueue *wq, int pri)
1468 {
1469 return wq->wq_paced >= _wq_pacing_bit_for_pri(pri);
1470 }
1471
1472 static inline void
1473 _wq_pacing_start(struct workqueue *wq, struct threadlist *tl)
1474 {
1475 uint8_t bit = _wq_pacing_bit_for_pri(tl->th_priority);
1476 assert((tl->th_flags & TH_LIST_PACING) == 0);
1477 assert((wq->wq_paced & bit) == 0);
1478 wq->wq_paced |= bit;
1479 tl->th_flags |= TH_LIST_PACING;
1480 }
1481
1482 static inline bool
1483 _wq_pacing_end(struct workqueue *wq, struct threadlist *tl)
1484 {
1485 if (tl->th_flags & TH_LIST_PACING) {
1486 uint8_t bit = _wq_pacing_bit_for_pri(tl->th_priority);
1487 assert((wq->wq_paced & bit) != 0);
1488 wq->wq_paced ^= bit;
1489 tl->th_flags &= ~TH_LIST_PACING;
1490 return wq->wq_paced < bit; // !_wq_should_pace_priority
1491 }
1492 return false;
1493 }
1494
1495 #pragma mark thread requests
1496
1497 static void
1498 _threadreq_init_alloced(struct threadreq *req, int priority, int flags)
1499 {
1500 assert((flags & TR_FLAG_ONSTACK) == 0);
1501 req->tr_state = TR_STATE_NEW;
1502 req->tr_priority = priority;
1503 req->tr_flags = flags;
1504 }
1505
1506 static void
1507 _threadreq_init_stack(struct threadreq *req, int priority, int flags)
1508 {
1509 req->tr_state = TR_STATE_NEW;
1510 req->tr_priority = priority;
1511 req->tr_flags = flags | TR_FLAG_ONSTACK;
1512 }
1513
1514 static void
1515 _threadreq_copy_prepare(struct workqueue *wq)
1516 {
1517 again:
1518 if (wq->wq_cached_threadreq) {
1519 return;
1520 }
1521
1522 workqueue_unlock(wq);
1523 struct threadreq *req = zalloc(pthread_zone_threadreq);
1524 workqueue_lock_spin(wq);
1525
1526 if (wq->wq_cached_threadreq) {
1527 /*
1528 * We lost the race and someone left behind an extra threadreq for us
1529 * to use. Throw away our request and retry.
1530 */
1531 workqueue_unlock(wq);
1532 zfree(pthread_zone_threadreq, req);
1533 workqueue_lock_spin(wq);
1534 goto again;
1535 } else {
1536 wq->wq_cached_threadreq = req;
1537 }
1538
1539 assert(wq->wq_cached_threadreq);
1540 }
1541
1542 static bool
1543 _threadreq_copy_prepare_noblock(struct workqueue *wq)
1544 {
1545 if (wq->wq_cached_threadreq) {
1546 return true;
1547 }
1548
1549 wq->wq_cached_threadreq = zalloc_noblock(pthread_zone_threadreq);
1550
1551 return wq->wq_cached_threadreq != NULL;
1552 }
1553
1554 static inline struct threadreq_head *
1555 _threadreq_list_for_req(struct workqueue *wq, const struct threadreq *req)
1556 {
1557 if (req->tr_flags & TR_FLAG_OVERCOMMIT) {
1558 return &wq->wq_overcommit_reqlist[req->tr_priority];
1559 } else {
1560 return &wq->wq_reqlist[req->tr_priority];
1561 }
1562 }
1563
1564 static void
1565 _threadreq_enqueue(struct workqueue *wq, struct threadreq *req)
1566 {
1567 assert(req && req->tr_state == TR_STATE_NEW);
1568 if (req->tr_priority == WORKQUEUE_EVENT_MANAGER_BUCKET) {
1569 assert(wq->wq_event_manager_threadreq.tr_state != TR_STATE_WAITING);
1570 memcpy(&wq->wq_event_manager_threadreq, req, sizeof(struct threadreq));
1571 req = &wq->wq_event_manager_threadreq;
1572 req->tr_flags &= ~(TR_FLAG_ONSTACK | TR_FLAG_NO_PACING);
1573 } else {
1574 if (req->tr_flags & TR_FLAG_ONSTACK) {
1575 assert(wq->wq_cached_threadreq);
1576 struct threadreq *newreq = wq->wq_cached_threadreq;
1577 wq->wq_cached_threadreq = NULL;
1578
1579 memcpy(newreq, req, sizeof(struct threadreq));
1580 newreq->tr_flags &= ~(TR_FLAG_ONSTACK | TR_FLAG_NO_PACING);
1581 req->tr_state = TR_STATE_DEAD;
1582 req = newreq;
1583 }
1584 TAILQ_INSERT_TAIL(_threadreq_list_for_req(wq, req), req, tr_entry);
1585 }
1586 req->tr_state = TR_STATE_WAITING;
1587 wq->wq_reqcount++;
1588 }
1589
1590 static void
1591 _threadreq_dequeue(struct workqueue *wq, struct threadreq *req)
1592 {
1593 if (req->tr_priority != WORKQUEUE_EVENT_MANAGER_BUCKET) {
1594 struct threadreq_head *req_list = _threadreq_list_for_req(wq, req);
1595 #if DEBUG
1596 struct threadreq *cursor = NULL;
1597 TAILQ_FOREACH(cursor, req_list, tr_entry) {
1598 if (cursor == req) break;
1599 }
1600 assert(cursor == req);
1601 #endif
1602 TAILQ_REMOVE(req_list, req, tr_entry);
1603 }
1604 wq->wq_reqcount--;
1605 }
1606
1607 /*
1608 * Mark a thread request as complete. At this point, it is treated as owned by
1609 * the submitting subsystem and you should assume it could be freed.
1610 *
1611 * Called with the workqueue lock held.
1612 */
1613 static int
1614 _threadreq_complete_and_unlock(proc_t p, struct workqueue *wq,
1615 struct threadreq *req, struct threadlist *tl)
1616 {
1617 struct threadreq *req_tofree = NULL;
1618 bool sync = (req->tr_state == TR_STATE_NEW);
1619 bool workloop = req->tr_flags & TR_FLAG_WORKLOOP;
1620 bool onstack = req->tr_flags & TR_FLAG_ONSTACK;
1621 bool kevent = req->tr_flags & TR_FLAG_KEVENT;
1622 bool unbinding = tl->th_flags & TH_LIST_UNBINDING;
1623 bool locked = true;
1624 bool waking_parked_thread = (tl->th_flags & TH_LIST_BUSY);
1625 int ret;
1626
1627 req->tr_state = TR_STATE_COMPLETE;
1628
1629 if (!workloop && !onstack && req != &wq->wq_event_manager_threadreq) {
1630 if (wq->wq_cached_threadreq) {
1631 req_tofree = req;
1632 } else {
1633 wq->wq_cached_threadreq = req;
1634 }
1635 }
1636
1637 if (tl->th_flags & TH_LIST_UNBINDING) {
1638 tl->th_flags &= ~TH_LIST_UNBINDING;
1639 assert((tl->th_flags & TH_LIST_KEVENT_BOUND));
1640 } else if (workloop || kevent) {
1641 assert((tl->th_flags & TH_LIST_KEVENT_BOUND) == 0);
1642 tl->th_flags |= TH_LIST_KEVENT_BOUND;
1643 }
1644
1645 if (workloop) {
1646 workqueue_unlock(wq);
1647 ret = pthread_kern->workloop_fulfill_threadreq(wq->wq_proc, (void*)req,
1648 tl->th_thread, sync ? WORKLOOP_FULFILL_THREADREQ_SYNC : 0);
1649 assert(ret == 0);
1650 locked = false;
1651 } else if (kevent) {
1652 unsigned int kevent_flags = KEVENT_FLAG_WORKQ;
1653 if (sync) {
1654 kevent_flags |= KEVENT_FLAG_SYNCHRONOUS_BIND;
1655 }
1656 if (tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET) {
1657 kevent_flags |= KEVENT_FLAG_WORKQ_MANAGER;
1658 }
1659 workqueue_unlock(wq);
1660 ret = kevent_qos_internal_bind(wq->wq_proc,
1661 class_index_get_thread_qos(tl->th_priority), tl->th_thread,
1662 kevent_flags);
1663 if (ret != 0) {
1664 workqueue_lock_spin(wq);
1665 tl->th_flags &= ~TH_LIST_KEVENT_BOUND;
1666 locked = true;
1667 } else {
1668 locked = false;
1669 }
1670 }
1671
1672 /*
1673 * Run Thread, Run!
1674 */
1675 PTHREAD_TRACE_WQ(TRACE_wq_run_threadreq | DBG_FUNC_END, wq, 0, 0, 0, 0);
1676 PTHREAD_TRACE_WQ_REQ(TRACE_wq_runitem | DBG_FUNC_START, wq, req, tl->th_priority,
1677 thread_tid(current_thread()), thread_tid(tl->th_thread));
1678
1679 if (waking_parked_thread) {
1680 if (!locked) {
1681 workqueue_lock_spin(wq);
1682 }
1683 tl->th_flags &= ~(TH_LIST_BUSY);
1684 if ((tl->th_flags & TH_LIST_REMOVING_VOUCHER) == 0) {
1685 /*
1686 * If the thread is in the process of removing its voucher, then it
1687 * isn't actually in the wait event yet and we don't need to wake
1688 * it up. Save the trouble (and potential lock-ordering issues
1689 * (see 30617015)).
1690 */
1691 thread_wakeup_thread(tl, tl->th_thread);
1692 }
1693 workqueue_unlock(wq);
1694
1695 if (req_tofree) zfree(pthread_zone_threadreq, req_tofree);
1696 return WQ_RUN_TR_THREAD_STARTED;
1697 }
1698
1699 assert ((tl->th_flags & TH_LIST_PACING) == 0);
1700 if (locked) {
1701 workqueue_unlock(wq);
1702 }
1703 if (req_tofree) zfree(pthread_zone_threadreq, req_tofree);
1704 if (unbinding) {
1705 return WQ_RUN_TR_THREAD_STARTED;
1706 }
1707 _setup_wqthread(p, tl->th_thread, wq, tl, WQ_SETUP_CLEAR_VOUCHER);
1708 pthread_kern->unix_syscall_return(EJUSTRETURN);
1709 __builtin_unreachable();
1710 }
1711
1712 /*
1713 * Mark a thread request as cancelled. Has similar ownership semantics to the
1714 * complete call above.
1715 */
1716 static void
1717 _threadreq_cancel(struct workqueue *wq, struct threadreq *req)
1718 {
1719 assert(req->tr_state == TR_STATE_WAITING);
1720 req->tr_state = TR_STATE_DEAD;
1721
1722 assert((req->tr_flags & TR_FLAG_ONSTACK) == 0);
1723 if (req->tr_flags & TR_FLAG_WORKLOOP) {
1724 __assert_only int ret;
1725 ret = pthread_kern->workloop_fulfill_threadreq(wq->wq_proc, (void*)req,
1726 THREAD_NULL, WORKLOOP_FULFILL_THREADREQ_CANCEL);
1727 assert(ret == 0 || ret == ECANCELED);
1728 } else if (req != &wq->wq_event_manager_threadreq) {
1729 zfree(pthread_zone_threadreq, req);
1730 }
1731 }
1732
1733 #pragma mark workqueue lock
1734
1735 static boolean_t workqueue_lock_spin_is_acquired_kdp(struct workqueue *wq) {
1736 return kdp_lck_spin_is_acquired(&wq->wq_lock);
1737 }
1738
1739 static void
1740 workqueue_lock_spin(struct workqueue *wq)
1741 {
1742 assert(ml_get_interrupts_enabled() == TRUE);
1743 lck_spin_lock(&wq->wq_lock);
1744 }
1745
1746 static bool
1747 workqueue_lock_try(struct workqueue *wq)
1748 {
1749 return lck_spin_try_lock(&wq->wq_lock);
1750 }
1751
1752 static void
1753 workqueue_unlock(struct workqueue *wq)
1754 {
1755 lck_spin_unlock(&wq->wq_lock);
1756 }
1757
1758 #pragma mark workqueue add timer
1759
1760 /**
1761 * Sets up the timer which will call out to workqueue_add_timer
1762 */
1763 static void
1764 workqueue_interval_timer_start(struct workqueue *wq)
1765 {
1766 uint64_t deadline;
1767
1768 /* n.b. wq_timer_interval is reset to 0 in workqueue_add_timer if the
1769 ATIMER_RUNNING flag is not present. The net effect here is that if a
1770 sequence of threads is required, we'll double the time before we give out
1771 the next one. */
1772 if (wq->wq_timer_interval == 0) {
1773 wq->wq_timer_interval = wq_stalled_window_usecs;
1774
1775 } else {
1776 wq->wq_timer_interval = wq->wq_timer_interval * 2;
1777
1778 if (wq->wq_timer_interval > wq_max_timer_interval_usecs) {
1779 wq->wq_timer_interval = wq_max_timer_interval_usecs;
1780 }
1781 }
1782 clock_interval_to_deadline(wq->wq_timer_interval, 1000, &deadline);
1783
1784 PTHREAD_TRACE_WQ(TRACE_wq_start_add_timer, wq, wq->wq_reqcount,
1785 _wq_flags(wq), wq->wq_timer_interval, 0);
1786
1787 thread_call_t call = wq->wq_atimer_delayed_call;
1788 if (thread_call_enter1_delayed(call, call, deadline)) {
1789 panic("delayed_call was already enqueued");
1790 }
1791 }
1792
1793 /**
1794 * Immediately trigger the workqueue_add_timer
1795 */
1796 static void
1797 workqueue_interval_timer_trigger(struct workqueue *wq)
1798 {
1799 PTHREAD_TRACE_WQ(TRACE_wq_start_add_timer, wq, wq->wq_reqcount,
1800 _wq_flags(wq), 0, 0);
1801
1802 thread_call_t call = wq->wq_atimer_immediate_call;
1803 if (thread_call_enter1(call, call)) {
1804 panic("immediate_call was already enqueued");
1805 }
1806 }
1807
1808 /**
1809 * returns whether lastblocked_tsp is within wq_stalled_window_usecs of cur_ts
1810 */
1811 static boolean_t
1812 wq_thread_is_busy(uint64_t cur_ts, _Atomic uint64_t *lastblocked_tsp)
1813 {
1814 clock_sec_t secs;
1815 clock_usec_t usecs;
1816 uint64_t lastblocked_ts;
1817 uint64_t elapsed;
1818
1819 lastblocked_ts = atomic_load_explicit(lastblocked_tsp, memory_order_relaxed);
1820 if (lastblocked_ts >= cur_ts) {
1821 /*
1822 * because the update of the timestamp when a thread blocks isn't
1823 * serialized against us looking at it (i.e. we don't hold the workq lock)
1824 * it's possible to have a timestamp that matches the current time or
1825 * that even looks to be in the future relative to when we grabbed the current
1826 * time... just treat this as a busy thread since it must have just blocked.
1827 */
1828 return (TRUE);
1829 }
1830 elapsed = cur_ts - lastblocked_ts;
1831
1832 pthread_kern->absolutetime_to_microtime(elapsed, &secs, &usecs);
1833
1834 return (secs == 0 && usecs < wq_stalled_window_usecs);
1835 }
1836
1837 /**
1838 * handler function for the timer
1839 */
1840 static void
1841 workqueue_add_timer(struct workqueue *wq, thread_call_t thread_call_self)
1842 {
1843 proc_t p = wq->wq_proc;
1844
1845 workqueue_lock_spin(wq);
1846
1847 PTHREAD_TRACE_WQ(TRACE_wq_add_timer | DBG_FUNC_START, wq,
1848 _wq_flags(wq), wq->wq_nthreads, wq->wq_thidlecount, 0);
1849
1850 /*
1851 * There's two tricky issues here.
1852 *
1853 * First issue: we start the thread_call's that invoke this routine without
1854 * the workqueue lock held. The scheduler callback needs to trigger
1855 * reevaluation of the number of running threads but shouldn't take that
1856 * lock, so we can't use it to synchronize state around the thread_call.
1857 * As a result, it might re-enter the thread_call while this routine is
1858 * already running. This could cause it to fire a second time and we'll
1859 * have two add_timers running at once. Obviously, we don't want that to
1860 * keep stacking, so we need to keep it at two timers.
1861 *
1862 * Solution: use wq_flags (accessed via atomic CAS) to synchronize the
1863 * enqueue of the thread_call itself. When a thread needs to trigger the
1864 * add_timer, it checks for ATIMER_DELAYED_RUNNING and, when not set, sets
1865 * the flag then does a thread_call_enter. We'll then remove that flag
1866 * only once we've got the lock and it's safe for the thread_call to be
1867 * entered again.
1868 *
1869 * Second issue: we need to make sure that the two timers don't execute this
1870 * routine concurrently. We can't use the workqueue lock for this because
1871 * we'll need to drop it during our execution.
1872 *
1873 * Solution: use WQL_ATIMER_BUSY as a condition variable to indicate that
1874 * we are currently executing the routine and the next thread should wait.
1875 *
1876 * After all that, we arrive at the following four possible states:
1877 * !WQ_ATIMER_DELAYED_RUNNING && !WQL_ATIMER_BUSY no pending timer, no active timer
1878 * !WQ_ATIMER_DELAYED_RUNNING && WQL_ATIMER_BUSY no pending timer, 1 active timer
1879 * WQ_ATIMER_DELAYED_RUNNING && !WQL_ATIMER_BUSY 1 pending timer, no active timer
1880 * WQ_ATIMER_DELAYED_RUNNING && WQL_ATIMER_BUSY 1 pending timer, 1 active timer
1881 *
1882 * Further complication sometimes we need to trigger this function to run
1883 * without delay. Because we aren't under a lock between setting
1884 * WQ_ATIMER_DELAYED_RUNNING and calling thread_call_enter, we can't simply
1885 * re-enter the thread call: if thread_call_enter() returned false, we
1886 * wouldn't be able to distinguish the case where the thread_call had
1887 * already fired from the case where it hadn't been entered yet from the
1888 * other thread. So, we use a separate thread_call for immediate
1889 * invocations, and a separate RUNNING flag, WQ_ATIMER_IMMEDIATE_RUNNING.
1890 */
1891
1892 while (wq->wq_lflags & WQL_ATIMER_BUSY) {
1893 wq->wq_lflags |= WQL_ATIMER_WAITING;
1894
1895 assert_wait((caddr_t)wq, (THREAD_UNINT));
1896 workqueue_unlock(wq);
1897
1898 thread_block(THREAD_CONTINUE_NULL);
1899
1900 workqueue_lock_spin(wq);
1901 }
1902 /*
1903 * Prevent _workqueue_mark_exiting() from going away
1904 */
1905 wq->wq_lflags |= WQL_ATIMER_BUSY;
1906
1907 /*
1908 * Decide which timer we are and remove the RUNNING flag.
1909 */
1910 if (thread_call_self == wq->wq_atimer_delayed_call) {
1911 uint64_t wq_flags = _wq_flags_and_orig(wq, ~WQ_ATIMER_DELAYED_RUNNING);
1912 if ((wq_flags & WQ_ATIMER_DELAYED_RUNNING) == 0) {
1913 panic("workqueue_add_timer(delayed) w/o WQ_ATIMER_DELAYED_RUNNING");
1914 }
1915 } else if (thread_call_self == wq->wq_atimer_immediate_call) {
1916 uint64_t wq_flags = _wq_flags_and_orig(wq, ~WQ_ATIMER_IMMEDIATE_RUNNING);
1917 if ((wq_flags & WQ_ATIMER_IMMEDIATE_RUNNING) == 0) {
1918 panic("workqueue_add_timer(immediate) w/o WQ_ATIMER_IMMEDIATE_RUNNING");
1919 }
1920 } else {
1921 panic("workqueue_add_timer can't figure out which timer it is");
1922 }
1923
1924 int ret = WQ_RUN_TR_THREAD_STARTED;
1925 while (ret == WQ_RUN_TR_THREAD_STARTED && wq->wq_reqcount) {
1926 ret = workqueue_run_threadreq_and_unlock(p, wq, NULL, NULL, true);
1927
1928 workqueue_lock_spin(wq);
1929 }
1930 _threadreq_copy_prepare(wq);
1931
1932 /*
1933 * If we called WQ_TIMER_NEEDED above, then this flag will be set if that
1934 * call marked the timer running. If so, we let the timer interval grow.
1935 * Otherwise, we reset it back to 0.
1936 */
1937 uint32_t wq_flags = _wq_flags(wq);
1938 if (!(wq_flags & WQ_ATIMER_DELAYED_RUNNING)) {
1939 wq->wq_timer_interval = 0;
1940 }
1941
1942 wq->wq_lflags &= ~WQL_ATIMER_BUSY;
1943
1944 if ((wq_flags & WQ_EXITING) || (wq->wq_lflags & WQL_ATIMER_WAITING)) {
1945 /*
1946 * wakeup the thread hung up in _workqueue_mark_exiting or
1947 * workqueue_add_timer waiting for this timer to finish getting out of
1948 * the way
1949 */
1950 wq->wq_lflags &= ~WQL_ATIMER_WAITING;
1951 wakeup(wq);
1952 }
1953
1954 PTHREAD_TRACE_WQ(TRACE_wq_add_timer | DBG_FUNC_END, wq, 0, wq->wq_nthreads, wq->wq_thidlecount, 0);
1955
1956 workqueue_unlock(wq);
1957 }
1958
1959 #pragma mark thread state tracking
1960
1961 // called by spinlock code when trying to yield to lock owner
1962 void
1963 _workqueue_thread_yielded(void)
1964 {
1965 }
1966
1967 static void
1968 workqueue_callback(int type, thread_t thread)
1969 {
1970 struct uthread *uth = pthread_kern->get_bsdthread_info(thread);
1971 struct threadlist *tl = pthread_kern->uthread_get_threadlist(uth);
1972 struct workqueue *wq = tl->th_workq;
1973 uint32_t old_count, req_qos, qos = tl->th_priority;
1974 wq_thactive_t old_thactive;
1975
1976 switch (type) {
1977 case SCHED_CALL_BLOCK: {
1978 bool start_timer = false;
1979
1980 old_thactive = _wq_thactive_dec(wq, tl->th_priority);
1981 req_qos = WQ_THACTIVE_BEST_CONSTRAINED_REQ_QOS(old_thactive);
1982 old_count = _wq_thactive_aggregate_downto_qos(wq, old_thactive,
1983 qos, NULL, NULL);
1984
1985 if (old_count == wq_max_concurrency[tl->th_priority]) {
1986 /*
1987 * The number of active threads at this priority has fallen below
1988 * the maximum number of concurrent threads that are allowed to run
1989 *
1990 * if we collide with another thread trying to update the
1991 * last_blocked (really unlikely since another thread would have to
1992 * get scheduled and then block after we start down this path), it's
1993 * not a problem. Either timestamp is adequate, so no need to retry
1994 */
1995 atomic_store_explicit(&wq->wq_lastblocked_ts[qos],
1996 mach_absolute_time(), memory_order_relaxed);
1997 }
1998
1999 if (req_qos == WORKQUEUE_EVENT_MANAGER_BUCKET || qos > req_qos) {
2000 /*
2001 * The blocking thread is at a lower QoS than the highest currently
2002 * pending constrained request, nothing has to be redriven
2003 */
2004 } else {
2005 uint32_t max_busycount, old_req_count;
2006 old_req_count = _wq_thactive_aggregate_downto_qos(wq, old_thactive,
2007 req_qos, NULL, &max_busycount);
2008 /*
2009 * If it is possible that may_start_constrained_thread had refused
2010 * admission due to being over the max concurrency, we may need to
2011 * spin up a new thread.
2012 *
2013 * We take into account the maximum number of busy threads
2014 * that can affect may_start_constrained_thread as looking at the
2015 * actual number may_start_constrained_thread will see is racy.
2016 *
2017 * IOW at NCPU = 4, for IN (req_qos = 1), if the old req count is
2018 * between NCPU (4) and NCPU - 2 (2) we need to redrive.
2019 */
2020 if (wq_max_concurrency[req_qos] <= old_req_count + max_busycount &&
2021 old_req_count <= wq_max_concurrency[req_qos]) {
2022 if (WQ_TIMER_DELAYED_NEEDED(wq)) {
2023 start_timer = true;
2024 workqueue_interval_timer_start(wq);
2025 }
2026 }
2027 }
2028
2029 PTHREAD_TRACE_WQ(TRACE_wq_thread_block | DBG_FUNC_START, wq,
2030 old_count - 1, qos | (req_qos << 8),
2031 wq->wq_reqcount << 1 | start_timer, 0);
2032 break;
2033 }
2034 case SCHED_CALL_UNBLOCK: {
2035 /*
2036 * we cannot take the workqueue_lock here...
2037 * an UNBLOCK can occur from a timer event which
2038 * is run from an interrupt context... if the workqueue_lock
2039 * is already held by this processor, we'll deadlock...
2040 * the thread lock for the thread being UNBLOCKED
2041 * is also held
2042 */
2043 old_thactive = _wq_thactive_inc(wq, qos);
2044 if (pthread_debug_tracing) {
2045 req_qos = WQ_THACTIVE_BEST_CONSTRAINED_REQ_QOS(old_thactive);
2046 old_count = _wq_thactive_aggregate_downto_qos(wq, old_thactive,
2047 qos, NULL, NULL);
2048 PTHREAD_TRACE_WQ(TRACE_wq_thread_block | DBG_FUNC_END, wq,
2049 old_count + 1, qos | (req_qos << 8),
2050 wq->wq_threads_scheduled, 0);
2051 }
2052 break;
2053 }
2054 }
2055 }
2056
2057 sched_call_t
2058 _workqueue_get_sched_callback(void)
2059 {
2060 return workqueue_callback;
2061 }
2062
2063 #pragma mark thread addition/removal
2064
2065 static mach_vm_size_t
2066 _workqueue_allocsize(struct workqueue *wq)
2067 {
2068 proc_t p = wq->wq_proc;
2069 mach_vm_size_t guardsize = vm_map_page_size(wq->wq_map);
2070 mach_vm_size_t pthread_size =
2071 vm_map_round_page_mask(pthread_kern->proc_get_pthsize(p) + PTHREAD_T_OFFSET, vm_map_page_mask(wq->wq_map));
2072 return guardsize + PTH_DEFAULT_STACKSIZE + pthread_size;
2073 }
2074
2075 /**
2076 * pop goes the thread
2077 *
2078 * If fromexit is set, the call is from workqueue_exit(,
2079 * so some cleanups are to be avoided.
2080 */
2081 static void
2082 workqueue_removethread(struct threadlist *tl, bool fromexit, bool first_use)
2083 {
2084 struct uthread * uth;
2085 struct workqueue * wq = tl->th_workq;
2086
2087 if (tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET){
2088 TAILQ_REMOVE(&wq->wq_thidlemgrlist, tl, th_entry);
2089 } else {
2090 TAILQ_REMOVE(&wq->wq_thidlelist, tl, th_entry);
2091 }
2092
2093 if (fromexit == 0) {
2094 assert(wq->wq_nthreads && wq->wq_thidlecount);
2095 wq->wq_nthreads--;
2096 wq->wq_thidlecount--;
2097 }
2098
2099 /*
2100 * Clear the threadlist pointer in uthread so
2101 * blocked thread on wakeup for termination will
2102 * not access the thread list as it is going to be
2103 * freed.
2104 */
2105 pthread_kern->thread_sched_call(tl->th_thread, NULL);
2106
2107 uth = pthread_kern->get_bsdthread_info(tl->th_thread);
2108 if (uth != (struct uthread *)0) {
2109 pthread_kern->uthread_set_threadlist(uth, NULL);
2110 }
2111 if (fromexit == 0) {
2112 /* during exit the lock is not held */
2113 workqueue_unlock(wq);
2114 }
2115
2116 if ( (tl->th_flags & TH_LIST_NEW) || first_use ) {
2117 /*
2118 * thread was created, but never used...
2119 * need to clean up the stack and port ourselves
2120 * since we're not going to spin up through the
2121 * normal exit path triggered from Libc
2122 */
2123 if (fromexit == 0) {
2124 /* vm map is already deallocated when this is called from exit */
2125 (void)mach_vm_deallocate(wq->wq_map, tl->th_stackaddr, _workqueue_allocsize(wq));
2126 }
2127 (void)pthread_kern->mach_port_deallocate(pthread_kern->task_get_ipcspace(wq->wq_task), tl->th_thport);
2128 }
2129 /*
2130 * drop our ref on the thread
2131 */
2132 thread_deallocate(tl->th_thread);
2133
2134 zfree(pthread_zone_threadlist, tl);
2135 }
2136
2137
2138 /**
2139 * Try to add a new workqueue thread.
2140 *
2141 * - called with workq lock held
2142 * - dropped and retaken around thread creation
2143 * - return with workq lock held
2144 */
2145 static bool
2146 workqueue_addnewthread(proc_t p, struct workqueue *wq)
2147 {
2148 kern_return_t kret;
2149
2150 wq->wq_nthreads++;
2151
2152 workqueue_unlock(wq);
2153
2154 struct threadlist *tl = zalloc(pthread_zone_threadlist);
2155 bzero(tl, sizeof(struct threadlist));
2156
2157 thread_t th;
2158 kret = pthread_kern->thread_create_workq_waiting(wq->wq_task, wq_unpark_continue, tl, &th);
2159 if (kret != KERN_SUCCESS) {
2160 PTHREAD_TRACE_WQ(TRACE_wq_thread_create_failed | DBG_FUNC_NONE, wq, kret, 0, 0, 0);
2161 goto fail_free;
2162 }
2163
2164 mach_vm_offset_t stackaddr = pthread_kern->proc_get_stack_addr_hint(p);
2165
2166 mach_vm_size_t guardsize = vm_map_page_size(wq->wq_map);
2167 mach_vm_size_t pthread_size =
2168 vm_map_round_page_mask(pthread_kern->proc_get_pthsize(p) + PTHREAD_T_OFFSET, vm_map_page_mask(wq->wq_map));
2169 mach_vm_size_t th_allocsize = guardsize + PTH_DEFAULT_STACKSIZE + pthread_size;
2170
2171 kret = mach_vm_map(wq->wq_map, &stackaddr,
2172 th_allocsize, page_size-1,
2173 VM_MAKE_TAG(VM_MEMORY_STACK)| VM_FLAGS_ANYWHERE, NULL,
2174 0, FALSE, VM_PROT_DEFAULT, VM_PROT_ALL,
2175 VM_INHERIT_DEFAULT);
2176
2177 if (kret != KERN_SUCCESS) {
2178 kret = mach_vm_allocate(wq->wq_map,
2179 &stackaddr, th_allocsize,
2180 VM_MAKE_TAG(VM_MEMORY_STACK) | VM_FLAGS_ANYWHERE);
2181 }
2182
2183 if (kret != KERN_SUCCESS) {
2184 PTHREAD_TRACE_WQ(TRACE_wq_thread_create_failed | DBG_FUNC_NONE, wq, kret, 1, 0, 0);
2185 goto fail_terminate;
2186 }
2187
2188 /*
2189 * The guard page is at the lowest address
2190 * The stack base is the highest address
2191 */
2192 kret = mach_vm_protect(wq->wq_map, stackaddr, guardsize, FALSE, VM_PROT_NONE);
2193 if (kret != KERN_SUCCESS) {
2194 PTHREAD_TRACE_WQ(TRACE_wq_thread_create_failed | DBG_FUNC_NONE, wq, kret, 2, 0, 0);
2195 goto fail_vm_deallocate;
2196 }
2197
2198
2199 pthread_kern->thread_set_tag(th, THREAD_TAG_PTHREAD | THREAD_TAG_WORKQUEUE);
2200 pthread_kern->thread_static_param(th, TRUE);
2201
2202 /*
2203 * convert_thread_to_port() consumes a reference
2204 */
2205 thread_reference(th);
2206 void *sright = (void *)pthread_kern->convert_thread_to_port(th);
2207 tl->th_thport = pthread_kern->ipc_port_copyout_send(sright,
2208 pthread_kern->task_get_ipcspace(wq->wq_task));
2209
2210 tl->th_flags = TH_LIST_INITED | TH_LIST_NEW;
2211 tl->th_thread = th;
2212 tl->th_workq = wq;
2213 tl->th_stackaddr = stackaddr;
2214 tl->th_priority = WORKQUEUE_NUM_BUCKETS;
2215
2216 struct uthread *uth;
2217 uth = pthread_kern->get_bsdthread_info(tl->th_thread);
2218
2219 workqueue_lock_spin(wq);
2220
2221 void *current_tl = pthread_kern->uthread_get_threadlist(uth);
2222 if (current_tl == NULL) {
2223 pthread_kern->uthread_set_threadlist(uth, tl);
2224 TAILQ_INSERT_TAIL(&wq->wq_thidlelist, tl, th_entry);
2225 wq->wq_thidlecount++;
2226 } else if (current_tl == WQ_THREADLIST_EXITING_POISON) {
2227 /*
2228 * Failed thread creation race: The thread already woke up and has exited.
2229 */
2230 PTHREAD_TRACE_WQ(TRACE_wq_thread_create_failed | DBG_FUNC_NONE, wq, kret, 3, 0, 0);
2231 goto fail_unlock;
2232 } else {
2233 panic("Unexpected initial threadlist value");
2234 }
2235
2236 PTHREAD_TRACE_WQ(TRACE_wq_thread_create | DBG_FUNC_NONE, wq, 0, 0, 0, 0);
2237
2238 return (TRUE);
2239
2240 fail_unlock:
2241 workqueue_unlock(wq);
2242 (void)pthread_kern->mach_port_deallocate(pthread_kern->task_get_ipcspace(wq->wq_task),
2243 tl->th_thport);
2244
2245 fail_vm_deallocate:
2246 (void) mach_vm_deallocate(wq->wq_map, stackaddr, th_allocsize);
2247
2248 fail_terminate:
2249 if (pthread_kern->thread_will_park_or_terminate) {
2250 pthread_kern->thread_will_park_or_terminate(th);
2251 }
2252 (void)thread_terminate(th);
2253 thread_deallocate(th);
2254
2255 fail_free:
2256 zfree(pthread_zone_threadlist, tl);
2257
2258 workqueue_lock_spin(wq);
2259 wq->wq_nthreads--;
2260
2261 return (FALSE);
2262 }
2263
2264 /**
2265 * Setup per-process state for the workqueue.
2266 */
2267 int
2268 _workq_open(struct proc *p, __unused int32_t *retval)
2269 {
2270 struct workqueue * wq;
2271 char * ptr;
2272 uint32_t num_cpus;
2273 int error = 0;
2274
2275 if (pthread_kern->proc_get_register(p) == 0) {
2276 return EINVAL;
2277 }
2278
2279 num_cpus = pthread_kern->ml_get_max_cpus();
2280
2281 if (wq_init_constrained_limit) {
2282 uint32_t limit;
2283 /*
2284 * set up the limit for the constrained pool
2285 * this is a virtual pool in that we don't
2286 * maintain it on a separate idle and run list
2287 */
2288 limit = num_cpus * WORKQUEUE_CONSTRAINED_FACTOR;
2289
2290 if (limit > wq_max_constrained_threads)
2291 wq_max_constrained_threads = limit;
2292
2293 wq_init_constrained_limit = 0;
2294
2295 if (wq_max_threads > WQ_THACTIVE_BUCKET_HALF) {
2296 wq_max_threads = WQ_THACTIVE_BUCKET_HALF;
2297 }
2298 if (wq_max_threads > pthread_kern->config_thread_max - 20) {
2299 wq_max_threads = pthread_kern->config_thread_max - 20;
2300 }
2301 }
2302
2303 if (pthread_kern->proc_get_wqptr(p) == NULL) {
2304 if (pthread_kern->proc_init_wqptr_or_wait(p) == FALSE) {
2305 assert(pthread_kern->proc_get_wqptr(p) != NULL);
2306 goto out;
2307 }
2308
2309 ptr = (char *)zalloc(pthread_zone_workqueue);
2310 bzero(ptr, sizeof(struct workqueue));
2311
2312 wq = (struct workqueue *)ptr;
2313 wq->wq_proc = p;
2314 wq->wq_task = current_task();
2315 wq->wq_map = pthread_kern->current_map();
2316
2317 // Start the event manager at the priority hinted at by the policy engine
2318 int mgr_priority_hint = pthread_kern->task_get_default_manager_qos(current_task());
2319 wq->wq_event_manager_priority = (uint32_t)thread_qos_get_pthread_priority(mgr_priority_hint) | _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG;
2320
2321 TAILQ_INIT(&wq->wq_thrunlist);
2322 TAILQ_INIT(&wq->wq_thidlelist);
2323 for (int i = 0; i < WORKQUEUE_EVENT_MANAGER_BUCKET; i++) {
2324 TAILQ_INIT(&wq->wq_overcommit_reqlist[i]);
2325 TAILQ_INIT(&wq->wq_reqlist[i]);
2326 }
2327
2328 wq->wq_atimer_delayed_call =
2329 thread_call_allocate_with_priority((thread_call_func_t)workqueue_add_timer,
2330 (thread_call_param_t)wq, THREAD_CALL_PRIORITY_KERNEL);
2331 wq->wq_atimer_immediate_call =
2332 thread_call_allocate_with_priority((thread_call_func_t)workqueue_add_timer,
2333 (thread_call_param_t)wq, THREAD_CALL_PRIORITY_KERNEL);
2334
2335 lck_spin_init(&wq->wq_lock, pthread_lck_grp, pthread_lck_attr);
2336
2337 wq->wq_cached_threadreq = zalloc(pthread_zone_threadreq);
2338 *(wq_thactive_t *)&wq->wq_thactive =
2339 (wq_thactive_t)WQ_THACTIVE_NO_PENDING_REQUEST <<
2340 WQ_THACTIVE_QOS_SHIFT;
2341
2342 pthread_kern->proc_set_wqptr(p, wq);
2343
2344 }
2345 out:
2346
2347 return(error);
2348 }
2349
2350 /*
2351 * Routine: workqueue_mark_exiting
2352 *
2353 * Function: Mark the work queue such that new threads will not be added to the
2354 * work queue after we return.
2355 *
2356 * Conditions: Called against the current process.
2357 */
2358 void
2359 _workqueue_mark_exiting(struct proc *p)
2360 {
2361 struct workqueue *wq = pthread_kern->proc_get_wqptr(p);
2362 if (!wq) return;
2363
2364 PTHREAD_TRACE_WQ(TRACE_wq_pthread_exit|DBG_FUNC_START, wq, 0, 0, 0, 0);
2365
2366 workqueue_lock_spin(wq);
2367
2368 /*
2369 * We arm the add timer without holding the workqueue lock so we need
2370 * to synchronize with any running or soon to be running timers.
2371 *
2372 * Threads that intend to arm the timer atomically OR
2373 * WQ_ATIMER_{DELAYED,IMMEDIATE}_RUNNING into the wq_flags, only if
2374 * WQ_EXITING is not present. So, once we have set WQ_EXITING, we can
2375 * be sure that no new RUNNING flags will be set, but still need to
2376 * wait for the already running timers to complete.
2377 *
2378 * We always hold the workq lock when dropping WQ_ATIMER_RUNNING, so
2379 * the check for and sleep until clear is protected.
2380 */
2381 uint64_t wq_flags = _wq_flags_or_orig(wq, WQ_EXITING);
2382
2383 if (wq_flags & WQ_ATIMER_DELAYED_RUNNING) {
2384 if (thread_call_cancel(wq->wq_atimer_delayed_call) == TRUE) {
2385 wq_flags = _wq_flags_and_orig(wq, ~WQ_ATIMER_DELAYED_RUNNING);
2386 }
2387 }
2388 if (wq_flags & WQ_ATIMER_IMMEDIATE_RUNNING) {
2389 if (thread_call_cancel(wq->wq_atimer_immediate_call) == TRUE) {
2390 wq_flags = _wq_flags_and_orig(wq, ~WQ_ATIMER_IMMEDIATE_RUNNING);
2391 }
2392 }
2393 while ((_wq_flags(wq) & (WQ_ATIMER_DELAYED_RUNNING | WQ_ATIMER_IMMEDIATE_RUNNING)) ||
2394 (wq->wq_lflags & WQL_ATIMER_BUSY)) {
2395 assert_wait((caddr_t)wq, (THREAD_UNINT));
2396 workqueue_unlock(wq);
2397
2398 thread_block(THREAD_CONTINUE_NULL);
2399
2400 workqueue_lock_spin(wq);
2401 }
2402
2403 /*
2404 * Save off pending requests, will complete/free them below after unlocking
2405 */
2406 TAILQ_HEAD(, threadreq) local_list = TAILQ_HEAD_INITIALIZER(local_list);
2407
2408 for (int i = 0; i < WORKQUEUE_EVENT_MANAGER_BUCKET; i++) {
2409 TAILQ_CONCAT(&local_list, &wq->wq_overcommit_reqlist[i], tr_entry);
2410 TAILQ_CONCAT(&local_list, &wq->wq_reqlist[i], tr_entry);
2411 }
2412
2413 /*
2414 * XXX: Can't deferred cancel the event manager request, so just smash it.
2415 */
2416 assert((wq->wq_event_manager_threadreq.tr_flags & TR_FLAG_WORKLOOP) == 0);
2417 wq->wq_event_manager_threadreq.tr_state = TR_STATE_DEAD;
2418
2419 workqueue_unlock(wq);
2420
2421 struct threadreq *tr, *tr_temp;
2422 TAILQ_FOREACH_SAFE(tr, &local_list, tr_entry, tr_temp) {
2423 _threadreq_cancel(wq, tr);
2424 }
2425 PTHREAD_TRACE(TRACE_wq_pthread_exit|DBG_FUNC_END, 0, 0, 0, 0, 0);
2426 }
2427
2428 /*
2429 * Routine: workqueue_exit
2430 *
2431 * Function: clean up the work queue structure(s) now that there are no threads
2432 * left running inside the work queue (except possibly current_thread).
2433 *
2434 * Conditions: Called by the last thread in the process.
2435 * Called against current process.
2436 */
2437 void
2438 _workqueue_exit(struct proc *p)
2439 {
2440 struct workqueue * wq;
2441 struct threadlist * tl, *tlist;
2442 struct uthread *uth;
2443
2444 wq = pthread_kern->proc_get_wqptr(p);
2445 if (wq != NULL) {
2446
2447 PTHREAD_TRACE_WQ(TRACE_wq_workqueue_exit|DBG_FUNC_START, wq, 0, 0, 0, 0);
2448
2449 pthread_kern->proc_set_wqptr(p, NULL);
2450
2451 /*
2452 * Clean up workqueue data structures for threads that exited and
2453 * didn't get a chance to clean up after themselves.
2454 */
2455 TAILQ_FOREACH_SAFE(tl, &wq->wq_thrunlist, th_entry, tlist) {
2456 assert((tl->th_flags & TH_LIST_RUNNING) != 0);
2457
2458 pthread_kern->thread_sched_call(tl->th_thread, NULL);
2459
2460 uth = pthread_kern->get_bsdthread_info(tl->th_thread);
2461 if (uth != (struct uthread *)0) {
2462 pthread_kern->uthread_set_threadlist(uth, NULL);
2463 }
2464 TAILQ_REMOVE(&wq->wq_thrunlist, tl, th_entry);
2465
2466 /*
2467 * drop our last ref on the thread
2468 */
2469 thread_deallocate(tl->th_thread);
2470
2471 zfree(pthread_zone_threadlist, tl);
2472 }
2473 TAILQ_FOREACH_SAFE(tl, &wq->wq_thidlelist, th_entry, tlist) {
2474 assert((tl->th_flags & TH_LIST_RUNNING) == 0);
2475 assert(tl->th_priority != WORKQUEUE_EVENT_MANAGER_BUCKET);
2476 workqueue_removethread(tl, true, false);
2477 }
2478 TAILQ_FOREACH_SAFE(tl, &wq->wq_thidlemgrlist, th_entry, tlist) {
2479 assert((tl->th_flags & TH_LIST_RUNNING) == 0);
2480 assert(tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET);
2481 workqueue_removethread(tl, true, false);
2482 }
2483 if (wq->wq_cached_threadreq) {
2484 zfree(pthread_zone_threadreq, wq->wq_cached_threadreq);
2485 }
2486 thread_call_free(wq->wq_atimer_delayed_call);
2487 thread_call_free(wq->wq_atimer_immediate_call);
2488 lck_spin_destroy(&wq->wq_lock, pthread_lck_grp);
2489
2490 for (int i = 0; i < WORKQUEUE_EVENT_MANAGER_BUCKET; i++) {
2491 assert(TAILQ_EMPTY(&wq->wq_overcommit_reqlist[i]));
2492 assert(TAILQ_EMPTY(&wq->wq_reqlist[i]));
2493 }
2494
2495 zfree(pthread_zone_workqueue, wq);
2496
2497 PTHREAD_TRACE(TRACE_wq_workqueue_exit|DBG_FUNC_END, 0, 0, 0, 0, 0);
2498 }
2499 }
2500
2501
2502 #pragma mark workqueue thread manipulation
2503
2504
2505 /**
2506 * Entry point for libdispatch to ask for threads
2507 */
2508 static int
2509 wqops_queue_reqthreads(struct proc *p, int reqcount,
2510 pthread_priority_t priority)
2511 {
2512 bool overcommit = _pthread_priority_get_flags(priority) & _PTHREAD_PRIORITY_OVERCOMMIT_FLAG;
2513 bool event_manager = _pthread_priority_get_flags(priority) & _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG;
2514 int class = event_manager ? WORKQUEUE_EVENT_MANAGER_BUCKET :
2515 pthread_priority_get_class_index(priority);
2516
2517 if ((reqcount <= 0) || (class < 0) || (class >= WORKQUEUE_NUM_BUCKETS) ||
2518 (overcommit && event_manager)) {
2519 return EINVAL;
2520 }
2521
2522 struct workqueue *wq;
2523 if ((wq = (struct workqueue *)pthread_kern->proc_get_wqptr(p)) == NULL) {
2524 return EINVAL;
2525 }
2526
2527 workqueue_lock_spin(wq);
2528 _threadreq_copy_prepare(wq);
2529
2530 PTHREAD_TRACE_WQ(TRACE_wq_wqops_reqthreads | DBG_FUNC_NONE, wq, reqcount, priority, 0, 0);
2531
2532 int tr_flags = 0;
2533 if (overcommit) tr_flags |= TR_FLAG_OVERCOMMIT;
2534 if (reqcount > 1) {
2535 /*
2536 * when libdispatch asks for more than one thread, it wants to achieve
2537 * parallelism. Pacing would be detrimental to this ask, so treat
2538 * these specially to not do the pacing admission check
2539 */
2540 tr_flags |= TR_FLAG_NO_PACING;
2541 }
2542
2543 while (reqcount-- && !_wq_exiting(wq)) {
2544 struct threadreq req;
2545 _threadreq_init_stack(&req, class, tr_flags);
2546
2547 workqueue_run_threadreq_and_unlock(p, wq, NULL, &req, true);
2548
2549 workqueue_lock_spin(wq); /* reacquire */
2550 _threadreq_copy_prepare(wq);
2551 }
2552
2553 workqueue_unlock(wq);
2554
2555 return 0;
2556 }
2557
2558 /*
2559 * Used by the kevent system to request threads.
2560 *
2561 * Currently count is ignored and we always return one thread per invocation.
2562 */
2563 static thread_t
2564 _workq_kevent_reqthreads(struct proc *p, pthread_priority_t priority,
2565 bool no_emergency)
2566 {
2567 int wq_run_tr = WQ_RUN_TR_THROTTLED;
2568 bool emergency_thread = false;
2569 struct threadreq req;
2570
2571
2572 struct workqueue *wq;
2573 if ((wq = (struct workqueue *)pthread_kern->proc_get_wqptr(p)) == NULL) {
2574 return THREAD_NULL;
2575 }
2576
2577 int class = pthread_priority_get_class_index(priority);
2578
2579 workqueue_lock_spin(wq);
2580 bool has_threadreq = _threadreq_copy_prepare_noblock(wq);
2581
2582 PTHREAD_TRACE_WQ_REQ(TRACE_wq_kevent_reqthreads | DBG_FUNC_NONE, wq, NULL, priority, 0, 0);
2583
2584 /*
2585 * Skip straight to event manager if that's what was requested
2586 */
2587 if ((_pthread_priority_get_qos_newest(priority) == QOS_CLASS_UNSPECIFIED) ||
2588 (_pthread_priority_get_flags(priority) & _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG)){
2589 goto event_manager;
2590 }
2591
2592 bool will_pace = _wq_should_pace_priority(wq, class);
2593 if ((wq->wq_thidlecount == 0 || will_pace) && has_threadreq == false) {
2594 /*
2595 * We'll need to persist the request and can't, so return the emergency
2596 * thread instead, which has a persistent request object.
2597 */
2598 emergency_thread = true;
2599 goto event_manager;
2600 }
2601
2602 /*
2603 * Handle overcommit requests
2604 */
2605 if ((_pthread_priority_get_flags(priority) & _PTHREAD_PRIORITY_OVERCOMMIT_FLAG) != 0){
2606 _threadreq_init_stack(&req, class, TR_FLAG_KEVENT | TR_FLAG_OVERCOMMIT);
2607 wq_run_tr = workqueue_run_threadreq_and_unlock(p, wq, NULL, &req, false);
2608 goto done;
2609 }
2610
2611 /*
2612 * Handle constrained requests
2613 */
2614 boolean_t may_start = may_start_constrained_thread(wq, class, NULL, false);
2615 if (may_start || no_emergency) {
2616 _threadreq_init_stack(&req, class, TR_FLAG_KEVENT);
2617 wq_run_tr = workqueue_run_threadreq_and_unlock(p, wq, NULL, &req, false);
2618 goto done;
2619 } else {
2620 emergency_thread = true;
2621 }
2622
2623
2624 event_manager:
2625 _threadreq_init_stack(&req, WORKQUEUE_EVENT_MANAGER_BUCKET, TR_FLAG_KEVENT);
2626 wq_run_tr = workqueue_run_threadreq_and_unlock(p, wq, NULL, &req, false);
2627
2628 done:
2629 if (wq_run_tr == WQ_RUN_TR_THREAD_NEEDED && WQ_TIMER_IMMEDIATE_NEEDED(wq)) {
2630 workqueue_interval_timer_trigger(wq);
2631 }
2632 return emergency_thread ? (void*)-1 : 0;
2633 }
2634
2635 thread_t
2636 _workq_reqthreads(struct proc *p, __assert_only int requests_count,
2637 workq_reqthreads_req_t request)
2638 {
2639 assert(requests_count == 1);
2640
2641 pthread_priority_t priority = request->priority;
2642 bool no_emergency = request->count & WORKQ_REQTHREADS_NOEMERGENCY;
2643
2644 return _workq_kevent_reqthreads(p, priority, no_emergency);
2645 }
2646
2647
2648 int
2649 workq_kern_threadreq(struct proc *p, workq_threadreq_t _req,
2650 enum workq_threadreq_type type, unsigned long priority, int flags)
2651 {
2652 struct workqueue *wq;
2653 int ret;
2654
2655 if ((wq = (struct workqueue *)pthread_kern->proc_get_wqptr(p)) == NULL) {
2656 return EINVAL;
2657 }
2658
2659 switch (type) {
2660 case WORKQ_THREADREQ_KEVENT: {
2661 bool no_emergency = flags & WORKQ_THREADREQ_FLAG_NOEMERGENCY;
2662 (void)_workq_kevent_reqthreads(p, priority, no_emergency);
2663 return 0;
2664 }
2665 case WORKQ_THREADREQ_WORKLOOP:
2666 case WORKQ_THREADREQ_WORKLOOP_NO_THREAD_CALL: {
2667 struct threadreq *req = (struct threadreq *)_req;
2668 int req_class = pthread_priority_get_class_index(priority);
2669 int req_flags = TR_FLAG_WORKLOOP;
2670 if ((_pthread_priority_get_flags(priority) &
2671 _PTHREAD_PRIORITY_OVERCOMMIT_FLAG) != 0){
2672 req_flags |= TR_FLAG_OVERCOMMIT;
2673 }
2674
2675 thread_t thread = current_thread();
2676 struct threadlist *tl = util_get_thread_threadlist_entry(thread);
2677
2678 if (tl && tl != WQ_THREADLIST_EXITING_POISON &&
2679 (tl->th_flags & TH_LIST_UNBINDING)) {
2680 /*
2681 * we're called back synchronously from the context of
2682 * kevent_qos_internal_unbind from within wqops_thread_return()
2683 * we can try to match up this thread with this request !
2684 */
2685 } else {
2686 tl = NULL;
2687 }
2688
2689 _threadreq_init_alloced(req, req_class, req_flags);
2690 workqueue_lock_spin(wq);
2691 PTHREAD_TRACE_WQ_REQ(TRACE_wq_kevent_reqthreads | DBG_FUNC_NONE, wq, req, priority, 1, 0);
2692 ret = workqueue_run_threadreq_and_unlock(p, wq, tl, req, false);
2693 if (ret == WQ_RUN_TR_EXITING) {
2694 return ECANCELED;
2695 }
2696 if (ret == WQ_RUN_TR_THREAD_NEEDED) {
2697 if (type == WORKQ_THREADREQ_WORKLOOP_NO_THREAD_CALL) {
2698 return EAGAIN;
2699 }
2700 if (WQ_TIMER_IMMEDIATE_NEEDED(wq)) {
2701 workqueue_interval_timer_trigger(wq);
2702 }
2703 }
2704 return 0;
2705 }
2706 case WORKQ_THREADREQ_REDRIVE:
2707 PTHREAD_TRACE_WQ_REQ(TRACE_wq_kevent_reqthreads | DBG_FUNC_NONE, wq, 0, 0, 4, 0);
2708 workqueue_lock_spin(wq);
2709 ret = workqueue_run_threadreq_and_unlock(p, wq, NULL, NULL, true);
2710 if (ret == WQ_RUN_TR_EXITING) {
2711 return ECANCELED;
2712 }
2713 return 0;
2714 default:
2715 return ENOTSUP;
2716 }
2717 }
2718
2719 int
2720 workq_kern_threadreq_modify(struct proc *p, workq_threadreq_t _req,
2721 enum workq_threadreq_op operation, unsigned long arg1,
2722 unsigned long __unused arg2)
2723 {
2724 struct threadreq *req = (struct threadreq *)_req;
2725 struct workqueue *wq;
2726 int priclass, ret = 0, wq_tr_rc = WQ_RUN_TR_THROTTLED;
2727
2728 if (req == NULL || (wq = pthread_kern->proc_get_wqptr(p)) == NULL) {
2729 return EINVAL;
2730 }
2731
2732 workqueue_lock_spin(wq);
2733
2734 if (_wq_exiting(wq)) {
2735 ret = ECANCELED;
2736 goto out_unlock;
2737 }
2738
2739 /*
2740 * Find/validate the referenced request structure
2741 */
2742 if (req->tr_state != TR_STATE_WAITING) {
2743 ret = EINVAL;
2744 goto out_unlock;
2745 }
2746 assert(req->tr_priority < WORKQUEUE_EVENT_MANAGER_BUCKET);
2747 assert(req->tr_flags & TR_FLAG_WORKLOOP);
2748
2749 switch (operation) {
2750 case WORKQ_THREADREQ_CHANGE_PRI:
2751 case WORKQ_THREADREQ_CHANGE_PRI_NO_THREAD_CALL:
2752 priclass = pthread_priority_get_class_index(arg1);
2753 PTHREAD_TRACE_WQ_REQ(TRACE_wq_kevent_reqthreads | DBG_FUNC_NONE, wq, req, arg1, 2, 0);
2754 if (req->tr_priority == priclass) {
2755 goto out_unlock;
2756 }
2757 _threadreq_dequeue(wq, req);
2758 req->tr_priority = priclass;
2759 req->tr_state = TR_STATE_NEW; // what was old is new again
2760 wq_tr_rc = workqueue_run_threadreq_and_unlock(p, wq, NULL, req, false);
2761 goto out;
2762
2763 case WORKQ_THREADREQ_CANCEL:
2764 PTHREAD_TRACE_WQ_REQ(TRACE_wq_kevent_reqthreads | DBG_FUNC_NONE, wq, req, 0, 3, 0);
2765 _threadreq_dequeue(wq, req);
2766 req->tr_state = TR_STATE_DEAD;
2767 break;
2768
2769 default:
2770 ret = ENOTSUP;
2771 break;
2772 }
2773
2774 out_unlock:
2775 workqueue_unlock(wq);
2776 out:
2777 if (wq_tr_rc == WQ_RUN_TR_THREAD_NEEDED) {
2778 if (operation == WORKQ_THREADREQ_CHANGE_PRI_NO_THREAD_CALL) {
2779 ret = EAGAIN;
2780 } else if (WQ_TIMER_IMMEDIATE_NEEDED(wq)) {
2781 workqueue_interval_timer_trigger(wq);
2782 }
2783 }
2784 return ret;
2785 }
2786
2787
2788 static int
2789 wqops_thread_return(struct proc *p, struct workqueue *wq)
2790 {
2791 thread_t th = current_thread();
2792 struct uthread *uth = pthread_kern->get_bsdthread_info(th);
2793 struct threadlist *tl = pthread_kern->uthread_get_threadlist(uth);
2794
2795 /* reset signal mask on the workqueue thread to default state */
2796 if (pthread_kern->uthread_get_sigmask(uth) != (sigset_t)(~workq_threadmask)) {
2797 pthread_kern->proc_lock(p);
2798 pthread_kern->uthread_set_sigmask(uth, ~workq_threadmask);
2799 pthread_kern->proc_unlock(p);
2800 }
2801
2802 if (wq == NULL || !tl) {
2803 return EINVAL;
2804 }
2805
2806 PTHREAD_TRACE_WQ(TRACE_wq_override_reset | DBG_FUNC_START, tl->th_workq, 0, 0, 0, 0);
2807
2808 /*
2809 * This squash call has neat semantics: it removes the specified overrides,
2810 * replacing the current requested QoS with the previous effective QoS from
2811 * those overrides. This means we won't be preempted due to having our QoS
2812 * lowered. Of course, now our understanding of the thread's QoS is wrong,
2813 * so we'll adjust below.
2814 */
2815 bool was_manager = (tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET);
2816 int new_qos;
2817
2818 if (!was_manager) {
2819 new_qos = pthread_kern->proc_usynch_thread_qos_squash_override_for_resource(th,
2820 THREAD_QOS_OVERRIDE_RESOURCE_WILDCARD,
2821 THREAD_QOS_OVERRIDE_TYPE_DISPATCH_ASYNCHRONOUS_OVERRIDE);
2822 }
2823
2824 PTHREAD_TRACE_WQ(TRACE_wq_runitem | DBG_FUNC_END, wq, tl->th_priority, 0, 0, 0);
2825
2826 workqueue_lock_spin(wq);
2827
2828 if (tl->th_flags & TH_LIST_KEVENT_BOUND) {
2829 unsigned int flags = KEVENT_FLAG_WORKQ;
2830 if (was_manager) {
2831 flags |= KEVENT_FLAG_WORKQ_MANAGER;
2832 }
2833
2834 tl->th_flags |= TH_LIST_UNBINDING;
2835 workqueue_unlock(wq);
2836 kevent_qos_internal_unbind(p, class_index_get_thread_qos(tl->th_priority), th, flags);
2837 if (!(tl->th_flags & TH_LIST_UNBINDING)) {
2838 _setup_wqthread(p, th, wq, tl, WQ_SETUP_CLEAR_VOUCHER);
2839 pthread_kern->unix_syscall_return(EJUSTRETURN);
2840 __builtin_unreachable();
2841 }
2842 workqueue_lock_spin(wq);
2843 tl->th_flags &= ~(TH_LIST_KEVENT_BOUND | TH_LIST_UNBINDING);
2844 }
2845
2846 if (!was_manager) {
2847 /* Fix up counters from the squash operation. */
2848 uint8_t old_bucket = tl->th_priority;
2849 uint8_t new_bucket = thread_qos_get_class_index(new_qos);
2850
2851 if (old_bucket != new_bucket) {
2852 _wq_thactive_move(wq, old_bucket, new_bucket);
2853 wq->wq_thscheduled_count[old_bucket]--;
2854 wq->wq_thscheduled_count[new_bucket]++;
2855
2856 PTHREAD_TRACE_WQ(TRACE_wq_thread_squash | DBG_FUNC_NONE, wq, tl->th_priority, new_bucket, 0, 0);
2857 tl->th_priority = new_bucket;
2858 PTHREAD_TRACE_WQ(TRACE_wq_override_reset | DBG_FUNC_END, tl->th_workq, new_qos, 0, 0, 0);
2859 }
2860 }
2861
2862 workqueue_run_threadreq_and_unlock(p, wq, tl, NULL, false);
2863 return 0;
2864 }
2865
2866 /**
2867 * Multiplexed call to interact with the workqueue mechanism
2868 */
2869 int
2870 _workq_kernreturn(struct proc *p,
2871 int options,
2872 user_addr_t item,
2873 int arg2,
2874 int arg3,
2875 int32_t *retval)
2876 {
2877 struct workqueue *wq;
2878 int error = 0;
2879
2880 if (pthread_kern->proc_get_register(p) == 0) {
2881 return EINVAL;
2882 }
2883
2884 switch (options) {
2885 case WQOPS_QUEUE_NEWSPISUPP: {
2886 /*
2887 * arg2 = offset of serialno into dispatch queue
2888 * arg3 = kevent support
2889 */
2890 int offset = arg2;
2891 if (arg3 & 0x01){
2892 // If we get here, then userspace has indicated support for kevent delivery.
2893 }
2894
2895 pthread_kern->proc_set_dispatchqueue_serialno_offset(p, (uint64_t)offset);
2896 break;
2897 }
2898 case WQOPS_QUEUE_REQTHREADS: {
2899 /*
2900 * arg2 = number of threads to start
2901 * arg3 = priority
2902 */
2903 error = wqops_queue_reqthreads(p, arg2, arg3);
2904 break;
2905 }
2906 case WQOPS_SET_EVENT_MANAGER_PRIORITY: {
2907 /*
2908 * arg2 = priority for the manager thread
2909 *
2910 * if _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG is set, the
2911 * ~_PTHREAD_PRIORITY_FLAGS_MASK contains a scheduling priority instead
2912 * of a QOS value
2913 */
2914 pthread_priority_t pri = arg2;
2915
2916 wq = (struct workqueue *)pthread_kern->proc_get_wqptr(p);
2917 if (wq == NULL) {
2918 error = EINVAL;
2919 break;
2920 }
2921 workqueue_lock_spin(wq);
2922 if (pri & _PTHREAD_PRIORITY_SCHED_PRI_FLAG){
2923 /*
2924 * If userspace passes a scheduling priority, that takes precidence
2925 * over any QoS. (So, userspace should take care not to accidenatally
2926 * lower the priority this way.)
2927 */
2928 uint32_t sched_pri = pri & _PTHREAD_PRIORITY_SCHED_PRI_MASK;
2929 if (wq->wq_event_manager_priority & _PTHREAD_PRIORITY_SCHED_PRI_FLAG){
2930 wq->wq_event_manager_priority = MAX(sched_pri, wq->wq_event_manager_priority & _PTHREAD_PRIORITY_SCHED_PRI_MASK)
2931 | _PTHREAD_PRIORITY_SCHED_PRI_FLAG | _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG;
2932 } else {
2933 wq->wq_event_manager_priority = sched_pri
2934 | _PTHREAD_PRIORITY_SCHED_PRI_FLAG | _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG;
2935 }
2936 } else if ((wq->wq_event_manager_priority & _PTHREAD_PRIORITY_SCHED_PRI_FLAG) == 0){
2937 int cur_qos = pthread_priority_get_thread_qos(wq->wq_event_manager_priority);
2938 int new_qos = pthread_priority_get_thread_qos(pri);
2939 wq->wq_event_manager_priority = (uint32_t)thread_qos_get_pthread_priority(MAX(cur_qos, new_qos)) | _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG;
2940 }
2941 workqueue_unlock(wq);
2942 break;
2943 }
2944 case WQOPS_THREAD_KEVENT_RETURN:
2945 case WQOPS_THREAD_WORKLOOP_RETURN:
2946 wq = (struct workqueue *)pthread_kern->proc_get_wqptr(p);
2947 PTHREAD_TRACE_WQ(TRACE_wq_runthread | DBG_FUNC_END, wq, options, 0, 0, 0);
2948 if (item != 0 && arg2 != 0) {
2949 int32_t kevent_retval;
2950 int ret;
2951 if (options == WQOPS_THREAD_KEVENT_RETURN) {
2952 ret = kevent_qos_internal(p, -1, item, arg2, item, arg2, NULL, NULL,
2953 KEVENT_FLAG_WORKQ | KEVENT_FLAG_IMMEDIATE | KEVENT_FLAG_ERROR_EVENTS,
2954 &kevent_retval);
2955 } else /* options == WQOPS_THREAD_WORKLOOP_RETURN */ {
2956 kqueue_id_t kevent_id = -1;
2957 ret = kevent_id_internal(p, &kevent_id, item, arg2, item, arg2,
2958 NULL, NULL,
2959 KEVENT_FLAG_WORKLOOP | KEVENT_FLAG_IMMEDIATE | KEVENT_FLAG_ERROR_EVENTS,
2960 &kevent_retval);
2961 }
2962 /*
2963 * We shouldn't be getting more errors out than events we put in, so
2964 * reusing the input buffer should always provide enough space. But,
2965 * the assert is commented out since we get errors in edge cases in the
2966 * process lifecycle.
2967 */
2968 //assert(ret == KERN_SUCCESS && kevent_retval >= 0);
2969 if (ret != KERN_SUCCESS){
2970 error = ret;
2971 break;
2972 } else if (kevent_retval > 0){
2973 assert(kevent_retval <= arg2);
2974 *retval = kevent_retval;
2975 error = 0;
2976 break;
2977 }
2978 }
2979 goto thread_return;
2980
2981 case WQOPS_THREAD_RETURN:
2982 wq = (struct workqueue *)pthread_kern->proc_get_wqptr(p);
2983 PTHREAD_TRACE_WQ(TRACE_wq_runthread | DBG_FUNC_END, wq, options, 0, 0, 0);
2984 thread_return:
2985 error = wqops_thread_return(p, wq);
2986 // NOT REACHED except in case of error
2987 assert(error);
2988 break;
2989
2990 case WQOPS_SHOULD_NARROW: {
2991 /*
2992 * arg2 = priority to test
2993 * arg3 = unused
2994 */
2995 pthread_priority_t priority = arg2;
2996 thread_t th = current_thread();
2997 struct threadlist *tl = util_get_thread_threadlist_entry(th);
2998
2999 if (tl == NULL || (tl->th_flags & TH_LIST_CONSTRAINED) == 0) {
3000 error = EINVAL;
3001 break;
3002 }
3003
3004 int class = pthread_priority_get_class_index(priority);
3005 wq = tl->th_workq;
3006 workqueue_lock_spin(wq);
3007 bool should_narrow = !may_start_constrained_thread(wq, class, tl, false);
3008 workqueue_unlock(wq);
3009
3010 *retval = should_narrow;
3011 break;
3012 }
3013 default:
3014 error = EINVAL;
3015 break;
3016 }
3017
3018 switch (options) {
3019 case WQOPS_THREAD_KEVENT_RETURN:
3020 case WQOPS_THREAD_WORKLOOP_RETURN:
3021 case WQOPS_THREAD_RETURN:
3022 PTHREAD_TRACE_WQ(TRACE_wq_runthread | DBG_FUNC_START, wq, options, 0, 0, 0);
3023 break;
3024 }
3025 return (error);
3026 }
3027
3028 /*
3029 * We have no work to do, park ourselves on the idle list.
3030 *
3031 * Consumes the workqueue lock and does not return.
3032 */
3033 static void __dead2
3034 parkit(struct workqueue *wq, struct threadlist *tl, thread_t thread)
3035 {
3036 assert(thread == tl->th_thread);
3037 assert(thread == current_thread());
3038
3039 PTHREAD_TRACE_WQ(TRACE_wq_thread_park | DBG_FUNC_START, wq, 0, 0, 0, 0);
3040
3041 uint32_t us_to_wait = 0;
3042
3043 TAILQ_REMOVE(&wq->wq_thrunlist, tl, th_entry);
3044
3045 tl->th_flags &= ~TH_LIST_RUNNING;
3046 tl->th_flags &= ~TH_LIST_KEVENT;
3047 assert((tl->th_flags & TH_LIST_KEVENT_BOUND) == 0);
3048
3049 if (tl->th_flags & TH_LIST_CONSTRAINED) {
3050 wq->wq_constrained_threads_scheduled--;
3051 tl->th_flags &= ~TH_LIST_CONSTRAINED;
3052 }
3053
3054 _wq_thactive_dec(wq, tl->th_priority);
3055 wq->wq_thscheduled_count[tl->th_priority]--;
3056 wq->wq_threads_scheduled--;
3057 uint32_t thidlecount = ++wq->wq_thidlecount;
3058
3059 pthread_kern->thread_sched_call(thread, NULL);
3060
3061 /*
3062 * We'd like to always have one manager thread parked so that we can have
3063 * low latency when we need to bring a manager thread up. If that idle
3064 * thread list is empty, make this thread a manager thread.
3065 *
3066 * XXX: This doesn't check that there's not a manager thread outstanding,
3067 * so it's based on the assumption that most manager callouts will change
3068 * their QoS before parking. If that stops being true, this may end up
3069 * costing us more than we gain.
3070 */
3071 if (TAILQ_EMPTY(&wq->wq_thidlemgrlist) &&
3072 tl->th_priority != WORKQUEUE_EVENT_MANAGER_BUCKET){
3073 PTHREAD_TRACE_WQ(TRACE_wq_thread_reset_priority | DBG_FUNC_NONE,
3074 wq, thread_tid(thread),
3075 (tl->th_priority << 16) | WORKQUEUE_EVENT_MANAGER_BUCKET, 2, 0);
3076 reset_priority(tl, pthread_priority_from_wq_class_index(wq, WORKQUEUE_EVENT_MANAGER_BUCKET));
3077 tl->th_priority = WORKQUEUE_EVENT_MANAGER_BUCKET;
3078 }
3079
3080 if (tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET){
3081 TAILQ_INSERT_HEAD(&wq->wq_thidlemgrlist, tl, th_entry);
3082 } else {
3083 TAILQ_INSERT_HEAD(&wq->wq_thidlelist, tl, th_entry);
3084 }
3085
3086 /*
3087 * When we remove the voucher from the thread, we may lose our importance
3088 * causing us to get preempted, so we do this after putting the thread on
3089 * the idle list. That when, when we get our importance back we'll be able
3090 * to use this thread from e.g. the kevent call out to deliver a boosting
3091 * message.
3092 */
3093 tl->th_flags |= TH_LIST_REMOVING_VOUCHER;
3094 workqueue_unlock(wq);
3095 if (pthread_kern->thread_will_park_or_terminate) {
3096 pthread_kern->thread_will_park_or_terminate(tl->th_thread);
3097 }
3098 __assert_only kern_return_t kr;
3099 kr = pthread_kern->thread_set_voucher_name(MACH_PORT_NULL);
3100 assert(kr == KERN_SUCCESS);
3101 workqueue_lock_spin(wq);
3102 tl->th_flags &= ~(TH_LIST_REMOVING_VOUCHER);
3103
3104 if ((tl->th_flags & TH_LIST_RUNNING) == 0) {
3105 if (thidlecount < 101) {
3106 us_to_wait = wq_reduce_pool_window_usecs - ((thidlecount-2) * (wq_reduce_pool_window_usecs / 100));
3107 } else {
3108 us_to_wait = wq_reduce_pool_window_usecs / 100;
3109 }
3110
3111 thread_set_pending_block_hint(thread, kThreadWaitParkedWorkQueue);
3112 assert_wait_timeout_with_leeway((caddr_t)tl, (THREAD_INTERRUPTIBLE),
3113 TIMEOUT_URGENCY_SYS_BACKGROUND|TIMEOUT_URGENCY_LEEWAY, us_to_wait,
3114 wq_reduce_pool_window_usecs/10, NSEC_PER_USEC);
3115
3116 workqueue_unlock(wq);
3117
3118 thread_block(wq_unpark_continue);
3119 panic("thread_block(wq_unpark_continue) returned!");
3120 } else {
3121 workqueue_unlock(wq);
3122
3123 /*
3124 * While we'd dropped the lock to unset our voucher, someone came
3125 * around and made us runnable. But because we weren't waiting on the
3126 * event their wakeup() was ineffectual. To correct for that, we just
3127 * run the continuation ourselves.
3128 */
3129 wq_unpark_continue(NULL, THREAD_AWAKENED);
3130 }
3131 }
3132
3133 static bool
3134 may_start_constrained_thread(struct workqueue *wq, uint32_t at_priclass,
3135 struct threadlist *tl, bool may_start_timer)
3136 {
3137 uint32_t req_qos = _wq_thactive_best_constrained_req_qos(wq);
3138 wq_thactive_t thactive;
3139
3140 if (may_start_timer && at_priclass < req_qos) {
3141 /*
3142 * When called from workqueue_run_threadreq_and_unlock() pre-post newest
3143 * higher priorities into the thactive state so that
3144 * workqueue_callback() takes the right decision.
3145 *
3146 * If the admission check passes, workqueue_run_threadreq_and_unlock
3147 * will reset this value before running the request.
3148 */
3149 thactive = _wq_thactive_set_best_constrained_req_qos(wq, req_qos,
3150 at_priclass);
3151 #ifdef __LP64__
3152 PTHREAD_TRACE_WQ(TRACE_wq_thactive_update, 1, (uint64_t)thactive,
3153 (uint64_t)(thactive >> 64), 0, 0);
3154 #endif
3155 } else {
3156 thactive = _wq_thactive(wq);
3157 }
3158
3159 uint32_t constrained_threads = wq->wq_constrained_threads_scheduled;
3160 if (tl && (tl->th_flags & TH_LIST_CONSTRAINED)) {
3161 /*
3162 * don't count the current thread as scheduled
3163 */
3164 constrained_threads--;
3165 }
3166 if (constrained_threads >= wq_max_constrained_threads) {
3167 PTHREAD_TRACE_WQ(TRACE_wq_constrained_admission | DBG_FUNC_NONE, wq, 1,
3168 wq->wq_constrained_threads_scheduled,
3169 wq_max_constrained_threads, 0);
3170 /*
3171 * we need 1 or more constrained threads to return to the kernel before
3172 * we can dispatch additional work
3173 */
3174 return false;
3175 }
3176
3177 /*
3178 * Compute a metric for many how many threads are active. We find the
3179 * highest priority request outstanding and then add up the number of
3180 * active threads in that and all higher-priority buckets. We'll also add
3181 * any "busy" threads which are not active but blocked recently enough that
3182 * we can't be sure they've gone idle yet. We'll then compare this metric
3183 * to our max concurrency to decide whether to add a new thread.
3184 */
3185
3186 uint32_t busycount, thactive_count;
3187
3188 thactive_count = _wq_thactive_aggregate_downto_qos(wq, thactive,
3189 at_priclass, &busycount, NULL);
3190
3191 if (tl && tl->th_priority <= at_priclass) {
3192 /*
3193 * don't count this thread as currently active
3194 */
3195 assert(thactive_count > 0);
3196 thactive_count--;
3197 }
3198
3199 if (thactive_count + busycount < wq_max_concurrency[at_priclass]) {
3200 PTHREAD_TRACE_WQ(TRACE_wq_constrained_admission | DBG_FUNC_NONE, wq, 2,
3201 thactive_count, busycount, 0);
3202 return true;
3203 } else {
3204 PTHREAD_TRACE_WQ(TRACE_wq_constrained_admission | DBG_FUNC_NONE, wq, 3,
3205 thactive_count, busycount, 0);
3206 }
3207
3208 if (busycount && may_start_timer) {
3209 /*
3210 * If this is called from the add timer, we won't have another timer
3211 * fire when the thread exits the "busy" state, so rearm the timer.
3212 */
3213 if (WQ_TIMER_DELAYED_NEEDED(wq)) {
3214 workqueue_interval_timer_start(wq);
3215 }
3216 }
3217
3218 return false;
3219 }
3220
3221 static struct threadlist *
3222 pop_from_thidlelist(struct workqueue *wq, uint32_t priclass)
3223 {
3224 assert(wq->wq_thidlecount);
3225
3226 struct threadlist *tl = NULL;
3227
3228 if (!TAILQ_EMPTY(&wq->wq_thidlemgrlist) &&
3229 (priclass == WORKQUEUE_EVENT_MANAGER_BUCKET || TAILQ_EMPTY(&wq->wq_thidlelist))){
3230 tl = TAILQ_FIRST(&wq->wq_thidlemgrlist);
3231 TAILQ_REMOVE(&wq->wq_thidlemgrlist, tl, th_entry);
3232 assert(tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET);
3233 } else if (!TAILQ_EMPTY(&wq->wq_thidlelist) &&
3234 (priclass != WORKQUEUE_EVENT_MANAGER_BUCKET || TAILQ_EMPTY(&wq->wq_thidlemgrlist))){
3235 tl = TAILQ_FIRST(&wq->wq_thidlelist);
3236 TAILQ_REMOVE(&wq->wq_thidlelist, tl, th_entry);
3237 assert(tl->th_priority != WORKQUEUE_EVENT_MANAGER_BUCKET);
3238 } else {
3239 panic("pop_from_thidlelist called with no threads available");
3240 }
3241 assert((tl->th_flags & TH_LIST_RUNNING) == 0);
3242
3243 assert(wq->wq_thidlecount);
3244 wq->wq_thidlecount--;
3245
3246 TAILQ_INSERT_TAIL(&wq->wq_thrunlist, tl, th_entry);
3247
3248 tl->th_flags |= TH_LIST_RUNNING | TH_LIST_BUSY;
3249
3250 wq->wq_threads_scheduled++;
3251 wq->wq_thscheduled_count[priclass]++;
3252 _wq_thactive_inc(wq, priclass);
3253 return tl;
3254 }
3255
3256 static pthread_priority_t
3257 pthread_priority_from_wq_class_index(struct workqueue *wq, int index)
3258 {
3259 if (index == WORKQUEUE_EVENT_MANAGER_BUCKET){
3260 return wq->wq_event_manager_priority;
3261 } else {
3262 return class_index_get_pthread_priority(index);
3263 }
3264 }
3265
3266 static void
3267 reset_priority(struct threadlist *tl, pthread_priority_t pri)
3268 {
3269 kern_return_t ret;
3270 thread_t th = tl->th_thread;
3271
3272 if ((pri & _PTHREAD_PRIORITY_SCHED_PRI_FLAG) == 0){
3273 ret = pthread_kern->thread_set_workq_qos(th, pthread_priority_get_thread_qos(pri), 0);
3274 assert(ret == KERN_SUCCESS || ret == KERN_TERMINATED);
3275
3276 if (tl->th_flags & TH_LIST_EVENT_MGR_SCHED_PRI) {
3277
3278 /* Reset priority to default (masked by QoS) */
3279
3280 ret = pthread_kern->thread_set_workq_pri(th, 31, POLICY_TIMESHARE);
3281 assert(ret == KERN_SUCCESS || ret == KERN_TERMINATED);
3282
3283 tl->th_flags &= ~TH_LIST_EVENT_MGR_SCHED_PRI;
3284 }
3285 } else {
3286 ret = pthread_kern->thread_set_workq_qos(th, THREAD_QOS_UNSPECIFIED, 0);
3287 assert(ret == KERN_SUCCESS || ret == KERN_TERMINATED);
3288 ret = pthread_kern->thread_set_workq_pri(th, (pri & (~_PTHREAD_PRIORITY_FLAGS_MASK)), POLICY_TIMESHARE);
3289 assert(ret == KERN_SUCCESS || ret == KERN_TERMINATED);
3290
3291 tl->th_flags |= TH_LIST_EVENT_MGR_SCHED_PRI;
3292 }
3293 }
3294
3295 /*
3296 * Picks the best request to run, and returns the best overcommit fallback
3297 * if the best pick is non overcommit and risks failing its admission check.
3298 */
3299 static struct threadreq *
3300 workqueue_best_threadreqs(struct workqueue *wq, struct threadlist *tl,
3301 struct threadreq **fallback)
3302 {
3303 struct threadreq *req, *best_req = NULL;
3304 int priclass, prilimit;
3305
3306 if ((wq->wq_event_manager_threadreq.tr_state == TR_STATE_WAITING) &&
3307 ((wq->wq_thscheduled_count[WORKQUEUE_EVENT_MANAGER_BUCKET] == 0) ||
3308 (tl && tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET))) {
3309 /*
3310 * There's an event manager request and either:
3311 * - no event manager currently running
3312 * - we are re-using the event manager
3313 */
3314 req = &wq->wq_event_manager_threadreq;
3315 PTHREAD_TRACE_WQ_REQ(TRACE_wq_run_threadreq_req_select | DBG_FUNC_NONE, wq, req, 1, 0, 0);
3316 return req;
3317 }
3318
3319 if (tl) {
3320 prilimit = WORKQUEUE_EVENT_MANAGER_BUCKET;
3321 } else {
3322 prilimit = _wq_highest_paced_priority(wq);
3323 }
3324 for (priclass = 0; priclass < prilimit; priclass++) {
3325 req = TAILQ_FIRST(&wq->wq_overcommit_reqlist[priclass]);
3326 if (req) {
3327 PTHREAD_TRACE_WQ_REQ(TRACE_wq_run_threadreq_req_select | DBG_FUNC_NONE, wq, req, 2, 0, 0);
3328 if (best_req) {
3329 *fallback = req;
3330 } else {
3331 best_req = req;
3332 }
3333 break;
3334 }
3335 if (!best_req) {
3336 best_req = TAILQ_FIRST(&wq->wq_reqlist[priclass]);
3337 if (best_req) {
3338 PTHREAD_TRACE_WQ_REQ(TRACE_wq_run_threadreq_req_select | DBG_FUNC_NONE, wq, best_req, 3, 0, 0);
3339 }
3340 }
3341 }
3342 return best_req;
3343 }
3344
3345 /**
3346 * Runs a thread request on a thread
3347 *
3348 * - if thread is THREAD_NULL, will find a thread and run the request there.
3349 * Otherwise, the thread must be the current thread.
3350 *
3351 * - if req is NULL, will find the highest priority request and run that. If
3352 * it is not NULL, it must be a threadreq object in state NEW. If it can not
3353 * be run immediately, it will be enqueued and moved to state WAITING.
3354 *
3355 * Either way, the thread request object serviced will be moved to state
3356 * PENDING and attached to the threadlist.
3357 *
3358 * Should be called with the workqueue lock held. Will drop it.
3359 *
3360 * WARNING: _workq_kevent_reqthreads needs to be able to preflight any
3361 * admission checks in this function. If you are changing this function,
3362 * keep that one up-to-date.
3363 *
3364 * - if parking_tl is non NULL, then the current thread is parking. This will
3365 * try to reuse this thread for a request. If no match is found, it will be
3366 * parked.
3367 */
3368 static int
3369 workqueue_run_threadreq_and_unlock(proc_t p, struct workqueue *wq,
3370 struct threadlist *parking_tl, struct threadreq *req,
3371 bool may_add_new_thread)
3372 {
3373 struct threadreq *incoming_req = req;
3374
3375 struct threadlist *tl = parking_tl;
3376 int rc = WQ_RUN_TR_THROTTLED;
3377
3378 assert(tl == NULL || tl->th_thread == current_thread());
3379 assert(req == NULL || req->tr_state == TR_STATE_NEW);
3380 assert(!may_add_new_thread || !tl);
3381
3382 PTHREAD_TRACE_WQ_REQ(TRACE_wq_run_threadreq | DBG_FUNC_START, wq, req,
3383 tl ? thread_tid(tl->th_thread) : 0,
3384 req ? (req->tr_priority << 16 | req->tr_flags) : 0, 0);
3385
3386 /*
3387 * Special cases when provided an event manager request
3388 */
3389 if (req && req->tr_priority == WORKQUEUE_EVENT_MANAGER_BUCKET) {
3390 // Clients must not rely on identity of event manager requests
3391 assert(req->tr_flags & TR_FLAG_ONSTACK);
3392 // You can't be both overcommit and event manager
3393 assert((req->tr_flags & TR_FLAG_OVERCOMMIT) == 0);
3394
3395 /*
3396 * We can only ever have one event manager request, so coalesce them if
3397 * there's already one outstanding.
3398 */
3399 if (wq->wq_event_manager_threadreq.tr_state == TR_STATE_WAITING) {
3400 PTHREAD_TRACE_WQ_REQ(TRACE_wq_run_threadreq_mgr_merge | DBG_FUNC_NONE, wq, req, 0, 0, 0);
3401
3402 struct threadreq *existing_req = &wq->wq_event_manager_threadreq;
3403 if (req->tr_flags & TR_FLAG_KEVENT) {
3404 existing_req->tr_flags |= TR_FLAG_KEVENT;
3405 }
3406
3407 req = existing_req;
3408 incoming_req = NULL;
3409 }
3410
3411 if (wq->wq_thscheduled_count[WORKQUEUE_EVENT_MANAGER_BUCKET] &&
3412 (!tl || tl->th_priority != WORKQUEUE_EVENT_MANAGER_BUCKET)){
3413 /*
3414 * There can only be one event manager running at a time.
3415 */
3416 PTHREAD_TRACE_WQ(TRACE_wq_run_threadreq | DBG_FUNC_END, wq, 1, 0, 0, 0);
3417 goto done;
3418 }
3419 }
3420
3421 again: // Start again after creating a thread
3422
3423 if (_wq_exiting(wq)) {
3424 rc = WQ_RUN_TR_EXITING;
3425 goto exiting;
3426 }
3427
3428 /*
3429 * Thread request selection and admission control
3430 */
3431 struct threadreq *fallback = NULL;
3432 if (req) {
3433 if ((req->tr_flags & TR_FLAG_NO_PACING) == 0 &&
3434 _wq_should_pace_priority(wq, req->tr_priority)) {
3435 /*
3436 * If a request fails the pacing admission check, then thread
3437 * requests are redriven when the pacing thread is finally scheduled
3438 * when it calls _wq_pacing_end() in wq_unpark_continue().
3439 */
3440 goto done;
3441 }
3442 } else if (wq->wq_reqcount == 0) {
3443 PTHREAD_TRACE_WQ(TRACE_wq_run_threadreq | DBG_FUNC_END, wq, 2, 0, 0, 0);
3444 goto done;
3445 } else if ((req = workqueue_best_threadreqs(wq, tl, &fallback)) == NULL) {
3446 PTHREAD_TRACE_WQ(TRACE_wq_run_threadreq | DBG_FUNC_END, wq, 3, 0, 0, 0);
3447 goto done;
3448 }
3449
3450 if ((req->tr_flags & TR_FLAG_OVERCOMMIT) == 0 &&
3451 (req->tr_priority < WORKQUEUE_EVENT_MANAGER_BUCKET)) {
3452 if (!may_start_constrained_thread(wq, req->tr_priority, parking_tl, true)) {
3453 if (!fallback) {
3454 PTHREAD_TRACE_WQ(TRACE_wq_run_threadreq | DBG_FUNC_END, wq, 4, 0, 0, 0);
3455 goto done;
3456 }
3457 assert(req->tr_state == TR_STATE_WAITING);
3458 req = fallback;
3459 }
3460 }
3461
3462 /*
3463 * Thread selection.
3464 */
3465 if (parking_tl) {
3466 if (tl->th_priority != req->tr_priority) {
3467 _wq_thactive_move(wq, tl->th_priority, req->tr_priority);
3468 wq->wq_thscheduled_count[tl->th_priority]--;
3469 wq->wq_thscheduled_count[req->tr_priority]++;
3470 }
3471 PTHREAD_TRACE_WQ(TRACE_wq_run_threadreq_thread_select | DBG_FUNC_NONE,
3472 wq, 1, thread_tid(tl->th_thread), 0, 0);
3473 } else if (wq->wq_thidlecount) {
3474 tl = pop_from_thidlelist(wq, req->tr_priority);
3475 /*
3476 * This call will update wq_thscheduled_count and wq_thactive_count for
3477 * the provided priority. It will not set the returned thread to that
3478 * priority. This matches the behavior of the parking_tl clause above.
3479 */
3480 PTHREAD_TRACE_WQ(TRACE_wq_run_threadreq_thread_select | DBG_FUNC_NONE,
3481 wq, 2, thread_tid(tl->th_thread), 0, 0);
3482 } else /* no idle threads */ {
3483 if (!may_add_new_thread || wq->wq_nthreads >= wq_max_threads) {
3484 PTHREAD_TRACE_WQ(TRACE_wq_run_threadreq | DBG_FUNC_END, wq, 5,
3485 may_add_new_thread, wq->wq_nthreads, 0);
3486 if (wq->wq_nthreads < wq_max_threads) {
3487 rc = WQ_RUN_TR_THREAD_NEEDED;
3488 }
3489 goto done;
3490 }
3491
3492 bool added_thread = workqueue_addnewthread(p, wq);
3493 /*
3494 * workqueue_addnewthread will drop and re-take the lock, so we
3495 * need to ensure we still have a cached request.
3496 *
3497 * It also means we have to pick a new request, since our old pick may
3498 * not be valid anymore.
3499 */
3500 req = incoming_req;
3501 if (req && (req->tr_flags & TR_FLAG_ONSTACK)) {
3502 _threadreq_copy_prepare(wq);
3503 }
3504
3505 if (added_thread) {
3506 PTHREAD_TRACE_WQ(TRACE_wq_run_threadreq_thread_select | DBG_FUNC_NONE,
3507 wq, 3, 0, 0, 0);
3508 goto again;
3509 } else if (_wq_exiting(wq)) {
3510 rc = WQ_RUN_TR_EXITING;
3511 goto exiting;
3512 } else {
3513 PTHREAD_TRACE_WQ(TRACE_wq_run_threadreq | DBG_FUNC_END, wq, 6, 0, 0, 0);
3514 /*
3515 * Something caused thread creation to fail. Kick off the timer in
3516 * the hope that it'll succeed next time.
3517 */
3518 if (WQ_TIMER_DELAYED_NEEDED(wq)) {
3519 workqueue_interval_timer_start(wq);
3520 }
3521 goto done;
3522 }
3523 }
3524
3525 /*
3526 * Setup thread, mark request as complete and run with it.
3527 */
3528 if (req->tr_state == TR_STATE_WAITING) {
3529 _threadreq_dequeue(wq, req);
3530 }
3531 if (tl->th_priority != req->tr_priority) {
3532 PTHREAD_TRACE_WQ(TRACE_wq_thread_reset_priority | DBG_FUNC_NONE,
3533 wq, thread_tid(tl->th_thread),
3534 (tl->th_priority << 16) | req->tr_priority, 1, 0);
3535 reset_priority(tl, pthread_priority_from_wq_class_index(wq, req->tr_priority));
3536 tl->th_priority = (uint8_t)req->tr_priority;
3537 }
3538 if (req->tr_flags & TR_FLAG_OVERCOMMIT) {
3539 if ((tl->th_flags & TH_LIST_CONSTRAINED) != 0) {
3540 tl->th_flags &= ~TH_LIST_CONSTRAINED;
3541 wq->wq_constrained_threads_scheduled--;
3542 }
3543 } else {
3544 if ((tl->th_flags & TH_LIST_CONSTRAINED) == 0) {
3545 tl->th_flags |= TH_LIST_CONSTRAINED;
3546 wq->wq_constrained_threads_scheduled++;
3547 }
3548 }
3549
3550 if (!parking_tl && !(req->tr_flags & TR_FLAG_NO_PACING)) {
3551 _wq_pacing_start(wq, tl);
3552 }
3553 if ((req->tr_flags & TR_FLAG_OVERCOMMIT) == 0) {
3554 uint32_t old_qos, new_qos;
3555
3556 /*
3557 * If we are scheduling a constrained thread request, we may need to
3558 * update the best constrained qos in the thactive atomic state.
3559 */
3560 for (new_qos = 0; new_qos < WQ_THACTIVE_NO_PENDING_REQUEST; new_qos++) {
3561 if (TAILQ_FIRST(&wq->wq_reqlist[new_qos]))
3562 break;
3563 }
3564 old_qos = _wq_thactive_best_constrained_req_qos(wq);
3565 if (old_qos != new_qos) {
3566 wq_thactive_t v = _wq_thactive_set_best_constrained_req_qos(wq,
3567 old_qos, new_qos);
3568 #ifdef __LP64__
3569 PTHREAD_TRACE_WQ(TRACE_wq_thactive_update, 2, (uint64_t)v,
3570 (uint64_t)(v >> 64), 0, 0);
3571 #else
3572 PTHREAD_TRACE_WQ(TRACE_wq_thactive_update, 2, v, 0, 0, 0);
3573 #endif
3574 }
3575 }
3576 {
3577 uint32_t upcall_flags = WQ_FLAG_THREAD_NEWSPI;
3578 if (req->tr_flags & TR_FLAG_OVERCOMMIT)
3579 upcall_flags |= WQ_FLAG_THREAD_OVERCOMMIT;
3580 if (req->tr_flags & TR_FLAG_KEVENT)
3581 upcall_flags |= WQ_FLAG_THREAD_KEVENT;
3582 if (req->tr_flags & TR_FLAG_WORKLOOP)
3583 upcall_flags |= WQ_FLAG_THREAD_WORKLOOP | WQ_FLAG_THREAD_KEVENT;
3584 if (tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET)
3585 upcall_flags |= WQ_FLAG_THREAD_EVENT_MANAGER;
3586 tl->th_upcall_flags = upcall_flags >> WQ_FLAG_THREAD_PRIOSHIFT;
3587 }
3588 if (req->tr_flags & TR_FLAG_KEVENT) {
3589 tl->th_flags |= TH_LIST_KEVENT;
3590 } else {
3591 tl->th_flags &= ~TH_LIST_KEVENT;
3592 }
3593 return _threadreq_complete_and_unlock(p, wq, req, tl);
3594
3595 done:
3596 if (incoming_req) {
3597 _threadreq_enqueue(wq, incoming_req);
3598 }
3599
3600 exiting:
3601
3602 if (parking_tl && !(parking_tl->th_flags & TH_LIST_UNBINDING)) {
3603 parkit(wq, parking_tl, parking_tl->th_thread);
3604 __builtin_unreachable();
3605 }
3606
3607 workqueue_unlock(wq);
3608
3609 return rc;
3610 }
3611
3612 /**
3613 * parked thread wakes up
3614 */
3615 static void __dead2
3616 wq_unpark_continue(void* __unused ptr, wait_result_t wait_result)
3617 {
3618 boolean_t first_use = false;
3619 thread_t th = current_thread();
3620 proc_t p = current_proc();
3621
3622 struct uthread *uth = pthread_kern->get_bsdthread_info(th);
3623 if (uth == NULL) goto done;
3624
3625 struct workqueue *wq = pthread_kern->proc_get_wqptr(p);
3626 if (wq == NULL) goto done;
3627
3628 workqueue_lock_spin(wq);
3629
3630 struct threadlist *tl = pthread_kern->uthread_get_threadlist(uth);
3631 assert(tl != WQ_THREADLIST_EXITING_POISON);
3632 if (tl == NULL) {
3633 /*
3634 * We woke up before addnewthread() was finished setting us up. Go
3635 * ahead and exit, but before we do poison the threadlist variable so
3636 * that addnewthread() doesn't think we are valid still.
3637 */
3638 pthread_kern->uthread_set_threadlist(uth, WQ_THREADLIST_EXITING_POISON);
3639 workqueue_unlock(wq);
3640 goto done;
3641 }
3642
3643 assert(tl->th_flags & TH_LIST_INITED);
3644
3645 if ((tl->th_flags & TH_LIST_NEW)){
3646 tl->th_flags &= ~(TH_LIST_NEW);
3647 first_use = true;
3648 }
3649
3650 if ((tl->th_flags & (TH_LIST_RUNNING | TH_LIST_BUSY)) == TH_LIST_RUNNING) {
3651 /*
3652 * The normal wakeup path.
3653 */
3654 goto return_to_user;
3655 }
3656
3657 if ((tl->th_flags & TH_LIST_RUNNING) == 0 &&
3658 wait_result == THREAD_TIMED_OUT &&
3659 tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET &&
3660 TAILQ_FIRST(&wq->wq_thidlemgrlist) == tl &&
3661 TAILQ_NEXT(tl, th_entry) == NULL){
3662 /*
3663 * If we are the only idle manager and we pop'ed for self-destruction,
3664 * then don't actually exit. Instead, free our stack to save some
3665 * memory and re-park.
3666 */
3667
3668 workqueue_unlock(wq);
3669
3670 vm_map_t vmap = wq->wq_map;
3671
3672 // Keep this in sync with _setup_wqthread()
3673 const vm_size_t guardsize = vm_map_page_size(vmap);
3674 const user_addr_t freeaddr = (user_addr_t)tl->th_stackaddr + guardsize;
3675 const vm_map_offset_t freesize = vm_map_trunc_page_mask((PTH_DEFAULT_STACKSIZE + guardsize + PTHREAD_T_OFFSET) - 1, vm_map_page_mask(vmap)) - guardsize;
3676
3677 __assert_only int kr = mach_vm_behavior_set(vmap, freeaddr, freesize, VM_BEHAVIOR_REUSABLE);
3678 #if MACH_ASSERT
3679 if (kr != KERN_SUCCESS && kr != KERN_INVALID_ADDRESS) {
3680 os_log_error(OS_LOG_DEFAULT, "unable to make thread stack reusable (kr: %d)", kr);
3681 }
3682 #endif
3683
3684 workqueue_lock_spin(wq);
3685
3686 if ( !(tl->th_flags & TH_LIST_RUNNING)) {
3687 thread_set_pending_block_hint(th, kThreadWaitParkedWorkQueue);
3688 assert_wait((caddr_t)tl, (THREAD_INTERRUPTIBLE));
3689
3690 workqueue_unlock(wq);
3691
3692 thread_block(wq_unpark_continue);
3693 __builtin_unreachable();
3694 }
3695 }
3696
3697 if ((tl->th_flags & TH_LIST_RUNNING) == 0) {
3698 assert((tl->th_flags & TH_LIST_BUSY) == 0);
3699 if (!first_use) {
3700 PTHREAD_TRACE_WQ(TRACE_wq_thread_park | DBG_FUNC_END, wq, 0, 0, 0, 0);
3701 }
3702 /*
3703 * We were set running, but not for the purposes of actually running.
3704 * This could be because the timer elapsed. Or it could be because the
3705 * thread aborted. Either way, we need to return to userspace to exit.
3706 *
3707 * The call to workqueue_removethread will consume the lock.
3708 */
3709
3710 if (!first_use &&
3711 (tl->th_priority < qos_class_get_class_index(WQ_THREAD_CLEANUP_QOS) ||
3712 (tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET))) {
3713 // Reset the QoS to something low for the pthread cleanup
3714 PTHREAD_TRACE_WQ(TRACE_wq_thread_reset_priority | DBG_FUNC_NONE,
3715 wq, thread_tid(th),
3716 (tl->th_priority << 16) | qos_class_get_class_index(WQ_THREAD_CLEANUP_QOS), 3, 0);
3717 pthread_priority_t cleanup_pri = _pthread_priority_make_newest(WQ_THREAD_CLEANUP_QOS, 0, 0);
3718 reset_priority(tl, cleanup_pri);
3719 }
3720
3721 workqueue_removethread(tl, 0, first_use);
3722
3723 if (first_use){
3724 pthread_kern->thread_bootstrap_return();
3725 } else {
3726 pthread_kern->unix_syscall_return(0);
3727 }
3728 __builtin_unreachable();
3729 }
3730
3731 /*
3732 * The timer woke us up or the thread was aborted. However, we have
3733 * already started to make this a runnable thread. Wait for that to
3734 * finish, then continue to userspace.
3735 */
3736 while ((tl->th_flags & TH_LIST_BUSY)) {
3737 assert_wait((caddr_t)tl, (THREAD_UNINT));
3738
3739 workqueue_unlock(wq);
3740
3741 thread_block(THREAD_CONTINUE_NULL);
3742
3743 workqueue_lock_spin(wq);
3744 }
3745
3746 return_to_user:
3747 if (!first_use) {
3748 PTHREAD_TRACE_WQ(TRACE_wq_thread_park | DBG_FUNC_END, wq, 0, 0, 0, 0);
3749 }
3750 if (_wq_pacing_end(wq, tl) && wq->wq_reqcount) {
3751 workqueue_run_threadreq_and_unlock(p, wq, NULL, NULL, true);
3752 } else {
3753 workqueue_unlock(wq);
3754 }
3755 _setup_wqthread(p, th, wq, tl, first_use ? WQ_SETUP_FIRST_USE : 0);
3756 pthread_kern->thread_sched_call(th, workqueue_callback);
3757 done:
3758 if (first_use){
3759 pthread_kern->thread_bootstrap_return();
3760 } else {
3761 pthread_kern->unix_syscall_return(EJUSTRETURN);
3762 }
3763 panic("Our attempt to return to userspace failed...");
3764 }
3765
3766 /**
3767 * configures initial thread stack/registers to jump into:
3768 * _pthread_wqthread(pthread_t self, mach_port_t kport, void *stackaddr, void *keventlist, int upcall_flags, int nkevents);
3769 * to get there we jump through assembily stubs in pthread_asm.s. Those
3770 * routines setup a stack frame, using the current stack pointer, and marshall
3771 * arguments from registers to the stack as required by the ABI.
3772 *
3773 * One odd thing we do here is to start the pthread_t 4k below what would be the
3774 * top of the stack otherwise. This is because usually only the first 4k of the
3775 * pthread_t will be used and so we want to put it on the same 16k page as the
3776 * top of the stack to save memory.
3777 *
3778 * When we are done the stack will look like:
3779 * |-----------| th_stackaddr + th_allocsize
3780 * |pthread_t | th_stackaddr + DEFAULT_STACKSIZE + guardsize + PTHREAD_STACK_OFFSET
3781 * |kevent list| optionally - at most WQ_KEVENT_LIST_LEN events
3782 * |kevent data| optionally - at most WQ_KEVENT_DATA_SIZE bytes
3783 * |stack gap | bottom aligned to 16 bytes, and at least as big as stack_gap_min
3784 * | STACK |
3785 * | ⇓ |
3786 * | |
3787 * |guard page | guardsize
3788 * |-----------| th_stackaddr
3789 */
3790 void
3791 _setup_wqthread(proc_t p, thread_t th, struct workqueue *wq,
3792 struct threadlist *tl, int setup_flags)
3793 {
3794 int error;
3795 if (setup_flags & WQ_SETUP_CLEAR_VOUCHER) {
3796 /*
3797 * For preemption reasons, we want to reset the voucher as late as
3798 * possible, so we do it in two places:
3799 * - Just before parking (i.e. in parkit())
3800 * - Prior to doing the setup for the next workitem (i.e. here)
3801 *
3802 * Those two places are sufficient to ensure we always reset it before
3803 * it goes back out to user space, but be careful to not break that
3804 * guarantee.
3805 */
3806 __assert_only kern_return_t kr;
3807 kr = pthread_kern->thread_set_voucher_name(MACH_PORT_NULL);
3808 assert(kr == KERN_SUCCESS);
3809 }
3810
3811 uint32_t upcall_flags = tl->th_upcall_flags << WQ_FLAG_THREAD_PRIOSHIFT;
3812 if (!(setup_flags & WQ_SETUP_FIRST_USE)) {
3813 upcall_flags |= WQ_FLAG_THREAD_REUSE;
3814 }
3815
3816 /*
3817 * Put the QoS class value into the lower bits of the reuse_thread register, this is where
3818 * the thread priority used to be stored anyway.
3819 */
3820 pthread_priority_t priority = pthread_priority_from_wq_class_index(wq, tl->th_priority);
3821 upcall_flags |= (_pthread_priority_get_qos_newest(priority) & WQ_FLAG_THREAD_PRIOMASK);
3822
3823 const vm_size_t guardsize = vm_map_page_size(tl->th_workq->wq_map);
3824 const vm_size_t stack_gap_min = (proc_is64bit(p) == 0) ? C_32_STK_ALIGN : C_64_REDZONE_LEN;
3825 const vm_size_t stack_align_min = (proc_is64bit(p) == 0) ? C_32_STK_ALIGN : C_64_STK_ALIGN;
3826
3827 user_addr_t pthread_self_addr = (user_addr_t)(tl->th_stackaddr + PTH_DEFAULT_STACKSIZE + guardsize + PTHREAD_T_OFFSET);
3828 user_addr_t stack_top_addr = (user_addr_t)((pthread_self_addr - stack_gap_min) & -stack_align_min);
3829 user_addr_t stack_bottom_addr = (user_addr_t)(tl->th_stackaddr + guardsize);
3830
3831 user_addr_t wqstart_fnptr = pthread_kern->proc_get_wqthread(p);
3832 if (!wqstart_fnptr) {
3833 panic("workqueue thread start function pointer is NULL");
3834 }
3835
3836 if (setup_flags & WQ_SETUP_FIRST_USE) {
3837 uint32_t tsd_offset = pthread_kern->proc_get_pthread_tsd_offset(p);
3838 if (tsd_offset) {
3839 mach_vm_offset_t th_tsd_base = (mach_vm_offset_t)pthread_self_addr + tsd_offset;
3840 kern_return_t kret = pthread_kern->thread_set_tsd_base(th, th_tsd_base);
3841 if (kret == KERN_SUCCESS) {
3842 upcall_flags |= WQ_FLAG_THREAD_TSD_BASE_SET;
3843 }
3844 }
3845
3846 /*
3847 * Pre-fault the first page of the new thread's stack and the page that will
3848 * contain the pthread_t structure.
3849 */
3850 vm_map_t vmap = pthread_kern->current_map();
3851 if (vm_map_trunc_page_mask((vm_map_offset_t)(stack_top_addr - C_64_REDZONE_LEN), vm_map_page_mask(vmap)) !=
3852 vm_map_trunc_page_mask((vm_map_offset_t)pthread_self_addr, vm_map_page_mask(vmap))){
3853 vm_fault( vmap,
3854 vm_map_trunc_page_mask((vm_map_offset_t)(stack_top_addr - C_64_REDZONE_LEN), vm_map_page_mask(vmap)),
3855 VM_PROT_READ | VM_PROT_WRITE,
3856 FALSE,
3857 THREAD_UNINT, NULL, 0);
3858 }
3859 vm_fault( vmap,
3860 vm_map_trunc_page_mask((vm_map_offset_t)pthread_self_addr, vm_map_page_mask(vmap)),
3861 VM_PROT_READ | VM_PROT_WRITE,
3862 FALSE,
3863 THREAD_UNINT, NULL, 0);
3864 }
3865
3866 user_addr_t kevent_list = NULL;
3867 int kevent_count = 0;
3868 if (upcall_flags & WQ_FLAG_THREAD_KEVENT){
3869 bool workloop = upcall_flags & WQ_FLAG_THREAD_WORKLOOP;
3870
3871 kevent_list = pthread_self_addr - WQ_KEVENT_LIST_LEN * sizeof(struct kevent_qos_s);
3872 kevent_count = WQ_KEVENT_LIST_LEN;
3873
3874 user_addr_t kevent_id_addr = kevent_list;
3875 if (workloop) {
3876 /*
3877 * The kevent ID goes just below the kevent list. Sufficiently new
3878 * userspace will know to look there. Old userspace will just
3879 * ignore it.
3880 */
3881 kevent_id_addr -= sizeof(kqueue_id_t);
3882 }
3883
3884 user_addr_t kevent_data_buf = kevent_id_addr - WQ_KEVENT_DATA_SIZE;
3885 user_size_t kevent_data_available = WQ_KEVENT_DATA_SIZE;
3886
3887 int32_t events_out = 0;
3888
3889 assert(tl->th_flags | TH_LIST_KEVENT_BOUND);
3890 unsigned int flags = KEVENT_FLAG_STACK_DATA | KEVENT_FLAG_IMMEDIATE;
3891 if (tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET) {
3892 flags |= KEVENT_FLAG_WORKQ_MANAGER;
3893 }
3894 int ret = 0;
3895 if (workloop) {
3896 flags |= KEVENT_FLAG_WORKLOOP;
3897 kqueue_id_t kevent_id = -1;
3898 ret = kevent_id_internal(p, &kevent_id,
3899 NULL, 0, kevent_list, kevent_count,
3900 kevent_data_buf, &kevent_data_available,
3901 flags, &events_out);
3902 copyout(&kevent_id, kevent_id_addr, sizeof(kevent_id));
3903 } else {
3904 flags |= KEVENT_FLAG_WORKQ;
3905 ret = kevent_qos_internal(p,
3906 class_index_get_thread_qos(tl->th_priority),
3907 NULL, 0, kevent_list, kevent_count,
3908 kevent_data_buf, &kevent_data_available,
3909 flags, &events_out);
3910 }
3911
3912 // squash any errors into just empty output
3913 if (ret != KERN_SUCCESS || events_out == -1){
3914 events_out = 0;
3915 kevent_data_available = WQ_KEVENT_DATA_SIZE;
3916 }
3917
3918 // We shouldn't get data out if there aren't events available
3919 assert(events_out != 0 || kevent_data_available == WQ_KEVENT_DATA_SIZE);
3920
3921 if (events_out > 0){
3922 if (kevent_data_available == WQ_KEVENT_DATA_SIZE){
3923 stack_top_addr = (kevent_id_addr - stack_gap_min) & -stack_align_min;
3924 } else {
3925 stack_top_addr = (kevent_data_buf + kevent_data_available - stack_gap_min) & -stack_align_min;
3926 }
3927
3928 kevent_count = events_out;
3929 } else {
3930 kevent_list = NULL;
3931 kevent_count = 0;
3932 }
3933 }
3934
3935 PTHREAD_TRACE_WQ(TRACE_wq_runthread | DBG_FUNC_START, wq, 0, 0, 0, 0);
3936
3937 #if defined(__i386__) || defined(__x86_64__)
3938 if (proc_is64bit(p) == 0) {
3939 x86_thread_state32_t state = {
3940 .eip = (unsigned int)wqstart_fnptr,
3941 .eax = /* arg0 */ (unsigned int)pthread_self_addr,
3942 .ebx = /* arg1 */ (unsigned int)tl->th_thport,
3943 .ecx = /* arg2 */ (unsigned int)stack_bottom_addr,
3944 .edx = /* arg3 */ (unsigned int)kevent_list,
3945 .edi = /* arg4 */ (unsigned int)upcall_flags,
3946 .esi = /* arg5 */ (unsigned int)kevent_count,
3947
3948 .esp = (int)((vm_offset_t)stack_top_addr),
3949 };
3950
3951 error = pthread_kern->thread_set_wq_state32(th, (thread_state_t)&state);
3952 if (error != KERN_SUCCESS) {
3953 panic(__func__ ": thread_set_wq_state failed: %d", error);
3954 }
3955 } else {
3956 x86_thread_state64_t state64 = {
3957 // x86-64 already passes all the arguments in registers, so we just put them in their final place here
3958 .rip = (uint64_t)wqstart_fnptr,
3959 .rdi = (uint64_t)pthread_self_addr,
3960 .rsi = (uint64_t)tl->th_thport,
3961 .rdx = (uint64_t)stack_bottom_addr,
3962 .rcx = (uint64_t)kevent_list,
3963 .r8 = (uint64_t)upcall_flags,
3964 .r9 = (uint64_t)kevent_count,
3965
3966 .rsp = (uint64_t)(stack_top_addr)
3967 };
3968
3969 error = pthread_kern->thread_set_wq_state64(th, (thread_state_t)&state64);
3970 if (error != KERN_SUCCESS) {
3971 panic(__func__ ": thread_set_wq_state failed: %d", error);
3972 }
3973 }
3974 #else
3975 #error setup_wqthread not defined for this architecture
3976 #endif
3977 }
3978
3979 #if DEBUG
3980 static int wq_kevent_test SYSCTL_HANDLER_ARGS {
3981 //(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req)
3982 #pragma unused(oidp, arg1, arg2)
3983 int error;
3984 struct workq_reqthreads_req_s requests[64] = {};
3985
3986 if (req->newlen > sizeof(requests) || req->newlen < sizeof(struct workq_reqthreads_req_s))
3987 return EINVAL;
3988
3989 error = copyin(req->newptr, requests, req->newlen);
3990 if (error) return error;
3991
3992 _workq_reqthreads(req->p, (int)(req->newlen / sizeof(struct workq_reqthreads_req_s)), requests);
3993
3994 return 0;
3995 }
3996 #endif // DEBUG
3997
3998 #pragma mark - Misc
3999
4000 int
4001 _fill_procworkqueue(proc_t p, struct proc_workqueueinfo * pwqinfo)
4002 {
4003 struct workqueue * wq;
4004 int error = 0;
4005 int activecount;
4006
4007 if ((wq = pthread_kern->proc_get_wqptr(p)) == NULL) {
4008 return EINVAL;
4009 }
4010
4011 /*
4012 * This is sometimes called from interrupt context by the kperf sampler.
4013 * In that case, it's not safe to spin trying to take the lock since we
4014 * might already hold it. So, we just try-lock it and error out if it's
4015 * already held. Since this is just a debugging aid, and all our callers
4016 * are able to handle an error, that's fine.
4017 */
4018 bool locked = workqueue_lock_try(wq);
4019 if (!locked) {
4020 return EBUSY;
4021 }
4022
4023 activecount = _wq_thactive_aggregate_downto_qos(wq, _wq_thactive(wq),
4024 WORKQUEUE_NUM_BUCKETS - 1, NULL, NULL);
4025 pwqinfo->pwq_nthreads = wq->wq_nthreads;
4026 pwqinfo->pwq_runthreads = activecount;
4027 pwqinfo->pwq_blockedthreads = wq->wq_threads_scheduled - activecount;
4028 pwqinfo->pwq_state = 0;
4029
4030 if (wq->wq_constrained_threads_scheduled >= wq_max_constrained_threads) {
4031 pwqinfo->pwq_state |= WQ_EXCEEDED_CONSTRAINED_THREAD_LIMIT;
4032 }
4033
4034 if (wq->wq_nthreads >= wq_max_threads) {
4035 pwqinfo->pwq_state |= WQ_EXCEEDED_TOTAL_THREAD_LIMIT;
4036 }
4037
4038 workqueue_unlock(wq);
4039 return(error);
4040 }
4041
4042 uint32_t
4043 _get_pwq_state_kdp(proc_t p)
4044 {
4045 if (p == NULL) {
4046 return 0;
4047 }
4048
4049 struct workqueue *wq = pthread_kern->proc_get_wqptr(p);
4050
4051 if (wq == NULL || workqueue_lock_spin_is_acquired_kdp(wq)) {
4052 return 0;
4053 }
4054
4055 uint32_t pwq_state = WQ_FLAGS_AVAILABLE;
4056
4057 if (wq->wq_constrained_threads_scheduled >= wq_max_constrained_threads) {
4058 pwq_state |= WQ_EXCEEDED_CONSTRAINED_THREAD_LIMIT;
4059 }
4060
4061 if (wq->wq_nthreads >= wq_max_threads) {
4062 pwq_state |= WQ_EXCEEDED_TOTAL_THREAD_LIMIT;
4063 }
4064
4065 return pwq_state;
4066 }
4067
4068 int
4069 _thread_selfid(__unused struct proc *p, uint64_t *retval)
4070 {
4071 thread_t thread = current_thread();
4072 *retval = thread_tid(thread);
4073 return KERN_SUCCESS;
4074 }
4075
4076 void
4077 _pthread_init(void)
4078 {
4079 pthread_lck_grp_attr = lck_grp_attr_alloc_init();
4080 pthread_lck_grp = lck_grp_alloc_init("pthread", pthread_lck_grp_attr);
4081
4082 /*
4083 * allocate the lock attribute for pthread synchronizers
4084 */
4085 pthread_lck_attr = lck_attr_alloc_init();
4086
4087 pthread_list_mlock = lck_mtx_alloc_init(pthread_lck_grp, pthread_lck_attr);
4088
4089 pth_global_hashinit();
4090 psynch_thcall = thread_call_allocate(psynch_wq_cleanup, NULL);
4091 psynch_zoneinit();
4092
4093 pthread_zone_workqueue = zinit(sizeof(struct workqueue),
4094 1024 * sizeof(struct workqueue), 8192, "pthread.workqueue");
4095 pthread_zone_threadlist = zinit(sizeof(struct threadlist),
4096 1024 * sizeof(struct threadlist), 8192, "pthread.threadlist");
4097 pthread_zone_threadreq = zinit(sizeof(struct threadreq),
4098 1024 * sizeof(struct threadreq), 8192, "pthread.threadreq");
4099
4100 int policy_bootarg;
4101 if (PE_parse_boot_argn("pthread_mutex_default_policy", &policy_bootarg, sizeof(policy_bootarg))) {
4102 pthread_mutex_default_policy = policy_bootarg;
4103 }
4104
4105 /*
4106 * register sysctls
4107 */
4108 sysctl_register_oid(&sysctl__kern_wq_stalled_window_usecs);
4109 sysctl_register_oid(&sysctl__kern_wq_reduce_pool_window_usecs);
4110 sysctl_register_oid(&sysctl__kern_wq_max_timer_interval_usecs);
4111 sysctl_register_oid(&sysctl__kern_wq_max_threads);
4112 sysctl_register_oid(&sysctl__kern_wq_max_constrained_threads);
4113 sysctl_register_oid(&sysctl__kern_pthread_debug_tracing);
4114 sysctl_register_oid(&sysctl__kern_pthread_mutex_default_policy);
4115
4116 #if DEBUG
4117 sysctl_register_oid(&sysctl__debug_wq_kevent_test);
4118 #endif
4119
4120 for (int i = 0; i < WORKQUEUE_NUM_BUCKETS; i++) {
4121 uint32_t thread_qos = _wq_bucket_to_thread_qos(i);
4122 wq_max_concurrency[i] = pthread_kern->qos_max_parallelism(thread_qos,
4123 QOS_PARALLELISM_COUNT_LOGICAL);
4124 }
4125 wq_max_concurrency[WORKQUEUE_EVENT_MANAGER_BUCKET] = 1;
4126 }