]> git.saurik.com Git - apple/libpthread.git/blob - kern/kern_support.c
0d269e2277cf16c3abbb65ae890198c9573b695d
[apple/libpthread.git] / kern / kern_support.c
1 /*
2 * Copyright (c) 2000-2012 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /* Copyright (c) 1995-2005 Apple Computer, Inc. All Rights Reserved */
29 /*
30 * pthread_synch.c
31 */
32
33 #pragma mark - Front Matter
34
35 #define _PTHREAD_CONDATTR_T
36 #define _PTHREAD_COND_T
37 #define _PTHREAD_MUTEXATTR_T
38 #define _PTHREAD_MUTEX_T
39 #define _PTHREAD_RWLOCKATTR_T
40 #define _PTHREAD_RWLOCK_T
41
42 #undef pthread_mutexattr_t
43 #undef pthread_mutex_t
44 #undef pthread_condattr_t
45 #undef pthread_cond_t
46 #undef pthread_rwlockattr_t
47 #undef pthread_rwlock_t
48
49 #include <sys/cdefs.h>
50
51 // <rdar://problem/26158937> panic() should be marked noreturn
52 extern void panic(const char *string, ...) __printflike(1,2) __dead2;
53
54 #include <sys/param.h>
55 #include <sys/queue.h>
56 #include <sys/resourcevar.h>
57 //#include <sys/proc_internal.h>
58 #include <sys/kauth.h>
59 #include <sys/systm.h>
60 #include <sys/timeb.h>
61 #include <sys/times.h>
62 #include <sys/acct.h>
63 #include <sys/kernel.h>
64 #include <sys/wait.h>
65 #include <sys/signalvar.h>
66 #include <sys/sysctl.h>
67 #include <sys/syslog.h>
68 #include <sys/stat.h>
69 #include <sys/lock.h>
70 #include <sys/kdebug.h>
71 //#include <sys/sysproto.h>
72 #include <sys/vm.h>
73 #include <sys/user.h> /* for coredump */
74 #include <sys/proc_info.h> /* for fill_procworkqueue */
75
76 #include <mach/mach_port.h>
77 #include <mach/mach_types.h>
78 #include <mach/semaphore.h>
79 #include <mach/sync_policy.h>
80 #include <mach/task.h>
81 #include <mach/vm_prot.h>
82 #include <kern/kern_types.h>
83 #include <kern/task.h>
84 #include <kern/clock.h>
85 #include <mach/kern_return.h>
86 #include <kern/thread.h>
87 #include <kern/zalloc.h>
88 #include <kern/sched_prim.h> /* for thread_exception_return */
89 #include <kern/processor.h>
90 #include <kern/assert.h>
91 #include <mach/mach_vm.h>
92 #include <mach/mach_param.h>
93 #include <mach/thread_status.h>
94 #include <mach/thread_policy.h>
95 #include <mach/message.h>
96 #include <mach/port.h>
97 //#include <vm/vm_protos.h>
98 #include <vm/vm_fault.h>
99 #include <vm/vm_map.h>
100 #include <mach/thread_act.h> /* for thread_resume */
101 #include <machine/machine_routines.h>
102 #include <mach/shared_region.h>
103
104 #include <libkern/OSAtomic.h>
105 #include <libkern/libkern.h>
106
107 #include <sys/pthread_shims.h>
108 #include "kern_internal.h"
109
110 // XXX: Dirty import for sys/signarvar.h that's wrapped in BSD_KERNEL_PRIVATE
111 #define sigcantmask (sigmask(SIGKILL) | sigmask(SIGSTOP))
112
113 // XXX: Ditto for thread tags from kern/thread.h
114 #define THREAD_TAG_MAINTHREAD 0x1
115 #define THREAD_TAG_PTHREAD 0x10
116 #define THREAD_TAG_WORKQUEUE 0x20
117
118 lck_grp_attr_t *pthread_lck_grp_attr;
119 lck_grp_t *pthread_lck_grp;
120 lck_attr_t *pthread_lck_attr;
121
122 zone_t pthread_zone_workqueue;
123 zone_t pthread_zone_threadlist;
124 zone_t pthread_zone_threadreq;
125
126 extern void thread_set_cthreadself(thread_t thread, uint64_t pself, int isLP64);
127 extern void workqueue_thread_yielded(void);
128
129 #define WQ_SETUP_FIRST_USE 1
130 #define WQ_SETUP_CLEAR_VOUCHER 2
131 static void _setup_wqthread(proc_t p, thread_t th, struct workqueue *wq,
132 struct threadlist *tl, int flags);
133
134 static void reset_priority(struct threadlist *tl, pthread_priority_t pri);
135 static pthread_priority_t pthread_priority_from_wq_class_index(struct workqueue *wq, int index);
136
137 static void wq_unpark_continue(void* ptr, wait_result_t wait_result) __dead2;
138
139 static bool workqueue_addnewthread(proc_t p, struct workqueue *wq);
140 static void workqueue_removethread(struct threadlist *tl, bool fromexit, bool first_use);
141 static void workqueue_lock_spin(struct workqueue *);
142 static void workqueue_unlock(struct workqueue *);
143
144 #define WQ_RUN_TR_THROTTLED 0
145 #define WQ_RUN_TR_THREAD_NEEDED 1
146 #define WQ_RUN_TR_THREAD_STARTED 2
147 #define WQ_RUN_TR_EXITING 3
148 static int workqueue_run_threadreq_and_unlock(proc_t p, struct workqueue *wq,
149 struct threadlist *tl, struct threadreq *req, bool may_add_new_thread);
150
151 static bool may_start_constrained_thread(struct workqueue *wq,
152 uint32_t at_priclass, struct threadlist *tl, bool may_start_timer);
153
154 static mach_vm_offset_t stack_addr_hint(proc_t p, vm_map_t vmap);
155 static boolean_t wq_thread_is_busy(uint64_t cur_ts,
156 _Atomic uint64_t *lastblocked_tsp);
157
158 int proc_settargetconc(pid_t pid, int queuenum, int32_t targetconc);
159 int proc_setalltargetconc(pid_t pid, int32_t * targetconcp);
160
161 #define WQ_MAXPRI_MIN 0 /* low prio queue num */
162 #define WQ_MAXPRI_MAX 2 /* max prio queuenum */
163 #define WQ_PRI_NUM 3 /* number of prio work queues */
164
165 #define C_32_STK_ALIGN 16
166 #define C_64_STK_ALIGN 16
167 #define C_64_REDZONE_LEN 128
168
169 #define PTHREAD_T_OFFSET 0
170
171 /*
172 * Flags filed passed to bsdthread_create and back in pthread_start
173 31 <---------------------------------> 0
174 _________________________________________
175 | flags(8) | policy(8) | importance(16) |
176 -----------------------------------------
177 */
178
179 #define PTHREAD_START_CUSTOM 0x01000000
180 #define PTHREAD_START_SETSCHED 0x02000000
181 #define PTHREAD_START_DETACHED 0x04000000
182 #define PTHREAD_START_QOSCLASS 0x08000000
183 #define PTHREAD_START_TSD_BASE_SET 0x10000000
184 #define PTHREAD_START_QOSCLASS_MASK 0x00ffffff
185 #define PTHREAD_START_POLICY_BITSHIFT 16
186 #define PTHREAD_START_POLICY_MASK 0xff
187 #define PTHREAD_START_IMPORTANCE_MASK 0xffff
188
189 #define SCHED_OTHER POLICY_TIMESHARE
190 #define SCHED_FIFO POLICY_FIFO
191 #define SCHED_RR POLICY_RR
192
193 #define BASEPRI_DEFAULT 31
194
195 #pragma mark sysctls
196
197 static uint32_t wq_stalled_window_usecs = WQ_STALLED_WINDOW_USECS;
198 static uint32_t wq_reduce_pool_window_usecs = WQ_REDUCE_POOL_WINDOW_USECS;
199 static uint32_t wq_max_timer_interval_usecs = WQ_MAX_TIMER_INTERVAL_USECS;
200 static uint32_t wq_max_threads = WORKQUEUE_MAXTHREADS;
201 static uint32_t wq_max_constrained_threads = WORKQUEUE_MAXTHREADS / 8;
202 static uint32_t wq_max_concurrency[WORKQUEUE_NUM_BUCKETS + 1]; // set to ncpus on load
203
204 SYSCTL_INT(_kern, OID_AUTO, wq_stalled_window_usecs, CTLFLAG_RW | CTLFLAG_LOCKED,
205 &wq_stalled_window_usecs, 0, "");
206
207 SYSCTL_INT(_kern, OID_AUTO, wq_reduce_pool_window_usecs, CTLFLAG_RW | CTLFLAG_LOCKED,
208 &wq_reduce_pool_window_usecs, 0, "");
209
210 SYSCTL_INT(_kern, OID_AUTO, wq_max_timer_interval_usecs, CTLFLAG_RW | CTLFLAG_LOCKED,
211 &wq_max_timer_interval_usecs, 0, "");
212
213 SYSCTL_INT(_kern, OID_AUTO, wq_max_threads, CTLFLAG_RW | CTLFLAG_LOCKED,
214 &wq_max_threads, 0, "");
215
216 SYSCTL_INT(_kern, OID_AUTO, wq_max_constrained_threads, CTLFLAG_RW | CTLFLAG_LOCKED,
217 &wq_max_constrained_threads, 0, "");
218
219 #ifdef DEBUG
220 static int wq_kevent_test SYSCTL_HANDLER_ARGS;
221 SYSCTL_PROC(_debug, OID_AUTO, wq_kevent_test, CTLFLAG_MASKED | CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY | CTLTYPE_OPAQUE, NULL, 0, wq_kevent_test, 0, "-");
222 #endif
223
224 static uint32_t wq_init_constrained_limit = 1;
225
226 uint32_t pthread_debug_tracing = 1;
227
228 SYSCTL_INT(_kern, OID_AUTO, pthread_debug_tracing, CTLFLAG_RW | CTLFLAG_LOCKED,
229 &pthread_debug_tracing, 0, "")
230
231 /*
232 * +-----+-----+-----+-----+-----+-----+-----+
233 * | MT | BG | UT | DE | IN | UN | mgr |
234 * +-----+-----+-----+-----+-----+-----+-----+-----+
235 * | pri | 5 | 4 | 3 | 2 | 1 | 0 | 6 |
236 * | qos | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
237 * +-----+-----+-----+-----+-----+-----+-----+-----+
238 */
239 static inline uint32_t
240 _wq_bucket_to_thread_qos(int pri)
241 {
242 if (pri == WORKQUEUE_EVENT_MANAGER_BUCKET) {
243 return WORKQUEUE_EVENT_MANAGER_BUCKET + 1;
244 }
245 return WORKQUEUE_EVENT_MANAGER_BUCKET - pri;
246 }
247
248 #pragma mark wq_thactive
249
250 #if defined(__LP64__)
251 // Layout is:
252 // 7 * 16 bits for each QoS bucket request count (including manager)
253 // 3 bits of best QoS among all pending constrained requests
254 // 13 bits of zeroes
255 #define WQ_THACTIVE_BUCKET_WIDTH 16
256 #define WQ_THACTIVE_QOS_SHIFT (7 * WQ_THACTIVE_BUCKET_WIDTH)
257 #else
258 // Layout is:
259 // 6 * 10 bits for each QoS bucket request count (except manager)
260 // 1 bit for the manager bucket
261 // 3 bits of best QoS among all pending constrained requests
262 #define WQ_THACTIVE_BUCKET_WIDTH 10
263 #define WQ_THACTIVE_QOS_SHIFT (6 * WQ_THACTIVE_BUCKET_WIDTH + 1)
264 #endif
265 #define WQ_THACTIVE_BUCKET_MASK ((1U << WQ_THACTIVE_BUCKET_WIDTH) - 1)
266 #define WQ_THACTIVE_BUCKET_HALF (1U << (WQ_THACTIVE_BUCKET_WIDTH - 1))
267 #define WQ_THACTIVE_NO_PENDING_REQUEST 6
268
269 _Static_assert(sizeof(wq_thactive_t) * CHAR_BIT - WQ_THACTIVE_QOS_SHIFT >= 3,
270 "Make sure we have space to encode a QoS");
271
272 static inline wq_thactive_t
273 _wq_thactive_fetch_and_add(struct workqueue *wq, wq_thactive_t offset)
274 {
275 #if PTHREAD_INLINE_RMW_ATOMICS || !defined(__LP64__)
276 return atomic_fetch_add_explicit(&wq->wq_thactive, offset,
277 memory_order_relaxed);
278 #else
279 return pthread_kern->atomic_fetch_add_128_relaxed(&wq->wq_thactive, offset);
280 #endif
281 }
282
283 static inline wq_thactive_t
284 _wq_thactive(struct workqueue *wq)
285 {
286 #if PTHREAD_INLINE_RMW_ATOMICS || !defined(__LP64__)
287 return atomic_load_explicit(&wq->wq_thactive, memory_order_relaxed);
288 #else
289 return pthread_kern->atomic_load_128_relaxed(&wq->wq_thactive);
290 #endif
291 }
292
293 #define WQ_THACTIVE_BEST_CONSTRAINED_REQ_QOS(tha) \
294 ((tha) >> WQ_THACTIVE_QOS_SHIFT)
295
296 static inline uint32_t
297 _wq_thactive_best_constrained_req_qos(struct workqueue *wq)
298 {
299 // Avoid expensive atomic operations: the three bits we're loading are in
300 // a single byte, and always updated under the workqueue lock
301 wq_thactive_t v = *(wq_thactive_t *)&wq->wq_thactive;
302 return WQ_THACTIVE_BEST_CONSTRAINED_REQ_QOS(v);
303 }
304
305 static inline wq_thactive_t
306 _wq_thactive_set_best_constrained_req_qos(struct workqueue *wq,
307 uint32_t orig_qos, uint32_t new_qos)
308 {
309 wq_thactive_t v;
310 v = (wq_thactive_t)(new_qos - orig_qos) << WQ_THACTIVE_QOS_SHIFT;
311 /*
312 * We can do an atomic add relative to the initial load because updates
313 * to this qos are always serialized under the workqueue lock.
314 */
315 return _wq_thactive_fetch_and_add(wq, v) + v;
316 }
317
318 static inline wq_thactive_t
319 _wq_thactive_offset_for_qos(int qos)
320 {
321 return (wq_thactive_t)1 << (qos * WQ_THACTIVE_BUCKET_WIDTH);
322 }
323
324 static inline wq_thactive_t
325 _wq_thactive_inc(struct workqueue *wq, int qos)
326 {
327 return _wq_thactive_fetch_and_add(wq, _wq_thactive_offset_for_qos(qos));
328 }
329
330 static inline wq_thactive_t
331 _wq_thactive_dec(struct workqueue *wq, int qos)
332 {
333 return _wq_thactive_fetch_and_add(wq, -_wq_thactive_offset_for_qos(qos));
334 }
335
336 static inline wq_thactive_t
337 _wq_thactive_move(struct workqueue *wq, int oldqos, int newqos)
338 {
339 return _wq_thactive_fetch_and_add(wq, _wq_thactive_offset_for_qos(newqos) -
340 _wq_thactive_offset_for_qos(oldqos));
341 }
342
343 static inline uint32_t
344 _wq_thactive_aggregate_downto_qos(struct workqueue *wq, wq_thactive_t v,
345 int qos, uint32_t *busycount, uint32_t *max_busycount)
346 {
347 uint32_t count = 0, active;
348 uint64_t curtime;
349
350 #ifndef __LP64__
351 /*
352 * on 32bits the manager bucket is a single bit and the best constrained
353 * request QoS 3 bits are where the 10 bits of a regular QoS bucket count
354 * would be. Mask them out.
355 */
356 v &= ~(~0ull << WQ_THACTIVE_QOS_SHIFT);
357 #endif
358 if (busycount) {
359 curtime = mach_absolute_time();
360 *busycount = 0;
361 }
362 if (max_busycount) {
363 *max_busycount = qos + 1;
364 }
365 for (int i = 0; i <= qos; i++, v >>= WQ_THACTIVE_BUCKET_WIDTH) {
366 active = v & WQ_THACTIVE_BUCKET_MASK;
367 count += active;
368 if (busycount && wq->wq_thscheduled_count[i] > active) {
369 if (wq_thread_is_busy(curtime, &wq->wq_lastblocked_ts[i])) {
370 /*
371 * We only consider the last blocked thread for a given bucket
372 * as busy because we don't want to take the list lock in each
373 * sched callback. However this is an approximation that could
374 * contribute to thread creation storms.
375 */
376 (*busycount)++;
377 }
378 }
379 }
380 return count;
381 }
382
383 #pragma mark - Process/Thread Setup/Teardown syscalls
384
385 static mach_vm_offset_t
386 stack_addr_hint(proc_t p, vm_map_t vmap)
387 {
388 mach_vm_offset_t stackaddr;
389 mach_vm_offset_t aslr_offset;
390 bool proc64bit = proc_is64bit(p);
391
392 // We can't safely take random values % something unless its a power-of-two
393 _Static_assert(powerof2(PTH_DEFAULT_STACKSIZE), "PTH_DEFAULT_STACKSIZE is a power-of-two");
394
395 #if defined(__i386__) || defined(__x86_64__)
396 if (proc64bit) {
397 // Matches vm_map_get_max_aslr_slide_pages's image shift in xnu
398 aslr_offset = random() % (1 << 28); // about 512 stacks
399 } else {
400 // Actually bigger than the image shift, we've got ~256MB to work with
401 aslr_offset = random() % (16 * PTH_DEFAULT_STACKSIZE);
402 }
403 aslr_offset = vm_map_trunc_page_mask(aslr_offset, vm_map_page_mask(vmap));
404 if (proc64bit) {
405 // Above nanomalloc range (see NANOZONE_SIGNATURE)
406 stackaddr = 0x700000000000 + aslr_offset;
407 } else {
408 stackaddr = SHARED_REGION_BASE_I386 + SHARED_REGION_SIZE_I386 + aslr_offset;
409 }
410 #elif defined(__arm__) || defined(__arm64__)
411 user_addr_t main_thread_stack_top = 0;
412 if (pthread_kern->proc_get_user_stack) {
413 main_thread_stack_top = pthread_kern->proc_get_user_stack(p);
414 }
415 if (proc64bit && main_thread_stack_top) {
416 // The main thread stack position is randomly slid by xnu (c.f.
417 // load_main() in mach_loader.c), so basing pthread stack allocations
418 // where the main thread stack ends is already ASLRd and doing so
419 // avoids creating a gap in the process address space that may cause
420 // extra PTE memory usage. rdar://problem/33328206
421 stackaddr = vm_map_trunc_page_mask((vm_map_offset_t)main_thread_stack_top,
422 vm_map_page_mask(vmap));
423 } else {
424 // vm_map_get_max_aslr_slide_pages ensures 1MB of slide, we do better
425 aslr_offset = random() % ((proc64bit ? 4 : 2) * PTH_DEFAULT_STACKSIZE);
426 aslr_offset = vm_map_trunc_page_mask((vm_map_offset_t)aslr_offset,
427 vm_map_page_mask(vmap));
428 if (proc64bit) {
429 // 64 stacks below shared region
430 stackaddr = SHARED_REGION_BASE_ARM64 - 64 * PTH_DEFAULT_STACKSIZE - aslr_offset;
431 } else {
432 // If you try to slide down from this point, you risk ending up in memory consumed by malloc
433 stackaddr = SHARED_REGION_BASE_ARM - 32 * PTH_DEFAULT_STACKSIZE + aslr_offset;
434 }
435 }
436 #else
437 #error Need to define a stack address hint for this architecture
438 #endif
439 return stackaddr;
440 }
441
442 /**
443 * bsdthread_create system call. Used by pthread_create.
444 */
445 int
446 _bsdthread_create(struct proc *p, user_addr_t user_func, user_addr_t user_funcarg, user_addr_t user_stack, user_addr_t user_pthread, uint32_t flags, user_addr_t *retval)
447 {
448 kern_return_t kret;
449 void * sright;
450 int error = 0;
451 int allocated = 0;
452 mach_vm_offset_t stackaddr;
453 mach_vm_size_t th_allocsize = 0;
454 mach_vm_size_t th_guardsize;
455 mach_vm_offset_t th_stack;
456 mach_vm_offset_t th_pthread;
457 mach_vm_offset_t th_tsd_base;
458 mach_port_name_t th_thport;
459 thread_t th;
460 vm_map_t vmap = pthread_kern->current_map();
461 task_t ctask = current_task();
462 unsigned int policy, importance;
463 uint32_t tsd_offset;
464
465 int isLP64 = 0;
466
467 if (pthread_kern->proc_get_register(p) == 0) {
468 return EINVAL;
469 }
470
471 PTHREAD_TRACE(TRACE_pthread_thread_create | DBG_FUNC_START, flags, 0, 0, 0, 0);
472
473 isLP64 = proc_is64bit(p);
474 th_guardsize = vm_map_page_size(vmap);
475
476 stackaddr = pthread_kern->proc_get_stack_addr_hint(p);
477 kret = pthread_kern->thread_create(ctask, &th);
478 if (kret != KERN_SUCCESS)
479 return(ENOMEM);
480 thread_reference(th);
481
482 pthread_kern->thread_set_tag(th, THREAD_TAG_PTHREAD);
483
484 sright = (void *)pthread_kern->convert_thread_to_port(th);
485 th_thport = pthread_kern->ipc_port_copyout_send(sright, pthread_kern->task_get_ipcspace(ctask));
486 if (!MACH_PORT_VALID(th_thport)) {
487 error = EMFILE; // userland will convert this into a crash
488 goto out;
489 }
490
491 if ((flags & PTHREAD_START_CUSTOM) == 0) {
492 mach_vm_size_t pthread_size =
493 vm_map_round_page_mask(pthread_kern->proc_get_pthsize(p) + PTHREAD_T_OFFSET, vm_map_page_mask(vmap));
494 th_allocsize = th_guardsize + user_stack + pthread_size;
495 user_stack += PTHREAD_T_OFFSET;
496
497 kret = mach_vm_map(vmap, &stackaddr,
498 th_allocsize,
499 page_size-1,
500 VM_MAKE_TAG(VM_MEMORY_STACK)| VM_FLAGS_ANYWHERE , NULL,
501 0, FALSE, VM_PROT_DEFAULT, VM_PROT_ALL,
502 VM_INHERIT_DEFAULT);
503 if (kret != KERN_SUCCESS){
504 kret = mach_vm_allocate(vmap,
505 &stackaddr, th_allocsize,
506 VM_MAKE_TAG(VM_MEMORY_STACK)| VM_FLAGS_ANYWHERE);
507 }
508 if (kret != KERN_SUCCESS) {
509 error = ENOMEM;
510 goto out;
511 }
512
513 PTHREAD_TRACE(TRACE_pthread_thread_create|DBG_FUNC_NONE, th_allocsize, stackaddr, 0, 2, 0);
514
515 allocated = 1;
516 /*
517 * The guard page is at the lowest address
518 * The stack base is the highest address
519 */
520 kret = mach_vm_protect(vmap, stackaddr, th_guardsize, FALSE, VM_PROT_NONE);
521
522 if (kret != KERN_SUCCESS) {
523 error = ENOMEM;
524 goto out1;
525 }
526
527 th_pthread = stackaddr + th_guardsize + user_stack;
528 th_stack = th_pthread;
529
530 /*
531 * Pre-fault the first page of the new thread's stack and the page that will
532 * contain the pthread_t structure.
533 */
534 if (vm_map_trunc_page_mask((vm_map_offset_t)(th_stack - C_64_REDZONE_LEN), vm_map_page_mask(vmap)) !=
535 vm_map_trunc_page_mask((vm_map_offset_t)th_pthread, vm_map_page_mask(vmap))){
536 vm_fault( vmap,
537 vm_map_trunc_page_mask((vm_map_offset_t)(th_stack - C_64_REDZONE_LEN), vm_map_page_mask(vmap)),
538 VM_PROT_READ | VM_PROT_WRITE,
539 FALSE,
540 THREAD_UNINT, NULL, 0);
541 }
542
543 vm_fault( vmap,
544 vm_map_trunc_page_mask((vm_map_offset_t)th_pthread, vm_map_page_mask(vmap)),
545 VM_PROT_READ | VM_PROT_WRITE,
546 FALSE,
547 THREAD_UNINT, NULL, 0);
548
549 } else {
550 th_stack = user_stack;
551 th_pthread = user_pthread;
552
553 PTHREAD_TRACE(TRACE_pthread_thread_create|DBG_FUNC_NONE, 0, 0, 0, 3, 0);
554 }
555
556 tsd_offset = pthread_kern->proc_get_pthread_tsd_offset(p);
557 if (tsd_offset) {
558 th_tsd_base = th_pthread + tsd_offset;
559 kret = pthread_kern->thread_set_tsd_base(th, th_tsd_base);
560 if (kret == KERN_SUCCESS) {
561 flags |= PTHREAD_START_TSD_BASE_SET;
562 }
563 }
564
565 #if defined(__i386__) || defined(__x86_64__)
566 /*
567 * Set up i386 registers & function call.
568 */
569 if (isLP64 == 0) {
570 x86_thread_state32_t state = {
571 .eip = (unsigned int)pthread_kern->proc_get_threadstart(p),
572 .eax = (unsigned int)th_pthread,
573 .ebx = (unsigned int)th_thport,
574 .ecx = (unsigned int)user_func,
575 .edx = (unsigned int)user_funcarg,
576 .edi = (unsigned int)user_stack,
577 .esi = (unsigned int)flags,
578 /*
579 * set stack pointer
580 */
581 .esp = (int)((vm_offset_t)(th_stack-C_32_STK_ALIGN))
582 };
583
584 error = pthread_kern->thread_set_wq_state32(th, (thread_state_t)&state);
585 if (error != KERN_SUCCESS) {
586 error = EINVAL;
587 goto out;
588 }
589 } else {
590 x86_thread_state64_t state64 = {
591 .rip = (uint64_t)pthread_kern->proc_get_threadstart(p),
592 .rdi = (uint64_t)th_pthread,
593 .rsi = (uint64_t)(th_thport),
594 .rdx = (uint64_t)user_func,
595 .rcx = (uint64_t)user_funcarg,
596 .r8 = (uint64_t)user_stack,
597 .r9 = (uint64_t)flags,
598 /*
599 * set stack pointer aligned to 16 byte boundary
600 */
601 .rsp = (uint64_t)(th_stack - C_64_REDZONE_LEN)
602 };
603
604 error = pthread_kern->thread_set_wq_state64(th, (thread_state_t)&state64);
605 if (error != KERN_SUCCESS) {
606 error = EINVAL;
607 goto out;
608 }
609
610 }
611 #elif defined(__arm__)
612 arm_thread_state_t state = {
613 .pc = (int)pthread_kern->proc_get_threadstart(p),
614 .r[0] = (unsigned int)th_pthread,
615 .r[1] = (unsigned int)th_thport,
616 .r[2] = (unsigned int)user_func,
617 .r[3] = (unsigned int)user_funcarg,
618 .r[4] = (unsigned int)user_stack,
619 .r[5] = (unsigned int)flags,
620
621 /* Set r7 & lr to 0 for better back tracing */
622 .r[7] = 0,
623 .lr = 0,
624
625 /*
626 * set stack pointer
627 */
628 .sp = (int)((vm_offset_t)(th_stack-C_32_STK_ALIGN))
629 };
630
631 (void) pthread_kern->thread_set_wq_state32(th, (thread_state_t)&state);
632
633 #else
634 #error bsdthread_create not defined for this architecture
635 #endif
636
637 if ((flags & PTHREAD_START_SETSCHED) != 0) {
638 /* Set scheduling parameters if needed */
639 thread_extended_policy_data_t extinfo;
640 thread_precedence_policy_data_t precedinfo;
641
642 importance = (flags & PTHREAD_START_IMPORTANCE_MASK);
643 policy = (flags >> PTHREAD_START_POLICY_BITSHIFT) & PTHREAD_START_POLICY_MASK;
644
645 if (policy == SCHED_OTHER) {
646 extinfo.timeshare = 1;
647 } else {
648 extinfo.timeshare = 0;
649 }
650
651 thread_policy_set(th, THREAD_EXTENDED_POLICY, (thread_policy_t)&extinfo, THREAD_EXTENDED_POLICY_COUNT);
652
653 precedinfo.importance = (importance - BASEPRI_DEFAULT);
654 thread_policy_set(th, THREAD_PRECEDENCE_POLICY, (thread_policy_t)&precedinfo, THREAD_PRECEDENCE_POLICY_COUNT);
655 } else if ((flags & PTHREAD_START_QOSCLASS) != 0) {
656 /* Set thread QoS class if requested. */
657 pthread_priority_t priority = (pthread_priority_t)(flags & PTHREAD_START_QOSCLASS_MASK);
658
659 thread_qos_policy_data_t qos;
660 qos.qos_tier = pthread_priority_get_thread_qos(priority);
661 qos.tier_importance = (qos.qos_tier == QOS_CLASS_UNSPECIFIED) ? 0 :
662 _pthread_priority_get_relpri(priority);
663
664 pthread_kern->thread_policy_set_internal(th, THREAD_QOS_POLICY, (thread_policy_t)&qos, THREAD_QOS_POLICY_COUNT);
665 }
666
667 if (pthread_kern->proc_get_mach_thread_self_tsd_offset) {
668 uint64_t mach_thread_self_offset =
669 pthread_kern->proc_get_mach_thread_self_tsd_offset(p);
670 if (mach_thread_self_offset && tsd_offset) {
671 bool proc64bit = proc_is64bit(p);
672 if (proc64bit) {
673 uint64_t th_thport_tsd = (uint64_t)th_thport;
674 error = copyout(&th_thport_tsd, th_pthread + tsd_offset +
675 mach_thread_self_offset, sizeof(th_thport_tsd));
676 } else {
677 uint32_t th_thport_tsd = (uint32_t)th_thport;
678 error = copyout(&th_thport_tsd, th_pthread + tsd_offset +
679 mach_thread_self_offset, sizeof(th_thport_tsd));
680 }
681 if (error) {
682 goto out1;
683 }
684 }
685 }
686
687 kret = pthread_kern->thread_resume(th);
688 if (kret != KERN_SUCCESS) {
689 error = EINVAL;
690 goto out1;
691 }
692 thread_deallocate(th); /* drop the creator reference */
693
694 PTHREAD_TRACE(TRACE_pthread_thread_create|DBG_FUNC_END, error, th_pthread, 0, 0, 0);
695
696 // cast required as mach_vm_offset_t is always 64 bits even on 32-bit platforms
697 *retval = (user_addr_t)th_pthread;
698
699 return(0);
700
701 out1:
702 if (allocated != 0) {
703 (void)mach_vm_deallocate(vmap, stackaddr, th_allocsize);
704 }
705 out:
706 (void)pthread_kern->mach_port_deallocate(pthread_kern->task_get_ipcspace(ctask), th_thport);
707 if (pthread_kern->thread_will_park_or_terminate) {
708 pthread_kern->thread_will_park_or_terminate(th);
709 }
710 (void)thread_terminate(th);
711 (void)thread_deallocate(th);
712 return(error);
713 }
714
715 /**
716 * bsdthread_terminate system call. Used by pthread_terminate
717 */
718 int
719 _bsdthread_terminate(__unused struct proc *p,
720 user_addr_t stackaddr,
721 size_t size,
722 uint32_t kthport,
723 uint32_t sem,
724 __unused int32_t *retval)
725 {
726 mach_vm_offset_t freeaddr;
727 mach_vm_size_t freesize;
728 kern_return_t kret;
729 thread_t th = current_thread();
730
731 freeaddr = (mach_vm_offset_t)stackaddr;
732 freesize = size;
733
734 PTHREAD_TRACE(TRACE_pthread_thread_terminate|DBG_FUNC_START, freeaddr, freesize, kthport, 0xff, 0);
735
736 if ((freesize != (mach_vm_size_t)0) && (freeaddr != (mach_vm_offset_t)0)) {
737 if (pthread_kern->thread_get_tag(th) & THREAD_TAG_MAINTHREAD){
738 vm_map_t user_map = pthread_kern->current_map();
739 freesize = vm_map_trunc_page_mask((vm_map_offset_t)freesize - 1, vm_map_page_mask(user_map));
740 kret = mach_vm_behavior_set(user_map, freeaddr, freesize, VM_BEHAVIOR_REUSABLE);
741 assert(kret == KERN_SUCCESS || kret == KERN_INVALID_ADDRESS);
742 kret = kret ? kret : mach_vm_protect(user_map, freeaddr, freesize, FALSE, VM_PROT_NONE);
743 assert(kret == KERN_SUCCESS || kret == KERN_INVALID_ADDRESS);
744 } else {
745 kret = mach_vm_deallocate(pthread_kern->current_map(), freeaddr, freesize);
746 if (kret != KERN_SUCCESS) {
747 PTHREAD_TRACE(TRACE_pthread_thread_terminate|DBG_FUNC_END, kret, 0, 0, 0, 0);
748 return(EINVAL);
749 }
750 }
751 }
752
753 if (pthread_kern->thread_will_park_or_terminate) {
754 pthread_kern->thread_will_park_or_terminate(th);
755 }
756 (void)thread_terminate(th);
757 if (sem != MACH_PORT_NULL) {
758 kret = pthread_kern->semaphore_signal_internal_trap(sem);
759 if (kret != KERN_SUCCESS) {
760 PTHREAD_TRACE(TRACE_pthread_thread_terminate|DBG_FUNC_END, kret, 0, 0, 0, 0);
761 return(EINVAL);
762 }
763 }
764
765 if (kthport != MACH_PORT_NULL) {
766 pthread_kern->mach_port_deallocate(pthread_kern->task_get_ipcspace(current_task()), kthport);
767 }
768
769 PTHREAD_TRACE(TRACE_pthread_thread_terminate|DBG_FUNC_END, 0, 0, 0, 0, 0);
770
771 pthread_kern->thread_exception_return();
772 panic("bsdthread_terminate: still running\n");
773
774 PTHREAD_TRACE(TRACE_pthread_thread_terminate|DBG_FUNC_END, 0, 0xff, 0, 0, 0);
775
776 return(0);
777 }
778
779 /**
780 * bsdthread_register system call. Performs per-process setup. Responsible for
781 * returning capabilitiy bits to userspace and receiving userspace function addresses.
782 */
783 int
784 _bsdthread_register(struct proc *p,
785 user_addr_t threadstart,
786 user_addr_t wqthread,
787 int pthsize,
788 user_addr_t pthread_init_data,
789 user_addr_t pthread_init_data_size,
790 uint64_t dispatchqueue_offset,
791 int32_t *retval)
792 {
793 struct _pthread_registration_data data = {};
794 uint32_t max_tsd_offset;
795 kern_return_t kr;
796 size_t pthread_init_sz = 0;
797
798 /* syscall randomizer test can pass bogus values */
799 if (pthsize < 0 || pthsize > MAX_PTHREAD_SIZE) {
800 return(EINVAL);
801 }
802 /*
803 * if we have pthread_init_data, then we use that and target_concptr
804 * (which is an offset) get data.
805 */
806 if (pthread_init_data != 0) {
807 if (pthread_init_data_size < sizeof(data.version)) {
808 return EINVAL;
809 }
810 pthread_init_sz = MIN(sizeof(data), (size_t)pthread_init_data_size);
811 int ret = copyin(pthread_init_data, &data, pthread_init_sz);
812 if (ret) {
813 return ret;
814 }
815 if (data.version != (size_t)pthread_init_data_size) {
816 return EINVAL;
817 }
818 } else {
819 data.dispatch_queue_offset = dispatchqueue_offset;
820 }
821
822 /* We have to do this before proc_get_register so that it resets after fork */
823 mach_vm_offset_t stackaddr = stack_addr_hint(p, pthread_kern->current_map());
824 pthread_kern->proc_set_stack_addr_hint(p, (user_addr_t)stackaddr);
825
826 /* prevent multiple registrations */
827 if (pthread_kern->proc_get_register(p) != 0) {
828 return(EINVAL);
829 }
830
831 pthread_kern->proc_set_threadstart(p, threadstart);
832 pthread_kern->proc_set_wqthread(p, wqthread);
833 pthread_kern->proc_set_pthsize(p, pthsize);
834 pthread_kern->proc_set_register(p);
835
836 uint32_t tsd_slot_sz = proc_is64bit(p) ? sizeof(uint64_t) : sizeof(uint32_t);
837 if ((uint32_t)pthsize >= tsd_slot_sz &&
838 data.tsd_offset <= (uint32_t)(pthsize - tsd_slot_sz)) {
839 max_tsd_offset = ((uint32_t)pthsize - data.tsd_offset - tsd_slot_sz);
840 } else {
841 data.tsd_offset = 0;
842 max_tsd_offset = 0;
843 }
844 pthread_kern->proc_set_pthread_tsd_offset(p, data.tsd_offset);
845
846 if (data.dispatch_queue_offset > max_tsd_offset) {
847 data.dispatch_queue_offset = 0;
848 }
849 pthread_kern->proc_set_dispatchqueue_offset(p, data.dispatch_queue_offset);
850
851 if (pthread_kern->proc_set_return_to_kernel_offset) {
852 if (data.return_to_kernel_offset > max_tsd_offset) {
853 data.return_to_kernel_offset = 0;
854 }
855 pthread_kern->proc_set_return_to_kernel_offset(p,
856 data.return_to_kernel_offset);
857 }
858
859 if (pthread_kern->proc_set_mach_thread_self_tsd_offset) {
860 if (data.mach_thread_self_offset > max_tsd_offset) {
861 data.mach_thread_self_offset = 0;
862 }
863 pthread_kern->proc_set_mach_thread_self_tsd_offset(p,
864 data.mach_thread_self_offset);
865 }
866
867 if (pthread_init_data != 0) {
868 /* Outgoing data that userspace expects as a reply */
869 data.version = sizeof(struct _pthread_registration_data);
870 if (pthread_kern->qos_main_thread_active()) {
871 mach_msg_type_number_t nqos = THREAD_QOS_POLICY_COUNT;
872 thread_qos_policy_data_t qos;
873 boolean_t gd = FALSE;
874
875 kr = pthread_kern->thread_policy_get(current_thread(), THREAD_QOS_POLICY, (thread_policy_t)&qos, &nqos, &gd);
876 if (kr != KERN_SUCCESS || qos.qos_tier == THREAD_QOS_UNSPECIFIED) {
877 /* Unspecified threads means the kernel wants us to impose legacy upon the thread. */
878 qos.qos_tier = THREAD_QOS_LEGACY;
879 qos.tier_importance = 0;
880
881 kr = pthread_kern->thread_policy_set_internal(current_thread(), THREAD_QOS_POLICY, (thread_policy_t)&qos, THREAD_QOS_POLICY_COUNT);
882 }
883
884 if (kr == KERN_SUCCESS) {
885 data.main_qos = thread_qos_get_pthread_priority(qos.qos_tier);
886 } else {
887 data.main_qos = _pthread_priority_make_newest(QOS_CLASS_UNSPECIFIED, 0, 0);
888 }
889 } else {
890 data.main_qos = _pthread_priority_make_newest(QOS_CLASS_UNSPECIFIED, 0, 0);
891 }
892
893 kr = copyout(&data, pthread_init_data, pthread_init_sz);
894 if (kr != KERN_SUCCESS) {
895 return EINVAL;
896 }
897 }
898
899 /* return the supported feature set as the return value. */
900 *retval = PTHREAD_FEATURE_SUPPORTED;
901
902 return(0);
903 }
904
905 #pragma mark - QoS Manipulation
906
907 int
908 _bsdthread_ctl_set_qos(struct proc *p, user_addr_t __unused cmd, mach_port_name_t kport, user_addr_t tsd_priority_addr, user_addr_t arg3, int *retval)
909 {
910 int rv;
911 thread_t th;
912
913 pthread_priority_t priority;
914
915 /* Unused parameters must be zero. */
916 if (arg3 != 0) {
917 return EINVAL;
918 }
919
920 /* QoS is stored in a given slot in the pthread TSD. We need to copy that in and set our QoS based on it. */
921 if (proc_is64bit(p)) {
922 uint64_t v;
923 rv = copyin(tsd_priority_addr, &v, sizeof(v));
924 if (rv) goto out;
925 priority = (int)(v & 0xffffffff);
926 } else {
927 uint32_t v;
928 rv = copyin(tsd_priority_addr, &v, sizeof(v));
929 if (rv) goto out;
930 priority = v;
931 }
932
933 if ((th = port_name_to_thread(kport)) == THREAD_NULL) {
934 return ESRCH;
935 }
936
937 /* <rdar://problem/16211829> Disable pthread_set_qos_class_np() on threads other than pthread_self */
938 if (th != current_thread()) {
939 thread_deallocate(th);
940 return EPERM;
941 }
942
943 rv = _bsdthread_ctl_set_self(p, 0, priority, 0, _PTHREAD_SET_SELF_QOS_FLAG, retval);
944
945 /* Static param the thread, we just set QoS on it, so its stuck in QoS land now. */
946 /* pthread_kern->thread_static_param(th, TRUE); */ // see <rdar://problem/16433744>, for details
947
948 thread_deallocate(th);
949
950 out:
951 return rv;
952 }
953
954 static inline struct threadlist *
955 util_get_thread_threadlist_entry(thread_t th)
956 {
957 struct uthread *uth = pthread_kern->get_bsdthread_info(th);
958 if (uth) {
959 struct threadlist *tl = pthread_kern->uthread_get_threadlist(uth);
960 return tl;
961 }
962 return NULL;
963 }
964
965 boolean_t
966 _workq_thread_has_been_unbound(thread_t th, int qos_class)
967 {
968 struct threadlist *tl = util_get_thread_threadlist_entry(th);
969 if (!tl) {
970 return FALSE;
971 }
972
973 struct workqueue *wq = tl->th_workq;
974 workqueue_lock_spin(wq);
975
976 if (tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET) {
977 goto failure;
978 } else if (qos_class != class_index_get_thread_qos(tl->th_priority)) {
979 goto failure;
980 }
981
982 if ((tl->th_flags & TH_LIST_KEVENT_BOUND)){
983 goto failure;
984 }
985 tl->th_flags &= ~TH_LIST_KEVENT_BOUND;
986
987 workqueue_unlock(wq);
988 return TRUE;
989
990 failure:
991 workqueue_unlock(wq);
992 return FALSE;
993 }
994
995 int
996 _bsdthread_ctl_set_self(struct proc *p, user_addr_t __unused cmd, pthread_priority_t priority, mach_port_name_t voucher, _pthread_set_flags_t flags, int __unused *retval)
997 {
998 thread_qos_policy_data_t qos;
999 mach_msg_type_number_t nqos = THREAD_QOS_POLICY_COUNT;
1000 boolean_t gd = FALSE;
1001 thread_t th = current_thread();
1002 struct workqueue *wq = NULL;
1003 struct threadlist *tl = NULL;
1004
1005 kern_return_t kr;
1006 int qos_rv = 0, voucher_rv = 0, fixedpri_rv = 0;
1007
1008 if ((flags & _PTHREAD_SET_SELF_WQ_KEVENT_UNBIND) != 0) {
1009 tl = util_get_thread_threadlist_entry(th);
1010 if (tl) {
1011 wq = tl->th_workq;
1012 } else {
1013 goto qos;
1014 }
1015
1016 workqueue_lock_spin(wq);
1017 if (tl->th_flags & TH_LIST_KEVENT_BOUND) {
1018 tl->th_flags &= ~TH_LIST_KEVENT_BOUND;
1019 unsigned int kevent_flags = KEVENT_FLAG_WORKQ | KEVENT_FLAG_UNBIND_CHECK_FLAGS;
1020 if (tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET) {
1021 kevent_flags |= KEVENT_FLAG_WORKQ_MANAGER;
1022 }
1023
1024 workqueue_unlock(wq);
1025 __assert_only int ret = kevent_qos_internal_unbind(p, class_index_get_thread_qos(tl->th_priority), th, kevent_flags);
1026 assert(ret == 0);
1027 } else {
1028 workqueue_unlock(wq);
1029 }
1030 }
1031
1032 qos:
1033 if ((flags & _PTHREAD_SET_SELF_QOS_FLAG) != 0) {
1034 kr = pthread_kern->thread_policy_get(th, THREAD_QOS_POLICY, (thread_policy_t)&qos, &nqos, &gd);
1035 if (kr != KERN_SUCCESS) {
1036 qos_rv = EINVAL;
1037 goto voucher;
1038 }
1039
1040 /*
1041 * If we have main-thread QoS then we don't allow a thread to come out
1042 * of QOS_CLASS_UNSPECIFIED.
1043 */
1044 if (pthread_kern->qos_main_thread_active() && qos.qos_tier ==
1045 THREAD_QOS_UNSPECIFIED) {
1046 qos_rv = EPERM;
1047 goto voucher;
1048 }
1049
1050 if (!tl) {
1051 tl = util_get_thread_threadlist_entry(th);
1052 if (tl) wq = tl->th_workq;
1053 }
1054
1055 PTHREAD_TRACE_WQ(TRACE_pthread_set_qos_self | DBG_FUNC_START, wq, qos.qos_tier, qos.tier_importance, 0, 0);
1056
1057 qos.qos_tier = pthread_priority_get_thread_qos(priority);
1058 qos.tier_importance = (qos.qos_tier == QOS_CLASS_UNSPECIFIED) ? 0 : _pthread_priority_get_relpri(priority);
1059
1060 if (qos.qos_tier == QOS_CLASS_UNSPECIFIED ||
1061 qos.tier_importance > 0 || qos.tier_importance < THREAD_QOS_MIN_TIER_IMPORTANCE) {
1062 qos_rv = EINVAL;
1063 goto voucher;
1064 }
1065
1066 /*
1067 * If we're a workqueue, the threadlist item priority needs adjusting,
1068 * along with the bucket we were running in.
1069 */
1070 if (tl) {
1071 bool try_run_threadreq = false;
1072
1073 workqueue_lock_spin(wq);
1074 kr = pthread_kern->thread_set_workq_qos(th, qos.qos_tier, qos.tier_importance);
1075 assert(kr == KERN_SUCCESS || kr == KERN_TERMINATED);
1076
1077 /* Fix up counters. */
1078 uint8_t old_bucket = tl->th_priority;
1079 uint8_t new_bucket = pthread_priority_get_class_index(priority);
1080
1081 if (old_bucket != new_bucket) {
1082 _wq_thactive_move(wq, old_bucket, new_bucket);
1083 wq->wq_thscheduled_count[old_bucket]--;
1084 wq->wq_thscheduled_count[new_bucket]++;
1085 if (old_bucket == WORKQUEUE_EVENT_MANAGER_BUCKET ||
1086 old_bucket < new_bucket) {
1087 /*
1088 * if the QoS of the thread was lowered, then this could
1089 * allow for a higher QoS thread request to run, so we need
1090 * to reevaluate.
1091 */
1092 try_run_threadreq = true;
1093 }
1094 tl->th_priority = new_bucket;
1095 }
1096
1097 bool old_overcommit = !(tl->th_flags & TH_LIST_CONSTRAINED);
1098 bool new_overcommit = priority & _PTHREAD_PRIORITY_OVERCOMMIT_FLAG;
1099 if (!old_overcommit && new_overcommit) {
1100 if (wq->wq_constrained_threads_scheduled-- ==
1101 wq_max_constrained_threads) {
1102 try_run_threadreq = true;
1103 }
1104 tl->th_flags &= ~TH_LIST_CONSTRAINED;
1105 } else if (old_overcommit && !new_overcommit) {
1106 wq->wq_constrained_threads_scheduled++;
1107 tl->th_flags |= TH_LIST_CONSTRAINED;
1108 }
1109
1110 if (try_run_threadreq) {
1111 workqueue_run_threadreq_and_unlock(p, wq, NULL, NULL, true);
1112 } else {
1113 workqueue_unlock(wq);
1114 }
1115 } else {
1116 kr = pthread_kern->thread_policy_set_internal(th, THREAD_QOS_POLICY, (thread_policy_t)&qos, THREAD_QOS_POLICY_COUNT);
1117 if (kr != KERN_SUCCESS) {
1118 qos_rv = EINVAL;
1119 }
1120 }
1121
1122 PTHREAD_TRACE_WQ(TRACE_pthread_set_qos_self | DBG_FUNC_END, wq, qos.qos_tier, qos.tier_importance, 0, 0);
1123 }
1124
1125 voucher:
1126 if ((flags & _PTHREAD_SET_SELF_VOUCHER_FLAG) != 0) {
1127 kr = pthread_kern->thread_set_voucher_name(voucher);
1128 if (kr != KERN_SUCCESS) {
1129 voucher_rv = ENOENT;
1130 goto fixedpri;
1131 }
1132 }
1133
1134 fixedpri:
1135 if (qos_rv) goto done;
1136 if ((flags & _PTHREAD_SET_SELF_FIXEDPRIORITY_FLAG) != 0) {
1137 thread_extended_policy_data_t extpol = {.timeshare = 0};
1138
1139 if (!tl) tl = util_get_thread_threadlist_entry(th);
1140 if (tl) {
1141 /* Not allowed on workqueue threads */
1142 fixedpri_rv = ENOTSUP;
1143 goto done;
1144 }
1145
1146 kr = pthread_kern->thread_policy_set_internal(th, THREAD_EXTENDED_POLICY, (thread_policy_t)&extpol, THREAD_EXTENDED_POLICY_COUNT);
1147 if (kr != KERN_SUCCESS) {
1148 fixedpri_rv = EINVAL;
1149 goto done;
1150 }
1151 } else if ((flags & _PTHREAD_SET_SELF_TIMESHARE_FLAG) != 0) {
1152 thread_extended_policy_data_t extpol = {.timeshare = 1};
1153
1154 if (!tl) tl = util_get_thread_threadlist_entry(th);
1155 if (tl) {
1156 /* Not allowed on workqueue threads */
1157 fixedpri_rv = ENOTSUP;
1158 goto done;
1159 }
1160
1161 kr = pthread_kern->thread_policy_set_internal(th, THREAD_EXTENDED_POLICY, (thread_policy_t)&extpol, THREAD_EXTENDED_POLICY_COUNT);
1162 if (kr != KERN_SUCCESS) {
1163 fixedpri_rv = EINVAL;
1164 goto done;
1165 }
1166 }
1167
1168 done:
1169 if (qos_rv && voucher_rv) {
1170 /* Both failed, give that a unique error. */
1171 return EBADMSG;
1172 }
1173
1174 if (qos_rv) {
1175 return qos_rv;
1176 }
1177
1178 if (voucher_rv) {
1179 return voucher_rv;
1180 }
1181
1182 if (fixedpri_rv) {
1183 return fixedpri_rv;
1184 }
1185
1186 return 0;
1187 }
1188
1189 int
1190 _bsdthread_ctl_qos_override_start(struct proc __unused *p, user_addr_t __unused cmd, mach_port_name_t kport, pthread_priority_t priority, user_addr_t resource, int __unused *retval)
1191 {
1192 thread_t th;
1193 int rv = 0;
1194
1195 if ((th = port_name_to_thread(kport)) == THREAD_NULL) {
1196 return ESRCH;
1197 }
1198
1199 int override_qos = pthread_priority_get_thread_qos(priority);
1200
1201 struct threadlist *tl = util_get_thread_threadlist_entry(th);
1202 if (tl) {
1203 PTHREAD_TRACE_WQ(TRACE_wq_override_start | DBG_FUNC_NONE, tl->th_workq, thread_tid(th), 1, priority, 0);
1204 }
1205
1206 /* The only failure case here is if we pass a tid and have it lookup the thread, we pass the uthread, so this all always succeeds. */
1207 pthread_kern->proc_usynch_thread_qos_add_override_for_resource_check_owner(th, override_qos, TRUE,
1208 resource, THREAD_QOS_OVERRIDE_TYPE_PTHREAD_EXPLICIT_OVERRIDE, USER_ADDR_NULL, MACH_PORT_NULL);
1209 thread_deallocate(th);
1210 return rv;
1211 }
1212
1213 int
1214 _bsdthread_ctl_qos_override_end(struct proc __unused *p, user_addr_t __unused cmd, mach_port_name_t kport, user_addr_t resource, user_addr_t arg3, int __unused *retval)
1215 {
1216 thread_t th;
1217 int rv = 0;
1218
1219 if (arg3 != 0) {
1220 return EINVAL;
1221 }
1222
1223 if ((th = port_name_to_thread(kport)) == THREAD_NULL) {
1224 return ESRCH;
1225 }
1226
1227 struct uthread *uth = pthread_kern->get_bsdthread_info(th);
1228
1229 struct threadlist *tl = util_get_thread_threadlist_entry(th);
1230 if (tl) {
1231 PTHREAD_TRACE_WQ(TRACE_wq_override_end | DBG_FUNC_NONE, tl->th_workq, thread_tid(th), 0, 0, 0);
1232 }
1233
1234 pthread_kern->proc_usynch_thread_qos_remove_override_for_resource(current_task(), uth, 0, resource, THREAD_QOS_OVERRIDE_TYPE_PTHREAD_EXPLICIT_OVERRIDE);
1235
1236 thread_deallocate(th);
1237 return rv;
1238 }
1239
1240 static int
1241 _bsdthread_ctl_qos_dispatch_asynchronous_override_add_internal(mach_port_name_t kport, pthread_priority_t priority, user_addr_t resource, user_addr_t ulock_addr)
1242 {
1243 thread_t th;
1244 int rv = 0;
1245
1246 if ((th = port_name_to_thread(kport)) == THREAD_NULL) {
1247 return ESRCH;
1248 }
1249
1250 int override_qos = pthread_priority_get_thread_qos(priority);
1251
1252 struct threadlist *tl = util_get_thread_threadlist_entry(th);
1253 if (!tl) {
1254 thread_deallocate(th);
1255 return EPERM;
1256 }
1257
1258 PTHREAD_TRACE_WQ(TRACE_wq_override_dispatch | DBG_FUNC_NONE, tl->th_workq, thread_tid(th), 1, priority, 0);
1259
1260 rv = pthread_kern->proc_usynch_thread_qos_add_override_for_resource_check_owner(th, override_qos, TRUE,
1261 resource, THREAD_QOS_OVERRIDE_TYPE_DISPATCH_ASYNCHRONOUS_OVERRIDE, ulock_addr, kport);
1262
1263 thread_deallocate(th);
1264 return rv;
1265 }
1266
1267 int _bsdthread_ctl_qos_dispatch_asynchronous_override_add(struct proc __unused *p, user_addr_t __unused cmd,
1268 mach_port_name_t kport, pthread_priority_t priority, user_addr_t resource, int __unused *retval)
1269 {
1270 return _bsdthread_ctl_qos_dispatch_asynchronous_override_add_internal(kport, priority, resource, USER_ADDR_NULL);
1271 }
1272
1273 int
1274 _bsdthread_ctl_qos_override_dispatch(struct proc *p __unused, user_addr_t cmd __unused, mach_port_name_t kport, pthread_priority_t priority, user_addr_t ulock_addr, int __unused *retval)
1275 {
1276 return _bsdthread_ctl_qos_dispatch_asynchronous_override_add_internal(kport, priority, USER_ADDR_NULL, ulock_addr);
1277 }
1278
1279 int
1280 _bsdthread_ctl_qos_override_reset(struct proc *p, user_addr_t cmd, user_addr_t arg1, user_addr_t arg2, user_addr_t arg3, int *retval)
1281 {
1282 if (arg1 != 0 || arg2 != 0 || arg3 != 0) {
1283 return EINVAL;
1284 }
1285
1286 return _bsdthread_ctl_qos_dispatch_asynchronous_override_reset(p, cmd, 1 /* reset_all */, 0, 0, retval);
1287 }
1288
1289 int
1290 _bsdthread_ctl_qos_dispatch_asynchronous_override_reset(struct proc __unused *p, user_addr_t __unused cmd, int reset_all, user_addr_t resource, user_addr_t arg3, int __unused *retval)
1291 {
1292 if ((reset_all && (resource != 0)) || arg3 != 0) {
1293 return EINVAL;
1294 }
1295
1296 thread_t th = current_thread();
1297 struct uthread *uth = pthread_kern->get_bsdthread_info(th);
1298 struct threadlist *tl = pthread_kern->uthread_get_threadlist(uth);
1299
1300 if (!tl) {
1301 return EPERM;
1302 }
1303
1304 PTHREAD_TRACE_WQ(TRACE_wq_override_reset | DBG_FUNC_NONE, tl->th_workq, 0, 0, 0, 0);
1305
1306 resource = reset_all ? THREAD_QOS_OVERRIDE_RESOURCE_WILDCARD : resource;
1307 pthread_kern->proc_usynch_thread_qos_reset_override_for_resource(current_task(), uth, 0, resource, THREAD_QOS_OVERRIDE_TYPE_DISPATCH_ASYNCHRONOUS_OVERRIDE);
1308
1309 return 0;
1310 }
1311
1312 static int
1313 _bsdthread_ctl_max_parallelism(struct proc __unused *p, user_addr_t __unused cmd,
1314 int qos, unsigned long flags, int *retval)
1315 {
1316 _Static_assert(QOS_PARALLELISM_COUNT_LOGICAL ==
1317 _PTHREAD_QOS_PARALLELISM_COUNT_LOGICAL, "logical");
1318 _Static_assert(QOS_PARALLELISM_REALTIME ==
1319 _PTHREAD_QOS_PARALLELISM_REALTIME, "realtime");
1320
1321 if (flags & ~(QOS_PARALLELISM_REALTIME | QOS_PARALLELISM_COUNT_LOGICAL)) {
1322 return EINVAL;
1323 }
1324
1325 if (flags & QOS_PARALLELISM_REALTIME) {
1326 if (qos) {
1327 return EINVAL;
1328 }
1329 } else if (qos == THREAD_QOS_UNSPECIFIED || qos >= THREAD_QOS_LAST) {
1330 return EINVAL;
1331 }
1332
1333 *retval = pthread_kern->qos_max_parallelism(qos, flags);
1334 return 0;
1335 }
1336
1337 int
1338 _bsdthread_ctl(struct proc *p, user_addr_t cmd, user_addr_t arg1, user_addr_t arg2, user_addr_t arg3, int *retval)
1339 {
1340 switch (cmd) {
1341 case BSDTHREAD_CTL_SET_QOS:
1342 return _bsdthread_ctl_set_qos(p, cmd, (mach_port_name_t)arg1, arg2, arg3, retval);
1343 case BSDTHREAD_CTL_QOS_OVERRIDE_START:
1344 return _bsdthread_ctl_qos_override_start(p, cmd, (mach_port_name_t)arg1, (pthread_priority_t)arg2, arg3, retval);
1345 case BSDTHREAD_CTL_QOS_OVERRIDE_END:
1346 return _bsdthread_ctl_qos_override_end(p, cmd, (mach_port_name_t)arg1, arg2, arg3, retval);
1347 case BSDTHREAD_CTL_QOS_OVERRIDE_RESET:
1348 return _bsdthread_ctl_qos_override_reset(p, cmd, arg1, arg2, arg3, retval);
1349 case BSDTHREAD_CTL_QOS_OVERRIDE_DISPATCH:
1350 return _bsdthread_ctl_qos_override_dispatch(p, cmd, (mach_port_name_t)arg1, (pthread_priority_t)arg2, arg3, retval);
1351 case BSDTHREAD_CTL_QOS_DISPATCH_ASYNCHRONOUS_OVERRIDE_ADD:
1352 return _bsdthread_ctl_qos_dispatch_asynchronous_override_add(p, cmd, (mach_port_name_t)arg1, (pthread_priority_t)arg2, arg3, retval);
1353 case BSDTHREAD_CTL_QOS_DISPATCH_ASYNCHRONOUS_OVERRIDE_RESET:
1354 return _bsdthread_ctl_qos_dispatch_asynchronous_override_reset(p, cmd, (int)arg1, arg2, arg3, retval);
1355 case BSDTHREAD_CTL_SET_SELF:
1356 return _bsdthread_ctl_set_self(p, cmd, (pthread_priority_t)arg1, (mach_port_name_t)arg2, (_pthread_set_flags_t)arg3, retval);
1357 case BSDTHREAD_CTL_QOS_MAX_PARALLELISM:
1358 return _bsdthread_ctl_max_parallelism(p, cmd, (int)arg1, (unsigned long)arg2, retval);
1359 default:
1360 return EINVAL;
1361 }
1362 }
1363
1364 #pragma mark - Workqueue Implementation
1365
1366 #pragma mark wq_flags
1367
1368 static inline uint32_t
1369 _wq_flags(struct workqueue *wq)
1370 {
1371 return atomic_load_explicit(&wq->wq_flags, memory_order_relaxed);
1372 }
1373
1374 static inline bool
1375 _wq_exiting(struct workqueue *wq)
1376 {
1377 return _wq_flags(wq) & WQ_EXITING;
1378 }
1379
1380 static inline uint32_t
1381 _wq_flags_or_orig(struct workqueue *wq, uint32_t v)
1382 {
1383 #if PTHREAD_INLINE_RMW_ATOMICS
1384 uint32_t state;
1385 do {
1386 state = _wq_flags(wq);
1387 } while (!OSCompareAndSwap(state, state | v, &wq->wq_flags));
1388 return state;
1389 #else
1390 return atomic_fetch_or_explicit(&wq->wq_flags, v, memory_order_relaxed);
1391 #endif
1392 }
1393
1394 static inline uint32_t
1395 _wq_flags_and_orig(struct workqueue *wq, uint32_t v)
1396 {
1397 #if PTHREAD_INLINE_RMW_ATOMICS
1398 uint32_t state;
1399 do {
1400 state = _wq_flags(wq);
1401 } while (!OSCompareAndSwap(state, state & v, &wq->wq_flags));
1402 return state;
1403 #else
1404 return atomic_fetch_and_explicit(&wq->wq_flags, v, memory_order_relaxed);
1405 #endif
1406 }
1407
1408 static inline bool
1409 WQ_TIMER_DELAYED_NEEDED(struct workqueue *wq)
1410 {
1411 uint32_t oldflags, newflags;
1412 do {
1413 oldflags = _wq_flags(wq);
1414 if (oldflags & (WQ_EXITING | WQ_ATIMER_DELAYED_RUNNING)) {
1415 return false;
1416 }
1417 newflags = oldflags | WQ_ATIMER_DELAYED_RUNNING;
1418 } while (!OSCompareAndSwap(oldflags, newflags, &wq->wq_flags));
1419 return true;
1420 }
1421
1422 static inline bool
1423 WQ_TIMER_IMMEDIATE_NEEDED(struct workqueue *wq)
1424 {
1425 uint32_t oldflags, newflags;
1426 do {
1427 oldflags = _wq_flags(wq);
1428 if (oldflags & (WQ_EXITING | WQ_ATIMER_IMMEDIATE_RUNNING)) {
1429 return false;
1430 }
1431 newflags = oldflags | WQ_ATIMER_IMMEDIATE_RUNNING;
1432 } while (!OSCompareAndSwap(oldflags, newflags, &wq->wq_flags));
1433 return true;
1434 }
1435
1436 #pragma mark thread requests pacing
1437
1438 static inline uint32_t
1439 _wq_pacing_shift_for_pri(int pri)
1440 {
1441 return _wq_bucket_to_thread_qos(pri) - 1;
1442 }
1443
1444 static inline int
1445 _wq_highest_paced_priority(struct workqueue *wq)
1446 {
1447 uint8_t paced = wq->wq_paced;
1448 int msb = paced ? 32 - __builtin_clz(paced) : 0; // fls(paced) == bit + 1
1449 return WORKQUEUE_EVENT_MANAGER_BUCKET - msb;
1450 }
1451
1452 static inline uint8_t
1453 _wq_pacing_bit_for_pri(int pri)
1454 {
1455 return 1u << _wq_pacing_shift_for_pri(pri);
1456 }
1457
1458 static inline bool
1459 _wq_should_pace_priority(struct workqueue *wq, int pri)
1460 {
1461 return wq->wq_paced >= _wq_pacing_bit_for_pri(pri);
1462 }
1463
1464 static inline void
1465 _wq_pacing_start(struct workqueue *wq, struct threadlist *tl)
1466 {
1467 uint8_t bit = _wq_pacing_bit_for_pri(tl->th_priority);
1468 assert((tl->th_flags & TH_LIST_PACING) == 0);
1469 assert((wq->wq_paced & bit) == 0);
1470 wq->wq_paced |= bit;
1471 tl->th_flags |= TH_LIST_PACING;
1472 }
1473
1474 static inline bool
1475 _wq_pacing_end(struct workqueue *wq, struct threadlist *tl)
1476 {
1477 if (tl->th_flags & TH_LIST_PACING) {
1478 uint8_t bit = _wq_pacing_bit_for_pri(tl->th_priority);
1479 assert((wq->wq_paced & bit) != 0);
1480 wq->wq_paced ^= bit;
1481 tl->th_flags &= ~TH_LIST_PACING;
1482 return wq->wq_paced < bit; // !_wq_should_pace_priority
1483 }
1484 return false;
1485 }
1486
1487 #pragma mark thread requests
1488
1489 static void
1490 _threadreq_init_alloced(struct threadreq *req, int priority, int flags)
1491 {
1492 assert((flags & TR_FLAG_ONSTACK) == 0);
1493 req->tr_state = TR_STATE_NEW;
1494 req->tr_priority = priority;
1495 req->tr_flags = flags;
1496 }
1497
1498 static void
1499 _threadreq_init_stack(struct threadreq *req, int priority, int flags)
1500 {
1501 req->tr_state = TR_STATE_NEW;
1502 req->tr_priority = priority;
1503 req->tr_flags = flags | TR_FLAG_ONSTACK;
1504 }
1505
1506 static void
1507 _threadreq_copy_prepare(struct workqueue *wq)
1508 {
1509 again:
1510 if (wq->wq_cached_threadreq) {
1511 return;
1512 }
1513
1514 workqueue_unlock(wq);
1515 struct threadreq *req = zalloc(pthread_zone_threadreq);
1516 workqueue_lock_spin(wq);
1517
1518 if (wq->wq_cached_threadreq) {
1519 /*
1520 * We lost the race and someone left behind an extra threadreq for us
1521 * to use. Throw away our request and retry.
1522 */
1523 workqueue_unlock(wq);
1524 zfree(pthread_zone_threadreq, req);
1525 workqueue_lock_spin(wq);
1526 goto again;
1527 } else {
1528 wq->wq_cached_threadreq = req;
1529 }
1530
1531 assert(wq->wq_cached_threadreq);
1532 }
1533
1534 static bool
1535 _threadreq_copy_prepare_noblock(struct workqueue *wq)
1536 {
1537 if (wq->wq_cached_threadreq) {
1538 return true;
1539 }
1540
1541 wq->wq_cached_threadreq = zalloc_noblock(pthread_zone_threadreq);
1542
1543 return wq->wq_cached_threadreq != NULL;
1544 }
1545
1546 static inline struct threadreq_head *
1547 _threadreq_list_for_req(struct workqueue *wq, const struct threadreq *req)
1548 {
1549 if (req->tr_flags & TR_FLAG_OVERCOMMIT) {
1550 return &wq->wq_overcommit_reqlist[req->tr_priority];
1551 } else {
1552 return &wq->wq_reqlist[req->tr_priority];
1553 }
1554 }
1555
1556 static void
1557 _threadreq_enqueue(struct workqueue *wq, struct threadreq *req)
1558 {
1559 assert(req && req->tr_state == TR_STATE_NEW);
1560 if (req->tr_priority == WORKQUEUE_EVENT_MANAGER_BUCKET) {
1561 assert(wq->wq_event_manager_threadreq.tr_state != TR_STATE_WAITING);
1562 memcpy(&wq->wq_event_manager_threadreq, req, sizeof(struct threadreq));
1563 req = &wq->wq_event_manager_threadreq;
1564 req->tr_flags &= ~(TR_FLAG_ONSTACK | TR_FLAG_NO_PACING);
1565 } else {
1566 if (req->tr_flags & TR_FLAG_ONSTACK) {
1567 assert(wq->wq_cached_threadreq);
1568 struct threadreq *newreq = wq->wq_cached_threadreq;
1569 wq->wq_cached_threadreq = NULL;
1570
1571 memcpy(newreq, req, sizeof(struct threadreq));
1572 newreq->tr_flags &= ~(TR_FLAG_ONSTACK | TR_FLAG_NO_PACING);
1573 req->tr_state = TR_STATE_DEAD;
1574 req = newreq;
1575 }
1576 TAILQ_INSERT_TAIL(_threadreq_list_for_req(wq, req), req, tr_entry);
1577 }
1578 req->tr_state = TR_STATE_WAITING;
1579 wq->wq_reqcount++;
1580 }
1581
1582 static void
1583 _threadreq_dequeue(struct workqueue *wq, struct threadreq *req)
1584 {
1585 if (req->tr_priority != WORKQUEUE_EVENT_MANAGER_BUCKET) {
1586 struct threadreq_head *req_list = _threadreq_list_for_req(wq, req);
1587 #if DEBUG
1588 struct threadreq *cursor = NULL;
1589 TAILQ_FOREACH(cursor, req_list, tr_entry) {
1590 if (cursor == req) break;
1591 }
1592 assert(cursor == req);
1593 #endif
1594 TAILQ_REMOVE(req_list, req, tr_entry);
1595 }
1596 wq->wq_reqcount--;
1597 }
1598
1599 /*
1600 * Mark a thread request as complete. At this point, it is treated as owned by
1601 * the submitting subsystem and you should assume it could be freed.
1602 *
1603 * Called with the workqueue lock held.
1604 */
1605 static int
1606 _threadreq_complete_and_unlock(proc_t p, struct workqueue *wq,
1607 struct threadreq *req, struct threadlist *tl)
1608 {
1609 struct threadreq *req_tofree = NULL;
1610 bool sync = (req->tr_state == TR_STATE_NEW);
1611 bool workloop = req->tr_flags & TR_FLAG_WORKLOOP;
1612 bool onstack = req->tr_flags & TR_FLAG_ONSTACK;
1613 bool kevent = req->tr_flags & TR_FLAG_KEVENT;
1614 bool unbinding = tl->th_flags & TH_LIST_UNBINDING;
1615 bool locked = true;
1616 bool waking_parked_thread = (tl->th_flags & TH_LIST_BUSY);
1617 int ret;
1618
1619 req->tr_state = TR_STATE_COMPLETE;
1620
1621 if (!workloop && !onstack && req != &wq->wq_event_manager_threadreq) {
1622 if (wq->wq_cached_threadreq) {
1623 req_tofree = req;
1624 } else {
1625 wq->wq_cached_threadreq = req;
1626 }
1627 }
1628
1629 if (tl->th_flags & TH_LIST_UNBINDING) {
1630 tl->th_flags &= ~TH_LIST_UNBINDING;
1631 assert((tl->th_flags & TH_LIST_KEVENT_BOUND));
1632 } else if (workloop || kevent) {
1633 assert((tl->th_flags & TH_LIST_KEVENT_BOUND) == 0);
1634 tl->th_flags |= TH_LIST_KEVENT_BOUND;
1635 }
1636
1637 if (workloop) {
1638 workqueue_unlock(wq);
1639 ret = pthread_kern->workloop_fulfill_threadreq(wq->wq_proc, (void*)req,
1640 tl->th_thread, sync ? WORKLOOP_FULFILL_THREADREQ_SYNC : 0);
1641 assert(ret == 0);
1642 locked = false;
1643 } else if (kevent) {
1644 unsigned int kevent_flags = KEVENT_FLAG_WORKQ;
1645 if (sync) {
1646 kevent_flags |= KEVENT_FLAG_SYNCHRONOUS_BIND;
1647 }
1648 if (tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET) {
1649 kevent_flags |= KEVENT_FLAG_WORKQ_MANAGER;
1650 }
1651 workqueue_unlock(wq);
1652 ret = kevent_qos_internal_bind(wq->wq_proc,
1653 class_index_get_thread_qos(tl->th_priority), tl->th_thread,
1654 kevent_flags);
1655 if (ret != 0) {
1656 workqueue_lock_spin(wq);
1657 tl->th_flags &= ~TH_LIST_KEVENT_BOUND;
1658 locked = true;
1659 } else {
1660 locked = false;
1661 }
1662 }
1663
1664 /*
1665 * Run Thread, Run!
1666 */
1667 PTHREAD_TRACE_WQ(TRACE_wq_run_threadreq | DBG_FUNC_END, wq, 0, 0, 0, 0);
1668 PTHREAD_TRACE_WQ_REQ(TRACE_wq_runitem | DBG_FUNC_START, wq, req, tl->th_priority,
1669 thread_tid(current_thread()), thread_tid(tl->th_thread));
1670
1671 if (waking_parked_thread) {
1672 if (!locked) {
1673 workqueue_lock_spin(wq);
1674 }
1675 tl->th_flags &= ~(TH_LIST_BUSY);
1676 if ((tl->th_flags & TH_LIST_REMOVING_VOUCHER) == 0) {
1677 /*
1678 * If the thread is in the process of removing its voucher, then it
1679 * isn't actually in the wait event yet and we don't need to wake
1680 * it up. Save the trouble (and potential lock-ordering issues
1681 * (see 30617015)).
1682 */
1683 thread_wakeup_thread(tl, tl->th_thread);
1684 }
1685 workqueue_unlock(wq);
1686
1687 if (req_tofree) zfree(pthread_zone_threadreq, req_tofree);
1688 return WQ_RUN_TR_THREAD_STARTED;
1689 }
1690
1691 assert ((tl->th_flags & TH_LIST_PACING) == 0);
1692 if (locked) {
1693 workqueue_unlock(wq);
1694 }
1695 if (req_tofree) zfree(pthread_zone_threadreq, req_tofree);
1696 if (unbinding) {
1697 return WQ_RUN_TR_THREAD_STARTED;
1698 }
1699 _setup_wqthread(p, tl->th_thread, wq, tl, WQ_SETUP_CLEAR_VOUCHER);
1700 pthread_kern->unix_syscall_return(EJUSTRETURN);
1701 __builtin_unreachable();
1702 }
1703
1704 /*
1705 * Mark a thread request as cancelled. Has similar ownership semantics to the
1706 * complete call above.
1707 */
1708 static void
1709 _threadreq_cancel(struct workqueue *wq, struct threadreq *req)
1710 {
1711 assert(req->tr_state == TR_STATE_WAITING);
1712 req->tr_state = TR_STATE_DEAD;
1713
1714 assert((req->tr_flags & TR_FLAG_ONSTACK) == 0);
1715 if (req->tr_flags & TR_FLAG_WORKLOOP) {
1716 __assert_only int ret;
1717 ret = pthread_kern->workloop_fulfill_threadreq(wq->wq_proc, (void*)req,
1718 THREAD_NULL, WORKLOOP_FULFILL_THREADREQ_CANCEL);
1719 assert(ret == 0 || ret == ECANCELED);
1720 } else if (req != &wq->wq_event_manager_threadreq) {
1721 zfree(pthread_zone_threadreq, req);
1722 }
1723 }
1724
1725 #pragma mark workqueue lock
1726
1727 static boolean_t workqueue_lock_spin_is_acquired_kdp(struct workqueue *wq) {
1728 return kdp_lck_spin_is_acquired(&wq->wq_lock);
1729 }
1730
1731 static void
1732 workqueue_lock_spin(struct workqueue *wq)
1733 {
1734 assert(ml_get_interrupts_enabled() == TRUE);
1735 lck_spin_lock(&wq->wq_lock);
1736 }
1737
1738 static bool
1739 workqueue_lock_try(struct workqueue *wq)
1740 {
1741 return lck_spin_try_lock(&wq->wq_lock);
1742 }
1743
1744 static void
1745 workqueue_unlock(struct workqueue *wq)
1746 {
1747 lck_spin_unlock(&wq->wq_lock);
1748 }
1749
1750 #pragma mark workqueue add timer
1751
1752 /**
1753 * Sets up the timer which will call out to workqueue_add_timer
1754 */
1755 static void
1756 workqueue_interval_timer_start(struct workqueue *wq)
1757 {
1758 uint64_t deadline;
1759
1760 /* n.b. wq_timer_interval is reset to 0 in workqueue_add_timer if the
1761 ATIMER_RUNNING flag is not present. The net effect here is that if a
1762 sequence of threads is required, we'll double the time before we give out
1763 the next one. */
1764 if (wq->wq_timer_interval == 0) {
1765 wq->wq_timer_interval = wq_stalled_window_usecs;
1766
1767 } else {
1768 wq->wq_timer_interval = wq->wq_timer_interval * 2;
1769
1770 if (wq->wq_timer_interval > wq_max_timer_interval_usecs) {
1771 wq->wq_timer_interval = wq_max_timer_interval_usecs;
1772 }
1773 }
1774 clock_interval_to_deadline(wq->wq_timer_interval, 1000, &deadline);
1775
1776 PTHREAD_TRACE_WQ(TRACE_wq_start_add_timer, wq, wq->wq_reqcount,
1777 _wq_flags(wq), wq->wq_timer_interval, 0);
1778
1779 thread_call_t call = wq->wq_atimer_delayed_call;
1780 if (thread_call_enter1_delayed(call, call, deadline)) {
1781 panic("delayed_call was already enqueued");
1782 }
1783 }
1784
1785 /**
1786 * Immediately trigger the workqueue_add_timer
1787 */
1788 static void
1789 workqueue_interval_timer_trigger(struct workqueue *wq)
1790 {
1791 PTHREAD_TRACE_WQ(TRACE_wq_start_add_timer, wq, wq->wq_reqcount,
1792 _wq_flags(wq), 0, 0);
1793
1794 thread_call_t call = wq->wq_atimer_immediate_call;
1795 if (thread_call_enter1(call, call)) {
1796 panic("immediate_call was already enqueued");
1797 }
1798 }
1799
1800 /**
1801 * returns whether lastblocked_tsp is within wq_stalled_window_usecs of cur_ts
1802 */
1803 static boolean_t
1804 wq_thread_is_busy(uint64_t cur_ts, _Atomic uint64_t *lastblocked_tsp)
1805 {
1806 clock_sec_t secs;
1807 clock_usec_t usecs;
1808 uint64_t lastblocked_ts;
1809 uint64_t elapsed;
1810
1811 lastblocked_ts = atomic_load_explicit(lastblocked_tsp, memory_order_relaxed);
1812 if (lastblocked_ts >= cur_ts) {
1813 /*
1814 * because the update of the timestamp when a thread blocks isn't
1815 * serialized against us looking at it (i.e. we don't hold the workq lock)
1816 * it's possible to have a timestamp that matches the current time or
1817 * that even looks to be in the future relative to when we grabbed the current
1818 * time... just treat this as a busy thread since it must have just blocked.
1819 */
1820 return (TRUE);
1821 }
1822 elapsed = cur_ts - lastblocked_ts;
1823
1824 pthread_kern->absolutetime_to_microtime(elapsed, &secs, &usecs);
1825
1826 return (secs == 0 && usecs < wq_stalled_window_usecs);
1827 }
1828
1829 /**
1830 * handler function for the timer
1831 */
1832 static void
1833 workqueue_add_timer(struct workqueue *wq, thread_call_t thread_call_self)
1834 {
1835 proc_t p = wq->wq_proc;
1836
1837 workqueue_lock_spin(wq);
1838
1839 PTHREAD_TRACE_WQ(TRACE_wq_add_timer | DBG_FUNC_START, wq,
1840 _wq_flags(wq), wq->wq_nthreads, wq->wq_thidlecount, 0);
1841
1842 /*
1843 * There's two tricky issues here.
1844 *
1845 * First issue: we start the thread_call's that invoke this routine without
1846 * the workqueue lock held. The scheduler callback needs to trigger
1847 * reevaluation of the number of running threads but shouldn't take that
1848 * lock, so we can't use it to synchronize state around the thread_call.
1849 * As a result, it might re-enter the thread_call while this routine is
1850 * already running. This could cause it to fire a second time and we'll
1851 * have two add_timers running at once. Obviously, we don't want that to
1852 * keep stacking, so we need to keep it at two timers.
1853 *
1854 * Solution: use wq_flags (accessed via atomic CAS) to synchronize the
1855 * enqueue of the thread_call itself. When a thread needs to trigger the
1856 * add_timer, it checks for ATIMER_DELAYED_RUNNING and, when not set, sets
1857 * the flag then does a thread_call_enter. We'll then remove that flag
1858 * only once we've got the lock and it's safe for the thread_call to be
1859 * entered again.
1860 *
1861 * Second issue: we need to make sure that the two timers don't execute this
1862 * routine concurrently. We can't use the workqueue lock for this because
1863 * we'll need to drop it during our execution.
1864 *
1865 * Solution: use WQL_ATIMER_BUSY as a condition variable to indicate that
1866 * we are currently executing the routine and the next thread should wait.
1867 *
1868 * After all that, we arrive at the following four possible states:
1869 * !WQ_ATIMER_DELAYED_RUNNING && !WQL_ATIMER_BUSY no pending timer, no active timer
1870 * !WQ_ATIMER_DELAYED_RUNNING && WQL_ATIMER_BUSY no pending timer, 1 active timer
1871 * WQ_ATIMER_DELAYED_RUNNING && !WQL_ATIMER_BUSY 1 pending timer, no active timer
1872 * WQ_ATIMER_DELAYED_RUNNING && WQL_ATIMER_BUSY 1 pending timer, 1 active timer
1873 *
1874 * Further complication sometimes we need to trigger this function to run
1875 * without delay. Because we aren't under a lock between setting
1876 * WQ_ATIMER_DELAYED_RUNNING and calling thread_call_enter, we can't simply
1877 * re-enter the thread call: if thread_call_enter() returned false, we
1878 * wouldn't be able to distinguish the case where the thread_call had
1879 * already fired from the case where it hadn't been entered yet from the
1880 * other thread. So, we use a separate thread_call for immediate
1881 * invocations, and a separate RUNNING flag, WQ_ATIMER_IMMEDIATE_RUNNING.
1882 */
1883
1884 while (wq->wq_lflags & WQL_ATIMER_BUSY) {
1885 wq->wq_lflags |= WQL_ATIMER_WAITING;
1886
1887 assert_wait((caddr_t)wq, (THREAD_UNINT));
1888 workqueue_unlock(wq);
1889
1890 thread_block(THREAD_CONTINUE_NULL);
1891
1892 workqueue_lock_spin(wq);
1893 }
1894 /*
1895 * Prevent _workqueue_mark_exiting() from going away
1896 */
1897 wq->wq_lflags |= WQL_ATIMER_BUSY;
1898
1899 /*
1900 * Decide which timer we are and remove the RUNNING flag.
1901 */
1902 if (thread_call_self == wq->wq_atimer_delayed_call) {
1903 uint64_t wq_flags = _wq_flags_and_orig(wq, ~WQ_ATIMER_DELAYED_RUNNING);
1904 if ((wq_flags & WQ_ATIMER_DELAYED_RUNNING) == 0) {
1905 panic("workqueue_add_timer(delayed) w/o WQ_ATIMER_DELAYED_RUNNING");
1906 }
1907 } else if (thread_call_self == wq->wq_atimer_immediate_call) {
1908 uint64_t wq_flags = _wq_flags_and_orig(wq, ~WQ_ATIMER_IMMEDIATE_RUNNING);
1909 if ((wq_flags & WQ_ATIMER_IMMEDIATE_RUNNING) == 0) {
1910 panic("workqueue_add_timer(immediate) w/o WQ_ATIMER_IMMEDIATE_RUNNING");
1911 }
1912 } else {
1913 panic("workqueue_add_timer can't figure out which timer it is");
1914 }
1915
1916 int ret = WQ_RUN_TR_THREAD_STARTED;
1917 while (ret == WQ_RUN_TR_THREAD_STARTED && wq->wq_reqcount) {
1918 ret = workqueue_run_threadreq_and_unlock(p, wq, NULL, NULL, true);
1919
1920 workqueue_lock_spin(wq);
1921 }
1922 _threadreq_copy_prepare(wq);
1923
1924 /*
1925 * If we called WQ_TIMER_NEEDED above, then this flag will be set if that
1926 * call marked the timer running. If so, we let the timer interval grow.
1927 * Otherwise, we reset it back to 0.
1928 */
1929 uint32_t wq_flags = _wq_flags(wq);
1930 if (!(wq_flags & WQ_ATIMER_DELAYED_RUNNING)) {
1931 wq->wq_timer_interval = 0;
1932 }
1933
1934 wq->wq_lflags &= ~WQL_ATIMER_BUSY;
1935
1936 if ((wq_flags & WQ_EXITING) || (wq->wq_lflags & WQL_ATIMER_WAITING)) {
1937 /*
1938 * wakeup the thread hung up in _workqueue_mark_exiting or
1939 * workqueue_add_timer waiting for this timer to finish getting out of
1940 * the way
1941 */
1942 wq->wq_lflags &= ~WQL_ATIMER_WAITING;
1943 wakeup(wq);
1944 }
1945
1946 PTHREAD_TRACE_WQ(TRACE_wq_add_timer | DBG_FUNC_END, wq, 0, wq->wq_nthreads, wq->wq_thidlecount, 0);
1947
1948 workqueue_unlock(wq);
1949 }
1950
1951 #pragma mark thread state tracking
1952
1953 // called by spinlock code when trying to yield to lock owner
1954 void
1955 _workqueue_thread_yielded(void)
1956 {
1957 }
1958
1959 static void
1960 workqueue_callback(int type, thread_t thread)
1961 {
1962 struct uthread *uth = pthread_kern->get_bsdthread_info(thread);
1963 struct threadlist *tl = pthread_kern->uthread_get_threadlist(uth);
1964 struct workqueue *wq = tl->th_workq;
1965 uint32_t old_count, req_qos, qos = tl->th_priority;
1966 wq_thactive_t old_thactive;
1967
1968 switch (type) {
1969 case SCHED_CALL_BLOCK: {
1970 bool start_timer = false;
1971
1972 old_thactive = _wq_thactive_dec(wq, tl->th_priority);
1973 req_qos = WQ_THACTIVE_BEST_CONSTRAINED_REQ_QOS(old_thactive);
1974 old_count = _wq_thactive_aggregate_downto_qos(wq, old_thactive,
1975 qos, NULL, NULL);
1976
1977 if (old_count == wq_max_concurrency[tl->th_priority]) {
1978 /*
1979 * The number of active threads at this priority has fallen below
1980 * the maximum number of concurrent threads that are allowed to run
1981 *
1982 * if we collide with another thread trying to update the
1983 * last_blocked (really unlikely since another thread would have to
1984 * get scheduled and then block after we start down this path), it's
1985 * not a problem. Either timestamp is adequate, so no need to retry
1986 */
1987 atomic_store_explicit(&wq->wq_lastblocked_ts[qos],
1988 mach_absolute_time(), memory_order_relaxed);
1989 }
1990
1991 if (req_qos == WORKQUEUE_EVENT_MANAGER_BUCKET || qos > req_qos) {
1992 /*
1993 * The blocking thread is at a lower QoS than the highest currently
1994 * pending constrained request, nothing has to be redriven
1995 */
1996 } else {
1997 uint32_t max_busycount, old_req_count;
1998 old_req_count = _wq_thactive_aggregate_downto_qos(wq, old_thactive,
1999 req_qos, NULL, &max_busycount);
2000 /*
2001 * If it is possible that may_start_constrained_thread had refused
2002 * admission due to being over the max concurrency, we may need to
2003 * spin up a new thread.
2004 *
2005 * We take into account the maximum number of busy threads
2006 * that can affect may_start_constrained_thread as looking at the
2007 * actual number may_start_constrained_thread will see is racy.
2008 *
2009 * IOW at NCPU = 4, for IN (req_qos = 1), if the old req count is
2010 * between NCPU (4) and NCPU - 2 (2) we need to redrive.
2011 */
2012 if (wq_max_concurrency[req_qos] <= old_req_count + max_busycount &&
2013 old_req_count <= wq_max_concurrency[req_qos]) {
2014 if (WQ_TIMER_DELAYED_NEEDED(wq)) {
2015 start_timer = true;
2016 workqueue_interval_timer_start(wq);
2017 }
2018 }
2019 }
2020
2021 PTHREAD_TRACE_WQ(TRACE_wq_thread_block | DBG_FUNC_START, wq,
2022 old_count - 1, qos | (req_qos << 8),
2023 wq->wq_reqcount << 1 | start_timer, 0);
2024 break;
2025 }
2026 case SCHED_CALL_UNBLOCK: {
2027 /*
2028 * we cannot take the workqueue_lock here...
2029 * an UNBLOCK can occur from a timer event which
2030 * is run from an interrupt context... if the workqueue_lock
2031 * is already held by this processor, we'll deadlock...
2032 * the thread lock for the thread being UNBLOCKED
2033 * is also held
2034 */
2035 old_thactive = _wq_thactive_inc(wq, qos);
2036 if (pthread_debug_tracing) {
2037 req_qos = WQ_THACTIVE_BEST_CONSTRAINED_REQ_QOS(old_thactive);
2038 old_count = _wq_thactive_aggregate_downto_qos(wq, old_thactive,
2039 qos, NULL, NULL);
2040 PTHREAD_TRACE_WQ(TRACE_wq_thread_block | DBG_FUNC_END, wq,
2041 old_count + 1, qos | (req_qos << 8),
2042 wq->wq_threads_scheduled, 0);
2043 }
2044 break;
2045 }
2046 }
2047 }
2048
2049 sched_call_t
2050 _workqueue_get_sched_callback(void)
2051 {
2052 return workqueue_callback;
2053 }
2054
2055 #pragma mark thread addition/removal
2056
2057 static mach_vm_size_t
2058 _workqueue_allocsize(struct workqueue *wq)
2059 {
2060 proc_t p = wq->wq_proc;
2061 mach_vm_size_t guardsize = vm_map_page_size(wq->wq_map);
2062 mach_vm_size_t pthread_size =
2063 vm_map_round_page_mask(pthread_kern->proc_get_pthsize(p) + PTHREAD_T_OFFSET, vm_map_page_mask(wq->wq_map));
2064 return guardsize + PTH_DEFAULT_STACKSIZE + pthread_size;
2065 }
2066
2067 /**
2068 * pop goes the thread
2069 *
2070 * If fromexit is set, the call is from workqueue_exit(,
2071 * so some cleanups are to be avoided.
2072 */
2073 static void
2074 workqueue_removethread(struct threadlist *tl, bool fromexit, bool first_use)
2075 {
2076 struct uthread * uth;
2077 struct workqueue * wq = tl->th_workq;
2078
2079 if (tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET){
2080 TAILQ_REMOVE(&wq->wq_thidlemgrlist, tl, th_entry);
2081 } else {
2082 TAILQ_REMOVE(&wq->wq_thidlelist, tl, th_entry);
2083 }
2084
2085 if (fromexit == 0) {
2086 assert(wq->wq_nthreads && wq->wq_thidlecount);
2087 wq->wq_nthreads--;
2088 wq->wq_thidlecount--;
2089 }
2090
2091 /*
2092 * Clear the threadlist pointer in uthread so
2093 * blocked thread on wakeup for termination will
2094 * not access the thread list as it is going to be
2095 * freed.
2096 */
2097 pthread_kern->thread_sched_call(tl->th_thread, NULL);
2098
2099 uth = pthread_kern->get_bsdthread_info(tl->th_thread);
2100 if (uth != (struct uthread *)0) {
2101 pthread_kern->uthread_set_threadlist(uth, NULL);
2102 }
2103 if (fromexit == 0) {
2104 /* during exit the lock is not held */
2105 workqueue_unlock(wq);
2106 }
2107
2108 if ( (tl->th_flags & TH_LIST_NEW) || first_use ) {
2109 /*
2110 * thread was created, but never used...
2111 * need to clean up the stack and port ourselves
2112 * since we're not going to spin up through the
2113 * normal exit path triggered from Libc
2114 */
2115 if (fromexit == 0) {
2116 /* vm map is already deallocated when this is called from exit */
2117 (void)mach_vm_deallocate(wq->wq_map, tl->th_stackaddr, _workqueue_allocsize(wq));
2118 }
2119 (void)pthread_kern->mach_port_deallocate(pthread_kern->task_get_ipcspace(wq->wq_task), tl->th_thport);
2120 }
2121 /*
2122 * drop our ref on the thread
2123 */
2124 thread_deallocate(tl->th_thread);
2125
2126 zfree(pthread_zone_threadlist, tl);
2127 }
2128
2129
2130 /**
2131 * Try to add a new workqueue thread.
2132 *
2133 * - called with workq lock held
2134 * - dropped and retaken around thread creation
2135 * - return with workq lock held
2136 */
2137 static bool
2138 workqueue_addnewthread(proc_t p, struct workqueue *wq)
2139 {
2140 kern_return_t kret;
2141
2142 wq->wq_nthreads++;
2143
2144 workqueue_unlock(wq);
2145
2146 struct threadlist *tl = zalloc(pthread_zone_threadlist);
2147 bzero(tl, sizeof(struct threadlist));
2148
2149 thread_t th;
2150 kret = pthread_kern->thread_create_workq_waiting(wq->wq_task, wq_unpark_continue, tl, &th);
2151 if (kret != KERN_SUCCESS) {
2152 PTHREAD_TRACE_WQ(TRACE_wq_thread_create_failed | DBG_FUNC_NONE, wq, kret, 0, 0, 0);
2153 goto fail_free;
2154 }
2155
2156 mach_vm_offset_t stackaddr = pthread_kern->proc_get_stack_addr_hint(p);
2157
2158 mach_vm_size_t guardsize = vm_map_page_size(wq->wq_map);
2159 mach_vm_size_t pthread_size =
2160 vm_map_round_page_mask(pthread_kern->proc_get_pthsize(p) + PTHREAD_T_OFFSET, vm_map_page_mask(wq->wq_map));
2161 mach_vm_size_t th_allocsize = guardsize + PTH_DEFAULT_STACKSIZE + pthread_size;
2162
2163 kret = mach_vm_map(wq->wq_map, &stackaddr,
2164 th_allocsize, page_size-1,
2165 VM_MAKE_TAG(VM_MEMORY_STACK)| VM_FLAGS_ANYWHERE, NULL,
2166 0, FALSE, VM_PROT_DEFAULT, VM_PROT_ALL,
2167 VM_INHERIT_DEFAULT);
2168
2169 if (kret != KERN_SUCCESS) {
2170 kret = mach_vm_allocate(wq->wq_map,
2171 &stackaddr, th_allocsize,
2172 VM_MAKE_TAG(VM_MEMORY_STACK) | VM_FLAGS_ANYWHERE);
2173 }
2174
2175 if (kret != KERN_SUCCESS) {
2176 PTHREAD_TRACE_WQ(TRACE_wq_thread_create_failed | DBG_FUNC_NONE, wq, kret, 1, 0, 0);
2177 goto fail_terminate;
2178 }
2179
2180 /*
2181 * The guard page is at the lowest address
2182 * The stack base is the highest address
2183 */
2184 kret = mach_vm_protect(wq->wq_map, stackaddr, guardsize, FALSE, VM_PROT_NONE);
2185 if (kret != KERN_SUCCESS) {
2186 PTHREAD_TRACE_WQ(TRACE_wq_thread_create_failed | DBG_FUNC_NONE, wq, kret, 2, 0, 0);
2187 goto fail_vm_deallocate;
2188 }
2189
2190
2191 pthread_kern->thread_set_tag(th, THREAD_TAG_PTHREAD | THREAD_TAG_WORKQUEUE);
2192 pthread_kern->thread_static_param(th, TRUE);
2193
2194 /*
2195 * convert_thread_to_port() consumes a reference
2196 */
2197 thread_reference(th);
2198 void *sright = (void *)pthread_kern->convert_thread_to_port(th);
2199 tl->th_thport = pthread_kern->ipc_port_copyout_send(sright,
2200 pthread_kern->task_get_ipcspace(wq->wq_task));
2201
2202 tl->th_flags = TH_LIST_INITED | TH_LIST_NEW;
2203 tl->th_thread = th;
2204 tl->th_workq = wq;
2205 tl->th_stackaddr = stackaddr;
2206 tl->th_priority = WORKQUEUE_NUM_BUCKETS;
2207
2208 struct uthread *uth;
2209 uth = pthread_kern->get_bsdthread_info(tl->th_thread);
2210
2211 workqueue_lock_spin(wq);
2212
2213 void *current_tl = pthread_kern->uthread_get_threadlist(uth);
2214 if (current_tl == NULL) {
2215 pthread_kern->uthread_set_threadlist(uth, tl);
2216 TAILQ_INSERT_TAIL(&wq->wq_thidlelist, tl, th_entry);
2217 wq->wq_thidlecount++;
2218 } else if (current_tl == WQ_THREADLIST_EXITING_POISON) {
2219 /*
2220 * Failed thread creation race: The thread already woke up and has exited.
2221 */
2222 PTHREAD_TRACE_WQ(TRACE_wq_thread_create_failed | DBG_FUNC_NONE, wq, kret, 3, 0, 0);
2223 goto fail_unlock;
2224 } else {
2225 panic("Unexpected initial threadlist value");
2226 }
2227
2228 PTHREAD_TRACE_WQ(TRACE_wq_thread_create | DBG_FUNC_NONE, wq, 0, 0, 0, 0);
2229
2230 return (TRUE);
2231
2232 fail_unlock:
2233 workqueue_unlock(wq);
2234 (void)pthread_kern->mach_port_deallocate(pthread_kern->task_get_ipcspace(wq->wq_task),
2235 tl->th_thport);
2236
2237 fail_vm_deallocate:
2238 (void) mach_vm_deallocate(wq->wq_map, stackaddr, th_allocsize);
2239
2240 fail_terminate:
2241 if (pthread_kern->thread_will_park_or_terminate) {
2242 pthread_kern->thread_will_park_or_terminate(th);
2243 }
2244 (void)thread_terminate(th);
2245 thread_deallocate(th);
2246
2247 fail_free:
2248 zfree(pthread_zone_threadlist, tl);
2249
2250 workqueue_lock_spin(wq);
2251 wq->wq_nthreads--;
2252
2253 return (FALSE);
2254 }
2255
2256 /**
2257 * Setup per-process state for the workqueue.
2258 */
2259 int
2260 _workq_open(struct proc *p, __unused int32_t *retval)
2261 {
2262 struct workqueue * wq;
2263 char * ptr;
2264 uint32_t num_cpus;
2265 int error = 0;
2266
2267 if (pthread_kern->proc_get_register(p) == 0) {
2268 return EINVAL;
2269 }
2270
2271 num_cpus = pthread_kern->ml_get_max_cpus();
2272
2273 if (wq_init_constrained_limit) {
2274 uint32_t limit;
2275 /*
2276 * set up the limit for the constrained pool
2277 * this is a virtual pool in that we don't
2278 * maintain it on a separate idle and run list
2279 */
2280 limit = num_cpus * WORKQUEUE_CONSTRAINED_FACTOR;
2281
2282 if (limit > wq_max_constrained_threads)
2283 wq_max_constrained_threads = limit;
2284
2285 wq_init_constrained_limit = 0;
2286
2287 if (wq_max_threads > WQ_THACTIVE_BUCKET_HALF) {
2288 wq_max_threads = WQ_THACTIVE_BUCKET_HALF;
2289 }
2290 if (wq_max_threads > pthread_kern->config_thread_max - 20) {
2291 wq_max_threads = pthread_kern->config_thread_max - 20;
2292 }
2293 }
2294
2295 if (pthread_kern->proc_get_wqptr(p) == NULL) {
2296 if (pthread_kern->proc_init_wqptr_or_wait(p) == FALSE) {
2297 assert(pthread_kern->proc_get_wqptr(p) != NULL);
2298 goto out;
2299 }
2300
2301 ptr = (char *)zalloc(pthread_zone_workqueue);
2302 bzero(ptr, sizeof(struct workqueue));
2303
2304 wq = (struct workqueue *)ptr;
2305 wq->wq_proc = p;
2306 wq->wq_task = current_task();
2307 wq->wq_map = pthread_kern->current_map();
2308
2309 // Start the event manager at the priority hinted at by the policy engine
2310 int mgr_priority_hint = pthread_kern->task_get_default_manager_qos(current_task());
2311 wq->wq_event_manager_priority = (uint32_t)thread_qos_get_pthread_priority(mgr_priority_hint) | _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG;
2312
2313 TAILQ_INIT(&wq->wq_thrunlist);
2314 TAILQ_INIT(&wq->wq_thidlelist);
2315 for (int i = 0; i < WORKQUEUE_EVENT_MANAGER_BUCKET; i++) {
2316 TAILQ_INIT(&wq->wq_overcommit_reqlist[i]);
2317 TAILQ_INIT(&wq->wq_reqlist[i]);
2318 }
2319
2320 wq->wq_atimer_delayed_call =
2321 thread_call_allocate_with_priority((thread_call_func_t)workqueue_add_timer,
2322 (thread_call_param_t)wq, THREAD_CALL_PRIORITY_KERNEL);
2323 wq->wq_atimer_immediate_call =
2324 thread_call_allocate_with_priority((thread_call_func_t)workqueue_add_timer,
2325 (thread_call_param_t)wq, THREAD_CALL_PRIORITY_KERNEL);
2326
2327 lck_spin_init(&wq->wq_lock, pthread_lck_grp, pthread_lck_attr);
2328
2329 wq->wq_cached_threadreq = zalloc(pthread_zone_threadreq);
2330 *(wq_thactive_t *)&wq->wq_thactive =
2331 (wq_thactive_t)WQ_THACTIVE_NO_PENDING_REQUEST <<
2332 WQ_THACTIVE_QOS_SHIFT;
2333
2334 pthread_kern->proc_set_wqptr(p, wq);
2335
2336 }
2337 out:
2338
2339 return(error);
2340 }
2341
2342 /*
2343 * Routine: workqueue_mark_exiting
2344 *
2345 * Function: Mark the work queue such that new threads will not be added to the
2346 * work queue after we return.
2347 *
2348 * Conditions: Called against the current process.
2349 */
2350 void
2351 _workqueue_mark_exiting(struct proc *p)
2352 {
2353 struct workqueue *wq = pthread_kern->proc_get_wqptr(p);
2354 if (!wq) return;
2355
2356 PTHREAD_TRACE_WQ(TRACE_wq_pthread_exit|DBG_FUNC_START, wq, 0, 0, 0, 0);
2357
2358 workqueue_lock_spin(wq);
2359
2360 /*
2361 * We arm the add timer without holding the workqueue lock so we need
2362 * to synchronize with any running or soon to be running timers.
2363 *
2364 * Threads that intend to arm the timer atomically OR
2365 * WQ_ATIMER_{DELAYED,IMMEDIATE}_RUNNING into the wq_flags, only if
2366 * WQ_EXITING is not present. So, once we have set WQ_EXITING, we can
2367 * be sure that no new RUNNING flags will be set, but still need to
2368 * wait for the already running timers to complete.
2369 *
2370 * We always hold the workq lock when dropping WQ_ATIMER_RUNNING, so
2371 * the check for and sleep until clear is protected.
2372 */
2373 uint64_t wq_flags = _wq_flags_or_orig(wq, WQ_EXITING);
2374
2375 if (wq_flags & WQ_ATIMER_DELAYED_RUNNING) {
2376 if (thread_call_cancel(wq->wq_atimer_delayed_call) == TRUE) {
2377 wq_flags = _wq_flags_and_orig(wq, ~WQ_ATIMER_DELAYED_RUNNING);
2378 }
2379 }
2380 if (wq_flags & WQ_ATIMER_IMMEDIATE_RUNNING) {
2381 if (thread_call_cancel(wq->wq_atimer_immediate_call) == TRUE) {
2382 wq_flags = _wq_flags_and_orig(wq, ~WQ_ATIMER_IMMEDIATE_RUNNING);
2383 }
2384 }
2385 while ((_wq_flags(wq) & (WQ_ATIMER_DELAYED_RUNNING | WQ_ATIMER_IMMEDIATE_RUNNING)) ||
2386 (wq->wq_lflags & WQL_ATIMER_BUSY)) {
2387 assert_wait((caddr_t)wq, (THREAD_UNINT));
2388 workqueue_unlock(wq);
2389
2390 thread_block(THREAD_CONTINUE_NULL);
2391
2392 workqueue_lock_spin(wq);
2393 }
2394
2395 /*
2396 * Save off pending requests, will complete/free them below after unlocking
2397 */
2398 TAILQ_HEAD(, threadreq) local_list = TAILQ_HEAD_INITIALIZER(local_list);
2399
2400 for (int i = 0; i < WORKQUEUE_EVENT_MANAGER_BUCKET; i++) {
2401 TAILQ_CONCAT(&local_list, &wq->wq_overcommit_reqlist[i], tr_entry);
2402 TAILQ_CONCAT(&local_list, &wq->wq_reqlist[i], tr_entry);
2403 }
2404
2405 /*
2406 * XXX: Can't deferred cancel the event manager request, so just smash it.
2407 */
2408 assert((wq->wq_event_manager_threadreq.tr_flags & TR_FLAG_WORKLOOP) == 0);
2409 wq->wq_event_manager_threadreq.tr_state = TR_STATE_DEAD;
2410
2411 workqueue_unlock(wq);
2412
2413 struct threadreq *tr, *tr_temp;
2414 TAILQ_FOREACH_SAFE(tr, &local_list, tr_entry, tr_temp) {
2415 _threadreq_cancel(wq, tr);
2416 }
2417 PTHREAD_TRACE(TRACE_wq_pthread_exit|DBG_FUNC_END, 0, 0, 0, 0, 0);
2418 }
2419
2420 /*
2421 * Routine: workqueue_exit
2422 *
2423 * Function: clean up the work queue structure(s) now that there are no threads
2424 * left running inside the work queue (except possibly current_thread).
2425 *
2426 * Conditions: Called by the last thread in the process.
2427 * Called against current process.
2428 */
2429 void
2430 _workqueue_exit(struct proc *p)
2431 {
2432 struct workqueue * wq;
2433 struct threadlist * tl, *tlist;
2434 struct uthread *uth;
2435
2436 wq = pthread_kern->proc_get_wqptr(p);
2437 if (wq != NULL) {
2438
2439 PTHREAD_TRACE_WQ(TRACE_wq_workqueue_exit|DBG_FUNC_START, wq, 0, 0, 0, 0);
2440
2441 pthread_kern->proc_set_wqptr(p, NULL);
2442
2443 /*
2444 * Clean up workqueue data structures for threads that exited and
2445 * didn't get a chance to clean up after themselves.
2446 */
2447 TAILQ_FOREACH_SAFE(tl, &wq->wq_thrunlist, th_entry, tlist) {
2448 assert((tl->th_flags & TH_LIST_RUNNING) != 0);
2449
2450 pthread_kern->thread_sched_call(tl->th_thread, NULL);
2451
2452 uth = pthread_kern->get_bsdthread_info(tl->th_thread);
2453 if (uth != (struct uthread *)0) {
2454 pthread_kern->uthread_set_threadlist(uth, NULL);
2455 }
2456 TAILQ_REMOVE(&wq->wq_thrunlist, tl, th_entry);
2457
2458 /*
2459 * drop our last ref on the thread
2460 */
2461 thread_deallocate(tl->th_thread);
2462
2463 zfree(pthread_zone_threadlist, tl);
2464 }
2465 TAILQ_FOREACH_SAFE(tl, &wq->wq_thidlelist, th_entry, tlist) {
2466 assert((tl->th_flags & TH_LIST_RUNNING) == 0);
2467 assert(tl->th_priority != WORKQUEUE_EVENT_MANAGER_BUCKET);
2468 workqueue_removethread(tl, true, false);
2469 }
2470 TAILQ_FOREACH_SAFE(tl, &wq->wq_thidlemgrlist, th_entry, tlist) {
2471 assert((tl->th_flags & TH_LIST_RUNNING) == 0);
2472 assert(tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET);
2473 workqueue_removethread(tl, true, false);
2474 }
2475 if (wq->wq_cached_threadreq) {
2476 zfree(pthread_zone_threadreq, wq->wq_cached_threadreq);
2477 }
2478 thread_call_free(wq->wq_atimer_delayed_call);
2479 thread_call_free(wq->wq_atimer_immediate_call);
2480 lck_spin_destroy(&wq->wq_lock, pthread_lck_grp);
2481
2482 for (int i = 0; i < WORKQUEUE_EVENT_MANAGER_BUCKET; i++) {
2483 assert(TAILQ_EMPTY(&wq->wq_overcommit_reqlist[i]));
2484 assert(TAILQ_EMPTY(&wq->wq_reqlist[i]));
2485 }
2486
2487 zfree(pthread_zone_workqueue, wq);
2488
2489 PTHREAD_TRACE(TRACE_wq_workqueue_exit|DBG_FUNC_END, 0, 0, 0, 0, 0);
2490 }
2491 }
2492
2493
2494 #pragma mark workqueue thread manipulation
2495
2496
2497 /**
2498 * Entry point for libdispatch to ask for threads
2499 */
2500 static int
2501 wqops_queue_reqthreads(struct proc *p, int reqcount,
2502 pthread_priority_t priority)
2503 {
2504 bool overcommit = _pthread_priority_get_flags(priority) & _PTHREAD_PRIORITY_OVERCOMMIT_FLAG;
2505 bool event_manager = _pthread_priority_get_flags(priority) & _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG;
2506 int class = event_manager ? WORKQUEUE_EVENT_MANAGER_BUCKET :
2507 pthread_priority_get_class_index(priority);
2508
2509 if ((reqcount <= 0) || (class < 0) || (class >= WORKQUEUE_NUM_BUCKETS) ||
2510 (overcommit && event_manager)) {
2511 return EINVAL;
2512 }
2513
2514 struct workqueue *wq;
2515 if ((wq = (struct workqueue *)pthread_kern->proc_get_wqptr(p)) == NULL) {
2516 return EINVAL;
2517 }
2518
2519 workqueue_lock_spin(wq);
2520 _threadreq_copy_prepare(wq);
2521
2522 PTHREAD_TRACE_WQ(TRACE_wq_wqops_reqthreads | DBG_FUNC_NONE, wq, reqcount, priority, 0, 0);
2523
2524 int tr_flags = 0;
2525 if (overcommit) tr_flags |= TR_FLAG_OVERCOMMIT;
2526 if (reqcount > 1) {
2527 /*
2528 * when libdispatch asks for more than one thread, it wants to achieve
2529 * parallelism. Pacing would be detrimental to this ask, so treat
2530 * these specially to not do the pacing admission check
2531 */
2532 tr_flags |= TR_FLAG_NO_PACING;
2533 }
2534
2535 while (reqcount-- && !_wq_exiting(wq)) {
2536 struct threadreq req;
2537 _threadreq_init_stack(&req, class, tr_flags);
2538
2539 workqueue_run_threadreq_and_unlock(p, wq, NULL, &req, true);
2540
2541 workqueue_lock_spin(wq); /* reacquire */
2542 _threadreq_copy_prepare(wq);
2543 }
2544
2545 workqueue_unlock(wq);
2546
2547 return 0;
2548 }
2549
2550 /*
2551 * Used by the kevent system to request threads.
2552 *
2553 * Currently count is ignored and we always return one thread per invocation.
2554 */
2555 static thread_t
2556 _workq_kevent_reqthreads(struct proc *p, pthread_priority_t priority,
2557 bool no_emergency)
2558 {
2559 int wq_run_tr = WQ_RUN_TR_THROTTLED;
2560 bool emergency_thread = false;
2561 struct threadreq req;
2562
2563
2564 struct workqueue *wq;
2565 if ((wq = (struct workqueue *)pthread_kern->proc_get_wqptr(p)) == NULL) {
2566 return THREAD_NULL;
2567 }
2568
2569 int class = pthread_priority_get_class_index(priority);
2570
2571 workqueue_lock_spin(wq);
2572 bool has_threadreq = _threadreq_copy_prepare_noblock(wq);
2573
2574 PTHREAD_TRACE_WQ_REQ(TRACE_wq_kevent_reqthreads | DBG_FUNC_NONE, wq, NULL, priority, 0, 0);
2575
2576 /*
2577 * Skip straight to event manager if that's what was requested
2578 */
2579 if ((_pthread_priority_get_qos_newest(priority) == QOS_CLASS_UNSPECIFIED) ||
2580 (_pthread_priority_get_flags(priority) & _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG)){
2581 goto event_manager;
2582 }
2583
2584 bool will_pace = _wq_should_pace_priority(wq, class);
2585 if ((wq->wq_thidlecount == 0 || will_pace) && has_threadreq == false) {
2586 /*
2587 * We'll need to persist the request and can't, so return the emergency
2588 * thread instead, which has a persistent request object.
2589 */
2590 emergency_thread = true;
2591 goto event_manager;
2592 }
2593
2594 /*
2595 * Handle overcommit requests
2596 */
2597 if ((_pthread_priority_get_flags(priority) & _PTHREAD_PRIORITY_OVERCOMMIT_FLAG) != 0){
2598 _threadreq_init_stack(&req, class, TR_FLAG_KEVENT | TR_FLAG_OVERCOMMIT);
2599 wq_run_tr = workqueue_run_threadreq_and_unlock(p, wq, NULL, &req, false);
2600 goto done;
2601 }
2602
2603 /*
2604 * Handle constrained requests
2605 */
2606 boolean_t may_start = may_start_constrained_thread(wq, class, NULL, false);
2607 if (may_start || no_emergency) {
2608 _threadreq_init_stack(&req, class, TR_FLAG_KEVENT);
2609 wq_run_tr = workqueue_run_threadreq_and_unlock(p, wq, NULL, &req, false);
2610 goto done;
2611 } else {
2612 emergency_thread = true;
2613 }
2614
2615
2616 event_manager:
2617 _threadreq_init_stack(&req, WORKQUEUE_EVENT_MANAGER_BUCKET, TR_FLAG_KEVENT);
2618 wq_run_tr = workqueue_run_threadreq_and_unlock(p, wq, NULL, &req, false);
2619
2620 done:
2621 if (wq_run_tr == WQ_RUN_TR_THREAD_NEEDED && WQ_TIMER_IMMEDIATE_NEEDED(wq)) {
2622 workqueue_interval_timer_trigger(wq);
2623 }
2624 return emergency_thread ? (void*)-1 : 0;
2625 }
2626
2627 thread_t
2628 _workq_reqthreads(struct proc *p, __assert_only int requests_count,
2629 workq_reqthreads_req_t request)
2630 {
2631 assert(requests_count == 1);
2632
2633 pthread_priority_t priority = request->priority;
2634 bool no_emergency = request->count & WORKQ_REQTHREADS_NOEMERGENCY;
2635
2636 return _workq_kevent_reqthreads(p, priority, no_emergency);
2637 }
2638
2639
2640 int
2641 workq_kern_threadreq(struct proc *p, workq_threadreq_t _req,
2642 enum workq_threadreq_type type, unsigned long priority, int flags)
2643 {
2644 struct workqueue *wq;
2645 int ret;
2646
2647 if ((wq = (struct workqueue *)pthread_kern->proc_get_wqptr(p)) == NULL) {
2648 return EINVAL;
2649 }
2650
2651 switch (type) {
2652 case WORKQ_THREADREQ_KEVENT: {
2653 bool no_emergency = flags & WORKQ_THREADREQ_FLAG_NOEMERGENCY;
2654 (void)_workq_kevent_reqthreads(p, priority, no_emergency);
2655 return 0;
2656 }
2657 case WORKQ_THREADREQ_WORKLOOP:
2658 case WORKQ_THREADREQ_WORKLOOP_NO_THREAD_CALL: {
2659 struct threadreq *req = (struct threadreq *)_req;
2660 int req_class = pthread_priority_get_class_index(priority);
2661 int req_flags = TR_FLAG_WORKLOOP;
2662 if ((_pthread_priority_get_flags(priority) &
2663 _PTHREAD_PRIORITY_OVERCOMMIT_FLAG) != 0){
2664 req_flags |= TR_FLAG_OVERCOMMIT;
2665 }
2666
2667 thread_t thread = current_thread();
2668 struct threadlist *tl = util_get_thread_threadlist_entry(thread);
2669
2670 if (tl && tl != WQ_THREADLIST_EXITING_POISON &&
2671 (tl->th_flags & TH_LIST_UNBINDING)) {
2672 /*
2673 * we're called back synchronously from the context of
2674 * kevent_qos_internal_unbind from within wqops_thread_return()
2675 * we can try to match up this thread with this request !
2676 */
2677 } else {
2678 tl = NULL;
2679 }
2680
2681 _threadreq_init_alloced(req, req_class, req_flags);
2682 workqueue_lock_spin(wq);
2683 PTHREAD_TRACE_WQ_REQ(TRACE_wq_kevent_reqthreads | DBG_FUNC_NONE, wq, req, priority, 1, 0);
2684 ret = workqueue_run_threadreq_and_unlock(p, wq, tl, req, false);
2685 if (ret == WQ_RUN_TR_EXITING) {
2686 return ECANCELED;
2687 }
2688 if (ret == WQ_RUN_TR_THREAD_NEEDED) {
2689 if (type == WORKQ_THREADREQ_WORKLOOP_NO_THREAD_CALL) {
2690 return EAGAIN;
2691 }
2692 if (WQ_TIMER_IMMEDIATE_NEEDED(wq)) {
2693 workqueue_interval_timer_trigger(wq);
2694 }
2695 }
2696 return 0;
2697 }
2698 case WORKQ_THREADREQ_REDRIVE:
2699 PTHREAD_TRACE_WQ_REQ(TRACE_wq_kevent_reqthreads | DBG_FUNC_NONE, wq, 0, 0, 4, 0);
2700 workqueue_lock_spin(wq);
2701 ret = workqueue_run_threadreq_and_unlock(p, wq, NULL, NULL, true);
2702 if (ret == WQ_RUN_TR_EXITING) {
2703 return ECANCELED;
2704 }
2705 return 0;
2706 default:
2707 return ENOTSUP;
2708 }
2709 }
2710
2711 int
2712 workq_kern_threadreq_modify(struct proc *p, workq_threadreq_t _req,
2713 enum workq_threadreq_op operation, unsigned long arg1,
2714 unsigned long __unused arg2)
2715 {
2716 struct threadreq *req = (struct threadreq *)_req;
2717 struct workqueue *wq;
2718 int priclass, ret = 0, wq_tr_rc = WQ_RUN_TR_THROTTLED;
2719
2720 if (req == NULL || (wq = pthread_kern->proc_get_wqptr(p)) == NULL) {
2721 return EINVAL;
2722 }
2723
2724 workqueue_lock_spin(wq);
2725
2726 if (_wq_exiting(wq)) {
2727 ret = ECANCELED;
2728 goto out_unlock;
2729 }
2730
2731 /*
2732 * Find/validate the referenced request structure
2733 */
2734 if (req->tr_state != TR_STATE_WAITING) {
2735 ret = EINVAL;
2736 goto out_unlock;
2737 }
2738 assert(req->tr_priority < WORKQUEUE_EVENT_MANAGER_BUCKET);
2739 assert(req->tr_flags & TR_FLAG_WORKLOOP);
2740
2741 switch (operation) {
2742 case WORKQ_THREADREQ_CHANGE_PRI:
2743 case WORKQ_THREADREQ_CHANGE_PRI_NO_THREAD_CALL:
2744 priclass = pthread_priority_get_class_index(arg1);
2745 PTHREAD_TRACE_WQ_REQ(TRACE_wq_kevent_reqthreads | DBG_FUNC_NONE, wq, req, arg1, 2, 0);
2746 if (req->tr_priority == priclass) {
2747 goto out_unlock;
2748 }
2749 _threadreq_dequeue(wq, req);
2750 req->tr_priority = priclass;
2751 req->tr_state = TR_STATE_NEW; // what was old is new again
2752 wq_tr_rc = workqueue_run_threadreq_and_unlock(p, wq, NULL, req, false);
2753 goto out;
2754
2755 case WORKQ_THREADREQ_CANCEL:
2756 PTHREAD_TRACE_WQ_REQ(TRACE_wq_kevent_reqthreads | DBG_FUNC_NONE, wq, req, 0, 3, 0);
2757 _threadreq_dequeue(wq, req);
2758 req->tr_state = TR_STATE_DEAD;
2759 break;
2760
2761 default:
2762 ret = ENOTSUP;
2763 break;
2764 }
2765
2766 out_unlock:
2767 workqueue_unlock(wq);
2768 out:
2769 if (wq_tr_rc == WQ_RUN_TR_THREAD_NEEDED) {
2770 if (operation == WORKQ_THREADREQ_CHANGE_PRI_NO_THREAD_CALL) {
2771 ret = EAGAIN;
2772 } else if (WQ_TIMER_IMMEDIATE_NEEDED(wq)) {
2773 workqueue_interval_timer_trigger(wq);
2774 }
2775 }
2776 return ret;
2777 }
2778
2779
2780 static int
2781 wqops_thread_return(struct proc *p, struct workqueue *wq)
2782 {
2783 thread_t th = current_thread();
2784 struct uthread *uth = pthread_kern->get_bsdthread_info(th);
2785 struct threadlist *tl = pthread_kern->uthread_get_threadlist(uth);
2786
2787 /* reset signal mask on the workqueue thread to default state */
2788 if (pthread_kern->uthread_get_sigmask(uth) != (sigset_t)(~workq_threadmask)) {
2789 pthread_kern->proc_lock(p);
2790 pthread_kern->uthread_set_sigmask(uth, ~workq_threadmask);
2791 pthread_kern->proc_unlock(p);
2792 }
2793
2794 if (wq == NULL || !tl) {
2795 return EINVAL;
2796 }
2797
2798 PTHREAD_TRACE_WQ(TRACE_wq_override_reset | DBG_FUNC_START, tl->th_workq, 0, 0, 0, 0);
2799
2800 /*
2801 * This squash call has neat semantics: it removes the specified overrides,
2802 * replacing the current requested QoS with the previous effective QoS from
2803 * those overrides. This means we won't be preempted due to having our QoS
2804 * lowered. Of course, now our understanding of the thread's QoS is wrong,
2805 * so we'll adjust below.
2806 */
2807 bool was_manager = (tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET);
2808 int new_qos;
2809
2810 if (!was_manager) {
2811 new_qos = pthread_kern->proc_usynch_thread_qos_squash_override_for_resource(th,
2812 THREAD_QOS_OVERRIDE_RESOURCE_WILDCARD,
2813 THREAD_QOS_OVERRIDE_TYPE_DISPATCH_ASYNCHRONOUS_OVERRIDE);
2814 }
2815
2816 PTHREAD_TRACE_WQ(TRACE_wq_runitem | DBG_FUNC_END, wq, tl->th_priority, 0, 0, 0);
2817
2818 workqueue_lock_spin(wq);
2819
2820 if (tl->th_flags & TH_LIST_KEVENT_BOUND) {
2821 unsigned int flags = KEVENT_FLAG_WORKQ;
2822 if (was_manager) {
2823 flags |= KEVENT_FLAG_WORKQ_MANAGER;
2824 }
2825
2826 tl->th_flags |= TH_LIST_UNBINDING;
2827 workqueue_unlock(wq);
2828 kevent_qos_internal_unbind(p, class_index_get_thread_qos(tl->th_priority), th, flags);
2829 if (!(tl->th_flags & TH_LIST_UNBINDING)) {
2830 _setup_wqthread(p, th, wq, tl, WQ_SETUP_CLEAR_VOUCHER);
2831 pthread_kern->unix_syscall_return(EJUSTRETURN);
2832 __builtin_unreachable();
2833 }
2834 workqueue_lock_spin(wq);
2835 tl->th_flags &= ~(TH_LIST_KEVENT_BOUND | TH_LIST_UNBINDING);
2836 }
2837
2838 if (!was_manager) {
2839 /* Fix up counters from the squash operation. */
2840 uint8_t old_bucket = tl->th_priority;
2841 uint8_t new_bucket = thread_qos_get_class_index(new_qos);
2842
2843 if (old_bucket != new_bucket) {
2844 _wq_thactive_move(wq, old_bucket, new_bucket);
2845 wq->wq_thscheduled_count[old_bucket]--;
2846 wq->wq_thscheduled_count[new_bucket]++;
2847
2848 PTHREAD_TRACE_WQ(TRACE_wq_thread_squash | DBG_FUNC_NONE, wq, tl->th_priority, new_bucket, 0, 0);
2849 tl->th_priority = new_bucket;
2850 PTHREAD_TRACE_WQ(TRACE_wq_override_reset | DBG_FUNC_END, tl->th_workq, new_qos, 0, 0, 0);
2851 }
2852 }
2853
2854 workqueue_run_threadreq_and_unlock(p, wq, tl, NULL, false);
2855 return 0;
2856 }
2857
2858 /**
2859 * Multiplexed call to interact with the workqueue mechanism
2860 */
2861 int
2862 _workq_kernreturn(struct proc *p,
2863 int options,
2864 user_addr_t item,
2865 int arg2,
2866 int arg3,
2867 int32_t *retval)
2868 {
2869 struct workqueue *wq;
2870 int error = 0;
2871
2872 if (pthread_kern->proc_get_register(p) == 0) {
2873 return EINVAL;
2874 }
2875
2876 switch (options) {
2877 case WQOPS_QUEUE_NEWSPISUPP: {
2878 /*
2879 * arg2 = offset of serialno into dispatch queue
2880 * arg3 = kevent support
2881 */
2882 int offset = arg2;
2883 if (arg3 & 0x01){
2884 // If we get here, then userspace has indicated support for kevent delivery.
2885 }
2886
2887 pthread_kern->proc_set_dispatchqueue_serialno_offset(p, (uint64_t)offset);
2888 break;
2889 }
2890 case WQOPS_QUEUE_REQTHREADS: {
2891 /*
2892 * arg2 = number of threads to start
2893 * arg3 = priority
2894 */
2895 error = wqops_queue_reqthreads(p, arg2, arg3);
2896 break;
2897 }
2898 case WQOPS_SET_EVENT_MANAGER_PRIORITY: {
2899 /*
2900 * arg2 = priority for the manager thread
2901 *
2902 * if _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG is set, the
2903 * ~_PTHREAD_PRIORITY_FLAGS_MASK contains a scheduling priority instead
2904 * of a QOS value
2905 */
2906 pthread_priority_t pri = arg2;
2907
2908 wq = (struct workqueue *)pthread_kern->proc_get_wqptr(p);
2909 if (wq == NULL) {
2910 error = EINVAL;
2911 break;
2912 }
2913 workqueue_lock_spin(wq);
2914 if (pri & _PTHREAD_PRIORITY_SCHED_PRI_FLAG){
2915 /*
2916 * If userspace passes a scheduling priority, that takes precidence
2917 * over any QoS. (So, userspace should take care not to accidenatally
2918 * lower the priority this way.)
2919 */
2920 uint32_t sched_pri = pri & _PTHREAD_PRIORITY_SCHED_PRI_MASK;
2921 if (wq->wq_event_manager_priority & _PTHREAD_PRIORITY_SCHED_PRI_FLAG){
2922 wq->wq_event_manager_priority = MAX(sched_pri, wq->wq_event_manager_priority & _PTHREAD_PRIORITY_SCHED_PRI_MASK)
2923 | _PTHREAD_PRIORITY_SCHED_PRI_FLAG | _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG;
2924 } else {
2925 wq->wq_event_manager_priority = sched_pri
2926 | _PTHREAD_PRIORITY_SCHED_PRI_FLAG | _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG;
2927 }
2928 } else if ((wq->wq_event_manager_priority & _PTHREAD_PRIORITY_SCHED_PRI_FLAG) == 0){
2929 int cur_qos = pthread_priority_get_thread_qos(wq->wq_event_manager_priority);
2930 int new_qos = pthread_priority_get_thread_qos(pri);
2931 wq->wq_event_manager_priority = (uint32_t)thread_qos_get_pthread_priority(MAX(cur_qos, new_qos)) | _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG;
2932 }
2933 workqueue_unlock(wq);
2934 break;
2935 }
2936 case WQOPS_THREAD_KEVENT_RETURN:
2937 case WQOPS_THREAD_WORKLOOP_RETURN:
2938 wq = (struct workqueue *)pthread_kern->proc_get_wqptr(p);
2939 PTHREAD_TRACE_WQ(TRACE_wq_runthread | DBG_FUNC_END, wq, options, 0, 0, 0);
2940 if (item != 0 && arg2 != 0) {
2941 int32_t kevent_retval;
2942 int ret;
2943 if (options == WQOPS_THREAD_KEVENT_RETURN) {
2944 ret = kevent_qos_internal(p, -1, item, arg2, item, arg2, NULL, NULL,
2945 KEVENT_FLAG_WORKQ | KEVENT_FLAG_IMMEDIATE | KEVENT_FLAG_ERROR_EVENTS,
2946 &kevent_retval);
2947 } else /* options == WQOPS_THREAD_WORKLOOP_RETURN */ {
2948 kqueue_id_t kevent_id = -1;
2949 ret = kevent_id_internal(p, &kevent_id, item, arg2, item, arg2,
2950 NULL, NULL,
2951 KEVENT_FLAG_WORKLOOP | KEVENT_FLAG_IMMEDIATE | KEVENT_FLAG_ERROR_EVENTS,
2952 &kevent_retval);
2953 }
2954 /*
2955 * We shouldn't be getting more errors out than events we put in, so
2956 * reusing the input buffer should always provide enough space. But,
2957 * the assert is commented out since we get errors in edge cases in the
2958 * process lifecycle.
2959 */
2960 //assert(ret == KERN_SUCCESS && kevent_retval >= 0);
2961 if (ret != KERN_SUCCESS){
2962 error = ret;
2963 break;
2964 } else if (kevent_retval > 0){
2965 assert(kevent_retval <= arg2);
2966 *retval = kevent_retval;
2967 error = 0;
2968 break;
2969 }
2970 }
2971 goto thread_return;
2972
2973 case WQOPS_THREAD_RETURN:
2974 wq = (struct workqueue *)pthread_kern->proc_get_wqptr(p);
2975 PTHREAD_TRACE_WQ(TRACE_wq_runthread | DBG_FUNC_END, wq, options, 0, 0, 0);
2976 thread_return:
2977 error = wqops_thread_return(p, wq);
2978 // NOT REACHED except in case of error
2979 assert(error);
2980 break;
2981
2982 case WQOPS_SHOULD_NARROW: {
2983 /*
2984 * arg2 = priority to test
2985 * arg3 = unused
2986 */
2987 pthread_priority_t priority = arg2;
2988 thread_t th = current_thread();
2989 struct threadlist *tl = util_get_thread_threadlist_entry(th);
2990
2991 if (tl == NULL || (tl->th_flags & TH_LIST_CONSTRAINED) == 0) {
2992 error = EINVAL;
2993 break;
2994 }
2995
2996 int class = pthread_priority_get_class_index(priority);
2997 wq = tl->th_workq;
2998 workqueue_lock_spin(wq);
2999 bool should_narrow = !may_start_constrained_thread(wq, class, tl, false);
3000 workqueue_unlock(wq);
3001
3002 *retval = should_narrow;
3003 break;
3004 }
3005 default:
3006 error = EINVAL;
3007 break;
3008 }
3009
3010 switch (options) {
3011 case WQOPS_THREAD_KEVENT_RETURN:
3012 case WQOPS_THREAD_WORKLOOP_RETURN:
3013 case WQOPS_THREAD_RETURN:
3014 PTHREAD_TRACE_WQ(TRACE_wq_runthread | DBG_FUNC_START, wq, options, 0, 0, 0);
3015 break;
3016 }
3017 return (error);
3018 }
3019
3020 /*
3021 * We have no work to do, park ourselves on the idle list.
3022 *
3023 * Consumes the workqueue lock and does not return.
3024 */
3025 static void __dead2
3026 parkit(struct workqueue *wq, struct threadlist *tl, thread_t thread)
3027 {
3028 assert(thread == tl->th_thread);
3029 assert(thread == current_thread());
3030
3031 PTHREAD_TRACE_WQ(TRACE_wq_thread_park | DBG_FUNC_START, wq, 0, 0, 0, 0);
3032
3033 uint32_t us_to_wait = 0;
3034
3035 TAILQ_REMOVE(&wq->wq_thrunlist, tl, th_entry);
3036
3037 tl->th_flags &= ~TH_LIST_RUNNING;
3038 tl->th_flags &= ~TH_LIST_KEVENT;
3039 assert((tl->th_flags & TH_LIST_KEVENT_BOUND) == 0);
3040
3041 if (tl->th_flags & TH_LIST_CONSTRAINED) {
3042 wq->wq_constrained_threads_scheduled--;
3043 tl->th_flags &= ~TH_LIST_CONSTRAINED;
3044 }
3045
3046 _wq_thactive_dec(wq, tl->th_priority);
3047 wq->wq_thscheduled_count[tl->th_priority]--;
3048 wq->wq_threads_scheduled--;
3049 uint32_t thidlecount = ++wq->wq_thidlecount;
3050
3051 pthread_kern->thread_sched_call(thread, NULL);
3052
3053 /*
3054 * We'd like to always have one manager thread parked so that we can have
3055 * low latency when we need to bring a manager thread up. If that idle
3056 * thread list is empty, make this thread a manager thread.
3057 *
3058 * XXX: This doesn't check that there's not a manager thread outstanding,
3059 * so it's based on the assumption that most manager callouts will change
3060 * their QoS before parking. If that stops being true, this may end up
3061 * costing us more than we gain.
3062 */
3063 if (TAILQ_EMPTY(&wq->wq_thidlemgrlist) &&
3064 tl->th_priority != WORKQUEUE_EVENT_MANAGER_BUCKET){
3065 PTHREAD_TRACE_WQ(TRACE_wq_thread_reset_priority | DBG_FUNC_NONE,
3066 wq, thread_tid(thread),
3067 (tl->th_priority << 16) | WORKQUEUE_EVENT_MANAGER_BUCKET, 2, 0);
3068 reset_priority(tl, pthread_priority_from_wq_class_index(wq, WORKQUEUE_EVENT_MANAGER_BUCKET));
3069 tl->th_priority = WORKQUEUE_EVENT_MANAGER_BUCKET;
3070 }
3071
3072 if (tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET){
3073 TAILQ_INSERT_HEAD(&wq->wq_thidlemgrlist, tl, th_entry);
3074 } else {
3075 TAILQ_INSERT_HEAD(&wq->wq_thidlelist, tl, th_entry);
3076 }
3077
3078 /*
3079 * When we remove the voucher from the thread, we may lose our importance
3080 * causing us to get preempted, so we do this after putting the thread on
3081 * the idle list. That when, when we get our importance back we'll be able
3082 * to use this thread from e.g. the kevent call out to deliver a boosting
3083 * message.
3084 */
3085 tl->th_flags |= TH_LIST_REMOVING_VOUCHER;
3086 workqueue_unlock(wq);
3087 if (pthread_kern->thread_will_park_or_terminate) {
3088 pthread_kern->thread_will_park_or_terminate(tl->th_thread);
3089 }
3090 __assert_only kern_return_t kr;
3091 kr = pthread_kern->thread_set_voucher_name(MACH_PORT_NULL);
3092 assert(kr == KERN_SUCCESS);
3093 workqueue_lock_spin(wq);
3094 tl->th_flags &= ~(TH_LIST_REMOVING_VOUCHER);
3095
3096 if ((tl->th_flags & TH_LIST_RUNNING) == 0) {
3097 if (thidlecount < 101) {
3098 us_to_wait = wq_reduce_pool_window_usecs - ((thidlecount-2) * (wq_reduce_pool_window_usecs / 100));
3099 } else {
3100 us_to_wait = wq_reduce_pool_window_usecs / 100;
3101 }
3102
3103 thread_set_pending_block_hint(thread, kThreadWaitParkedWorkQueue);
3104 assert_wait_timeout_with_leeway((caddr_t)tl, (THREAD_INTERRUPTIBLE),
3105 TIMEOUT_URGENCY_SYS_BACKGROUND|TIMEOUT_URGENCY_LEEWAY, us_to_wait,
3106 wq_reduce_pool_window_usecs/10, NSEC_PER_USEC);
3107
3108 workqueue_unlock(wq);
3109
3110 thread_block(wq_unpark_continue);
3111 panic("thread_block(wq_unpark_continue) returned!");
3112 } else {
3113 workqueue_unlock(wq);
3114
3115 /*
3116 * While we'd dropped the lock to unset our voucher, someone came
3117 * around and made us runnable. But because we weren't waiting on the
3118 * event their wakeup() was ineffectual. To correct for that, we just
3119 * run the continuation ourselves.
3120 */
3121 wq_unpark_continue(NULL, THREAD_AWAKENED);
3122 }
3123 }
3124
3125 static bool
3126 may_start_constrained_thread(struct workqueue *wq, uint32_t at_priclass,
3127 struct threadlist *tl, bool may_start_timer)
3128 {
3129 uint32_t req_qos = _wq_thactive_best_constrained_req_qos(wq);
3130 wq_thactive_t thactive;
3131
3132 if (may_start_timer && at_priclass < req_qos) {
3133 /*
3134 * When called from workqueue_run_threadreq_and_unlock() pre-post newest
3135 * higher priorities into the thactive state so that
3136 * workqueue_callback() takes the right decision.
3137 *
3138 * If the admission check passes, workqueue_run_threadreq_and_unlock
3139 * will reset this value before running the request.
3140 */
3141 thactive = _wq_thactive_set_best_constrained_req_qos(wq, req_qos,
3142 at_priclass);
3143 #ifdef __LP64__
3144 PTHREAD_TRACE_WQ(TRACE_wq_thactive_update, 1, (uint64_t)thactive,
3145 (uint64_t)(thactive >> 64), 0, 0);
3146 #endif
3147 } else {
3148 thactive = _wq_thactive(wq);
3149 }
3150
3151 uint32_t constrained_threads = wq->wq_constrained_threads_scheduled;
3152 if (tl && (tl->th_flags & TH_LIST_CONSTRAINED)) {
3153 /*
3154 * don't count the current thread as scheduled
3155 */
3156 constrained_threads--;
3157 }
3158 if (constrained_threads >= wq_max_constrained_threads) {
3159 PTHREAD_TRACE_WQ(TRACE_wq_constrained_admission | DBG_FUNC_NONE, wq, 1,
3160 wq->wq_constrained_threads_scheduled,
3161 wq_max_constrained_threads, 0);
3162 /*
3163 * we need 1 or more constrained threads to return to the kernel before
3164 * we can dispatch additional work
3165 */
3166 return false;
3167 }
3168
3169 /*
3170 * Compute a metric for many how many threads are active. We find the
3171 * highest priority request outstanding and then add up the number of
3172 * active threads in that and all higher-priority buckets. We'll also add
3173 * any "busy" threads which are not active but blocked recently enough that
3174 * we can't be sure they've gone idle yet. We'll then compare this metric
3175 * to our max concurrency to decide whether to add a new thread.
3176 */
3177
3178 uint32_t busycount, thactive_count;
3179
3180 thactive_count = _wq_thactive_aggregate_downto_qos(wq, thactive,
3181 at_priclass, &busycount, NULL);
3182
3183 if (tl && tl->th_priority <= at_priclass) {
3184 /*
3185 * don't count this thread as currently active
3186 */
3187 assert(thactive_count > 0);
3188 thactive_count--;
3189 }
3190
3191 if (thactive_count + busycount < wq_max_concurrency[at_priclass]) {
3192 PTHREAD_TRACE_WQ(TRACE_wq_constrained_admission | DBG_FUNC_NONE, wq, 2,
3193 thactive_count, busycount, 0);
3194 return true;
3195 } else {
3196 PTHREAD_TRACE_WQ(TRACE_wq_constrained_admission | DBG_FUNC_NONE, wq, 3,
3197 thactive_count, busycount, 0);
3198 }
3199
3200 if (busycount && may_start_timer) {
3201 /*
3202 * If this is called from the add timer, we won't have another timer
3203 * fire when the thread exits the "busy" state, so rearm the timer.
3204 */
3205 if (WQ_TIMER_DELAYED_NEEDED(wq)) {
3206 workqueue_interval_timer_start(wq);
3207 }
3208 }
3209
3210 return false;
3211 }
3212
3213 static struct threadlist *
3214 pop_from_thidlelist(struct workqueue *wq, uint32_t priclass)
3215 {
3216 assert(wq->wq_thidlecount);
3217
3218 struct threadlist *tl = NULL;
3219
3220 if (!TAILQ_EMPTY(&wq->wq_thidlemgrlist) &&
3221 (priclass == WORKQUEUE_EVENT_MANAGER_BUCKET || TAILQ_EMPTY(&wq->wq_thidlelist))){
3222 tl = TAILQ_FIRST(&wq->wq_thidlemgrlist);
3223 TAILQ_REMOVE(&wq->wq_thidlemgrlist, tl, th_entry);
3224 assert(tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET);
3225 } else if (!TAILQ_EMPTY(&wq->wq_thidlelist) &&
3226 (priclass != WORKQUEUE_EVENT_MANAGER_BUCKET || TAILQ_EMPTY(&wq->wq_thidlemgrlist))){
3227 tl = TAILQ_FIRST(&wq->wq_thidlelist);
3228 TAILQ_REMOVE(&wq->wq_thidlelist, tl, th_entry);
3229 assert(tl->th_priority != WORKQUEUE_EVENT_MANAGER_BUCKET);
3230 } else {
3231 panic("pop_from_thidlelist called with no threads available");
3232 }
3233 assert((tl->th_flags & TH_LIST_RUNNING) == 0);
3234
3235 assert(wq->wq_thidlecount);
3236 wq->wq_thidlecount--;
3237
3238 TAILQ_INSERT_TAIL(&wq->wq_thrunlist, tl, th_entry);
3239
3240 tl->th_flags |= TH_LIST_RUNNING | TH_LIST_BUSY;
3241
3242 wq->wq_threads_scheduled++;
3243 wq->wq_thscheduled_count[priclass]++;
3244 _wq_thactive_inc(wq, priclass);
3245 return tl;
3246 }
3247
3248 static pthread_priority_t
3249 pthread_priority_from_wq_class_index(struct workqueue *wq, int index)
3250 {
3251 if (index == WORKQUEUE_EVENT_MANAGER_BUCKET){
3252 return wq->wq_event_manager_priority;
3253 } else {
3254 return class_index_get_pthread_priority(index);
3255 }
3256 }
3257
3258 static void
3259 reset_priority(struct threadlist *tl, pthread_priority_t pri)
3260 {
3261 kern_return_t ret;
3262 thread_t th = tl->th_thread;
3263
3264 if ((pri & _PTHREAD_PRIORITY_SCHED_PRI_FLAG) == 0){
3265 ret = pthread_kern->thread_set_workq_qos(th, pthread_priority_get_thread_qos(pri), 0);
3266 assert(ret == KERN_SUCCESS || ret == KERN_TERMINATED);
3267
3268 if (tl->th_flags & TH_LIST_EVENT_MGR_SCHED_PRI) {
3269
3270 /* Reset priority to default (masked by QoS) */
3271
3272 ret = pthread_kern->thread_set_workq_pri(th, 31, POLICY_TIMESHARE);
3273 assert(ret == KERN_SUCCESS || ret == KERN_TERMINATED);
3274
3275 tl->th_flags &= ~TH_LIST_EVENT_MGR_SCHED_PRI;
3276 }
3277 } else {
3278 ret = pthread_kern->thread_set_workq_qos(th, THREAD_QOS_UNSPECIFIED, 0);
3279 assert(ret == KERN_SUCCESS || ret == KERN_TERMINATED);
3280 ret = pthread_kern->thread_set_workq_pri(th, (pri & (~_PTHREAD_PRIORITY_FLAGS_MASK)), POLICY_TIMESHARE);
3281 assert(ret == KERN_SUCCESS || ret == KERN_TERMINATED);
3282
3283 tl->th_flags |= TH_LIST_EVENT_MGR_SCHED_PRI;
3284 }
3285 }
3286
3287 /*
3288 * Picks the best request to run, and returns the best overcommit fallback
3289 * if the best pick is non overcommit and risks failing its admission check.
3290 */
3291 static struct threadreq *
3292 workqueue_best_threadreqs(struct workqueue *wq, struct threadlist *tl,
3293 struct threadreq **fallback)
3294 {
3295 struct threadreq *req, *best_req = NULL;
3296 int priclass, prilimit;
3297
3298 if ((wq->wq_event_manager_threadreq.tr_state == TR_STATE_WAITING) &&
3299 ((wq->wq_thscheduled_count[WORKQUEUE_EVENT_MANAGER_BUCKET] == 0) ||
3300 (tl && tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET))) {
3301 /*
3302 * There's an event manager request and either:
3303 * - no event manager currently running
3304 * - we are re-using the event manager
3305 */
3306 req = &wq->wq_event_manager_threadreq;
3307 PTHREAD_TRACE_WQ_REQ(TRACE_wq_run_threadreq_req_select | DBG_FUNC_NONE, wq, req, 1, 0, 0);
3308 return req;
3309 }
3310
3311 if (tl) {
3312 prilimit = WORKQUEUE_EVENT_MANAGER_BUCKET;
3313 } else {
3314 prilimit = _wq_highest_paced_priority(wq);
3315 }
3316 for (priclass = 0; priclass < prilimit; priclass++) {
3317 req = TAILQ_FIRST(&wq->wq_overcommit_reqlist[priclass]);
3318 if (req) {
3319 PTHREAD_TRACE_WQ_REQ(TRACE_wq_run_threadreq_req_select | DBG_FUNC_NONE, wq, req, 2, 0, 0);
3320 if (best_req) {
3321 *fallback = req;
3322 } else {
3323 best_req = req;
3324 }
3325 break;
3326 }
3327 if (!best_req) {
3328 best_req = TAILQ_FIRST(&wq->wq_reqlist[priclass]);
3329 if (best_req) {
3330 PTHREAD_TRACE_WQ_REQ(TRACE_wq_run_threadreq_req_select | DBG_FUNC_NONE, wq, best_req, 3, 0, 0);
3331 }
3332 }
3333 }
3334 return best_req;
3335 }
3336
3337 /**
3338 * Runs a thread request on a thread
3339 *
3340 * - if thread is THREAD_NULL, will find a thread and run the request there.
3341 * Otherwise, the thread must be the current thread.
3342 *
3343 * - if req is NULL, will find the highest priority request and run that. If
3344 * it is not NULL, it must be a threadreq object in state NEW. If it can not
3345 * be run immediately, it will be enqueued and moved to state WAITING.
3346 *
3347 * Either way, the thread request object serviced will be moved to state
3348 * PENDING and attached to the threadlist.
3349 *
3350 * Should be called with the workqueue lock held. Will drop it.
3351 *
3352 * WARNING: _workq_kevent_reqthreads needs to be able to preflight any
3353 * admission checks in this function. If you are changing this function,
3354 * keep that one up-to-date.
3355 *
3356 * - if parking_tl is non NULL, then the current thread is parking. This will
3357 * try to reuse this thread for a request. If no match is found, it will be
3358 * parked.
3359 */
3360 static int
3361 workqueue_run_threadreq_and_unlock(proc_t p, struct workqueue *wq,
3362 struct threadlist *parking_tl, struct threadreq *req,
3363 bool may_add_new_thread)
3364 {
3365 struct threadreq *incoming_req = req;
3366
3367 struct threadlist *tl = parking_tl;
3368 int rc = WQ_RUN_TR_THROTTLED;
3369
3370 assert(tl == NULL || tl->th_thread == current_thread());
3371 assert(req == NULL || req->tr_state == TR_STATE_NEW);
3372 assert(!may_add_new_thread || !tl);
3373
3374 PTHREAD_TRACE_WQ_REQ(TRACE_wq_run_threadreq | DBG_FUNC_START, wq, req,
3375 tl ? thread_tid(tl->th_thread) : 0,
3376 req ? (req->tr_priority << 16 | req->tr_flags) : 0, 0);
3377
3378 /*
3379 * Special cases when provided an event manager request
3380 */
3381 if (req && req->tr_priority == WORKQUEUE_EVENT_MANAGER_BUCKET) {
3382 // Clients must not rely on identity of event manager requests
3383 assert(req->tr_flags & TR_FLAG_ONSTACK);
3384 // You can't be both overcommit and event manager
3385 assert((req->tr_flags & TR_FLAG_OVERCOMMIT) == 0);
3386
3387 /*
3388 * We can only ever have one event manager request, so coalesce them if
3389 * there's already one outstanding.
3390 */
3391 if (wq->wq_event_manager_threadreq.tr_state == TR_STATE_WAITING) {
3392 PTHREAD_TRACE_WQ_REQ(TRACE_wq_run_threadreq_mgr_merge | DBG_FUNC_NONE, wq, req, 0, 0, 0);
3393
3394 struct threadreq *existing_req = &wq->wq_event_manager_threadreq;
3395 if (req->tr_flags & TR_FLAG_KEVENT) {
3396 existing_req->tr_flags |= TR_FLAG_KEVENT;
3397 }
3398
3399 req = existing_req;
3400 incoming_req = NULL;
3401 }
3402
3403 if (wq->wq_thscheduled_count[WORKQUEUE_EVENT_MANAGER_BUCKET] &&
3404 (!tl || tl->th_priority != WORKQUEUE_EVENT_MANAGER_BUCKET)){
3405 /*
3406 * There can only be one event manager running at a time.
3407 */
3408 PTHREAD_TRACE_WQ(TRACE_wq_run_threadreq | DBG_FUNC_END, wq, 1, 0, 0, 0);
3409 goto done;
3410 }
3411 }
3412
3413 again: // Start again after creating a thread
3414
3415 if (_wq_exiting(wq)) {
3416 rc = WQ_RUN_TR_EXITING;
3417 goto exiting;
3418 }
3419
3420 /*
3421 * Thread request selection and admission control
3422 */
3423 struct threadreq *fallback = NULL;
3424 if (req) {
3425 if ((req->tr_flags & TR_FLAG_NO_PACING) == 0 &&
3426 _wq_should_pace_priority(wq, req->tr_priority)) {
3427 /*
3428 * If a request fails the pacing admission check, then thread
3429 * requests are redriven when the pacing thread is finally scheduled
3430 * when it calls _wq_pacing_end() in wq_unpark_continue().
3431 */
3432 goto done;
3433 }
3434 } else if (wq->wq_reqcount == 0) {
3435 PTHREAD_TRACE_WQ(TRACE_wq_run_threadreq | DBG_FUNC_END, wq, 2, 0, 0, 0);
3436 goto done;
3437 } else if ((req = workqueue_best_threadreqs(wq, tl, &fallback)) == NULL) {
3438 PTHREAD_TRACE_WQ(TRACE_wq_run_threadreq | DBG_FUNC_END, wq, 3, 0, 0, 0);
3439 goto done;
3440 }
3441
3442 if ((req->tr_flags & TR_FLAG_OVERCOMMIT) == 0 &&
3443 (req->tr_priority < WORKQUEUE_EVENT_MANAGER_BUCKET)) {
3444 if (!may_start_constrained_thread(wq, req->tr_priority, parking_tl, true)) {
3445 if (!fallback) {
3446 PTHREAD_TRACE_WQ(TRACE_wq_run_threadreq | DBG_FUNC_END, wq, 4, 0, 0, 0);
3447 goto done;
3448 }
3449 assert(req->tr_state == TR_STATE_WAITING);
3450 req = fallback;
3451 }
3452 }
3453
3454 /*
3455 * Thread selection.
3456 */
3457 if (parking_tl) {
3458 if (tl->th_priority != req->tr_priority) {
3459 _wq_thactive_move(wq, tl->th_priority, req->tr_priority);
3460 wq->wq_thscheduled_count[tl->th_priority]--;
3461 wq->wq_thscheduled_count[req->tr_priority]++;
3462 }
3463 PTHREAD_TRACE_WQ(TRACE_wq_run_threadreq_thread_select | DBG_FUNC_NONE,
3464 wq, 1, thread_tid(tl->th_thread), 0, 0);
3465 } else if (wq->wq_thidlecount) {
3466 tl = pop_from_thidlelist(wq, req->tr_priority);
3467 /*
3468 * This call will update wq_thscheduled_count and wq_thactive_count for
3469 * the provided priority. It will not set the returned thread to that
3470 * priority. This matches the behavior of the parking_tl clause above.
3471 */
3472 PTHREAD_TRACE_WQ(TRACE_wq_run_threadreq_thread_select | DBG_FUNC_NONE,
3473 wq, 2, thread_tid(tl->th_thread), 0, 0);
3474 } else /* no idle threads */ {
3475 if (!may_add_new_thread || wq->wq_nthreads >= wq_max_threads) {
3476 PTHREAD_TRACE_WQ(TRACE_wq_run_threadreq | DBG_FUNC_END, wq, 5,
3477 may_add_new_thread, wq->wq_nthreads, 0);
3478 if (wq->wq_nthreads < wq_max_threads) {
3479 rc = WQ_RUN_TR_THREAD_NEEDED;
3480 }
3481 goto done;
3482 }
3483
3484 bool added_thread = workqueue_addnewthread(p, wq);
3485 /*
3486 * workqueue_addnewthread will drop and re-take the lock, so we
3487 * need to ensure we still have a cached request.
3488 *
3489 * It also means we have to pick a new request, since our old pick may
3490 * not be valid anymore.
3491 */
3492 req = incoming_req;
3493 if (req && (req->tr_flags & TR_FLAG_ONSTACK)) {
3494 _threadreq_copy_prepare(wq);
3495 }
3496
3497 if (added_thread) {
3498 PTHREAD_TRACE_WQ(TRACE_wq_run_threadreq_thread_select | DBG_FUNC_NONE,
3499 wq, 3, 0, 0, 0);
3500 goto again;
3501 } else if (_wq_exiting(wq)) {
3502 rc = WQ_RUN_TR_EXITING;
3503 goto exiting;
3504 } else {
3505 PTHREAD_TRACE_WQ(TRACE_wq_run_threadreq | DBG_FUNC_END, wq, 6, 0, 0, 0);
3506 /*
3507 * Something caused thread creation to fail. Kick off the timer in
3508 * the hope that it'll succeed next time.
3509 */
3510 if (WQ_TIMER_DELAYED_NEEDED(wq)) {
3511 workqueue_interval_timer_start(wq);
3512 }
3513 goto done;
3514 }
3515 }
3516
3517 /*
3518 * Setup thread, mark request as complete and run with it.
3519 */
3520 if (req->tr_state == TR_STATE_WAITING) {
3521 _threadreq_dequeue(wq, req);
3522 }
3523 if (tl->th_priority != req->tr_priority) {
3524 PTHREAD_TRACE_WQ(TRACE_wq_thread_reset_priority | DBG_FUNC_NONE,
3525 wq, thread_tid(tl->th_thread),
3526 (tl->th_priority << 16) | req->tr_priority, 1, 0);
3527 reset_priority(tl, pthread_priority_from_wq_class_index(wq, req->tr_priority));
3528 tl->th_priority = (uint8_t)req->tr_priority;
3529 }
3530 if (req->tr_flags & TR_FLAG_OVERCOMMIT) {
3531 if ((tl->th_flags & TH_LIST_CONSTRAINED) != 0) {
3532 tl->th_flags &= ~TH_LIST_CONSTRAINED;
3533 wq->wq_constrained_threads_scheduled--;
3534 }
3535 } else {
3536 if ((tl->th_flags & TH_LIST_CONSTRAINED) == 0) {
3537 tl->th_flags |= TH_LIST_CONSTRAINED;
3538 wq->wq_constrained_threads_scheduled++;
3539 }
3540 }
3541
3542 if (!parking_tl && !(req->tr_flags & TR_FLAG_NO_PACING)) {
3543 _wq_pacing_start(wq, tl);
3544 }
3545 if ((req->tr_flags & TR_FLAG_OVERCOMMIT) == 0) {
3546 uint32_t old_qos, new_qos;
3547
3548 /*
3549 * If we are scheduling a constrained thread request, we may need to
3550 * update the best constrained qos in the thactive atomic state.
3551 */
3552 for (new_qos = 0; new_qos < WQ_THACTIVE_NO_PENDING_REQUEST; new_qos++) {
3553 if (TAILQ_FIRST(&wq->wq_reqlist[new_qos]))
3554 break;
3555 }
3556 old_qos = _wq_thactive_best_constrained_req_qos(wq);
3557 if (old_qos != new_qos) {
3558 wq_thactive_t v = _wq_thactive_set_best_constrained_req_qos(wq,
3559 old_qos, new_qos);
3560 #ifdef __LP64__
3561 PTHREAD_TRACE_WQ(TRACE_wq_thactive_update, 2, (uint64_t)v,
3562 (uint64_t)(v >> 64), 0, 0);
3563 #else
3564 PTHREAD_TRACE_WQ(TRACE_wq_thactive_update, 2, v, 0, 0, 0);
3565 #endif
3566 }
3567 }
3568 {
3569 uint32_t upcall_flags = WQ_FLAG_THREAD_NEWSPI;
3570 if (req->tr_flags & TR_FLAG_OVERCOMMIT)
3571 upcall_flags |= WQ_FLAG_THREAD_OVERCOMMIT;
3572 if (req->tr_flags & TR_FLAG_KEVENT)
3573 upcall_flags |= WQ_FLAG_THREAD_KEVENT;
3574 if (req->tr_flags & TR_FLAG_WORKLOOP)
3575 upcall_flags |= WQ_FLAG_THREAD_WORKLOOP | WQ_FLAG_THREAD_KEVENT;
3576 if (tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET)
3577 upcall_flags |= WQ_FLAG_THREAD_EVENT_MANAGER;
3578 tl->th_upcall_flags = upcall_flags >> WQ_FLAG_THREAD_PRIOSHIFT;
3579 }
3580 if (req->tr_flags & TR_FLAG_KEVENT) {
3581 tl->th_flags |= TH_LIST_KEVENT;
3582 } else {
3583 tl->th_flags &= ~TH_LIST_KEVENT;
3584 }
3585 return _threadreq_complete_and_unlock(p, wq, req, tl);
3586
3587 done:
3588 if (incoming_req) {
3589 _threadreq_enqueue(wq, incoming_req);
3590 }
3591
3592 exiting:
3593
3594 if (parking_tl && !(parking_tl->th_flags & TH_LIST_UNBINDING)) {
3595 parkit(wq, parking_tl, parking_tl->th_thread);
3596 __builtin_unreachable();
3597 }
3598
3599 workqueue_unlock(wq);
3600
3601 return rc;
3602 }
3603
3604 /**
3605 * parked thread wakes up
3606 */
3607 static void __dead2
3608 wq_unpark_continue(void* __unused ptr, wait_result_t wait_result)
3609 {
3610 boolean_t first_use = false;
3611 thread_t th = current_thread();
3612 proc_t p = current_proc();
3613
3614 struct uthread *uth = pthread_kern->get_bsdthread_info(th);
3615 if (uth == NULL) goto done;
3616
3617 struct workqueue *wq = pthread_kern->proc_get_wqptr(p);
3618 if (wq == NULL) goto done;
3619
3620 workqueue_lock_spin(wq);
3621
3622 struct threadlist *tl = pthread_kern->uthread_get_threadlist(uth);
3623 assert(tl != WQ_THREADLIST_EXITING_POISON);
3624 if (tl == NULL) {
3625 /*
3626 * We woke up before addnewthread() was finished setting us up. Go
3627 * ahead and exit, but before we do poison the threadlist variable so
3628 * that addnewthread() doesn't think we are valid still.
3629 */
3630 pthread_kern->uthread_set_threadlist(uth, WQ_THREADLIST_EXITING_POISON);
3631 workqueue_unlock(wq);
3632 goto done;
3633 }
3634
3635 assert(tl->th_flags & TH_LIST_INITED);
3636
3637 if ((tl->th_flags & TH_LIST_NEW)){
3638 tl->th_flags &= ~(TH_LIST_NEW);
3639 first_use = true;
3640 }
3641
3642 if ((tl->th_flags & (TH_LIST_RUNNING | TH_LIST_BUSY)) == TH_LIST_RUNNING) {
3643 /*
3644 * The normal wakeup path.
3645 */
3646 goto return_to_user;
3647 }
3648
3649 if ((tl->th_flags & TH_LIST_RUNNING) == 0 &&
3650 wait_result == THREAD_TIMED_OUT &&
3651 tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET &&
3652 TAILQ_FIRST(&wq->wq_thidlemgrlist) == tl &&
3653 TAILQ_NEXT(tl, th_entry) == NULL){
3654 /*
3655 * If we are the only idle manager and we pop'ed for self-destruction,
3656 * then don't actually exit. Instead, free our stack to save some
3657 * memory and re-park.
3658 */
3659
3660 workqueue_unlock(wq);
3661
3662 vm_map_t vmap = wq->wq_map;
3663
3664 // Keep this in sync with _setup_wqthread()
3665 const vm_size_t guardsize = vm_map_page_size(vmap);
3666 const user_addr_t freeaddr = (user_addr_t)tl->th_stackaddr + guardsize;
3667 const vm_map_offset_t freesize = vm_map_trunc_page_mask((PTH_DEFAULT_STACKSIZE + guardsize + PTHREAD_T_OFFSET) - 1, vm_map_page_mask(vmap)) - guardsize;
3668
3669 int kr;
3670 kr = mach_vm_behavior_set(vmap, freeaddr, freesize, VM_BEHAVIOR_REUSABLE);
3671 assert(kr == KERN_SUCCESS || kr == KERN_INVALID_ADDRESS);
3672
3673 workqueue_lock_spin(wq);
3674
3675 if ( !(tl->th_flags & TH_LIST_RUNNING)) {
3676 thread_set_pending_block_hint(th, kThreadWaitParkedWorkQueue);
3677 assert_wait((caddr_t)tl, (THREAD_INTERRUPTIBLE));
3678
3679 workqueue_unlock(wq);
3680
3681 thread_block(wq_unpark_continue);
3682 __builtin_unreachable();
3683 }
3684 }
3685
3686 if ((tl->th_flags & TH_LIST_RUNNING) == 0) {
3687 assert((tl->th_flags & TH_LIST_BUSY) == 0);
3688 if (!first_use) {
3689 PTHREAD_TRACE_WQ(TRACE_wq_thread_park | DBG_FUNC_END, wq, 0, 0, 0, 0);
3690 }
3691 /*
3692 * We were set running, but not for the purposes of actually running.
3693 * This could be because the timer elapsed. Or it could be because the
3694 * thread aborted. Either way, we need to return to userspace to exit.
3695 *
3696 * The call to workqueue_removethread will consume the lock.
3697 */
3698
3699 if (!first_use &&
3700 (tl->th_priority < qos_class_get_class_index(WQ_THREAD_CLEANUP_QOS) ||
3701 (tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET))) {
3702 // Reset the QoS to something low for the pthread cleanup
3703 PTHREAD_TRACE_WQ(TRACE_wq_thread_reset_priority | DBG_FUNC_NONE,
3704 wq, thread_tid(th),
3705 (tl->th_priority << 16) | qos_class_get_class_index(WQ_THREAD_CLEANUP_QOS), 3, 0);
3706 pthread_priority_t cleanup_pri = _pthread_priority_make_newest(WQ_THREAD_CLEANUP_QOS, 0, 0);
3707 reset_priority(tl, cleanup_pri);
3708 }
3709
3710 workqueue_removethread(tl, 0, first_use);
3711
3712 if (first_use){
3713 pthread_kern->thread_bootstrap_return();
3714 } else {
3715 pthread_kern->unix_syscall_return(0);
3716 }
3717 __builtin_unreachable();
3718 }
3719
3720 /*
3721 * The timer woke us up or the thread was aborted. However, we have
3722 * already started to make this a runnable thread. Wait for that to
3723 * finish, then continue to userspace.
3724 */
3725 while ((tl->th_flags & TH_LIST_BUSY)) {
3726 assert_wait((caddr_t)tl, (THREAD_UNINT));
3727
3728 workqueue_unlock(wq);
3729
3730 thread_block(THREAD_CONTINUE_NULL);
3731
3732 workqueue_lock_spin(wq);
3733 }
3734
3735 return_to_user:
3736 if (!first_use) {
3737 PTHREAD_TRACE_WQ(TRACE_wq_thread_park | DBG_FUNC_END, wq, 0, 0, 0, 0);
3738 }
3739 if (_wq_pacing_end(wq, tl) && wq->wq_reqcount) {
3740 workqueue_run_threadreq_and_unlock(p, wq, NULL, NULL, true);
3741 } else {
3742 workqueue_unlock(wq);
3743 }
3744 _setup_wqthread(p, th, wq, tl, first_use ? WQ_SETUP_FIRST_USE : 0);
3745 pthread_kern->thread_sched_call(th, workqueue_callback);
3746 done:
3747 if (first_use){
3748 pthread_kern->thread_bootstrap_return();
3749 } else {
3750 pthread_kern->unix_syscall_return(EJUSTRETURN);
3751 }
3752 panic("Our attempt to return to userspace failed...");
3753 }
3754
3755 /**
3756 * configures initial thread stack/registers to jump into:
3757 * _pthread_wqthread(pthread_t self, mach_port_t kport, void *stackaddr, void *keventlist, int upcall_flags, int nkevents);
3758 * to get there we jump through assembily stubs in pthread_asm.s. Those
3759 * routines setup a stack frame, using the current stack pointer, and marshall
3760 * arguments from registers to the stack as required by the ABI.
3761 *
3762 * One odd thing we do here is to start the pthread_t 4k below what would be the
3763 * top of the stack otherwise. This is because usually only the first 4k of the
3764 * pthread_t will be used and so we want to put it on the same 16k page as the
3765 * top of the stack to save memory.
3766 *
3767 * When we are done the stack will look like:
3768 * |-----------| th_stackaddr + th_allocsize
3769 * |pthread_t | th_stackaddr + DEFAULT_STACKSIZE + guardsize + PTHREAD_STACK_OFFSET
3770 * |kevent list| optionally - at most WQ_KEVENT_LIST_LEN events
3771 * |kevent data| optionally - at most WQ_KEVENT_DATA_SIZE bytes
3772 * |stack gap | bottom aligned to 16 bytes, and at least as big as stack_gap_min
3773 * | STACK |
3774 * | ⇓ |
3775 * | |
3776 * |guard page | guardsize
3777 * |-----------| th_stackaddr
3778 */
3779 void
3780 _setup_wqthread(proc_t p, thread_t th, struct workqueue *wq,
3781 struct threadlist *tl, int setup_flags)
3782 {
3783 int error;
3784 if (setup_flags & WQ_SETUP_CLEAR_VOUCHER) {
3785 /*
3786 * For preemption reasons, we want to reset the voucher as late as
3787 * possible, so we do it in two places:
3788 * - Just before parking (i.e. in parkit())
3789 * - Prior to doing the setup for the next workitem (i.e. here)
3790 *
3791 * Those two places are sufficient to ensure we always reset it before
3792 * it goes back out to user space, but be careful to not break that
3793 * guarantee.
3794 */
3795 __assert_only kern_return_t kr;
3796 kr = pthread_kern->thread_set_voucher_name(MACH_PORT_NULL);
3797 assert(kr == KERN_SUCCESS);
3798 }
3799
3800 uint32_t upcall_flags = tl->th_upcall_flags << WQ_FLAG_THREAD_PRIOSHIFT;
3801 if (!(setup_flags & WQ_SETUP_FIRST_USE)) {
3802 upcall_flags |= WQ_FLAG_THREAD_REUSE;
3803 }
3804
3805 /*
3806 * Put the QoS class value into the lower bits of the reuse_thread register, this is where
3807 * the thread priority used to be stored anyway.
3808 */
3809 pthread_priority_t priority = pthread_priority_from_wq_class_index(wq, tl->th_priority);
3810 upcall_flags |= (_pthread_priority_get_qos_newest(priority) & WQ_FLAG_THREAD_PRIOMASK);
3811
3812 const vm_size_t guardsize = vm_map_page_size(tl->th_workq->wq_map);
3813 const vm_size_t stack_gap_min = (proc_is64bit(p) == 0) ? C_32_STK_ALIGN : C_64_REDZONE_LEN;
3814 const vm_size_t stack_align_min = (proc_is64bit(p) == 0) ? C_32_STK_ALIGN : C_64_STK_ALIGN;
3815
3816 user_addr_t pthread_self_addr = (user_addr_t)(tl->th_stackaddr + PTH_DEFAULT_STACKSIZE + guardsize + PTHREAD_T_OFFSET);
3817 user_addr_t stack_top_addr = (user_addr_t)((pthread_self_addr - stack_gap_min) & -stack_align_min);
3818 user_addr_t stack_bottom_addr = (user_addr_t)(tl->th_stackaddr + guardsize);
3819
3820 user_addr_t wqstart_fnptr = pthread_kern->proc_get_wqthread(p);
3821 if (!wqstart_fnptr) {
3822 panic("workqueue thread start function pointer is NULL");
3823 }
3824
3825 if (setup_flags & WQ_SETUP_FIRST_USE) {
3826 uint32_t tsd_offset = pthread_kern->proc_get_pthread_tsd_offset(p);
3827 if (tsd_offset) {
3828 mach_vm_offset_t th_tsd_base = (mach_vm_offset_t)pthread_self_addr + tsd_offset;
3829 kern_return_t kret = pthread_kern->thread_set_tsd_base(th, th_tsd_base);
3830 if (kret == KERN_SUCCESS) {
3831 upcall_flags |= WQ_FLAG_THREAD_TSD_BASE_SET;
3832 }
3833 }
3834
3835 /*
3836 * Pre-fault the first page of the new thread's stack and the page that will
3837 * contain the pthread_t structure.
3838 */
3839 vm_map_t vmap = pthread_kern->current_map();
3840 if (vm_map_trunc_page_mask((vm_map_offset_t)(stack_top_addr - C_64_REDZONE_LEN), vm_map_page_mask(vmap)) !=
3841 vm_map_trunc_page_mask((vm_map_offset_t)pthread_self_addr, vm_map_page_mask(vmap))){
3842 vm_fault( vmap,
3843 vm_map_trunc_page_mask((vm_map_offset_t)(stack_top_addr - C_64_REDZONE_LEN), vm_map_page_mask(vmap)),
3844 VM_PROT_READ | VM_PROT_WRITE,
3845 FALSE,
3846 THREAD_UNINT, NULL, 0);
3847 }
3848 vm_fault( vmap,
3849 vm_map_trunc_page_mask((vm_map_offset_t)pthread_self_addr, vm_map_page_mask(vmap)),
3850 VM_PROT_READ | VM_PROT_WRITE,
3851 FALSE,
3852 THREAD_UNINT, NULL, 0);
3853 }
3854
3855 user_addr_t kevent_list = NULL;
3856 int kevent_count = 0;
3857 if (upcall_flags & WQ_FLAG_THREAD_KEVENT){
3858 bool workloop = upcall_flags & WQ_FLAG_THREAD_WORKLOOP;
3859
3860 kevent_list = pthread_self_addr - WQ_KEVENT_LIST_LEN * sizeof(struct kevent_qos_s);
3861 kevent_count = WQ_KEVENT_LIST_LEN;
3862
3863 user_addr_t kevent_id_addr = kevent_list;
3864 if (workloop) {
3865 /*
3866 * The kevent ID goes just below the kevent list. Sufficiently new
3867 * userspace will know to look there. Old userspace will just
3868 * ignore it.
3869 */
3870 kevent_id_addr -= sizeof(kqueue_id_t);
3871 }
3872
3873 user_addr_t kevent_data_buf = kevent_id_addr - WQ_KEVENT_DATA_SIZE;
3874 user_size_t kevent_data_available = WQ_KEVENT_DATA_SIZE;
3875
3876 int32_t events_out = 0;
3877
3878 assert(tl->th_flags | TH_LIST_KEVENT_BOUND);
3879 unsigned int flags = KEVENT_FLAG_STACK_DATA | KEVENT_FLAG_IMMEDIATE;
3880 if (tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET) {
3881 flags |= KEVENT_FLAG_WORKQ_MANAGER;
3882 }
3883 int ret = 0;
3884 if (workloop) {
3885 flags |= KEVENT_FLAG_WORKLOOP;
3886 kqueue_id_t kevent_id = -1;
3887 ret = kevent_id_internal(p, &kevent_id,
3888 NULL, 0, kevent_list, kevent_count,
3889 kevent_data_buf, &kevent_data_available,
3890 flags, &events_out);
3891 copyout(&kevent_id, kevent_id_addr, sizeof(kevent_id));
3892 } else {
3893 flags |= KEVENT_FLAG_WORKQ;
3894 ret = kevent_qos_internal(p,
3895 class_index_get_thread_qos(tl->th_priority),
3896 NULL, 0, kevent_list, kevent_count,
3897 kevent_data_buf, &kevent_data_available,
3898 flags, &events_out);
3899 }
3900
3901 // squash any errors into just empty output
3902 if (ret != KERN_SUCCESS || events_out == -1){
3903 events_out = 0;
3904 kevent_data_available = WQ_KEVENT_DATA_SIZE;
3905 }
3906
3907 // We shouldn't get data out if there aren't events available
3908 assert(events_out != 0 || kevent_data_available == WQ_KEVENT_DATA_SIZE);
3909
3910 if (events_out > 0){
3911 if (kevent_data_available == WQ_KEVENT_DATA_SIZE){
3912 stack_top_addr = (kevent_id_addr - stack_gap_min) & -stack_align_min;
3913 } else {
3914 stack_top_addr = (kevent_data_buf + kevent_data_available - stack_gap_min) & -stack_align_min;
3915 }
3916
3917 kevent_count = events_out;
3918 } else {
3919 kevent_list = NULL;
3920 kevent_count = 0;
3921 }
3922 }
3923
3924 PTHREAD_TRACE_WQ(TRACE_wq_runthread | DBG_FUNC_START, wq, 0, 0, 0, 0);
3925
3926 #if defined(__i386__) || defined(__x86_64__)
3927 if (proc_is64bit(p) == 0) {
3928 x86_thread_state32_t state = {
3929 .eip = (unsigned int)wqstart_fnptr,
3930 .eax = /* arg0 */ (unsigned int)pthread_self_addr,
3931 .ebx = /* arg1 */ (unsigned int)tl->th_thport,
3932 .ecx = /* arg2 */ (unsigned int)stack_bottom_addr,
3933 .edx = /* arg3 */ (unsigned int)kevent_list,
3934 .edi = /* arg4 */ (unsigned int)upcall_flags,
3935 .esi = /* arg5 */ (unsigned int)kevent_count,
3936
3937 .esp = (int)((vm_offset_t)stack_top_addr),
3938 };
3939
3940 error = pthread_kern->thread_set_wq_state32(th, (thread_state_t)&state);
3941 if (error != KERN_SUCCESS) {
3942 panic(__func__ ": thread_set_wq_state failed: %d", error);
3943 }
3944 } else {
3945 x86_thread_state64_t state64 = {
3946 // x86-64 already passes all the arguments in registers, so we just put them in their final place here
3947 .rip = (uint64_t)wqstart_fnptr,
3948 .rdi = (uint64_t)pthread_self_addr,
3949 .rsi = (uint64_t)tl->th_thport,
3950 .rdx = (uint64_t)stack_bottom_addr,
3951 .rcx = (uint64_t)kevent_list,
3952 .r8 = (uint64_t)upcall_flags,
3953 .r9 = (uint64_t)kevent_count,
3954
3955 .rsp = (uint64_t)(stack_top_addr)
3956 };
3957
3958 error = pthread_kern->thread_set_wq_state64(th, (thread_state_t)&state64);
3959 if (error != KERN_SUCCESS) {
3960 panic(__func__ ": thread_set_wq_state failed: %d", error);
3961 }
3962 }
3963 #else
3964 #error setup_wqthread not defined for this architecture
3965 #endif
3966 }
3967
3968 #if DEBUG
3969 static int wq_kevent_test SYSCTL_HANDLER_ARGS {
3970 //(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req)
3971 #pragma unused(oidp, arg1, arg2)
3972 int error;
3973 struct workq_reqthreads_req_s requests[64] = {};
3974
3975 if (req->newlen > sizeof(requests) || req->newlen < sizeof(struct workq_reqthreads_req_s))
3976 return EINVAL;
3977
3978 error = copyin(req->newptr, requests, req->newlen);
3979 if (error) return error;
3980
3981 _workq_reqthreads(req->p, (int)(req->newlen / sizeof(struct workq_reqthreads_req_s)), requests);
3982
3983 return 0;
3984 }
3985 #endif // DEBUG
3986
3987 #pragma mark - Misc
3988
3989 int
3990 _fill_procworkqueue(proc_t p, struct proc_workqueueinfo * pwqinfo)
3991 {
3992 struct workqueue * wq;
3993 int error = 0;
3994 int activecount;
3995
3996 if ((wq = pthread_kern->proc_get_wqptr(p)) == NULL) {
3997 return EINVAL;
3998 }
3999
4000 /*
4001 * This is sometimes called from interrupt context by the kperf sampler.
4002 * In that case, it's not safe to spin trying to take the lock since we
4003 * might already hold it. So, we just try-lock it and error out if it's
4004 * already held. Since this is just a debugging aid, and all our callers
4005 * are able to handle an error, that's fine.
4006 */
4007 bool locked = workqueue_lock_try(wq);
4008 if (!locked) {
4009 return EBUSY;
4010 }
4011
4012 activecount = _wq_thactive_aggregate_downto_qos(wq, _wq_thactive(wq),
4013 WORKQUEUE_NUM_BUCKETS - 1, NULL, NULL);
4014 pwqinfo->pwq_nthreads = wq->wq_nthreads;
4015 pwqinfo->pwq_runthreads = activecount;
4016 pwqinfo->pwq_blockedthreads = wq->wq_threads_scheduled - activecount;
4017 pwqinfo->pwq_state = 0;
4018
4019 if (wq->wq_constrained_threads_scheduled >= wq_max_constrained_threads) {
4020 pwqinfo->pwq_state |= WQ_EXCEEDED_CONSTRAINED_THREAD_LIMIT;
4021 }
4022
4023 if (wq->wq_nthreads >= wq_max_threads) {
4024 pwqinfo->pwq_state |= WQ_EXCEEDED_TOTAL_THREAD_LIMIT;
4025 }
4026
4027 workqueue_unlock(wq);
4028 return(error);
4029 }
4030
4031 uint32_t
4032 _get_pwq_state_kdp(proc_t p)
4033 {
4034 if (p == NULL) {
4035 return 0;
4036 }
4037
4038 struct workqueue *wq = pthread_kern->proc_get_wqptr(p);
4039
4040 if (wq == NULL || workqueue_lock_spin_is_acquired_kdp(wq)) {
4041 return 0;
4042 }
4043
4044 uint32_t pwq_state = WQ_FLAGS_AVAILABLE;
4045
4046 if (wq->wq_constrained_threads_scheduled >= wq_max_constrained_threads) {
4047 pwq_state |= WQ_EXCEEDED_CONSTRAINED_THREAD_LIMIT;
4048 }
4049
4050 if (wq->wq_nthreads >= wq_max_threads) {
4051 pwq_state |= WQ_EXCEEDED_TOTAL_THREAD_LIMIT;
4052 }
4053
4054 return pwq_state;
4055 }
4056
4057 int
4058 _thread_selfid(__unused struct proc *p, uint64_t *retval)
4059 {
4060 thread_t thread = current_thread();
4061 *retval = thread_tid(thread);
4062 return KERN_SUCCESS;
4063 }
4064
4065 void
4066 _pthread_init(void)
4067 {
4068 pthread_lck_grp_attr = lck_grp_attr_alloc_init();
4069 pthread_lck_grp = lck_grp_alloc_init("pthread", pthread_lck_grp_attr);
4070
4071 /*
4072 * allocate the lock attribute for pthread synchronizers
4073 */
4074 pthread_lck_attr = lck_attr_alloc_init();
4075
4076 pthread_list_mlock = lck_mtx_alloc_init(pthread_lck_grp, pthread_lck_attr);
4077
4078 pth_global_hashinit();
4079 psynch_thcall = thread_call_allocate(psynch_wq_cleanup, NULL);
4080 psynch_zoneinit();
4081
4082 pthread_zone_workqueue = zinit(sizeof(struct workqueue),
4083 1024 * sizeof(struct workqueue), 8192, "pthread.workqueue");
4084 pthread_zone_threadlist = zinit(sizeof(struct threadlist),
4085 1024 * sizeof(struct threadlist), 8192, "pthread.threadlist");
4086 pthread_zone_threadreq = zinit(sizeof(struct threadreq),
4087 1024 * sizeof(struct threadreq), 8192, "pthread.threadreq");
4088
4089 /*
4090 * register sysctls
4091 */
4092 sysctl_register_oid(&sysctl__kern_wq_stalled_window_usecs);
4093 sysctl_register_oid(&sysctl__kern_wq_reduce_pool_window_usecs);
4094 sysctl_register_oid(&sysctl__kern_wq_max_timer_interval_usecs);
4095 sysctl_register_oid(&sysctl__kern_wq_max_threads);
4096 sysctl_register_oid(&sysctl__kern_wq_max_constrained_threads);
4097 sysctl_register_oid(&sysctl__kern_pthread_debug_tracing);
4098
4099 #if DEBUG
4100 sysctl_register_oid(&sysctl__debug_wq_kevent_test);
4101 #endif
4102
4103 for (int i = 0; i < WORKQUEUE_NUM_BUCKETS; i++) {
4104 uint32_t thread_qos = _wq_bucket_to_thread_qos(i);
4105 wq_max_concurrency[i] = pthread_kern->qos_max_parallelism(thread_qos,
4106 QOS_PARALLELISM_COUNT_LOGICAL);
4107 }
4108 wq_max_concurrency[WORKQUEUE_EVENT_MANAGER_BUCKET] = 1;
4109 }