]> git.saurik.com Git - apple/libpthread.git/blob - kern/kern_support.c
libpthread-301.20.1.tar.gz
[apple/libpthread.git] / kern / kern_support.c
1 /*
2 * Copyright (c) 2000-2012 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /* Copyright (c) 1995-2005 Apple Computer, Inc. All Rights Reserved */
29 /*
30 * pthread_synch.c
31 */
32
33 #pragma mark - Front Matter
34
35 #define _PTHREAD_CONDATTR_T
36 #define _PTHREAD_COND_T
37 #define _PTHREAD_MUTEXATTR_T
38 #define _PTHREAD_MUTEX_T
39 #define _PTHREAD_RWLOCKATTR_T
40 #define _PTHREAD_RWLOCK_T
41
42 #undef pthread_mutexattr_t
43 #undef pthread_mutex_t
44 #undef pthread_condattr_t
45 #undef pthread_cond_t
46 #undef pthread_rwlockattr_t
47 #undef pthread_rwlock_t
48
49 #include <sys/cdefs.h>
50 #include <os/log.h>
51
52 // <rdar://problem/26158937> panic() should be marked noreturn
53 extern void panic(const char *string, ...) __printflike(1,2) __dead2;
54
55 #include <sys/param.h>
56 #include <sys/queue.h>
57 #include <sys/resourcevar.h>
58 //#include <sys/proc_internal.h>
59 #include <sys/kauth.h>
60 #include <sys/systm.h>
61 #include <sys/timeb.h>
62 #include <sys/times.h>
63 #include <sys/acct.h>
64 #include <sys/kernel.h>
65 #include <sys/wait.h>
66 #include <sys/signalvar.h>
67 #include <sys/sysctl.h>
68 #include <sys/syslog.h>
69 #include <sys/stat.h>
70 #include <sys/lock.h>
71 #include <sys/kdebug.h>
72 //#include <sys/sysproto.h>
73 #include <sys/vm.h>
74 #include <sys/user.h> /* for coredump */
75 #include <sys/proc_info.h> /* for fill_procworkqueue */
76
77 #include <mach/mach_port.h>
78 #include <mach/mach_types.h>
79 #include <mach/semaphore.h>
80 #include <mach/sync_policy.h>
81 #include <mach/task.h>
82 #include <mach/vm_prot.h>
83 #include <kern/kern_types.h>
84 #include <kern/task.h>
85 #include <kern/clock.h>
86 #include <mach/kern_return.h>
87 #include <kern/thread.h>
88 #include <kern/zalloc.h>
89 #include <kern/sched_prim.h> /* for thread_exception_return */
90 #include <kern/processor.h>
91 #include <kern/assert.h>
92 #include <mach/mach_vm.h>
93 #include <mach/mach_param.h>
94 #include <mach/thread_status.h>
95 #include <mach/thread_policy.h>
96 #include <mach/message.h>
97 #include <mach/port.h>
98 //#include <vm/vm_protos.h>
99 #include <vm/vm_fault.h>
100 #include <vm/vm_map.h>
101 #include <mach/thread_act.h> /* for thread_resume */
102 #include <machine/machine_routines.h>
103 #include <mach/shared_region.h>
104
105 #include <libkern/OSAtomic.h>
106 #include <libkern/libkern.h>
107
108 #include <sys/pthread_shims.h>
109 #include "kern_internal.h"
110
111 // XXX: Dirty import for sys/signarvar.h that's wrapped in BSD_KERNEL_PRIVATE
112 #define sigcantmask (sigmask(SIGKILL) | sigmask(SIGSTOP))
113
114 // XXX: Ditto for thread tags from kern/thread.h
115 #define THREAD_TAG_MAINTHREAD 0x1
116 #define THREAD_TAG_PTHREAD 0x10
117 #define THREAD_TAG_WORKQUEUE 0x20
118
119 lck_grp_attr_t *pthread_lck_grp_attr;
120 lck_grp_t *pthread_lck_grp;
121 lck_attr_t *pthread_lck_attr;
122
123 zone_t pthread_zone_workqueue;
124 zone_t pthread_zone_threadlist;
125 zone_t pthread_zone_threadreq;
126
127 extern void thread_set_cthreadself(thread_t thread, uint64_t pself, int isLP64);
128 extern void workqueue_thread_yielded(void);
129
130 #define WQ_SETUP_FIRST_USE 1
131 #define WQ_SETUP_CLEAR_VOUCHER 2
132 static void _setup_wqthread(proc_t p, thread_t th, struct workqueue *wq,
133 struct threadlist *tl, int flags);
134
135 static void reset_priority(struct threadlist *tl, pthread_priority_t pri);
136 static pthread_priority_t pthread_priority_from_wq_class_index(struct workqueue *wq, int index);
137
138 static void wq_unpark_continue(void* ptr, wait_result_t wait_result) __dead2;
139
140 static bool workqueue_addnewthread(proc_t p, struct workqueue *wq);
141 static void workqueue_removethread(struct threadlist *tl, bool fromexit, bool first_use);
142 static void workqueue_lock_spin(struct workqueue *);
143 static void workqueue_unlock(struct workqueue *);
144
145 #define WQ_RUN_TR_THROTTLED 0
146 #define WQ_RUN_TR_THREAD_NEEDED 1
147 #define WQ_RUN_TR_THREAD_STARTED 2
148 #define WQ_RUN_TR_EXITING 3
149 static int workqueue_run_threadreq_and_unlock(proc_t p, struct workqueue *wq,
150 struct threadlist *tl, struct threadreq *req, bool may_add_new_thread);
151
152 static bool may_start_constrained_thread(struct workqueue *wq,
153 uint32_t at_priclass, struct threadlist *tl, bool may_start_timer);
154
155 static mach_vm_offset_t stack_addr_hint(proc_t p, vm_map_t vmap);
156 static boolean_t wq_thread_is_busy(uint64_t cur_ts,
157 _Atomic uint64_t *lastblocked_tsp);
158
159 int proc_settargetconc(pid_t pid, int queuenum, int32_t targetconc);
160 int proc_setalltargetconc(pid_t pid, int32_t * targetconcp);
161
162 #define WQ_MAXPRI_MIN 0 /* low prio queue num */
163 #define WQ_MAXPRI_MAX 2 /* max prio queuenum */
164 #define WQ_PRI_NUM 3 /* number of prio work queues */
165
166 #define C_32_STK_ALIGN 16
167 #define C_64_STK_ALIGN 16
168 #define C_64_REDZONE_LEN 128
169
170 #define PTHREAD_T_OFFSET 0
171
172 /*
173 * Flags filed passed to bsdthread_create and back in pthread_start
174 31 <---------------------------------> 0
175 _________________________________________
176 | flags(8) | policy(8) | importance(16) |
177 -----------------------------------------
178 */
179
180 #define PTHREAD_START_CUSTOM 0x01000000
181 #define PTHREAD_START_SETSCHED 0x02000000
182 #define PTHREAD_START_DETACHED 0x04000000
183 #define PTHREAD_START_QOSCLASS 0x08000000
184 #define PTHREAD_START_TSD_BASE_SET 0x10000000
185 #define PTHREAD_START_QOSCLASS_MASK 0x00ffffff
186 #define PTHREAD_START_POLICY_BITSHIFT 16
187 #define PTHREAD_START_POLICY_MASK 0xff
188 #define PTHREAD_START_IMPORTANCE_MASK 0xffff
189
190 #define SCHED_OTHER POLICY_TIMESHARE
191 #define SCHED_FIFO POLICY_FIFO
192 #define SCHED_RR POLICY_RR
193
194 #define BASEPRI_DEFAULT 31
195
196 #pragma mark sysctls
197
198 static uint32_t wq_stalled_window_usecs = WQ_STALLED_WINDOW_USECS;
199 static uint32_t wq_reduce_pool_window_usecs = WQ_REDUCE_POOL_WINDOW_USECS;
200 static uint32_t wq_max_timer_interval_usecs = WQ_MAX_TIMER_INTERVAL_USECS;
201 static uint32_t wq_max_threads = WORKQUEUE_MAXTHREADS;
202 static uint32_t wq_max_constrained_threads = WORKQUEUE_MAXTHREADS / 8;
203 static uint32_t wq_max_concurrency[WORKQUEUE_NUM_BUCKETS + 1]; // set to ncpus on load
204
205 SYSCTL_INT(_kern, OID_AUTO, wq_stalled_window_usecs, CTLFLAG_RW | CTLFLAG_LOCKED,
206 &wq_stalled_window_usecs, 0, "");
207
208 SYSCTL_INT(_kern, OID_AUTO, wq_reduce_pool_window_usecs, CTLFLAG_RW | CTLFLAG_LOCKED,
209 &wq_reduce_pool_window_usecs, 0, "");
210
211 SYSCTL_INT(_kern, OID_AUTO, wq_max_timer_interval_usecs, CTLFLAG_RW | CTLFLAG_LOCKED,
212 &wq_max_timer_interval_usecs, 0, "");
213
214 SYSCTL_INT(_kern, OID_AUTO, wq_max_threads, CTLFLAG_RW | CTLFLAG_LOCKED,
215 &wq_max_threads, 0, "");
216
217 SYSCTL_INT(_kern, OID_AUTO, wq_max_constrained_threads, CTLFLAG_RW | CTLFLAG_LOCKED,
218 &wq_max_constrained_threads, 0, "");
219
220 #ifdef DEBUG
221 static int wq_kevent_test SYSCTL_HANDLER_ARGS;
222 SYSCTL_PROC(_debug, OID_AUTO, wq_kevent_test, CTLFLAG_MASKED | CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY | CTLTYPE_OPAQUE, NULL, 0, wq_kevent_test, 0, "-");
223 #endif
224
225 static uint32_t wq_init_constrained_limit = 1;
226
227 uint32_t pthread_debug_tracing = 1;
228
229 SYSCTL_INT(_kern, OID_AUTO, pthread_debug_tracing, CTLFLAG_RW | CTLFLAG_LOCKED,
230 &pthread_debug_tracing, 0, "")
231
232 /*
233 * +-----+-----+-----+-----+-----+-----+-----+
234 * | MT | BG | UT | DE | IN | UN | mgr |
235 * +-----+-----+-----+-----+-----+-----+-----+-----+
236 * | pri | 5 | 4 | 3 | 2 | 1 | 0 | 6 |
237 * | qos | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
238 * +-----+-----+-----+-----+-----+-----+-----+-----+
239 */
240 static inline uint32_t
241 _wq_bucket_to_thread_qos(int pri)
242 {
243 if (pri == WORKQUEUE_EVENT_MANAGER_BUCKET) {
244 return WORKQUEUE_EVENT_MANAGER_BUCKET + 1;
245 }
246 return WORKQUEUE_EVENT_MANAGER_BUCKET - pri;
247 }
248
249 #pragma mark wq_thactive
250
251 #if defined(__LP64__)
252 // Layout is:
253 // 7 * 16 bits for each QoS bucket request count (including manager)
254 // 3 bits of best QoS among all pending constrained requests
255 // 13 bits of zeroes
256 #define WQ_THACTIVE_BUCKET_WIDTH 16
257 #define WQ_THACTIVE_QOS_SHIFT (7 * WQ_THACTIVE_BUCKET_WIDTH)
258 #else
259 // Layout is:
260 // 6 * 10 bits for each QoS bucket request count (except manager)
261 // 1 bit for the manager bucket
262 // 3 bits of best QoS among all pending constrained requests
263 #define WQ_THACTIVE_BUCKET_WIDTH 10
264 #define WQ_THACTIVE_QOS_SHIFT (6 * WQ_THACTIVE_BUCKET_WIDTH + 1)
265 #endif
266 #define WQ_THACTIVE_BUCKET_MASK ((1U << WQ_THACTIVE_BUCKET_WIDTH) - 1)
267 #define WQ_THACTIVE_BUCKET_HALF (1U << (WQ_THACTIVE_BUCKET_WIDTH - 1))
268 #define WQ_THACTIVE_NO_PENDING_REQUEST 6
269
270 _Static_assert(sizeof(wq_thactive_t) * CHAR_BIT - WQ_THACTIVE_QOS_SHIFT >= 3,
271 "Make sure we have space to encode a QoS");
272
273 static inline wq_thactive_t
274 _wq_thactive_fetch_and_add(struct workqueue *wq, wq_thactive_t offset)
275 {
276 #if PTHREAD_INLINE_RMW_ATOMICS || !defined(__LP64__)
277 return atomic_fetch_add_explicit(&wq->wq_thactive, offset,
278 memory_order_relaxed);
279 #else
280 return pthread_kern->atomic_fetch_add_128_relaxed(&wq->wq_thactive, offset);
281 #endif
282 }
283
284 static inline wq_thactive_t
285 _wq_thactive(struct workqueue *wq)
286 {
287 #if PTHREAD_INLINE_RMW_ATOMICS || !defined(__LP64__)
288 return atomic_load_explicit(&wq->wq_thactive, memory_order_relaxed);
289 #else
290 return pthread_kern->atomic_load_128_relaxed(&wq->wq_thactive);
291 #endif
292 }
293
294 #define WQ_THACTIVE_BEST_CONSTRAINED_REQ_QOS(tha) \
295 ((tha) >> WQ_THACTIVE_QOS_SHIFT)
296
297 static inline uint32_t
298 _wq_thactive_best_constrained_req_qos(struct workqueue *wq)
299 {
300 // Avoid expensive atomic operations: the three bits we're loading are in
301 // a single byte, and always updated under the workqueue lock
302 wq_thactive_t v = *(wq_thactive_t *)&wq->wq_thactive;
303 return WQ_THACTIVE_BEST_CONSTRAINED_REQ_QOS(v);
304 }
305
306 static inline wq_thactive_t
307 _wq_thactive_set_best_constrained_req_qos(struct workqueue *wq,
308 uint32_t orig_qos, uint32_t new_qos)
309 {
310 wq_thactive_t v;
311 v = (wq_thactive_t)(new_qos - orig_qos) << WQ_THACTIVE_QOS_SHIFT;
312 /*
313 * We can do an atomic add relative to the initial load because updates
314 * to this qos are always serialized under the workqueue lock.
315 */
316 return _wq_thactive_fetch_and_add(wq, v) + v;
317 }
318
319 static inline wq_thactive_t
320 _wq_thactive_offset_for_qos(int qos)
321 {
322 return (wq_thactive_t)1 << (qos * WQ_THACTIVE_BUCKET_WIDTH);
323 }
324
325 static inline wq_thactive_t
326 _wq_thactive_inc(struct workqueue *wq, int qos)
327 {
328 return _wq_thactive_fetch_and_add(wq, _wq_thactive_offset_for_qos(qos));
329 }
330
331 static inline wq_thactive_t
332 _wq_thactive_dec(struct workqueue *wq, int qos)
333 {
334 return _wq_thactive_fetch_and_add(wq, -_wq_thactive_offset_for_qos(qos));
335 }
336
337 static inline wq_thactive_t
338 _wq_thactive_move(struct workqueue *wq, int oldqos, int newqos)
339 {
340 return _wq_thactive_fetch_and_add(wq, _wq_thactive_offset_for_qos(newqos) -
341 _wq_thactive_offset_for_qos(oldqos));
342 }
343
344 static inline uint32_t
345 _wq_thactive_aggregate_downto_qos(struct workqueue *wq, wq_thactive_t v,
346 int qos, uint32_t *busycount, uint32_t *max_busycount)
347 {
348 uint32_t count = 0, active;
349 uint64_t curtime;
350
351 #ifndef __LP64__
352 /*
353 * on 32bits the manager bucket is a single bit and the best constrained
354 * request QoS 3 bits are where the 10 bits of a regular QoS bucket count
355 * would be. Mask them out.
356 */
357 v &= ~(~0ull << WQ_THACTIVE_QOS_SHIFT);
358 #endif
359 if (busycount) {
360 curtime = mach_absolute_time();
361 *busycount = 0;
362 }
363 if (max_busycount) {
364 *max_busycount = qos + 1;
365 }
366 for (int i = 0; i <= qos; i++, v >>= WQ_THACTIVE_BUCKET_WIDTH) {
367 active = v & WQ_THACTIVE_BUCKET_MASK;
368 count += active;
369 if (busycount && wq->wq_thscheduled_count[i] > active) {
370 if (wq_thread_is_busy(curtime, &wq->wq_lastblocked_ts[i])) {
371 /*
372 * We only consider the last blocked thread for a given bucket
373 * as busy because we don't want to take the list lock in each
374 * sched callback. However this is an approximation that could
375 * contribute to thread creation storms.
376 */
377 (*busycount)++;
378 }
379 }
380 }
381 return count;
382 }
383
384 #pragma mark - Process/Thread Setup/Teardown syscalls
385
386 static mach_vm_offset_t
387 stack_addr_hint(proc_t p, vm_map_t vmap)
388 {
389 mach_vm_offset_t stackaddr;
390 mach_vm_offset_t aslr_offset;
391 bool proc64bit = proc_is64bit(p);
392
393 // We can't safely take random values % something unless its a power-of-two
394 _Static_assert(powerof2(PTH_DEFAULT_STACKSIZE), "PTH_DEFAULT_STACKSIZE is a power-of-two");
395
396 #if defined(__i386__) || defined(__x86_64__)
397 if (proc64bit) {
398 // Matches vm_map_get_max_aslr_slide_pages's image shift in xnu
399 aslr_offset = random() % (1 << 28); // about 512 stacks
400 } else {
401 // Actually bigger than the image shift, we've got ~256MB to work with
402 aslr_offset = random() % (16 * PTH_DEFAULT_STACKSIZE);
403 }
404 aslr_offset = vm_map_trunc_page_mask(aslr_offset, vm_map_page_mask(vmap));
405 if (proc64bit) {
406 // Above nanomalloc range (see NANOZONE_SIGNATURE)
407 stackaddr = 0x700000000000 + aslr_offset;
408 } else {
409 stackaddr = SHARED_REGION_BASE_I386 + SHARED_REGION_SIZE_I386 + aslr_offset;
410 }
411 #elif defined(__arm__) || defined(__arm64__)
412 user_addr_t main_thread_stack_top = 0;
413 if (pthread_kern->proc_get_user_stack) {
414 main_thread_stack_top = pthread_kern->proc_get_user_stack(p);
415 }
416 if (proc64bit && main_thread_stack_top) {
417 // The main thread stack position is randomly slid by xnu (c.f.
418 // load_main() in mach_loader.c), so basing pthread stack allocations
419 // where the main thread stack ends is already ASLRd and doing so
420 // avoids creating a gap in the process address space that may cause
421 // extra PTE memory usage. rdar://problem/33328206
422 stackaddr = vm_map_trunc_page_mask((vm_map_offset_t)main_thread_stack_top,
423 vm_map_page_mask(vmap));
424 } else {
425 // vm_map_get_max_aslr_slide_pages ensures 1MB of slide, we do better
426 aslr_offset = random() % ((proc64bit ? 4 : 2) * PTH_DEFAULT_STACKSIZE);
427 aslr_offset = vm_map_trunc_page_mask((vm_map_offset_t)aslr_offset,
428 vm_map_page_mask(vmap));
429 if (proc64bit) {
430 // 64 stacks below shared region
431 stackaddr = SHARED_REGION_BASE_ARM64 - 64 * PTH_DEFAULT_STACKSIZE - aslr_offset;
432 } else {
433 // If you try to slide down from this point, you risk ending up in memory consumed by malloc
434 stackaddr = SHARED_REGION_BASE_ARM - 32 * PTH_DEFAULT_STACKSIZE + aslr_offset;
435 }
436 }
437 #else
438 #error Need to define a stack address hint for this architecture
439 #endif
440 return stackaddr;
441 }
442
443 /**
444 * bsdthread_create system call. Used by pthread_create.
445 */
446 int
447 _bsdthread_create(struct proc *p, user_addr_t user_func, user_addr_t user_funcarg, user_addr_t user_stack, user_addr_t user_pthread, uint32_t flags, user_addr_t *retval)
448 {
449 kern_return_t kret;
450 void * sright;
451 int error = 0;
452 int allocated = 0;
453 mach_vm_offset_t stackaddr;
454 mach_vm_size_t th_allocsize = 0;
455 mach_vm_size_t th_guardsize;
456 mach_vm_offset_t th_stack;
457 mach_vm_offset_t th_pthread;
458 mach_vm_offset_t th_tsd_base;
459 mach_port_name_t th_thport;
460 thread_t th;
461 vm_map_t vmap = pthread_kern->current_map();
462 task_t ctask = current_task();
463 unsigned int policy, importance;
464 uint32_t tsd_offset;
465
466 int isLP64 = 0;
467
468 if (pthread_kern->proc_get_register(p) == 0) {
469 return EINVAL;
470 }
471
472 PTHREAD_TRACE(TRACE_pthread_thread_create | DBG_FUNC_START, flags, 0, 0, 0, 0);
473
474 isLP64 = proc_is64bit(p);
475 th_guardsize = vm_map_page_size(vmap);
476
477 stackaddr = pthread_kern->proc_get_stack_addr_hint(p);
478 kret = pthread_kern->thread_create(ctask, &th);
479 if (kret != KERN_SUCCESS)
480 return(ENOMEM);
481 thread_reference(th);
482
483 pthread_kern->thread_set_tag(th, THREAD_TAG_PTHREAD);
484
485 sright = (void *)pthread_kern->convert_thread_to_port(th);
486 th_thport = pthread_kern->ipc_port_copyout_send(sright, pthread_kern->task_get_ipcspace(ctask));
487 if (!MACH_PORT_VALID(th_thport)) {
488 error = EMFILE; // userland will convert this into a crash
489 goto out;
490 }
491
492 if ((flags & PTHREAD_START_CUSTOM) == 0) {
493 mach_vm_size_t pthread_size =
494 vm_map_round_page_mask(pthread_kern->proc_get_pthsize(p) + PTHREAD_T_OFFSET, vm_map_page_mask(vmap));
495 th_allocsize = th_guardsize + user_stack + pthread_size;
496 user_stack += PTHREAD_T_OFFSET;
497
498 kret = mach_vm_map(vmap, &stackaddr,
499 th_allocsize,
500 page_size-1,
501 VM_MAKE_TAG(VM_MEMORY_STACK)| VM_FLAGS_ANYWHERE , NULL,
502 0, FALSE, VM_PROT_DEFAULT, VM_PROT_ALL,
503 VM_INHERIT_DEFAULT);
504 if (kret != KERN_SUCCESS){
505 kret = mach_vm_allocate(vmap,
506 &stackaddr, th_allocsize,
507 VM_MAKE_TAG(VM_MEMORY_STACK)| VM_FLAGS_ANYWHERE);
508 }
509 if (kret != KERN_SUCCESS) {
510 error = ENOMEM;
511 goto out;
512 }
513
514 PTHREAD_TRACE(TRACE_pthread_thread_create|DBG_FUNC_NONE, th_allocsize, stackaddr, 0, 2, 0);
515
516 allocated = 1;
517 /*
518 * The guard page is at the lowest address
519 * The stack base is the highest address
520 */
521 kret = mach_vm_protect(vmap, stackaddr, th_guardsize, FALSE, VM_PROT_NONE);
522
523 if (kret != KERN_SUCCESS) {
524 error = ENOMEM;
525 goto out1;
526 }
527
528 th_pthread = stackaddr + th_guardsize + user_stack;
529 th_stack = th_pthread;
530
531 /*
532 * Pre-fault the first page of the new thread's stack and the page that will
533 * contain the pthread_t structure.
534 */
535 if (vm_map_trunc_page_mask((vm_map_offset_t)(th_stack - C_64_REDZONE_LEN), vm_map_page_mask(vmap)) !=
536 vm_map_trunc_page_mask((vm_map_offset_t)th_pthread, vm_map_page_mask(vmap))){
537 vm_fault( vmap,
538 vm_map_trunc_page_mask((vm_map_offset_t)(th_stack - C_64_REDZONE_LEN), vm_map_page_mask(vmap)),
539 VM_PROT_READ | VM_PROT_WRITE,
540 FALSE,
541 THREAD_UNINT, NULL, 0);
542 }
543
544 vm_fault( vmap,
545 vm_map_trunc_page_mask((vm_map_offset_t)th_pthread, vm_map_page_mask(vmap)),
546 VM_PROT_READ | VM_PROT_WRITE,
547 FALSE,
548 THREAD_UNINT, NULL, 0);
549
550 } else {
551 th_stack = user_stack;
552 th_pthread = user_pthread;
553
554 PTHREAD_TRACE(TRACE_pthread_thread_create|DBG_FUNC_NONE, 0, 0, 0, 3, 0);
555 }
556
557 tsd_offset = pthread_kern->proc_get_pthread_tsd_offset(p);
558 if (tsd_offset) {
559 th_tsd_base = th_pthread + tsd_offset;
560 kret = pthread_kern->thread_set_tsd_base(th, th_tsd_base);
561 if (kret == KERN_SUCCESS) {
562 flags |= PTHREAD_START_TSD_BASE_SET;
563 }
564 }
565
566 #if defined(__i386__) || defined(__x86_64__)
567 /*
568 * Set up i386 registers & function call.
569 */
570 if (isLP64 == 0) {
571 x86_thread_state32_t state = {
572 .eip = (unsigned int)pthread_kern->proc_get_threadstart(p),
573 .eax = (unsigned int)th_pthread,
574 .ebx = (unsigned int)th_thport,
575 .ecx = (unsigned int)user_func,
576 .edx = (unsigned int)user_funcarg,
577 .edi = (unsigned int)user_stack,
578 .esi = (unsigned int)flags,
579 /*
580 * set stack pointer
581 */
582 .esp = (int)((vm_offset_t)(th_stack-C_32_STK_ALIGN))
583 };
584
585 error = pthread_kern->thread_set_wq_state32(th, (thread_state_t)&state);
586 if (error != KERN_SUCCESS) {
587 error = EINVAL;
588 goto out;
589 }
590 } else {
591 x86_thread_state64_t state64 = {
592 .rip = (uint64_t)pthread_kern->proc_get_threadstart(p),
593 .rdi = (uint64_t)th_pthread,
594 .rsi = (uint64_t)(th_thport),
595 .rdx = (uint64_t)user_func,
596 .rcx = (uint64_t)user_funcarg,
597 .r8 = (uint64_t)user_stack,
598 .r9 = (uint64_t)flags,
599 /*
600 * set stack pointer aligned to 16 byte boundary
601 */
602 .rsp = (uint64_t)(th_stack - C_64_REDZONE_LEN)
603 };
604
605 error = pthread_kern->thread_set_wq_state64(th, (thread_state_t)&state64);
606 if (error != KERN_SUCCESS) {
607 error = EINVAL;
608 goto out;
609 }
610
611 }
612 #elif defined(__arm__)
613 arm_thread_state_t state = {
614 .pc = (int)pthread_kern->proc_get_threadstart(p),
615 .r[0] = (unsigned int)th_pthread,
616 .r[1] = (unsigned int)th_thport,
617 .r[2] = (unsigned int)user_func,
618 .r[3] = (unsigned int)user_funcarg,
619 .r[4] = (unsigned int)user_stack,
620 .r[5] = (unsigned int)flags,
621
622 /* Set r7 & lr to 0 for better back tracing */
623 .r[7] = 0,
624 .lr = 0,
625
626 /*
627 * set stack pointer
628 */
629 .sp = (int)((vm_offset_t)(th_stack-C_32_STK_ALIGN))
630 };
631
632 (void) pthread_kern->thread_set_wq_state32(th, (thread_state_t)&state);
633
634 #else
635 #error bsdthread_create not defined for this architecture
636 #endif
637
638 if ((flags & PTHREAD_START_SETSCHED) != 0) {
639 /* Set scheduling parameters if needed */
640 thread_extended_policy_data_t extinfo;
641 thread_precedence_policy_data_t precedinfo;
642
643 importance = (flags & PTHREAD_START_IMPORTANCE_MASK);
644 policy = (flags >> PTHREAD_START_POLICY_BITSHIFT) & PTHREAD_START_POLICY_MASK;
645
646 if (policy == SCHED_OTHER) {
647 extinfo.timeshare = 1;
648 } else {
649 extinfo.timeshare = 0;
650 }
651
652 thread_policy_set(th, THREAD_EXTENDED_POLICY, (thread_policy_t)&extinfo, THREAD_EXTENDED_POLICY_COUNT);
653
654 precedinfo.importance = (importance - BASEPRI_DEFAULT);
655 thread_policy_set(th, THREAD_PRECEDENCE_POLICY, (thread_policy_t)&precedinfo, THREAD_PRECEDENCE_POLICY_COUNT);
656 } else if ((flags & PTHREAD_START_QOSCLASS) != 0) {
657 /* Set thread QoS class if requested. */
658 pthread_priority_t priority = (pthread_priority_t)(flags & PTHREAD_START_QOSCLASS_MASK);
659
660 thread_qos_policy_data_t qos;
661 qos.qos_tier = pthread_priority_get_thread_qos(priority);
662 qos.tier_importance = (qos.qos_tier == QOS_CLASS_UNSPECIFIED) ? 0 :
663 _pthread_priority_get_relpri(priority);
664
665 pthread_kern->thread_policy_set_internal(th, THREAD_QOS_POLICY, (thread_policy_t)&qos, THREAD_QOS_POLICY_COUNT);
666 }
667
668 if (pthread_kern->proc_get_mach_thread_self_tsd_offset) {
669 uint64_t mach_thread_self_offset =
670 pthread_kern->proc_get_mach_thread_self_tsd_offset(p);
671 if (mach_thread_self_offset && tsd_offset) {
672 bool proc64bit = proc_is64bit(p);
673 if (proc64bit) {
674 uint64_t th_thport_tsd = (uint64_t)th_thport;
675 error = copyout(&th_thport_tsd, th_pthread + tsd_offset +
676 mach_thread_self_offset, sizeof(th_thport_tsd));
677 } else {
678 uint32_t th_thport_tsd = (uint32_t)th_thport;
679 error = copyout(&th_thport_tsd, th_pthread + tsd_offset +
680 mach_thread_self_offset, sizeof(th_thport_tsd));
681 }
682 if (error) {
683 goto out1;
684 }
685 }
686 }
687
688 kret = pthread_kern->thread_resume(th);
689 if (kret != KERN_SUCCESS) {
690 error = EINVAL;
691 goto out1;
692 }
693 thread_deallocate(th); /* drop the creator reference */
694
695 PTHREAD_TRACE(TRACE_pthread_thread_create|DBG_FUNC_END, error, th_pthread, 0, 0, 0);
696
697 // cast required as mach_vm_offset_t is always 64 bits even on 32-bit platforms
698 *retval = (user_addr_t)th_pthread;
699
700 return(0);
701
702 out1:
703 if (allocated != 0) {
704 (void)mach_vm_deallocate(vmap, stackaddr, th_allocsize);
705 }
706 out:
707 (void)pthread_kern->mach_port_deallocate(pthread_kern->task_get_ipcspace(ctask), th_thport);
708 if (pthread_kern->thread_will_park_or_terminate) {
709 pthread_kern->thread_will_park_or_terminate(th);
710 }
711 (void)thread_terminate(th);
712 (void)thread_deallocate(th);
713 return(error);
714 }
715
716 /**
717 * bsdthread_terminate system call. Used by pthread_terminate
718 */
719 int
720 _bsdthread_terminate(__unused struct proc *p,
721 user_addr_t stackaddr,
722 size_t size,
723 uint32_t kthport,
724 uint32_t sem,
725 __unused int32_t *retval)
726 {
727 mach_vm_offset_t freeaddr;
728 mach_vm_size_t freesize;
729 kern_return_t kret;
730 thread_t th = current_thread();
731
732 freeaddr = (mach_vm_offset_t)stackaddr;
733 freesize = size;
734
735 PTHREAD_TRACE(TRACE_pthread_thread_terminate|DBG_FUNC_START, freeaddr, freesize, kthport, 0xff, 0);
736
737 if ((freesize != (mach_vm_size_t)0) && (freeaddr != (mach_vm_offset_t)0)) {
738 if (pthread_kern->thread_get_tag(th) & THREAD_TAG_MAINTHREAD){
739 vm_map_t user_map = pthread_kern->current_map();
740 freesize = vm_map_trunc_page_mask((vm_map_offset_t)freesize - 1, vm_map_page_mask(user_map));
741 kret = mach_vm_behavior_set(user_map, freeaddr, freesize, VM_BEHAVIOR_REUSABLE);
742 assert(kret == KERN_SUCCESS || kret == KERN_INVALID_ADDRESS);
743 kret = kret ? kret : mach_vm_protect(user_map, freeaddr, freesize, FALSE, VM_PROT_NONE);
744 assert(kret == KERN_SUCCESS || kret == KERN_INVALID_ADDRESS);
745 } else {
746 kret = mach_vm_deallocate(pthread_kern->current_map(), freeaddr, freesize);
747 if (kret != KERN_SUCCESS) {
748 PTHREAD_TRACE(TRACE_pthread_thread_terminate|DBG_FUNC_END, kret, 0, 0, 0, 0);
749 return(EINVAL);
750 }
751 }
752 }
753
754 if (pthread_kern->thread_will_park_or_terminate) {
755 pthread_kern->thread_will_park_or_terminate(th);
756 }
757 (void)thread_terminate(th);
758 if (sem != MACH_PORT_NULL) {
759 kret = pthread_kern->semaphore_signal_internal_trap(sem);
760 if (kret != KERN_SUCCESS) {
761 PTHREAD_TRACE(TRACE_pthread_thread_terminate|DBG_FUNC_END, kret, 0, 0, 0, 0);
762 return(EINVAL);
763 }
764 }
765
766 if (kthport != MACH_PORT_NULL) {
767 pthread_kern->mach_port_deallocate(pthread_kern->task_get_ipcspace(current_task()), kthport);
768 }
769
770 PTHREAD_TRACE(TRACE_pthread_thread_terminate|DBG_FUNC_END, 0, 0, 0, 0, 0);
771
772 pthread_kern->thread_exception_return();
773 panic("bsdthread_terminate: still running\n");
774
775 PTHREAD_TRACE(TRACE_pthread_thread_terminate|DBG_FUNC_END, 0, 0xff, 0, 0, 0);
776
777 return(0);
778 }
779
780 /**
781 * bsdthread_register system call. Performs per-process setup. Responsible for
782 * returning capabilitiy bits to userspace and receiving userspace function addresses.
783 */
784 int
785 _bsdthread_register(struct proc *p,
786 user_addr_t threadstart,
787 user_addr_t wqthread,
788 int pthsize,
789 user_addr_t pthread_init_data,
790 user_addr_t pthread_init_data_size,
791 uint64_t dispatchqueue_offset,
792 int32_t *retval)
793 {
794 struct _pthread_registration_data data = {};
795 uint32_t max_tsd_offset;
796 kern_return_t kr;
797 size_t pthread_init_sz = 0;
798
799 /* syscall randomizer test can pass bogus values */
800 if (pthsize < 0 || pthsize > MAX_PTHREAD_SIZE) {
801 return(EINVAL);
802 }
803 /*
804 * if we have pthread_init_data, then we use that and target_concptr
805 * (which is an offset) get data.
806 */
807 if (pthread_init_data != 0) {
808 if (pthread_init_data_size < sizeof(data.version)) {
809 return EINVAL;
810 }
811 pthread_init_sz = MIN(sizeof(data), (size_t)pthread_init_data_size);
812 int ret = copyin(pthread_init_data, &data, pthread_init_sz);
813 if (ret) {
814 return ret;
815 }
816 if (data.version != (size_t)pthread_init_data_size) {
817 return EINVAL;
818 }
819 } else {
820 data.dispatch_queue_offset = dispatchqueue_offset;
821 }
822
823 /* We have to do this before proc_get_register so that it resets after fork */
824 mach_vm_offset_t stackaddr = stack_addr_hint(p, pthread_kern->current_map());
825 pthread_kern->proc_set_stack_addr_hint(p, (user_addr_t)stackaddr);
826
827 /* prevent multiple registrations */
828 if (pthread_kern->proc_get_register(p) != 0) {
829 return(EINVAL);
830 }
831
832 pthread_kern->proc_set_threadstart(p, threadstart);
833 pthread_kern->proc_set_wqthread(p, wqthread);
834 pthread_kern->proc_set_pthsize(p, pthsize);
835 pthread_kern->proc_set_register(p);
836
837 uint32_t tsd_slot_sz = proc_is64bit(p) ? sizeof(uint64_t) : sizeof(uint32_t);
838 if ((uint32_t)pthsize >= tsd_slot_sz &&
839 data.tsd_offset <= (uint32_t)(pthsize - tsd_slot_sz)) {
840 max_tsd_offset = ((uint32_t)pthsize - data.tsd_offset - tsd_slot_sz);
841 } else {
842 data.tsd_offset = 0;
843 max_tsd_offset = 0;
844 }
845 pthread_kern->proc_set_pthread_tsd_offset(p, data.tsd_offset);
846
847 if (data.dispatch_queue_offset > max_tsd_offset) {
848 data.dispatch_queue_offset = 0;
849 }
850 pthread_kern->proc_set_dispatchqueue_offset(p, data.dispatch_queue_offset);
851
852 if (pthread_kern->proc_set_return_to_kernel_offset) {
853 if (data.return_to_kernel_offset > max_tsd_offset) {
854 data.return_to_kernel_offset = 0;
855 }
856 pthread_kern->proc_set_return_to_kernel_offset(p,
857 data.return_to_kernel_offset);
858 }
859
860 if (pthread_kern->proc_set_mach_thread_self_tsd_offset) {
861 if (data.mach_thread_self_offset > max_tsd_offset) {
862 data.mach_thread_self_offset = 0;
863 }
864 pthread_kern->proc_set_mach_thread_self_tsd_offset(p,
865 data.mach_thread_self_offset);
866 }
867
868 if (pthread_init_data != 0) {
869 /* Outgoing data that userspace expects as a reply */
870 data.version = sizeof(struct _pthread_registration_data);
871 if (pthread_kern->qos_main_thread_active()) {
872 mach_msg_type_number_t nqos = THREAD_QOS_POLICY_COUNT;
873 thread_qos_policy_data_t qos;
874 boolean_t gd = FALSE;
875
876 kr = pthread_kern->thread_policy_get(current_thread(), THREAD_QOS_POLICY, (thread_policy_t)&qos, &nqos, &gd);
877 if (kr != KERN_SUCCESS || qos.qos_tier == THREAD_QOS_UNSPECIFIED) {
878 /* Unspecified threads means the kernel wants us to impose legacy upon the thread. */
879 qos.qos_tier = THREAD_QOS_LEGACY;
880 qos.tier_importance = 0;
881
882 kr = pthread_kern->thread_policy_set_internal(current_thread(), THREAD_QOS_POLICY, (thread_policy_t)&qos, THREAD_QOS_POLICY_COUNT);
883 }
884
885 if (kr == KERN_SUCCESS) {
886 data.main_qos = thread_qos_get_pthread_priority(qos.qos_tier);
887 } else {
888 data.main_qos = _pthread_priority_make_newest(QOS_CLASS_UNSPECIFIED, 0, 0);
889 }
890 } else {
891 data.main_qos = _pthread_priority_make_newest(QOS_CLASS_UNSPECIFIED, 0, 0);
892 }
893
894 kr = copyout(&data, pthread_init_data, pthread_init_sz);
895 if (kr != KERN_SUCCESS) {
896 return EINVAL;
897 }
898 }
899
900 /* return the supported feature set as the return value. */
901 *retval = PTHREAD_FEATURE_SUPPORTED;
902
903 return(0);
904 }
905
906 #pragma mark - QoS Manipulation
907
908 int
909 _bsdthread_ctl_set_qos(struct proc *p, user_addr_t __unused cmd, mach_port_name_t kport, user_addr_t tsd_priority_addr, user_addr_t arg3, int *retval)
910 {
911 int rv;
912 thread_t th;
913
914 pthread_priority_t priority;
915
916 /* Unused parameters must be zero. */
917 if (arg3 != 0) {
918 return EINVAL;
919 }
920
921 /* QoS is stored in a given slot in the pthread TSD. We need to copy that in and set our QoS based on it. */
922 if (proc_is64bit(p)) {
923 uint64_t v;
924 rv = copyin(tsd_priority_addr, &v, sizeof(v));
925 if (rv) goto out;
926 priority = (int)(v & 0xffffffff);
927 } else {
928 uint32_t v;
929 rv = copyin(tsd_priority_addr, &v, sizeof(v));
930 if (rv) goto out;
931 priority = v;
932 }
933
934 if ((th = port_name_to_thread(kport)) == THREAD_NULL) {
935 return ESRCH;
936 }
937
938 /* <rdar://problem/16211829> Disable pthread_set_qos_class_np() on threads other than pthread_self */
939 if (th != current_thread()) {
940 thread_deallocate(th);
941 return EPERM;
942 }
943
944 rv = _bsdthread_ctl_set_self(p, 0, priority, 0, _PTHREAD_SET_SELF_QOS_FLAG, retval);
945
946 /* Static param the thread, we just set QoS on it, so its stuck in QoS land now. */
947 /* pthread_kern->thread_static_param(th, TRUE); */ // see <rdar://problem/16433744>, for details
948
949 thread_deallocate(th);
950
951 out:
952 return rv;
953 }
954
955 static inline struct threadlist *
956 util_get_thread_threadlist_entry(thread_t th)
957 {
958 struct uthread *uth = pthread_kern->get_bsdthread_info(th);
959 if (uth) {
960 struct threadlist *tl = pthread_kern->uthread_get_threadlist(uth);
961 return tl;
962 }
963 return NULL;
964 }
965
966 boolean_t
967 _workq_thread_has_been_unbound(thread_t th, int qos_class)
968 {
969 struct threadlist *tl = util_get_thread_threadlist_entry(th);
970 if (!tl) {
971 return FALSE;
972 }
973
974 struct workqueue *wq = tl->th_workq;
975 workqueue_lock_spin(wq);
976
977 if (tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET) {
978 goto failure;
979 } else if (qos_class != class_index_get_thread_qos(tl->th_priority)) {
980 goto failure;
981 }
982
983 if ((tl->th_flags & TH_LIST_KEVENT_BOUND)){
984 goto failure;
985 }
986 tl->th_flags &= ~TH_LIST_KEVENT_BOUND;
987
988 workqueue_unlock(wq);
989 return TRUE;
990
991 failure:
992 workqueue_unlock(wq);
993 return FALSE;
994 }
995
996 int
997 _bsdthread_ctl_set_self(struct proc *p, user_addr_t __unused cmd, pthread_priority_t priority, mach_port_name_t voucher, _pthread_set_flags_t flags, int __unused *retval)
998 {
999 thread_qos_policy_data_t qos;
1000 mach_msg_type_number_t nqos = THREAD_QOS_POLICY_COUNT;
1001 boolean_t gd = FALSE;
1002 thread_t th = current_thread();
1003 struct workqueue *wq = NULL;
1004 struct threadlist *tl = NULL;
1005
1006 kern_return_t kr;
1007 int qos_rv = 0, voucher_rv = 0, fixedpri_rv = 0;
1008
1009 if ((flags & _PTHREAD_SET_SELF_WQ_KEVENT_UNBIND) != 0) {
1010 tl = util_get_thread_threadlist_entry(th);
1011 if (tl) {
1012 wq = tl->th_workq;
1013 } else {
1014 goto qos;
1015 }
1016
1017 workqueue_lock_spin(wq);
1018 if (tl->th_flags & TH_LIST_KEVENT_BOUND) {
1019 tl->th_flags &= ~TH_LIST_KEVENT_BOUND;
1020 unsigned int kevent_flags = KEVENT_FLAG_WORKQ | KEVENT_FLAG_UNBIND_CHECK_FLAGS;
1021 if (tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET) {
1022 kevent_flags |= KEVENT_FLAG_WORKQ_MANAGER;
1023 }
1024
1025 workqueue_unlock(wq);
1026 __assert_only int ret = kevent_qos_internal_unbind(p, class_index_get_thread_qos(tl->th_priority), th, kevent_flags);
1027 assert(ret == 0);
1028 } else {
1029 workqueue_unlock(wq);
1030 }
1031 }
1032
1033 qos:
1034 if ((flags & _PTHREAD_SET_SELF_QOS_FLAG) != 0) {
1035 kr = pthread_kern->thread_policy_get(th, THREAD_QOS_POLICY, (thread_policy_t)&qos, &nqos, &gd);
1036 if (kr != KERN_SUCCESS) {
1037 qos_rv = EINVAL;
1038 goto voucher;
1039 }
1040
1041 /*
1042 * If we have main-thread QoS then we don't allow a thread to come out
1043 * of QOS_CLASS_UNSPECIFIED.
1044 */
1045 if (pthread_kern->qos_main_thread_active() && qos.qos_tier ==
1046 THREAD_QOS_UNSPECIFIED) {
1047 qos_rv = EPERM;
1048 goto voucher;
1049 }
1050
1051 if (!tl) {
1052 tl = util_get_thread_threadlist_entry(th);
1053 if (tl) wq = tl->th_workq;
1054 }
1055
1056 PTHREAD_TRACE_WQ(TRACE_pthread_set_qos_self | DBG_FUNC_START, wq, qos.qos_tier, qos.tier_importance, 0, 0);
1057
1058 qos.qos_tier = pthread_priority_get_thread_qos(priority);
1059 qos.tier_importance = (qos.qos_tier == QOS_CLASS_UNSPECIFIED) ? 0 : _pthread_priority_get_relpri(priority);
1060
1061 if (qos.qos_tier == QOS_CLASS_UNSPECIFIED ||
1062 qos.tier_importance > 0 || qos.tier_importance < THREAD_QOS_MIN_TIER_IMPORTANCE) {
1063 qos_rv = EINVAL;
1064 goto voucher;
1065 }
1066
1067 /*
1068 * If we're a workqueue, the threadlist item priority needs adjusting,
1069 * along with the bucket we were running in.
1070 */
1071 if (tl) {
1072 bool try_run_threadreq = false;
1073
1074 workqueue_lock_spin(wq);
1075 kr = pthread_kern->thread_set_workq_qos(th, qos.qos_tier, qos.tier_importance);
1076 assert(kr == KERN_SUCCESS || kr == KERN_TERMINATED);
1077
1078 /* Fix up counters. */
1079 uint8_t old_bucket = tl->th_priority;
1080 uint8_t new_bucket = pthread_priority_get_class_index(priority);
1081
1082 if (old_bucket != new_bucket) {
1083 _wq_thactive_move(wq, old_bucket, new_bucket);
1084 wq->wq_thscheduled_count[old_bucket]--;
1085 wq->wq_thscheduled_count[new_bucket]++;
1086 if (old_bucket == WORKQUEUE_EVENT_MANAGER_BUCKET ||
1087 old_bucket < new_bucket) {
1088 /*
1089 * if the QoS of the thread was lowered, then this could
1090 * allow for a higher QoS thread request to run, so we need
1091 * to reevaluate.
1092 */
1093 try_run_threadreq = true;
1094 }
1095 tl->th_priority = new_bucket;
1096 }
1097
1098 bool old_overcommit = !(tl->th_flags & TH_LIST_CONSTRAINED);
1099 bool new_overcommit = priority & _PTHREAD_PRIORITY_OVERCOMMIT_FLAG;
1100 if (!old_overcommit && new_overcommit) {
1101 if (wq->wq_constrained_threads_scheduled-- ==
1102 wq_max_constrained_threads) {
1103 try_run_threadreq = true;
1104 }
1105 tl->th_flags &= ~TH_LIST_CONSTRAINED;
1106 } else if (old_overcommit && !new_overcommit) {
1107 wq->wq_constrained_threads_scheduled++;
1108 tl->th_flags |= TH_LIST_CONSTRAINED;
1109 }
1110
1111 if (try_run_threadreq) {
1112 workqueue_run_threadreq_and_unlock(p, wq, NULL, NULL, true);
1113 } else {
1114 workqueue_unlock(wq);
1115 }
1116 } else {
1117 kr = pthread_kern->thread_policy_set_internal(th, THREAD_QOS_POLICY, (thread_policy_t)&qos, THREAD_QOS_POLICY_COUNT);
1118 if (kr != KERN_SUCCESS) {
1119 qos_rv = EINVAL;
1120 }
1121 }
1122
1123 PTHREAD_TRACE_WQ(TRACE_pthread_set_qos_self | DBG_FUNC_END, wq, qos.qos_tier, qos.tier_importance, 0, 0);
1124 }
1125
1126 voucher:
1127 if ((flags & _PTHREAD_SET_SELF_VOUCHER_FLAG) != 0) {
1128 kr = pthread_kern->thread_set_voucher_name(voucher);
1129 if (kr != KERN_SUCCESS) {
1130 voucher_rv = ENOENT;
1131 goto fixedpri;
1132 }
1133 }
1134
1135 fixedpri:
1136 if (qos_rv) goto done;
1137 if ((flags & _PTHREAD_SET_SELF_FIXEDPRIORITY_FLAG) != 0) {
1138 thread_extended_policy_data_t extpol = {.timeshare = 0};
1139
1140 if (!tl) tl = util_get_thread_threadlist_entry(th);
1141 if (tl) {
1142 /* Not allowed on workqueue threads */
1143 fixedpri_rv = ENOTSUP;
1144 goto done;
1145 }
1146
1147 kr = pthread_kern->thread_policy_set_internal(th, THREAD_EXTENDED_POLICY, (thread_policy_t)&extpol, THREAD_EXTENDED_POLICY_COUNT);
1148 if (kr != KERN_SUCCESS) {
1149 fixedpri_rv = EINVAL;
1150 goto done;
1151 }
1152 } else if ((flags & _PTHREAD_SET_SELF_TIMESHARE_FLAG) != 0) {
1153 thread_extended_policy_data_t extpol = {.timeshare = 1};
1154
1155 if (!tl) tl = util_get_thread_threadlist_entry(th);
1156 if (tl) {
1157 /* Not allowed on workqueue threads */
1158 fixedpri_rv = ENOTSUP;
1159 goto done;
1160 }
1161
1162 kr = pthread_kern->thread_policy_set_internal(th, THREAD_EXTENDED_POLICY, (thread_policy_t)&extpol, THREAD_EXTENDED_POLICY_COUNT);
1163 if (kr != KERN_SUCCESS) {
1164 fixedpri_rv = EINVAL;
1165 goto done;
1166 }
1167 }
1168
1169 done:
1170 if (qos_rv && voucher_rv) {
1171 /* Both failed, give that a unique error. */
1172 return EBADMSG;
1173 }
1174
1175 if (qos_rv) {
1176 return qos_rv;
1177 }
1178
1179 if (voucher_rv) {
1180 return voucher_rv;
1181 }
1182
1183 if (fixedpri_rv) {
1184 return fixedpri_rv;
1185 }
1186
1187 return 0;
1188 }
1189
1190 int
1191 _bsdthread_ctl_qos_override_start(struct proc __unused *p, user_addr_t __unused cmd, mach_port_name_t kport, pthread_priority_t priority, user_addr_t resource, int __unused *retval)
1192 {
1193 thread_t th;
1194 int rv = 0;
1195
1196 if ((th = port_name_to_thread(kport)) == THREAD_NULL) {
1197 return ESRCH;
1198 }
1199
1200 int override_qos = pthread_priority_get_thread_qos(priority);
1201
1202 struct threadlist *tl = util_get_thread_threadlist_entry(th);
1203 if (tl) {
1204 PTHREAD_TRACE_WQ(TRACE_wq_override_start | DBG_FUNC_NONE, tl->th_workq, thread_tid(th), 1, priority, 0);
1205 }
1206
1207 /* The only failure case here is if we pass a tid and have it lookup the thread, we pass the uthread, so this all always succeeds. */
1208 pthread_kern->proc_usynch_thread_qos_add_override_for_resource_check_owner(th, override_qos, TRUE,
1209 resource, THREAD_QOS_OVERRIDE_TYPE_PTHREAD_EXPLICIT_OVERRIDE, USER_ADDR_NULL, MACH_PORT_NULL);
1210 thread_deallocate(th);
1211 return rv;
1212 }
1213
1214 int
1215 _bsdthread_ctl_qos_override_end(struct proc __unused *p, user_addr_t __unused cmd, mach_port_name_t kport, user_addr_t resource, user_addr_t arg3, int __unused *retval)
1216 {
1217 thread_t th;
1218 int rv = 0;
1219
1220 if (arg3 != 0) {
1221 return EINVAL;
1222 }
1223
1224 if ((th = port_name_to_thread(kport)) == THREAD_NULL) {
1225 return ESRCH;
1226 }
1227
1228 struct uthread *uth = pthread_kern->get_bsdthread_info(th);
1229
1230 struct threadlist *tl = util_get_thread_threadlist_entry(th);
1231 if (tl) {
1232 PTHREAD_TRACE_WQ(TRACE_wq_override_end | DBG_FUNC_NONE, tl->th_workq, thread_tid(th), 0, 0, 0);
1233 }
1234
1235 pthread_kern->proc_usynch_thread_qos_remove_override_for_resource(current_task(), uth, 0, resource, THREAD_QOS_OVERRIDE_TYPE_PTHREAD_EXPLICIT_OVERRIDE);
1236
1237 thread_deallocate(th);
1238 return rv;
1239 }
1240
1241 static int
1242 _bsdthread_ctl_qos_dispatch_asynchronous_override_add_internal(mach_port_name_t kport, pthread_priority_t priority, user_addr_t resource, user_addr_t ulock_addr)
1243 {
1244 thread_t th;
1245 int rv = 0;
1246
1247 if ((th = port_name_to_thread(kport)) == THREAD_NULL) {
1248 return ESRCH;
1249 }
1250
1251 int override_qos = pthread_priority_get_thread_qos(priority);
1252
1253 struct threadlist *tl = util_get_thread_threadlist_entry(th);
1254 if (!tl) {
1255 thread_deallocate(th);
1256 return EPERM;
1257 }
1258
1259 PTHREAD_TRACE_WQ(TRACE_wq_override_dispatch | DBG_FUNC_NONE, tl->th_workq, thread_tid(th), 1, priority, 0);
1260
1261 rv = pthread_kern->proc_usynch_thread_qos_add_override_for_resource_check_owner(th, override_qos, TRUE,
1262 resource, THREAD_QOS_OVERRIDE_TYPE_DISPATCH_ASYNCHRONOUS_OVERRIDE, ulock_addr, kport);
1263
1264 thread_deallocate(th);
1265 return rv;
1266 }
1267
1268 int _bsdthread_ctl_qos_dispatch_asynchronous_override_add(struct proc __unused *p, user_addr_t __unused cmd,
1269 mach_port_name_t kport, pthread_priority_t priority, user_addr_t resource, int __unused *retval)
1270 {
1271 return _bsdthread_ctl_qos_dispatch_asynchronous_override_add_internal(kport, priority, resource, USER_ADDR_NULL);
1272 }
1273
1274 int
1275 _bsdthread_ctl_qos_override_dispatch(struct proc *p __unused, user_addr_t cmd __unused, mach_port_name_t kport, pthread_priority_t priority, user_addr_t ulock_addr, int __unused *retval)
1276 {
1277 return _bsdthread_ctl_qos_dispatch_asynchronous_override_add_internal(kport, priority, USER_ADDR_NULL, ulock_addr);
1278 }
1279
1280 int
1281 _bsdthread_ctl_qos_override_reset(struct proc *p, user_addr_t cmd, user_addr_t arg1, user_addr_t arg2, user_addr_t arg3, int *retval)
1282 {
1283 if (arg1 != 0 || arg2 != 0 || arg3 != 0) {
1284 return EINVAL;
1285 }
1286
1287 return _bsdthread_ctl_qos_dispatch_asynchronous_override_reset(p, cmd, 1 /* reset_all */, 0, 0, retval);
1288 }
1289
1290 int
1291 _bsdthread_ctl_qos_dispatch_asynchronous_override_reset(struct proc __unused *p, user_addr_t __unused cmd, int reset_all, user_addr_t resource, user_addr_t arg3, int __unused *retval)
1292 {
1293 if ((reset_all && (resource != 0)) || arg3 != 0) {
1294 return EINVAL;
1295 }
1296
1297 thread_t th = current_thread();
1298 struct uthread *uth = pthread_kern->get_bsdthread_info(th);
1299 struct threadlist *tl = pthread_kern->uthread_get_threadlist(uth);
1300
1301 if (!tl) {
1302 return EPERM;
1303 }
1304
1305 PTHREAD_TRACE_WQ(TRACE_wq_override_reset | DBG_FUNC_NONE, tl->th_workq, 0, 0, 0, 0);
1306
1307 resource = reset_all ? THREAD_QOS_OVERRIDE_RESOURCE_WILDCARD : resource;
1308 pthread_kern->proc_usynch_thread_qos_reset_override_for_resource(current_task(), uth, 0, resource, THREAD_QOS_OVERRIDE_TYPE_DISPATCH_ASYNCHRONOUS_OVERRIDE);
1309
1310 return 0;
1311 }
1312
1313 static int
1314 _bsdthread_ctl_max_parallelism(struct proc __unused *p, user_addr_t __unused cmd,
1315 int qos, unsigned long flags, int *retval)
1316 {
1317 _Static_assert(QOS_PARALLELISM_COUNT_LOGICAL ==
1318 _PTHREAD_QOS_PARALLELISM_COUNT_LOGICAL, "logical");
1319 _Static_assert(QOS_PARALLELISM_REALTIME ==
1320 _PTHREAD_QOS_PARALLELISM_REALTIME, "realtime");
1321
1322 if (flags & ~(QOS_PARALLELISM_REALTIME | QOS_PARALLELISM_COUNT_LOGICAL)) {
1323 return EINVAL;
1324 }
1325
1326 if (flags & QOS_PARALLELISM_REALTIME) {
1327 if (qos) {
1328 return EINVAL;
1329 }
1330 } else if (qos == THREAD_QOS_UNSPECIFIED || qos >= THREAD_QOS_LAST) {
1331 return EINVAL;
1332 }
1333
1334 *retval = pthread_kern->qos_max_parallelism(qos, flags);
1335 return 0;
1336 }
1337
1338 int
1339 _bsdthread_ctl(struct proc *p, user_addr_t cmd, user_addr_t arg1, user_addr_t arg2, user_addr_t arg3, int *retval)
1340 {
1341 switch (cmd) {
1342 case BSDTHREAD_CTL_SET_QOS:
1343 return _bsdthread_ctl_set_qos(p, cmd, (mach_port_name_t)arg1, arg2, arg3, retval);
1344 case BSDTHREAD_CTL_QOS_OVERRIDE_START:
1345 return _bsdthread_ctl_qos_override_start(p, cmd, (mach_port_name_t)arg1, (pthread_priority_t)arg2, arg3, retval);
1346 case BSDTHREAD_CTL_QOS_OVERRIDE_END:
1347 return _bsdthread_ctl_qos_override_end(p, cmd, (mach_port_name_t)arg1, arg2, arg3, retval);
1348 case BSDTHREAD_CTL_QOS_OVERRIDE_RESET:
1349 return _bsdthread_ctl_qos_override_reset(p, cmd, arg1, arg2, arg3, retval);
1350 case BSDTHREAD_CTL_QOS_OVERRIDE_DISPATCH:
1351 return _bsdthread_ctl_qos_override_dispatch(p, cmd, (mach_port_name_t)arg1, (pthread_priority_t)arg2, arg3, retval);
1352 case BSDTHREAD_CTL_QOS_DISPATCH_ASYNCHRONOUS_OVERRIDE_ADD:
1353 return _bsdthread_ctl_qos_dispatch_asynchronous_override_add(p, cmd, (mach_port_name_t)arg1, (pthread_priority_t)arg2, arg3, retval);
1354 case BSDTHREAD_CTL_QOS_DISPATCH_ASYNCHRONOUS_OVERRIDE_RESET:
1355 return _bsdthread_ctl_qos_dispatch_asynchronous_override_reset(p, cmd, (int)arg1, arg2, arg3, retval);
1356 case BSDTHREAD_CTL_SET_SELF:
1357 return _bsdthread_ctl_set_self(p, cmd, (pthread_priority_t)arg1, (mach_port_name_t)arg2, (_pthread_set_flags_t)arg3, retval);
1358 case BSDTHREAD_CTL_QOS_MAX_PARALLELISM:
1359 return _bsdthread_ctl_max_parallelism(p, cmd, (int)arg1, (unsigned long)arg2, retval);
1360 default:
1361 return EINVAL;
1362 }
1363 }
1364
1365 #pragma mark - Workqueue Implementation
1366
1367 #pragma mark wq_flags
1368
1369 static inline uint32_t
1370 _wq_flags(struct workqueue *wq)
1371 {
1372 return atomic_load_explicit(&wq->wq_flags, memory_order_relaxed);
1373 }
1374
1375 static inline bool
1376 _wq_exiting(struct workqueue *wq)
1377 {
1378 return _wq_flags(wq) & WQ_EXITING;
1379 }
1380
1381 static inline uint32_t
1382 _wq_flags_or_orig(struct workqueue *wq, uint32_t v)
1383 {
1384 #if PTHREAD_INLINE_RMW_ATOMICS
1385 uint32_t state;
1386 do {
1387 state = _wq_flags(wq);
1388 } while (!OSCompareAndSwap(state, state | v, &wq->wq_flags));
1389 return state;
1390 #else
1391 return atomic_fetch_or_explicit(&wq->wq_flags, v, memory_order_relaxed);
1392 #endif
1393 }
1394
1395 static inline uint32_t
1396 _wq_flags_and_orig(struct workqueue *wq, uint32_t v)
1397 {
1398 #if PTHREAD_INLINE_RMW_ATOMICS
1399 uint32_t state;
1400 do {
1401 state = _wq_flags(wq);
1402 } while (!OSCompareAndSwap(state, state & v, &wq->wq_flags));
1403 return state;
1404 #else
1405 return atomic_fetch_and_explicit(&wq->wq_flags, v, memory_order_relaxed);
1406 #endif
1407 }
1408
1409 static inline bool
1410 WQ_TIMER_DELAYED_NEEDED(struct workqueue *wq)
1411 {
1412 uint32_t oldflags, newflags;
1413 do {
1414 oldflags = _wq_flags(wq);
1415 if (oldflags & (WQ_EXITING | WQ_ATIMER_DELAYED_RUNNING)) {
1416 return false;
1417 }
1418 newflags = oldflags | WQ_ATIMER_DELAYED_RUNNING;
1419 } while (!OSCompareAndSwap(oldflags, newflags, &wq->wq_flags));
1420 return true;
1421 }
1422
1423 static inline bool
1424 WQ_TIMER_IMMEDIATE_NEEDED(struct workqueue *wq)
1425 {
1426 uint32_t oldflags, newflags;
1427 do {
1428 oldflags = _wq_flags(wq);
1429 if (oldflags & (WQ_EXITING | WQ_ATIMER_IMMEDIATE_RUNNING)) {
1430 return false;
1431 }
1432 newflags = oldflags | WQ_ATIMER_IMMEDIATE_RUNNING;
1433 } while (!OSCompareAndSwap(oldflags, newflags, &wq->wq_flags));
1434 return true;
1435 }
1436
1437 #pragma mark thread requests pacing
1438
1439 static inline uint32_t
1440 _wq_pacing_shift_for_pri(int pri)
1441 {
1442 return _wq_bucket_to_thread_qos(pri) - 1;
1443 }
1444
1445 static inline int
1446 _wq_highest_paced_priority(struct workqueue *wq)
1447 {
1448 uint8_t paced = wq->wq_paced;
1449 int msb = paced ? 32 - __builtin_clz(paced) : 0; // fls(paced) == bit + 1
1450 return WORKQUEUE_EVENT_MANAGER_BUCKET - msb;
1451 }
1452
1453 static inline uint8_t
1454 _wq_pacing_bit_for_pri(int pri)
1455 {
1456 return 1u << _wq_pacing_shift_for_pri(pri);
1457 }
1458
1459 static inline bool
1460 _wq_should_pace_priority(struct workqueue *wq, int pri)
1461 {
1462 return wq->wq_paced >= _wq_pacing_bit_for_pri(pri);
1463 }
1464
1465 static inline void
1466 _wq_pacing_start(struct workqueue *wq, struct threadlist *tl)
1467 {
1468 uint8_t bit = _wq_pacing_bit_for_pri(tl->th_priority);
1469 assert((tl->th_flags & TH_LIST_PACING) == 0);
1470 assert((wq->wq_paced & bit) == 0);
1471 wq->wq_paced |= bit;
1472 tl->th_flags |= TH_LIST_PACING;
1473 }
1474
1475 static inline bool
1476 _wq_pacing_end(struct workqueue *wq, struct threadlist *tl)
1477 {
1478 if (tl->th_flags & TH_LIST_PACING) {
1479 uint8_t bit = _wq_pacing_bit_for_pri(tl->th_priority);
1480 assert((wq->wq_paced & bit) != 0);
1481 wq->wq_paced ^= bit;
1482 tl->th_flags &= ~TH_LIST_PACING;
1483 return wq->wq_paced < bit; // !_wq_should_pace_priority
1484 }
1485 return false;
1486 }
1487
1488 #pragma mark thread requests
1489
1490 static void
1491 _threadreq_init_alloced(struct threadreq *req, int priority, int flags)
1492 {
1493 assert((flags & TR_FLAG_ONSTACK) == 0);
1494 req->tr_state = TR_STATE_NEW;
1495 req->tr_priority = priority;
1496 req->tr_flags = flags;
1497 }
1498
1499 static void
1500 _threadreq_init_stack(struct threadreq *req, int priority, int flags)
1501 {
1502 req->tr_state = TR_STATE_NEW;
1503 req->tr_priority = priority;
1504 req->tr_flags = flags | TR_FLAG_ONSTACK;
1505 }
1506
1507 static void
1508 _threadreq_copy_prepare(struct workqueue *wq)
1509 {
1510 again:
1511 if (wq->wq_cached_threadreq) {
1512 return;
1513 }
1514
1515 workqueue_unlock(wq);
1516 struct threadreq *req = zalloc(pthread_zone_threadreq);
1517 workqueue_lock_spin(wq);
1518
1519 if (wq->wq_cached_threadreq) {
1520 /*
1521 * We lost the race and someone left behind an extra threadreq for us
1522 * to use. Throw away our request and retry.
1523 */
1524 workqueue_unlock(wq);
1525 zfree(pthread_zone_threadreq, req);
1526 workqueue_lock_spin(wq);
1527 goto again;
1528 } else {
1529 wq->wq_cached_threadreq = req;
1530 }
1531
1532 assert(wq->wq_cached_threadreq);
1533 }
1534
1535 static bool
1536 _threadreq_copy_prepare_noblock(struct workqueue *wq)
1537 {
1538 if (wq->wq_cached_threadreq) {
1539 return true;
1540 }
1541
1542 wq->wq_cached_threadreq = zalloc_noblock(pthread_zone_threadreq);
1543
1544 return wq->wq_cached_threadreq != NULL;
1545 }
1546
1547 static inline struct threadreq_head *
1548 _threadreq_list_for_req(struct workqueue *wq, const struct threadreq *req)
1549 {
1550 if (req->tr_flags & TR_FLAG_OVERCOMMIT) {
1551 return &wq->wq_overcommit_reqlist[req->tr_priority];
1552 } else {
1553 return &wq->wq_reqlist[req->tr_priority];
1554 }
1555 }
1556
1557 static void
1558 _threadreq_enqueue(struct workqueue *wq, struct threadreq *req)
1559 {
1560 assert(req && req->tr_state == TR_STATE_NEW);
1561 if (req->tr_priority == WORKQUEUE_EVENT_MANAGER_BUCKET) {
1562 assert(wq->wq_event_manager_threadreq.tr_state != TR_STATE_WAITING);
1563 memcpy(&wq->wq_event_manager_threadreq, req, sizeof(struct threadreq));
1564 req = &wq->wq_event_manager_threadreq;
1565 req->tr_flags &= ~(TR_FLAG_ONSTACK | TR_FLAG_NO_PACING);
1566 } else {
1567 if (req->tr_flags & TR_FLAG_ONSTACK) {
1568 assert(wq->wq_cached_threadreq);
1569 struct threadreq *newreq = wq->wq_cached_threadreq;
1570 wq->wq_cached_threadreq = NULL;
1571
1572 memcpy(newreq, req, sizeof(struct threadreq));
1573 newreq->tr_flags &= ~(TR_FLAG_ONSTACK | TR_FLAG_NO_PACING);
1574 req->tr_state = TR_STATE_DEAD;
1575 req = newreq;
1576 }
1577 TAILQ_INSERT_TAIL(_threadreq_list_for_req(wq, req), req, tr_entry);
1578 }
1579 req->tr_state = TR_STATE_WAITING;
1580 wq->wq_reqcount++;
1581 }
1582
1583 static void
1584 _threadreq_dequeue(struct workqueue *wq, struct threadreq *req)
1585 {
1586 if (req->tr_priority != WORKQUEUE_EVENT_MANAGER_BUCKET) {
1587 struct threadreq_head *req_list = _threadreq_list_for_req(wq, req);
1588 #if DEBUG
1589 struct threadreq *cursor = NULL;
1590 TAILQ_FOREACH(cursor, req_list, tr_entry) {
1591 if (cursor == req) break;
1592 }
1593 assert(cursor == req);
1594 #endif
1595 TAILQ_REMOVE(req_list, req, tr_entry);
1596 }
1597 wq->wq_reqcount--;
1598 }
1599
1600 /*
1601 * Mark a thread request as complete. At this point, it is treated as owned by
1602 * the submitting subsystem and you should assume it could be freed.
1603 *
1604 * Called with the workqueue lock held.
1605 */
1606 static int
1607 _threadreq_complete_and_unlock(proc_t p, struct workqueue *wq,
1608 struct threadreq *req, struct threadlist *tl)
1609 {
1610 struct threadreq *req_tofree = NULL;
1611 bool sync = (req->tr_state == TR_STATE_NEW);
1612 bool workloop = req->tr_flags & TR_FLAG_WORKLOOP;
1613 bool onstack = req->tr_flags & TR_FLAG_ONSTACK;
1614 bool kevent = req->tr_flags & TR_FLAG_KEVENT;
1615 bool unbinding = tl->th_flags & TH_LIST_UNBINDING;
1616 bool locked = true;
1617 bool waking_parked_thread = (tl->th_flags & TH_LIST_BUSY);
1618 int ret;
1619
1620 req->tr_state = TR_STATE_COMPLETE;
1621
1622 if (!workloop && !onstack && req != &wq->wq_event_manager_threadreq) {
1623 if (wq->wq_cached_threadreq) {
1624 req_tofree = req;
1625 } else {
1626 wq->wq_cached_threadreq = req;
1627 }
1628 }
1629
1630 if (tl->th_flags & TH_LIST_UNBINDING) {
1631 tl->th_flags &= ~TH_LIST_UNBINDING;
1632 assert((tl->th_flags & TH_LIST_KEVENT_BOUND));
1633 } else if (workloop || kevent) {
1634 assert((tl->th_flags & TH_LIST_KEVENT_BOUND) == 0);
1635 tl->th_flags |= TH_LIST_KEVENT_BOUND;
1636 }
1637
1638 if (workloop) {
1639 workqueue_unlock(wq);
1640 ret = pthread_kern->workloop_fulfill_threadreq(wq->wq_proc, (void*)req,
1641 tl->th_thread, sync ? WORKLOOP_FULFILL_THREADREQ_SYNC : 0);
1642 assert(ret == 0);
1643 locked = false;
1644 } else if (kevent) {
1645 unsigned int kevent_flags = KEVENT_FLAG_WORKQ;
1646 if (sync) {
1647 kevent_flags |= KEVENT_FLAG_SYNCHRONOUS_BIND;
1648 }
1649 if (tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET) {
1650 kevent_flags |= KEVENT_FLAG_WORKQ_MANAGER;
1651 }
1652 workqueue_unlock(wq);
1653 ret = kevent_qos_internal_bind(wq->wq_proc,
1654 class_index_get_thread_qos(tl->th_priority), tl->th_thread,
1655 kevent_flags);
1656 if (ret != 0) {
1657 workqueue_lock_spin(wq);
1658 tl->th_flags &= ~TH_LIST_KEVENT_BOUND;
1659 locked = true;
1660 } else {
1661 locked = false;
1662 }
1663 }
1664
1665 /*
1666 * Run Thread, Run!
1667 */
1668 PTHREAD_TRACE_WQ(TRACE_wq_run_threadreq | DBG_FUNC_END, wq, 0, 0, 0, 0);
1669 PTHREAD_TRACE_WQ_REQ(TRACE_wq_runitem | DBG_FUNC_START, wq, req, tl->th_priority,
1670 thread_tid(current_thread()), thread_tid(tl->th_thread));
1671
1672 if (waking_parked_thread) {
1673 if (!locked) {
1674 workqueue_lock_spin(wq);
1675 }
1676 tl->th_flags &= ~(TH_LIST_BUSY);
1677 if ((tl->th_flags & TH_LIST_REMOVING_VOUCHER) == 0) {
1678 /*
1679 * If the thread is in the process of removing its voucher, then it
1680 * isn't actually in the wait event yet and we don't need to wake
1681 * it up. Save the trouble (and potential lock-ordering issues
1682 * (see 30617015)).
1683 */
1684 thread_wakeup_thread(tl, tl->th_thread);
1685 }
1686 workqueue_unlock(wq);
1687
1688 if (req_tofree) zfree(pthread_zone_threadreq, req_tofree);
1689 return WQ_RUN_TR_THREAD_STARTED;
1690 }
1691
1692 assert ((tl->th_flags & TH_LIST_PACING) == 0);
1693 if (locked) {
1694 workqueue_unlock(wq);
1695 }
1696 if (req_tofree) zfree(pthread_zone_threadreq, req_tofree);
1697 if (unbinding) {
1698 return WQ_RUN_TR_THREAD_STARTED;
1699 }
1700 _setup_wqthread(p, tl->th_thread, wq, tl, WQ_SETUP_CLEAR_VOUCHER);
1701 pthread_kern->unix_syscall_return(EJUSTRETURN);
1702 __builtin_unreachable();
1703 }
1704
1705 /*
1706 * Mark a thread request as cancelled. Has similar ownership semantics to the
1707 * complete call above.
1708 */
1709 static void
1710 _threadreq_cancel(struct workqueue *wq, struct threadreq *req)
1711 {
1712 assert(req->tr_state == TR_STATE_WAITING);
1713 req->tr_state = TR_STATE_DEAD;
1714
1715 assert((req->tr_flags & TR_FLAG_ONSTACK) == 0);
1716 if (req->tr_flags & TR_FLAG_WORKLOOP) {
1717 __assert_only int ret;
1718 ret = pthread_kern->workloop_fulfill_threadreq(wq->wq_proc, (void*)req,
1719 THREAD_NULL, WORKLOOP_FULFILL_THREADREQ_CANCEL);
1720 assert(ret == 0 || ret == ECANCELED);
1721 } else if (req != &wq->wq_event_manager_threadreq) {
1722 zfree(pthread_zone_threadreq, req);
1723 }
1724 }
1725
1726 #pragma mark workqueue lock
1727
1728 static boolean_t workqueue_lock_spin_is_acquired_kdp(struct workqueue *wq) {
1729 return kdp_lck_spin_is_acquired(&wq->wq_lock);
1730 }
1731
1732 static void
1733 workqueue_lock_spin(struct workqueue *wq)
1734 {
1735 assert(ml_get_interrupts_enabled() == TRUE);
1736 lck_spin_lock(&wq->wq_lock);
1737 }
1738
1739 static bool
1740 workqueue_lock_try(struct workqueue *wq)
1741 {
1742 return lck_spin_try_lock(&wq->wq_lock);
1743 }
1744
1745 static void
1746 workqueue_unlock(struct workqueue *wq)
1747 {
1748 lck_spin_unlock(&wq->wq_lock);
1749 }
1750
1751 #pragma mark workqueue add timer
1752
1753 /**
1754 * Sets up the timer which will call out to workqueue_add_timer
1755 */
1756 static void
1757 workqueue_interval_timer_start(struct workqueue *wq)
1758 {
1759 uint64_t deadline;
1760
1761 /* n.b. wq_timer_interval is reset to 0 in workqueue_add_timer if the
1762 ATIMER_RUNNING flag is not present. The net effect here is that if a
1763 sequence of threads is required, we'll double the time before we give out
1764 the next one. */
1765 if (wq->wq_timer_interval == 0) {
1766 wq->wq_timer_interval = wq_stalled_window_usecs;
1767
1768 } else {
1769 wq->wq_timer_interval = wq->wq_timer_interval * 2;
1770
1771 if (wq->wq_timer_interval > wq_max_timer_interval_usecs) {
1772 wq->wq_timer_interval = wq_max_timer_interval_usecs;
1773 }
1774 }
1775 clock_interval_to_deadline(wq->wq_timer_interval, 1000, &deadline);
1776
1777 PTHREAD_TRACE_WQ(TRACE_wq_start_add_timer, wq, wq->wq_reqcount,
1778 _wq_flags(wq), wq->wq_timer_interval, 0);
1779
1780 thread_call_t call = wq->wq_atimer_delayed_call;
1781 if (thread_call_enter1_delayed(call, call, deadline)) {
1782 panic("delayed_call was already enqueued");
1783 }
1784 }
1785
1786 /**
1787 * Immediately trigger the workqueue_add_timer
1788 */
1789 static void
1790 workqueue_interval_timer_trigger(struct workqueue *wq)
1791 {
1792 PTHREAD_TRACE_WQ(TRACE_wq_start_add_timer, wq, wq->wq_reqcount,
1793 _wq_flags(wq), 0, 0);
1794
1795 thread_call_t call = wq->wq_atimer_immediate_call;
1796 if (thread_call_enter1(call, call)) {
1797 panic("immediate_call was already enqueued");
1798 }
1799 }
1800
1801 /**
1802 * returns whether lastblocked_tsp is within wq_stalled_window_usecs of cur_ts
1803 */
1804 static boolean_t
1805 wq_thread_is_busy(uint64_t cur_ts, _Atomic uint64_t *lastblocked_tsp)
1806 {
1807 clock_sec_t secs;
1808 clock_usec_t usecs;
1809 uint64_t lastblocked_ts;
1810 uint64_t elapsed;
1811
1812 lastblocked_ts = atomic_load_explicit(lastblocked_tsp, memory_order_relaxed);
1813 if (lastblocked_ts >= cur_ts) {
1814 /*
1815 * because the update of the timestamp when a thread blocks isn't
1816 * serialized against us looking at it (i.e. we don't hold the workq lock)
1817 * it's possible to have a timestamp that matches the current time or
1818 * that even looks to be in the future relative to when we grabbed the current
1819 * time... just treat this as a busy thread since it must have just blocked.
1820 */
1821 return (TRUE);
1822 }
1823 elapsed = cur_ts - lastblocked_ts;
1824
1825 pthread_kern->absolutetime_to_microtime(elapsed, &secs, &usecs);
1826
1827 return (secs == 0 && usecs < wq_stalled_window_usecs);
1828 }
1829
1830 /**
1831 * handler function for the timer
1832 */
1833 static void
1834 workqueue_add_timer(struct workqueue *wq, thread_call_t thread_call_self)
1835 {
1836 proc_t p = wq->wq_proc;
1837
1838 workqueue_lock_spin(wq);
1839
1840 PTHREAD_TRACE_WQ(TRACE_wq_add_timer | DBG_FUNC_START, wq,
1841 _wq_flags(wq), wq->wq_nthreads, wq->wq_thidlecount, 0);
1842
1843 /*
1844 * There's two tricky issues here.
1845 *
1846 * First issue: we start the thread_call's that invoke this routine without
1847 * the workqueue lock held. The scheduler callback needs to trigger
1848 * reevaluation of the number of running threads but shouldn't take that
1849 * lock, so we can't use it to synchronize state around the thread_call.
1850 * As a result, it might re-enter the thread_call while this routine is
1851 * already running. This could cause it to fire a second time and we'll
1852 * have two add_timers running at once. Obviously, we don't want that to
1853 * keep stacking, so we need to keep it at two timers.
1854 *
1855 * Solution: use wq_flags (accessed via atomic CAS) to synchronize the
1856 * enqueue of the thread_call itself. When a thread needs to trigger the
1857 * add_timer, it checks for ATIMER_DELAYED_RUNNING and, when not set, sets
1858 * the flag then does a thread_call_enter. We'll then remove that flag
1859 * only once we've got the lock and it's safe for the thread_call to be
1860 * entered again.
1861 *
1862 * Second issue: we need to make sure that the two timers don't execute this
1863 * routine concurrently. We can't use the workqueue lock for this because
1864 * we'll need to drop it during our execution.
1865 *
1866 * Solution: use WQL_ATIMER_BUSY as a condition variable to indicate that
1867 * we are currently executing the routine and the next thread should wait.
1868 *
1869 * After all that, we arrive at the following four possible states:
1870 * !WQ_ATIMER_DELAYED_RUNNING && !WQL_ATIMER_BUSY no pending timer, no active timer
1871 * !WQ_ATIMER_DELAYED_RUNNING && WQL_ATIMER_BUSY no pending timer, 1 active timer
1872 * WQ_ATIMER_DELAYED_RUNNING && !WQL_ATIMER_BUSY 1 pending timer, no active timer
1873 * WQ_ATIMER_DELAYED_RUNNING && WQL_ATIMER_BUSY 1 pending timer, 1 active timer
1874 *
1875 * Further complication sometimes we need to trigger this function to run
1876 * without delay. Because we aren't under a lock between setting
1877 * WQ_ATIMER_DELAYED_RUNNING and calling thread_call_enter, we can't simply
1878 * re-enter the thread call: if thread_call_enter() returned false, we
1879 * wouldn't be able to distinguish the case where the thread_call had
1880 * already fired from the case where it hadn't been entered yet from the
1881 * other thread. So, we use a separate thread_call for immediate
1882 * invocations, and a separate RUNNING flag, WQ_ATIMER_IMMEDIATE_RUNNING.
1883 */
1884
1885 while (wq->wq_lflags & WQL_ATIMER_BUSY) {
1886 wq->wq_lflags |= WQL_ATIMER_WAITING;
1887
1888 assert_wait((caddr_t)wq, (THREAD_UNINT));
1889 workqueue_unlock(wq);
1890
1891 thread_block(THREAD_CONTINUE_NULL);
1892
1893 workqueue_lock_spin(wq);
1894 }
1895 /*
1896 * Prevent _workqueue_mark_exiting() from going away
1897 */
1898 wq->wq_lflags |= WQL_ATIMER_BUSY;
1899
1900 /*
1901 * Decide which timer we are and remove the RUNNING flag.
1902 */
1903 if (thread_call_self == wq->wq_atimer_delayed_call) {
1904 uint64_t wq_flags = _wq_flags_and_orig(wq, ~WQ_ATIMER_DELAYED_RUNNING);
1905 if ((wq_flags & WQ_ATIMER_DELAYED_RUNNING) == 0) {
1906 panic("workqueue_add_timer(delayed) w/o WQ_ATIMER_DELAYED_RUNNING");
1907 }
1908 } else if (thread_call_self == wq->wq_atimer_immediate_call) {
1909 uint64_t wq_flags = _wq_flags_and_orig(wq, ~WQ_ATIMER_IMMEDIATE_RUNNING);
1910 if ((wq_flags & WQ_ATIMER_IMMEDIATE_RUNNING) == 0) {
1911 panic("workqueue_add_timer(immediate) w/o WQ_ATIMER_IMMEDIATE_RUNNING");
1912 }
1913 } else {
1914 panic("workqueue_add_timer can't figure out which timer it is");
1915 }
1916
1917 int ret = WQ_RUN_TR_THREAD_STARTED;
1918 while (ret == WQ_RUN_TR_THREAD_STARTED && wq->wq_reqcount) {
1919 ret = workqueue_run_threadreq_and_unlock(p, wq, NULL, NULL, true);
1920
1921 workqueue_lock_spin(wq);
1922 }
1923 _threadreq_copy_prepare(wq);
1924
1925 /*
1926 * If we called WQ_TIMER_NEEDED above, then this flag will be set if that
1927 * call marked the timer running. If so, we let the timer interval grow.
1928 * Otherwise, we reset it back to 0.
1929 */
1930 uint32_t wq_flags = _wq_flags(wq);
1931 if (!(wq_flags & WQ_ATIMER_DELAYED_RUNNING)) {
1932 wq->wq_timer_interval = 0;
1933 }
1934
1935 wq->wq_lflags &= ~WQL_ATIMER_BUSY;
1936
1937 if ((wq_flags & WQ_EXITING) || (wq->wq_lflags & WQL_ATIMER_WAITING)) {
1938 /*
1939 * wakeup the thread hung up in _workqueue_mark_exiting or
1940 * workqueue_add_timer waiting for this timer to finish getting out of
1941 * the way
1942 */
1943 wq->wq_lflags &= ~WQL_ATIMER_WAITING;
1944 wakeup(wq);
1945 }
1946
1947 PTHREAD_TRACE_WQ(TRACE_wq_add_timer | DBG_FUNC_END, wq, 0, wq->wq_nthreads, wq->wq_thidlecount, 0);
1948
1949 workqueue_unlock(wq);
1950 }
1951
1952 #pragma mark thread state tracking
1953
1954 // called by spinlock code when trying to yield to lock owner
1955 void
1956 _workqueue_thread_yielded(void)
1957 {
1958 }
1959
1960 static void
1961 workqueue_callback(int type, thread_t thread)
1962 {
1963 struct uthread *uth = pthread_kern->get_bsdthread_info(thread);
1964 struct threadlist *tl = pthread_kern->uthread_get_threadlist(uth);
1965 struct workqueue *wq = tl->th_workq;
1966 uint32_t old_count, req_qos, qos = tl->th_priority;
1967 wq_thactive_t old_thactive;
1968
1969 switch (type) {
1970 case SCHED_CALL_BLOCK: {
1971 bool start_timer = false;
1972
1973 old_thactive = _wq_thactive_dec(wq, tl->th_priority);
1974 req_qos = WQ_THACTIVE_BEST_CONSTRAINED_REQ_QOS(old_thactive);
1975 old_count = _wq_thactive_aggregate_downto_qos(wq, old_thactive,
1976 qos, NULL, NULL);
1977
1978 if (old_count == wq_max_concurrency[tl->th_priority]) {
1979 /*
1980 * The number of active threads at this priority has fallen below
1981 * the maximum number of concurrent threads that are allowed to run
1982 *
1983 * if we collide with another thread trying to update the
1984 * last_blocked (really unlikely since another thread would have to
1985 * get scheduled and then block after we start down this path), it's
1986 * not a problem. Either timestamp is adequate, so no need to retry
1987 */
1988 atomic_store_explicit(&wq->wq_lastblocked_ts[qos],
1989 mach_absolute_time(), memory_order_relaxed);
1990 }
1991
1992 if (req_qos == WORKQUEUE_EVENT_MANAGER_BUCKET || qos > req_qos) {
1993 /*
1994 * The blocking thread is at a lower QoS than the highest currently
1995 * pending constrained request, nothing has to be redriven
1996 */
1997 } else {
1998 uint32_t max_busycount, old_req_count;
1999 old_req_count = _wq_thactive_aggregate_downto_qos(wq, old_thactive,
2000 req_qos, NULL, &max_busycount);
2001 /*
2002 * If it is possible that may_start_constrained_thread had refused
2003 * admission due to being over the max concurrency, we may need to
2004 * spin up a new thread.
2005 *
2006 * We take into account the maximum number of busy threads
2007 * that can affect may_start_constrained_thread as looking at the
2008 * actual number may_start_constrained_thread will see is racy.
2009 *
2010 * IOW at NCPU = 4, for IN (req_qos = 1), if the old req count is
2011 * between NCPU (4) and NCPU - 2 (2) we need to redrive.
2012 */
2013 if (wq_max_concurrency[req_qos] <= old_req_count + max_busycount &&
2014 old_req_count <= wq_max_concurrency[req_qos]) {
2015 if (WQ_TIMER_DELAYED_NEEDED(wq)) {
2016 start_timer = true;
2017 workqueue_interval_timer_start(wq);
2018 }
2019 }
2020 }
2021
2022 PTHREAD_TRACE_WQ(TRACE_wq_thread_block | DBG_FUNC_START, wq,
2023 old_count - 1, qos | (req_qos << 8),
2024 wq->wq_reqcount << 1 | start_timer, 0);
2025 break;
2026 }
2027 case SCHED_CALL_UNBLOCK: {
2028 /*
2029 * we cannot take the workqueue_lock here...
2030 * an UNBLOCK can occur from a timer event which
2031 * is run from an interrupt context... if the workqueue_lock
2032 * is already held by this processor, we'll deadlock...
2033 * the thread lock for the thread being UNBLOCKED
2034 * is also held
2035 */
2036 old_thactive = _wq_thactive_inc(wq, qos);
2037 if (pthread_debug_tracing) {
2038 req_qos = WQ_THACTIVE_BEST_CONSTRAINED_REQ_QOS(old_thactive);
2039 old_count = _wq_thactive_aggregate_downto_qos(wq, old_thactive,
2040 qos, NULL, NULL);
2041 PTHREAD_TRACE_WQ(TRACE_wq_thread_block | DBG_FUNC_END, wq,
2042 old_count + 1, qos | (req_qos << 8),
2043 wq->wq_threads_scheduled, 0);
2044 }
2045 break;
2046 }
2047 }
2048 }
2049
2050 sched_call_t
2051 _workqueue_get_sched_callback(void)
2052 {
2053 return workqueue_callback;
2054 }
2055
2056 #pragma mark thread addition/removal
2057
2058 static mach_vm_size_t
2059 _workqueue_allocsize(struct workqueue *wq)
2060 {
2061 proc_t p = wq->wq_proc;
2062 mach_vm_size_t guardsize = vm_map_page_size(wq->wq_map);
2063 mach_vm_size_t pthread_size =
2064 vm_map_round_page_mask(pthread_kern->proc_get_pthsize(p) + PTHREAD_T_OFFSET, vm_map_page_mask(wq->wq_map));
2065 return guardsize + PTH_DEFAULT_STACKSIZE + pthread_size;
2066 }
2067
2068 /**
2069 * pop goes the thread
2070 *
2071 * If fromexit is set, the call is from workqueue_exit(,
2072 * so some cleanups are to be avoided.
2073 */
2074 static void
2075 workqueue_removethread(struct threadlist *tl, bool fromexit, bool first_use)
2076 {
2077 struct uthread * uth;
2078 struct workqueue * wq = tl->th_workq;
2079
2080 if (tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET){
2081 TAILQ_REMOVE(&wq->wq_thidlemgrlist, tl, th_entry);
2082 } else {
2083 TAILQ_REMOVE(&wq->wq_thidlelist, tl, th_entry);
2084 }
2085
2086 if (fromexit == 0) {
2087 assert(wq->wq_nthreads && wq->wq_thidlecount);
2088 wq->wq_nthreads--;
2089 wq->wq_thidlecount--;
2090 }
2091
2092 /*
2093 * Clear the threadlist pointer in uthread so
2094 * blocked thread on wakeup for termination will
2095 * not access the thread list as it is going to be
2096 * freed.
2097 */
2098 pthread_kern->thread_sched_call(tl->th_thread, NULL);
2099
2100 uth = pthread_kern->get_bsdthread_info(tl->th_thread);
2101 if (uth != (struct uthread *)0) {
2102 pthread_kern->uthread_set_threadlist(uth, NULL);
2103 }
2104 if (fromexit == 0) {
2105 /* during exit the lock is not held */
2106 workqueue_unlock(wq);
2107 }
2108
2109 if ( (tl->th_flags & TH_LIST_NEW) || first_use ) {
2110 /*
2111 * thread was created, but never used...
2112 * need to clean up the stack and port ourselves
2113 * since we're not going to spin up through the
2114 * normal exit path triggered from Libc
2115 */
2116 if (fromexit == 0) {
2117 /* vm map is already deallocated when this is called from exit */
2118 (void)mach_vm_deallocate(wq->wq_map, tl->th_stackaddr, _workqueue_allocsize(wq));
2119 }
2120 (void)pthread_kern->mach_port_deallocate(pthread_kern->task_get_ipcspace(wq->wq_task), tl->th_thport);
2121 }
2122 /*
2123 * drop our ref on the thread
2124 */
2125 thread_deallocate(tl->th_thread);
2126
2127 zfree(pthread_zone_threadlist, tl);
2128 }
2129
2130
2131 /**
2132 * Try to add a new workqueue thread.
2133 *
2134 * - called with workq lock held
2135 * - dropped and retaken around thread creation
2136 * - return with workq lock held
2137 */
2138 static bool
2139 workqueue_addnewthread(proc_t p, struct workqueue *wq)
2140 {
2141 kern_return_t kret;
2142
2143 wq->wq_nthreads++;
2144
2145 workqueue_unlock(wq);
2146
2147 struct threadlist *tl = zalloc(pthread_zone_threadlist);
2148 bzero(tl, sizeof(struct threadlist));
2149
2150 thread_t th;
2151 kret = pthread_kern->thread_create_workq_waiting(wq->wq_task, wq_unpark_continue, tl, &th);
2152 if (kret != KERN_SUCCESS) {
2153 PTHREAD_TRACE_WQ(TRACE_wq_thread_create_failed | DBG_FUNC_NONE, wq, kret, 0, 0, 0);
2154 goto fail_free;
2155 }
2156
2157 mach_vm_offset_t stackaddr = pthread_kern->proc_get_stack_addr_hint(p);
2158
2159 mach_vm_size_t guardsize = vm_map_page_size(wq->wq_map);
2160 mach_vm_size_t pthread_size =
2161 vm_map_round_page_mask(pthread_kern->proc_get_pthsize(p) + PTHREAD_T_OFFSET, vm_map_page_mask(wq->wq_map));
2162 mach_vm_size_t th_allocsize = guardsize + PTH_DEFAULT_STACKSIZE + pthread_size;
2163
2164 kret = mach_vm_map(wq->wq_map, &stackaddr,
2165 th_allocsize, page_size-1,
2166 VM_MAKE_TAG(VM_MEMORY_STACK)| VM_FLAGS_ANYWHERE, NULL,
2167 0, FALSE, VM_PROT_DEFAULT, VM_PROT_ALL,
2168 VM_INHERIT_DEFAULT);
2169
2170 if (kret != KERN_SUCCESS) {
2171 kret = mach_vm_allocate(wq->wq_map,
2172 &stackaddr, th_allocsize,
2173 VM_MAKE_TAG(VM_MEMORY_STACK) | VM_FLAGS_ANYWHERE);
2174 }
2175
2176 if (kret != KERN_SUCCESS) {
2177 PTHREAD_TRACE_WQ(TRACE_wq_thread_create_failed | DBG_FUNC_NONE, wq, kret, 1, 0, 0);
2178 goto fail_terminate;
2179 }
2180
2181 /*
2182 * The guard page is at the lowest address
2183 * The stack base is the highest address
2184 */
2185 kret = mach_vm_protect(wq->wq_map, stackaddr, guardsize, FALSE, VM_PROT_NONE);
2186 if (kret != KERN_SUCCESS) {
2187 PTHREAD_TRACE_WQ(TRACE_wq_thread_create_failed | DBG_FUNC_NONE, wq, kret, 2, 0, 0);
2188 goto fail_vm_deallocate;
2189 }
2190
2191
2192 pthread_kern->thread_set_tag(th, THREAD_TAG_PTHREAD | THREAD_TAG_WORKQUEUE);
2193 pthread_kern->thread_static_param(th, TRUE);
2194
2195 /*
2196 * convert_thread_to_port() consumes a reference
2197 */
2198 thread_reference(th);
2199 void *sright = (void *)pthread_kern->convert_thread_to_port(th);
2200 tl->th_thport = pthread_kern->ipc_port_copyout_send(sright,
2201 pthread_kern->task_get_ipcspace(wq->wq_task));
2202
2203 tl->th_flags = TH_LIST_INITED | TH_LIST_NEW;
2204 tl->th_thread = th;
2205 tl->th_workq = wq;
2206 tl->th_stackaddr = stackaddr;
2207 tl->th_priority = WORKQUEUE_NUM_BUCKETS;
2208
2209 struct uthread *uth;
2210 uth = pthread_kern->get_bsdthread_info(tl->th_thread);
2211
2212 workqueue_lock_spin(wq);
2213
2214 void *current_tl = pthread_kern->uthread_get_threadlist(uth);
2215 if (current_tl == NULL) {
2216 pthread_kern->uthread_set_threadlist(uth, tl);
2217 TAILQ_INSERT_TAIL(&wq->wq_thidlelist, tl, th_entry);
2218 wq->wq_thidlecount++;
2219 } else if (current_tl == WQ_THREADLIST_EXITING_POISON) {
2220 /*
2221 * Failed thread creation race: The thread already woke up and has exited.
2222 */
2223 PTHREAD_TRACE_WQ(TRACE_wq_thread_create_failed | DBG_FUNC_NONE, wq, kret, 3, 0, 0);
2224 goto fail_unlock;
2225 } else {
2226 panic("Unexpected initial threadlist value");
2227 }
2228
2229 PTHREAD_TRACE_WQ(TRACE_wq_thread_create | DBG_FUNC_NONE, wq, 0, 0, 0, 0);
2230
2231 return (TRUE);
2232
2233 fail_unlock:
2234 workqueue_unlock(wq);
2235 (void)pthread_kern->mach_port_deallocate(pthread_kern->task_get_ipcspace(wq->wq_task),
2236 tl->th_thport);
2237
2238 fail_vm_deallocate:
2239 (void) mach_vm_deallocate(wq->wq_map, stackaddr, th_allocsize);
2240
2241 fail_terminate:
2242 if (pthread_kern->thread_will_park_or_terminate) {
2243 pthread_kern->thread_will_park_or_terminate(th);
2244 }
2245 (void)thread_terminate(th);
2246 thread_deallocate(th);
2247
2248 fail_free:
2249 zfree(pthread_zone_threadlist, tl);
2250
2251 workqueue_lock_spin(wq);
2252 wq->wq_nthreads--;
2253
2254 return (FALSE);
2255 }
2256
2257 /**
2258 * Setup per-process state for the workqueue.
2259 */
2260 int
2261 _workq_open(struct proc *p, __unused int32_t *retval)
2262 {
2263 struct workqueue * wq;
2264 char * ptr;
2265 uint32_t num_cpus;
2266 int error = 0;
2267
2268 if (pthread_kern->proc_get_register(p) == 0) {
2269 return EINVAL;
2270 }
2271
2272 num_cpus = pthread_kern->ml_get_max_cpus();
2273
2274 if (wq_init_constrained_limit) {
2275 uint32_t limit;
2276 /*
2277 * set up the limit for the constrained pool
2278 * this is a virtual pool in that we don't
2279 * maintain it on a separate idle and run list
2280 */
2281 limit = num_cpus * WORKQUEUE_CONSTRAINED_FACTOR;
2282
2283 if (limit > wq_max_constrained_threads)
2284 wq_max_constrained_threads = limit;
2285
2286 wq_init_constrained_limit = 0;
2287
2288 if (wq_max_threads > WQ_THACTIVE_BUCKET_HALF) {
2289 wq_max_threads = WQ_THACTIVE_BUCKET_HALF;
2290 }
2291 if (wq_max_threads > pthread_kern->config_thread_max - 20) {
2292 wq_max_threads = pthread_kern->config_thread_max - 20;
2293 }
2294 }
2295
2296 if (pthread_kern->proc_get_wqptr(p) == NULL) {
2297 if (pthread_kern->proc_init_wqptr_or_wait(p) == FALSE) {
2298 assert(pthread_kern->proc_get_wqptr(p) != NULL);
2299 goto out;
2300 }
2301
2302 ptr = (char *)zalloc(pthread_zone_workqueue);
2303 bzero(ptr, sizeof(struct workqueue));
2304
2305 wq = (struct workqueue *)ptr;
2306 wq->wq_proc = p;
2307 wq->wq_task = current_task();
2308 wq->wq_map = pthread_kern->current_map();
2309
2310 // Start the event manager at the priority hinted at by the policy engine
2311 int mgr_priority_hint = pthread_kern->task_get_default_manager_qos(current_task());
2312 wq->wq_event_manager_priority = (uint32_t)thread_qos_get_pthread_priority(mgr_priority_hint) | _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG;
2313
2314 TAILQ_INIT(&wq->wq_thrunlist);
2315 TAILQ_INIT(&wq->wq_thidlelist);
2316 for (int i = 0; i < WORKQUEUE_EVENT_MANAGER_BUCKET; i++) {
2317 TAILQ_INIT(&wq->wq_overcommit_reqlist[i]);
2318 TAILQ_INIT(&wq->wq_reqlist[i]);
2319 }
2320
2321 wq->wq_atimer_delayed_call =
2322 thread_call_allocate_with_priority((thread_call_func_t)workqueue_add_timer,
2323 (thread_call_param_t)wq, THREAD_CALL_PRIORITY_KERNEL);
2324 wq->wq_atimer_immediate_call =
2325 thread_call_allocate_with_priority((thread_call_func_t)workqueue_add_timer,
2326 (thread_call_param_t)wq, THREAD_CALL_PRIORITY_KERNEL);
2327
2328 lck_spin_init(&wq->wq_lock, pthread_lck_grp, pthread_lck_attr);
2329
2330 wq->wq_cached_threadreq = zalloc(pthread_zone_threadreq);
2331 *(wq_thactive_t *)&wq->wq_thactive =
2332 (wq_thactive_t)WQ_THACTIVE_NO_PENDING_REQUEST <<
2333 WQ_THACTIVE_QOS_SHIFT;
2334
2335 pthread_kern->proc_set_wqptr(p, wq);
2336
2337 }
2338 out:
2339
2340 return(error);
2341 }
2342
2343 /*
2344 * Routine: workqueue_mark_exiting
2345 *
2346 * Function: Mark the work queue such that new threads will not be added to the
2347 * work queue after we return.
2348 *
2349 * Conditions: Called against the current process.
2350 */
2351 void
2352 _workqueue_mark_exiting(struct proc *p)
2353 {
2354 struct workqueue *wq = pthread_kern->proc_get_wqptr(p);
2355 if (!wq) return;
2356
2357 PTHREAD_TRACE_WQ(TRACE_wq_pthread_exit|DBG_FUNC_START, wq, 0, 0, 0, 0);
2358
2359 workqueue_lock_spin(wq);
2360
2361 /*
2362 * We arm the add timer without holding the workqueue lock so we need
2363 * to synchronize with any running or soon to be running timers.
2364 *
2365 * Threads that intend to arm the timer atomically OR
2366 * WQ_ATIMER_{DELAYED,IMMEDIATE}_RUNNING into the wq_flags, only if
2367 * WQ_EXITING is not present. So, once we have set WQ_EXITING, we can
2368 * be sure that no new RUNNING flags will be set, but still need to
2369 * wait for the already running timers to complete.
2370 *
2371 * We always hold the workq lock when dropping WQ_ATIMER_RUNNING, so
2372 * the check for and sleep until clear is protected.
2373 */
2374 uint64_t wq_flags = _wq_flags_or_orig(wq, WQ_EXITING);
2375
2376 if (wq_flags & WQ_ATIMER_DELAYED_RUNNING) {
2377 if (thread_call_cancel(wq->wq_atimer_delayed_call) == TRUE) {
2378 wq_flags = _wq_flags_and_orig(wq, ~WQ_ATIMER_DELAYED_RUNNING);
2379 }
2380 }
2381 if (wq_flags & WQ_ATIMER_IMMEDIATE_RUNNING) {
2382 if (thread_call_cancel(wq->wq_atimer_immediate_call) == TRUE) {
2383 wq_flags = _wq_flags_and_orig(wq, ~WQ_ATIMER_IMMEDIATE_RUNNING);
2384 }
2385 }
2386 while ((_wq_flags(wq) & (WQ_ATIMER_DELAYED_RUNNING | WQ_ATIMER_IMMEDIATE_RUNNING)) ||
2387 (wq->wq_lflags & WQL_ATIMER_BUSY)) {
2388 assert_wait((caddr_t)wq, (THREAD_UNINT));
2389 workqueue_unlock(wq);
2390
2391 thread_block(THREAD_CONTINUE_NULL);
2392
2393 workqueue_lock_spin(wq);
2394 }
2395
2396 /*
2397 * Save off pending requests, will complete/free them below after unlocking
2398 */
2399 TAILQ_HEAD(, threadreq) local_list = TAILQ_HEAD_INITIALIZER(local_list);
2400
2401 for (int i = 0; i < WORKQUEUE_EVENT_MANAGER_BUCKET; i++) {
2402 TAILQ_CONCAT(&local_list, &wq->wq_overcommit_reqlist[i], tr_entry);
2403 TAILQ_CONCAT(&local_list, &wq->wq_reqlist[i], tr_entry);
2404 }
2405
2406 /*
2407 * XXX: Can't deferred cancel the event manager request, so just smash it.
2408 */
2409 assert((wq->wq_event_manager_threadreq.tr_flags & TR_FLAG_WORKLOOP) == 0);
2410 wq->wq_event_manager_threadreq.tr_state = TR_STATE_DEAD;
2411
2412 workqueue_unlock(wq);
2413
2414 struct threadreq *tr, *tr_temp;
2415 TAILQ_FOREACH_SAFE(tr, &local_list, tr_entry, tr_temp) {
2416 _threadreq_cancel(wq, tr);
2417 }
2418 PTHREAD_TRACE(TRACE_wq_pthread_exit|DBG_FUNC_END, 0, 0, 0, 0, 0);
2419 }
2420
2421 /*
2422 * Routine: workqueue_exit
2423 *
2424 * Function: clean up the work queue structure(s) now that there are no threads
2425 * left running inside the work queue (except possibly current_thread).
2426 *
2427 * Conditions: Called by the last thread in the process.
2428 * Called against current process.
2429 */
2430 void
2431 _workqueue_exit(struct proc *p)
2432 {
2433 struct workqueue * wq;
2434 struct threadlist * tl, *tlist;
2435 struct uthread *uth;
2436
2437 wq = pthread_kern->proc_get_wqptr(p);
2438 if (wq != NULL) {
2439
2440 PTHREAD_TRACE_WQ(TRACE_wq_workqueue_exit|DBG_FUNC_START, wq, 0, 0, 0, 0);
2441
2442 pthread_kern->proc_set_wqptr(p, NULL);
2443
2444 /*
2445 * Clean up workqueue data structures for threads that exited and
2446 * didn't get a chance to clean up after themselves.
2447 */
2448 TAILQ_FOREACH_SAFE(tl, &wq->wq_thrunlist, th_entry, tlist) {
2449 assert((tl->th_flags & TH_LIST_RUNNING) != 0);
2450
2451 pthread_kern->thread_sched_call(tl->th_thread, NULL);
2452
2453 uth = pthread_kern->get_bsdthread_info(tl->th_thread);
2454 if (uth != (struct uthread *)0) {
2455 pthread_kern->uthread_set_threadlist(uth, NULL);
2456 }
2457 TAILQ_REMOVE(&wq->wq_thrunlist, tl, th_entry);
2458
2459 /*
2460 * drop our last ref on the thread
2461 */
2462 thread_deallocate(tl->th_thread);
2463
2464 zfree(pthread_zone_threadlist, tl);
2465 }
2466 TAILQ_FOREACH_SAFE(tl, &wq->wq_thidlelist, th_entry, tlist) {
2467 assert((tl->th_flags & TH_LIST_RUNNING) == 0);
2468 assert(tl->th_priority != WORKQUEUE_EVENT_MANAGER_BUCKET);
2469 workqueue_removethread(tl, true, false);
2470 }
2471 TAILQ_FOREACH_SAFE(tl, &wq->wq_thidlemgrlist, th_entry, tlist) {
2472 assert((tl->th_flags & TH_LIST_RUNNING) == 0);
2473 assert(tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET);
2474 workqueue_removethread(tl, true, false);
2475 }
2476 if (wq->wq_cached_threadreq) {
2477 zfree(pthread_zone_threadreq, wq->wq_cached_threadreq);
2478 }
2479 thread_call_free(wq->wq_atimer_delayed_call);
2480 thread_call_free(wq->wq_atimer_immediate_call);
2481 lck_spin_destroy(&wq->wq_lock, pthread_lck_grp);
2482
2483 for (int i = 0; i < WORKQUEUE_EVENT_MANAGER_BUCKET; i++) {
2484 assert(TAILQ_EMPTY(&wq->wq_overcommit_reqlist[i]));
2485 assert(TAILQ_EMPTY(&wq->wq_reqlist[i]));
2486 }
2487
2488 zfree(pthread_zone_workqueue, wq);
2489
2490 PTHREAD_TRACE(TRACE_wq_workqueue_exit|DBG_FUNC_END, 0, 0, 0, 0, 0);
2491 }
2492 }
2493
2494
2495 #pragma mark workqueue thread manipulation
2496
2497
2498 /**
2499 * Entry point for libdispatch to ask for threads
2500 */
2501 static int
2502 wqops_queue_reqthreads(struct proc *p, int reqcount,
2503 pthread_priority_t priority)
2504 {
2505 bool overcommit = _pthread_priority_get_flags(priority) & _PTHREAD_PRIORITY_OVERCOMMIT_FLAG;
2506 bool event_manager = _pthread_priority_get_flags(priority) & _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG;
2507 int class = event_manager ? WORKQUEUE_EVENT_MANAGER_BUCKET :
2508 pthread_priority_get_class_index(priority);
2509
2510 if ((reqcount <= 0) || (class < 0) || (class >= WORKQUEUE_NUM_BUCKETS) ||
2511 (overcommit && event_manager)) {
2512 return EINVAL;
2513 }
2514
2515 struct workqueue *wq;
2516 if ((wq = (struct workqueue *)pthread_kern->proc_get_wqptr(p)) == NULL) {
2517 return EINVAL;
2518 }
2519
2520 workqueue_lock_spin(wq);
2521 _threadreq_copy_prepare(wq);
2522
2523 PTHREAD_TRACE_WQ(TRACE_wq_wqops_reqthreads | DBG_FUNC_NONE, wq, reqcount, priority, 0, 0);
2524
2525 int tr_flags = 0;
2526 if (overcommit) tr_flags |= TR_FLAG_OVERCOMMIT;
2527 if (reqcount > 1) {
2528 /*
2529 * when libdispatch asks for more than one thread, it wants to achieve
2530 * parallelism. Pacing would be detrimental to this ask, so treat
2531 * these specially to not do the pacing admission check
2532 */
2533 tr_flags |= TR_FLAG_NO_PACING;
2534 }
2535
2536 while (reqcount-- && !_wq_exiting(wq)) {
2537 struct threadreq req;
2538 _threadreq_init_stack(&req, class, tr_flags);
2539
2540 workqueue_run_threadreq_and_unlock(p, wq, NULL, &req, true);
2541
2542 workqueue_lock_spin(wq); /* reacquire */
2543 _threadreq_copy_prepare(wq);
2544 }
2545
2546 workqueue_unlock(wq);
2547
2548 return 0;
2549 }
2550
2551 /*
2552 * Used by the kevent system to request threads.
2553 *
2554 * Currently count is ignored and we always return one thread per invocation.
2555 */
2556 static thread_t
2557 _workq_kevent_reqthreads(struct proc *p, pthread_priority_t priority,
2558 bool no_emergency)
2559 {
2560 int wq_run_tr = WQ_RUN_TR_THROTTLED;
2561 bool emergency_thread = false;
2562 struct threadreq req;
2563
2564
2565 struct workqueue *wq;
2566 if ((wq = (struct workqueue *)pthread_kern->proc_get_wqptr(p)) == NULL) {
2567 return THREAD_NULL;
2568 }
2569
2570 int class = pthread_priority_get_class_index(priority);
2571
2572 workqueue_lock_spin(wq);
2573 bool has_threadreq = _threadreq_copy_prepare_noblock(wq);
2574
2575 PTHREAD_TRACE_WQ_REQ(TRACE_wq_kevent_reqthreads | DBG_FUNC_NONE, wq, NULL, priority, 0, 0);
2576
2577 /*
2578 * Skip straight to event manager if that's what was requested
2579 */
2580 if ((_pthread_priority_get_qos_newest(priority) == QOS_CLASS_UNSPECIFIED) ||
2581 (_pthread_priority_get_flags(priority) & _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG)){
2582 goto event_manager;
2583 }
2584
2585 bool will_pace = _wq_should_pace_priority(wq, class);
2586 if ((wq->wq_thidlecount == 0 || will_pace) && has_threadreq == false) {
2587 /*
2588 * We'll need to persist the request and can't, so return the emergency
2589 * thread instead, which has a persistent request object.
2590 */
2591 emergency_thread = true;
2592 goto event_manager;
2593 }
2594
2595 /*
2596 * Handle overcommit requests
2597 */
2598 if ((_pthread_priority_get_flags(priority) & _PTHREAD_PRIORITY_OVERCOMMIT_FLAG) != 0){
2599 _threadreq_init_stack(&req, class, TR_FLAG_KEVENT | TR_FLAG_OVERCOMMIT);
2600 wq_run_tr = workqueue_run_threadreq_and_unlock(p, wq, NULL, &req, false);
2601 goto done;
2602 }
2603
2604 /*
2605 * Handle constrained requests
2606 */
2607 boolean_t may_start = may_start_constrained_thread(wq, class, NULL, false);
2608 if (may_start || no_emergency) {
2609 _threadreq_init_stack(&req, class, TR_FLAG_KEVENT);
2610 wq_run_tr = workqueue_run_threadreq_and_unlock(p, wq, NULL, &req, false);
2611 goto done;
2612 } else {
2613 emergency_thread = true;
2614 }
2615
2616
2617 event_manager:
2618 _threadreq_init_stack(&req, WORKQUEUE_EVENT_MANAGER_BUCKET, TR_FLAG_KEVENT);
2619 wq_run_tr = workqueue_run_threadreq_and_unlock(p, wq, NULL, &req, false);
2620
2621 done:
2622 if (wq_run_tr == WQ_RUN_TR_THREAD_NEEDED && WQ_TIMER_IMMEDIATE_NEEDED(wq)) {
2623 workqueue_interval_timer_trigger(wq);
2624 }
2625 return emergency_thread ? (void*)-1 : 0;
2626 }
2627
2628 thread_t
2629 _workq_reqthreads(struct proc *p, __assert_only int requests_count,
2630 workq_reqthreads_req_t request)
2631 {
2632 assert(requests_count == 1);
2633
2634 pthread_priority_t priority = request->priority;
2635 bool no_emergency = request->count & WORKQ_REQTHREADS_NOEMERGENCY;
2636
2637 return _workq_kevent_reqthreads(p, priority, no_emergency);
2638 }
2639
2640
2641 int
2642 workq_kern_threadreq(struct proc *p, workq_threadreq_t _req,
2643 enum workq_threadreq_type type, unsigned long priority, int flags)
2644 {
2645 struct workqueue *wq;
2646 int ret;
2647
2648 if ((wq = (struct workqueue *)pthread_kern->proc_get_wqptr(p)) == NULL) {
2649 return EINVAL;
2650 }
2651
2652 switch (type) {
2653 case WORKQ_THREADREQ_KEVENT: {
2654 bool no_emergency = flags & WORKQ_THREADREQ_FLAG_NOEMERGENCY;
2655 (void)_workq_kevent_reqthreads(p, priority, no_emergency);
2656 return 0;
2657 }
2658 case WORKQ_THREADREQ_WORKLOOP:
2659 case WORKQ_THREADREQ_WORKLOOP_NO_THREAD_CALL: {
2660 struct threadreq *req = (struct threadreq *)_req;
2661 int req_class = pthread_priority_get_class_index(priority);
2662 int req_flags = TR_FLAG_WORKLOOP;
2663 if ((_pthread_priority_get_flags(priority) &
2664 _PTHREAD_PRIORITY_OVERCOMMIT_FLAG) != 0){
2665 req_flags |= TR_FLAG_OVERCOMMIT;
2666 }
2667
2668 thread_t thread = current_thread();
2669 struct threadlist *tl = util_get_thread_threadlist_entry(thread);
2670
2671 if (tl && tl != WQ_THREADLIST_EXITING_POISON &&
2672 (tl->th_flags & TH_LIST_UNBINDING)) {
2673 /*
2674 * we're called back synchronously from the context of
2675 * kevent_qos_internal_unbind from within wqops_thread_return()
2676 * we can try to match up this thread with this request !
2677 */
2678 } else {
2679 tl = NULL;
2680 }
2681
2682 _threadreq_init_alloced(req, req_class, req_flags);
2683 workqueue_lock_spin(wq);
2684 PTHREAD_TRACE_WQ_REQ(TRACE_wq_kevent_reqthreads | DBG_FUNC_NONE, wq, req, priority, 1, 0);
2685 ret = workqueue_run_threadreq_and_unlock(p, wq, tl, req, false);
2686 if (ret == WQ_RUN_TR_EXITING) {
2687 return ECANCELED;
2688 }
2689 if (ret == WQ_RUN_TR_THREAD_NEEDED) {
2690 if (type == WORKQ_THREADREQ_WORKLOOP_NO_THREAD_CALL) {
2691 return EAGAIN;
2692 }
2693 if (WQ_TIMER_IMMEDIATE_NEEDED(wq)) {
2694 workqueue_interval_timer_trigger(wq);
2695 }
2696 }
2697 return 0;
2698 }
2699 case WORKQ_THREADREQ_REDRIVE:
2700 PTHREAD_TRACE_WQ_REQ(TRACE_wq_kevent_reqthreads | DBG_FUNC_NONE, wq, 0, 0, 4, 0);
2701 workqueue_lock_spin(wq);
2702 ret = workqueue_run_threadreq_and_unlock(p, wq, NULL, NULL, true);
2703 if (ret == WQ_RUN_TR_EXITING) {
2704 return ECANCELED;
2705 }
2706 return 0;
2707 default:
2708 return ENOTSUP;
2709 }
2710 }
2711
2712 int
2713 workq_kern_threadreq_modify(struct proc *p, workq_threadreq_t _req,
2714 enum workq_threadreq_op operation, unsigned long arg1,
2715 unsigned long __unused arg2)
2716 {
2717 struct threadreq *req = (struct threadreq *)_req;
2718 struct workqueue *wq;
2719 int priclass, ret = 0, wq_tr_rc = WQ_RUN_TR_THROTTLED;
2720
2721 if (req == NULL || (wq = pthread_kern->proc_get_wqptr(p)) == NULL) {
2722 return EINVAL;
2723 }
2724
2725 workqueue_lock_spin(wq);
2726
2727 if (_wq_exiting(wq)) {
2728 ret = ECANCELED;
2729 goto out_unlock;
2730 }
2731
2732 /*
2733 * Find/validate the referenced request structure
2734 */
2735 if (req->tr_state != TR_STATE_WAITING) {
2736 ret = EINVAL;
2737 goto out_unlock;
2738 }
2739 assert(req->tr_priority < WORKQUEUE_EVENT_MANAGER_BUCKET);
2740 assert(req->tr_flags & TR_FLAG_WORKLOOP);
2741
2742 switch (operation) {
2743 case WORKQ_THREADREQ_CHANGE_PRI:
2744 case WORKQ_THREADREQ_CHANGE_PRI_NO_THREAD_CALL:
2745 priclass = pthread_priority_get_class_index(arg1);
2746 PTHREAD_TRACE_WQ_REQ(TRACE_wq_kevent_reqthreads | DBG_FUNC_NONE, wq, req, arg1, 2, 0);
2747 if (req->tr_priority == priclass) {
2748 goto out_unlock;
2749 }
2750 _threadreq_dequeue(wq, req);
2751 req->tr_priority = priclass;
2752 req->tr_state = TR_STATE_NEW; // what was old is new again
2753 wq_tr_rc = workqueue_run_threadreq_and_unlock(p, wq, NULL, req, false);
2754 goto out;
2755
2756 case WORKQ_THREADREQ_CANCEL:
2757 PTHREAD_TRACE_WQ_REQ(TRACE_wq_kevent_reqthreads | DBG_FUNC_NONE, wq, req, 0, 3, 0);
2758 _threadreq_dequeue(wq, req);
2759 req->tr_state = TR_STATE_DEAD;
2760 break;
2761
2762 default:
2763 ret = ENOTSUP;
2764 break;
2765 }
2766
2767 out_unlock:
2768 workqueue_unlock(wq);
2769 out:
2770 if (wq_tr_rc == WQ_RUN_TR_THREAD_NEEDED) {
2771 if (operation == WORKQ_THREADREQ_CHANGE_PRI_NO_THREAD_CALL) {
2772 ret = EAGAIN;
2773 } else if (WQ_TIMER_IMMEDIATE_NEEDED(wq)) {
2774 workqueue_interval_timer_trigger(wq);
2775 }
2776 }
2777 return ret;
2778 }
2779
2780
2781 static int
2782 wqops_thread_return(struct proc *p, struct workqueue *wq)
2783 {
2784 thread_t th = current_thread();
2785 struct uthread *uth = pthread_kern->get_bsdthread_info(th);
2786 struct threadlist *tl = pthread_kern->uthread_get_threadlist(uth);
2787
2788 /* reset signal mask on the workqueue thread to default state */
2789 if (pthread_kern->uthread_get_sigmask(uth) != (sigset_t)(~workq_threadmask)) {
2790 pthread_kern->proc_lock(p);
2791 pthread_kern->uthread_set_sigmask(uth, ~workq_threadmask);
2792 pthread_kern->proc_unlock(p);
2793 }
2794
2795 if (wq == NULL || !tl) {
2796 return EINVAL;
2797 }
2798
2799 PTHREAD_TRACE_WQ(TRACE_wq_override_reset | DBG_FUNC_START, tl->th_workq, 0, 0, 0, 0);
2800
2801 /*
2802 * This squash call has neat semantics: it removes the specified overrides,
2803 * replacing the current requested QoS with the previous effective QoS from
2804 * those overrides. This means we won't be preempted due to having our QoS
2805 * lowered. Of course, now our understanding of the thread's QoS is wrong,
2806 * so we'll adjust below.
2807 */
2808 bool was_manager = (tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET);
2809 int new_qos;
2810
2811 if (!was_manager) {
2812 new_qos = pthread_kern->proc_usynch_thread_qos_squash_override_for_resource(th,
2813 THREAD_QOS_OVERRIDE_RESOURCE_WILDCARD,
2814 THREAD_QOS_OVERRIDE_TYPE_DISPATCH_ASYNCHRONOUS_OVERRIDE);
2815 }
2816
2817 PTHREAD_TRACE_WQ(TRACE_wq_runitem | DBG_FUNC_END, wq, tl->th_priority, 0, 0, 0);
2818
2819 workqueue_lock_spin(wq);
2820
2821 if (tl->th_flags & TH_LIST_KEVENT_BOUND) {
2822 unsigned int flags = KEVENT_FLAG_WORKQ;
2823 if (was_manager) {
2824 flags |= KEVENT_FLAG_WORKQ_MANAGER;
2825 }
2826
2827 tl->th_flags |= TH_LIST_UNBINDING;
2828 workqueue_unlock(wq);
2829 kevent_qos_internal_unbind(p, class_index_get_thread_qos(tl->th_priority), th, flags);
2830 if (!(tl->th_flags & TH_LIST_UNBINDING)) {
2831 _setup_wqthread(p, th, wq, tl, WQ_SETUP_CLEAR_VOUCHER);
2832 pthread_kern->unix_syscall_return(EJUSTRETURN);
2833 __builtin_unreachable();
2834 }
2835 workqueue_lock_spin(wq);
2836 tl->th_flags &= ~(TH_LIST_KEVENT_BOUND | TH_LIST_UNBINDING);
2837 }
2838
2839 if (!was_manager) {
2840 /* Fix up counters from the squash operation. */
2841 uint8_t old_bucket = tl->th_priority;
2842 uint8_t new_bucket = thread_qos_get_class_index(new_qos);
2843
2844 if (old_bucket != new_bucket) {
2845 _wq_thactive_move(wq, old_bucket, new_bucket);
2846 wq->wq_thscheduled_count[old_bucket]--;
2847 wq->wq_thscheduled_count[new_bucket]++;
2848
2849 PTHREAD_TRACE_WQ(TRACE_wq_thread_squash | DBG_FUNC_NONE, wq, tl->th_priority, new_bucket, 0, 0);
2850 tl->th_priority = new_bucket;
2851 PTHREAD_TRACE_WQ(TRACE_wq_override_reset | DBG_FUNC_END, tl->th_workq, new_qos, 0, 0, 0);
2852 }
2853 }
2854
2855 workqueue_run_threadreq_and_unlock(p, wq, tl, NULL, false);
2856 return 0;
2857 }
2858
2859 /**
2860 * Multiplexed call to interact with the workqueue mechanism
2861 */
2862 int
2863 _workq_kernreturn(struct proc *p,
2864 int options,
2865 user_addr_t item,
2866 int arg2,
2867 int arg3,
2868 int32_t *retval)
2869 {
2870 struct workqueue *wq;
2871 int error = 0;
2872
2873 if (pthread_kern->proc_get_register(p) == 0) {
2874 return EINVAL;
2875 }
2876
2877 switch (options) {
2878 case WQOPS_QUEUE_NEWSPISUPP: {
2879 /*
2880 * arg2 = offset of serialno into dispatch queue
2881 * arg3 = kevent support
2882 */
2883 int offset = arg2;
2884 if (arg3 & 0x01){
2885 // If we get here, then userspace has indicated support for kevent delivery.
2886 }
2887
2888 pthread_kern->proc_set_dispatchqueue_serialno_offset(p, (uint64_t)offset);
2889 break;
2890 }
2891 case WQOPS_QUEUE_REQTHREADS: {
2892 /*
2893 * arg2 = number of threads to start
2894 * arg3 = priority
2895 */
2896 error = wqops_queue_reqthreads(p, arg2, arg3);
2897 break;
2898 }
2899 case WQOPS_SET_EVENT_MANAGER_PRIORITY: {
2900 /*
2901 * arg2 = priority for the manager thread
2902 *
2903 * if _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG is set, the
2904 * ~_PTHREAD_PRIORITY_FLAGS_MASK contains a scheduling priority instead
2905 * of a QOS value
2906 */
2907 pthread_priority_t pri = arg2;
2908
2909 wq = (struct workqueue *)pthread_kern->proc_get_wqptr(p);
2910 if (wq == NULL) {
2911 error = EINVAL;
2912 break;
2913 }
2914 workqueue_lock_spin(wq);
2915 if (pri & _PTHREAD_PRIORITY_SCHED_PRI_FLAG){
2916 /*
2917 * If userspace passes a scheduling priority, that takes precidence
2918 * over any QoS. (So, userspace should take care not to accidenatally
2919 * lower the priority this way.)
2920 */
2921 uint32_t sched_pri = pri & _PTHREAD_PRIORITY_SCHED_PRI_MASK;
2922 if (wq->wq_event_manager_priority & _PTHREAD_PRIORITY_SCHED_PRI_FLAG){
2923 wq->wq_event_manager_priority = MAX(sched_pri, wq->wq_event_manager_priority & _PTHREAD_PRIORITY_SCHED_PRI_MASK)
2924 | _PTHREAD_PRIORITY_SCHED_PRI_FLAG | _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG;
2925 } else {
2926 wq->wq_event_manager_priority = sched_pri
2927 | _PTHREAD_PRIORITY_SCHED_PRI_FLAG | _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG;
2928 }
2929 } else if ((wq->wq_event_manager_priority & _PTHREAD_PRIORITY_SCHED_PRI_FLAG) == 0){
2930 int cur_qos = pthread_priority_get_thread_qos(wq->wq_event_manager_priority);
2931 int new_qos = pthread_priority_get_thread_qos(pri);
2932 wq->wq_event_manager_priority = (uint32_t)thread_qos_get_pthread_priority(MAX(cur_qos, new_qos)) | _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG;
2933 }
2934 workqueue_unlock(wq);
2935 break;
2936 }
2937 case WQOPS_THREAD_KEVENT_RETURN:
2938 case WQOPS_THREAD_WORKLOOP_RETURN:
2939 wq = (struct workqueue *)pthread_kern->proc_get_wqptr(p);
2940 PTHREAD_TRACE_WQ(TRACE_wq_runthread | DBG_FUNC_END, wq, options, 0, 0, 0);
2941 if (item != 0 && arg2 != 0) {
2942 int32_t kevent_retval;
2943 int ret;
2944 if (options == WQOPS_THREAD_KEVENT_RETURN) {
2945 ret = kevent_qos_internal(p, -1, item, arg2, item, arg2, NULL, NULL,
2946 KEVENT_FLAG_WORKQ | KEVENT_FLAG_IMMEDIATE | KEVENT_FLAG_ERROR_EVENTS,
2947 &kevent_retval);
2948 } else /* options == WQOPS_THREAD_WORKLOOP_RETURN */ {
2949 kqueue_id_t kevent_id = -1;
2950 ret = kevent_id_internal(p, &kevent_id, item, arg2, item, arg2,
2951 NULL, NULL,
2952 KEVENT_FLAG_WORKLOOP | KEVENT_FLAG_IMMEDIATE | KEVENT_FLAG_ERROR_EVENTS,
2953 &kevent_retval);
2954 }
2955 /*
2956 * We shouldn't be getting more errors out than events we put in, so
2957 * reusing the input buffer should always provide enough space. But,
2958 * the assert is commented out since we get errors in edge cases in the
2959 * process lifecycle.
2960 */
2961 //assert(ret == KERN_SUCCESS && kevent_retval >= 0);
2962 if (ret != KERN_SUCCESS){
2963 error = ret;
2964 break;
2965 } else if (kevent_retval > 0){
2966 assert(kevent_retval <= arg2);
2967 *retval = kevent_retval;
2968 error = 0;
2969 break;
2970 }
2971 }
2972 goto thread_return;
2973
2974 case WQOPS_THREAD_RETURN:
2975 wq = (struct workqueue *)pthread_kern->proc_get_wqptr(p);
2976 PTHREAD_TRACE_WQ(TRACE_wq_runthread | DBG_FUNC_END, wq, options, 0, 0, 0);
2977 thread_return:
2978 error = wqops_thread_return(p, wq);
2979 // NOT REACHED except in case of error
2980 assert(error);
2981 break;
2982
2983 case WQOPS_SHOULD_NARROW: {
2984 /*
2985 * arg2 = priority to test
2986 * arg3 = unused
2987 */
2988 pthread_priority_t priority = arg2;
2989 thread_t th = current_thread();
2990 struct threadlist *tl = util_get_thread_threadlist_entry(th);
2991
2992 if (tl == NULL || (tl->th_flags & TH_LIST_CONSTRAINED) == 0) {
2993 error = EINVAL;
2994 break;
2995 }
2996
2997 int class = pthread_priority_get_class_index(priority);
2998 wq = tl->th_workq;
2999 workqueue_lock_spin(wq);
3000 bool should_narrow = !may_start_constrained_thread(wq, class, tl, false);
3001 workqueue_unlock(wq);
3002
3003 *retval = should_narrow;
3004 break;
3005 }
3006 default:
3007 error = EINVAL;
3008 break;
3009 }
3010
3011 switch (options) {
3012 case WQOPS_THREAD_KEVENT_RETURN:
3013 case WQOPS_THREAD_WORKLOOP_RETURN:
3014 case WQOPS_THREAD_RETURN:
3015 PTHREAD_TRACE_WQ(TRACE_wq_runthread | DBG_FUNC_START, wq, options, 0, 0, 0);
3016 break;
3017 }
3018 return (error);
3019 }
3020
3021 /*
3022 * We have no work to do, park ourselves on the idle list.
3023 *
3024 * Consumes the workqueue lock and does not return.
3025 */
3026 static void __dead2
3027 parkit(struct workqueue *wq, struct threadlist *tl, thread_t thread)
3028 {
3029 assert(thread == tl->th_thread);
3030 assert(thread == current_thread());
3031
3032 PTHREAD_TRACE_WQ(TRACE_wq_thread_park | DBG_FUNC_START, wq, 0, 0, 0, 0);
3033
3034 uint32_t us_to_wait = 0;
3035
3036 TAILQ_REMOVE(&wq->wq_thrunlist, tl, th_entry);
3037
3038 tl->th_flags &= ~TH_LIST_RUNNING;
3039 tl->th_flags &= ~TH_LIST_KEVENT;
3040 assert((tl->th_flags & TH_LIST_KEVENT_BOUND) == 0);
3041
3042 if (tl->th_flags & TH_LIST_CONSTRAINED) {
3043 wq->wq_constrained_threads_scheduled--;
3044 tl->th_flags &= ~TH_LIST_CONSTRAINED;
3045 }
3046
3047 _wq_thactive_dec(wq, tl->th_priority);
3048 wq->wq_thscheduled_count[tl->th_priority]--;
3049 wq->wq_threads_scheduled--;
3050 uint32_t thidlecount = ++wq->wq_thidlecount;
3051
3052 pthread_kern->thread_sched_call(thread, NULL);
3053
3054 /*
3055 * We'd like to always have one manager thread parked so that we can have
3056 * low latency when we need to bring a manager thread up. If that idle
3057 * thread list is empty, make this thread a manager thread.
3058 *
3059 * XXX: This doesn't check that there's not a manager thread outstanding,
3060 * so it's based on the assumption that most manager callouts will change
3061 * their QoS before parking. If that stops being true, this may end up
3062 * costing us more than we gain.
3063 */
3064 if (TAILQ_EMPTY(&wq->wq_thidlemgrlist) &&
3065 tl->th_priority != WORKQUEUE_EVENT_MANAGER_BUCKET){
3066 PTHREAD_TRACE_WQ(TRACE_wq_thread_reset_priority | DBG_FUNC_NONE,
3067 wq, thread_tid(thread),
3068 (tl->th_priority << 16) | WORKQUEUE_EVENT_MANAGER_BUCKET, 2, 0);
3069 reset_priority(tl, pthread_priority_from_wq_class_index(wq, WORKQUEUE_EVENT_MANAGER_BUCKET));
3070 tl->th_priority = WORKQUEUE_EVENT_MANAGER_BUCKET;
3071 }
3072
3073 if (tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET){
3074 TAILQ_INSERT_HEAD(&wq->wq_thidlemgrlist, tl, th_entry);
3075 } else {
3076 TAILQ_INSERT_HEAD(&wq->wq_thidlelist, tl, th_entry);
3077 }
3078
3079 /*
3080 * When we remove the voucher from the thread, we may lose our importance
3081 * causing us to get preempted, so we do this after putting the thread on
3082 * the idle list. That when, when we get our importance back we'll be able
3083 * to use this thread from e.g. the kevent call out to deliver a boosting
3084 * message.
3085 */
3086 tl->th_flags |= TH_LIST_REMOVING_VOUCHER;
3087 workqueue_unlock(wq);
3088 if (pthread_kern->thread_will_park_or_terminate) {
3089 pthread_kern->thread_will_park_or_terminate(tl->th_thread);
3090 }
3091 __assert_only kern_return_t kr;
3092 kr = pthread_kern->thread_set_voucher_name(MACH_PORT_NULL);
3093 assert(kr == KERN_SUCCESS);
3094 workqueue_lock_spin(wq);
3095 tl->th_flags &= ~(TH_LIST_REMOVING_VOUCHER);
3096
3097 if ((tl->th_flags & TH_LIST_RUNNING) == 0) {
3098 if (thidlecount < 101) {
3099 us_to_wait = wq_reduce_pool_window_usecs - ((thidlecount-2) * (wq_reduce_pool_window_usecs / 100));
3100 } else {
3101 us_to_wait = wq_reduce_pool_window_usecs / 100;
3102 }
3103
3104 thread_set_pending_block_hint(thread, kThreadWaitParkedWorkQueue);
3105 assert_wait_timeout_with_leeway((caddr_t)tl, (THREAD_INTERRUPTIBLE),
3106 TIMEOUT_URGENCY_SYS_BACKGROUND|TIMEOUT_URGENCY_LEEWAY, us_to_wait,
3107 wq_reduce_pool_window_usecs/10, NSEC_PER_USEC);
3108
3109 workqueue_unlock(wq);
3110
3111 thread_block(wq_unpark_continue);
3112 panic("thread_block(wq_unpark_continue) returned!");
3113 } else {
3114 workqueue_unlock(wq);
3115
3116 /*
3117 * While we'd dropped the lock to unset our voucher, someone came
3118 * around and made us runnable. But because we weren't waiting on the
3119 * event their wakeup() was ineffectual. To correct for that, we just
3120 * run the continuation ourselves.
3121 */
3122 wq_unpark_continue(NULL, THREAD_AWAKENED);
3123 }
3124 }
3125
3126 static bool
3127 may_start_constrained_thread(struct workqueue *wq, uint32_t at_priclass,
3128 struct threadlist *tl, bool may_start_timer)
3129 {
3130 uint32_t req_qos = _wq_thactive_best_constrained_req_qos(wq);
3131 wq_thactive_t thactive;
3132
3133 if (may_start_timer && at_priclass < req_qos) {
3134 /*
3135 * When called from workqueue_run_threadreq_and_unlock() pre-post newest
3136 * higher priorities into the thactive state so that
3137 * workqueue_callback() takes the right decision.
3138 *
3139 * If the admission check passes, workqueue_run_threadreq_and_unlock
3140 * will reset this value before running the request.
3141 */
3142 thactive = _wq_thactive_set_best_constrained_req_qos(wq, req_qos,
3143 at_priclass);
3144 #ifdef __LP64__
3145 PTHREAD_TRACE_WQ(TRACE_wq_thactive_update, 1, (uint64_t)thactive,
3146 (uint64_t)(thactive >> 64), 0, 0);
3147 #endif
3148 } else {
3149 thactive = _wq_thactive(wq);
3150 }
3151
3152 uint32_t constrained_threads = wq->wq_constrained_threads_scheduled;
3153 if (tl && (tl->th_flags & TH_LIST_CONSTRAINED)) {
3154 /*
3155 * don't count the current thread as scheduled
3156 */
3157 constrained_threads--;
3158 }
3159 if (constrained_threads >= wq_max_constrained_threads) {
3160 PTHREAD_TRACE_WQ(TRACE_wq_constrained_admission | DBG_FUNC_NONE, wq, 1,
3161 wq->wq_constrained_threads_scheduled,
3162 wq_max_constrained_threads, 0);
3163 /*
3164 * we need 1 or more constrained threads to return to the kernel before
3165 * we can dispatch additional work
3166 */
3167 return false;
3168 }
3169
3170 /*
3171 * Compute a metric for many how many threads are active. We find the
3172 * highest priority request outstanding and then add up the number of
3173 * active threads in that and all higher-priority buckets. We'll also add
3174 * any "busy" threads which are not active but blocked recently enough that
3175 * we can't be sure they've gone idle yet. We'll then compare this metric
3176 * to our max concurrency to decide whether to add a new thread.
3177 */
3178
3179 uint32_t busycount, thactive_count;
3180
3181 thactive_count = _wq_thactive_aggregate_downto_qos(wq, thactive,
3182 at_priclass, &busycount, NULL);
3183
3184 if (tl && tl->th_priority <= at_priclass) {
3185 /*
3186 * don't count this thread as currently active
3187 */
3188 assert(thactive_count > 0);
3189 thactive_count--;
3190 }
3191
3192 if (thactive_count + busycount < wq_max_concurrency[at_priclass]) {
3193 PTHREAD_TRACE_WQ(TRACE_wq_constrained_admission | DBG_FUNC_NONE, wq, 2,
3194 thactive_count, busycount, 0);
3195 return true;
3196 } else {
3197 PTHREAD_TRACE_WQ(TRACE_wq_constrained_admission | DBG_FUNC_NONE, wq, 3,
3198 thactive_count, busycount, 0);
3199 }
3200
3201 if (busycount && may_start_timer) {
3202 /*
3203 * If this is called from the add timer, we won't have another timer
3204 * fire when the thread exits the "busy" state, so rearm the timer.
3205 */
3206 if (WQ_TIMER_DELAYED_NEEDED(wq)) {
3207 workqueue_interval_timer_start(wq);
3208 }
3209 }
3210
3211 return false;
3212 }
3213
3214 static struct threadlist *
3215 pop_from_thidlelist(struct workqueue *wq, uint32_t priclass)
3216 {
3217 assert(wq->wq_thidlecount);
3218
3219 struct threadlist *tl = NULL;
3220
3221 if (!TAILQ_EMPTY(&wq->wq_thidlemgrlist) &&
3222 (priclass == WORKQUEUE_EVENT_MANAGER_BUCKET || TAILQ_EMPTY(&wq->wq_thidlelist))){
3223 tl = TAILQ_FIRST(&wq->wq_thidlemgrlist);
3224 TAILQ_REMOVE(&wq->wq_thidlemgrlist, tl, th_entry);
3225 assert(tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET);
3226 } else if (!TAILQ_EMPTY(&wq->wq_thidlelist) &&
3227 (priclass != WORKQUEUE_EVENT_MANAGER_BUCKET || TAILQ_EMPTY(&wq->wq_thidlemgrlist))){
3228 tl = TAILQ_FIRST(&wq->wq_thidlelist);
3229 TAILQ_REMOVE(&wq->wq_thidlelist, tl, th_entry);
3230 assert(tl->th_priority != WORKQUEUE_EVENT_MANAGER_BUCKET);
3231 } else {
3232 panic("pop_from_thidlelist called with no threads available");
3233 }
3234 assert((tl->th_flags & TH_LIST_RUNNING) == 0);
3235
3236 assert(wq->wq_thidlecount);
3237 wq->wq_thidlecount--;
3238
3239 TAILQ_INSERT_TAIL(&wq->wq_thrunlist, tl, th_entry);
3240
3241 tl->th_flags |= TH_LIST_RUNNING | TH_LIST_BUSY;
3242
3243 wq->wq_threads_scheduled++;
3244 wq->wq_thscheduled_count[priclass]++;
3245 _wq_thactive_inc(wq, priclass);
3246 return tl;
3247 }
3248
3249 static pthread_priority_t
3250 pthread_priority_from_wq_class_index(struct workqueue *wq, int index)
3251 {
3252 if (index == WORKQUEUE_EVENT_MANAGER_BUCKET){
3253 return wq->wq_event_manager_priority;
3254 } else {
3255 return class_index_get_pthread_priority(index);
3256 }
3257 }
3258
3259 static void
3260 reset_priority(struct threadlist *tl, pthread_priority_t pri)
3261 {
3262 kern_return_t ret;
3263 thread_t th = tl->th_thread;
3264
3265 if ((pri & _PTHREAD_PRIORITY_SCHED_PRI_FLAG) == 0){
3266 ret = pthread_kern->thread_set_workq_qos(th, pthread_priority_get_thread_qos(pri), 0);
3267 assert(ret == KERN_SUCCESS || ret == KERN_TERMINATED);
3268
3269 if (tl->th_flags & TH_LIST_EVENT_MGR_SCHED_PRI) {
3270
3271 /* Reset priority to default (masked by QoS) */
3272
3273 ret = pthread_kern->thread_set_workq_pri(th, 31, POLICY_TIMESHARE);
3274 assert(ret == KERN_SUCCESS || ret == KERN_TERMINATED);
3275
3276 tl->th_flags &= ~TH_LIST_EVENT_MGR_SCHED_PRI;
3277 }
3278 } else {
3279 ret = pthread_kern->thread_set_workq_qos(th, THREAD_QOS_UNSPECIFIED, 0);
3280 assert(ret == KERN_SUCCESS || ret == KERN_TERMINATED);
3281 ret = pthread_kern->thread_set_workq_pri(th, (pri & (~_PTHREAD_PRIORITY_FLAGS_MASK)), POLICY_TIMESHARE);
3282 assert(ret == KERN_SUCCESS || ret == KERN_TERMINATED);
3283
3284 tl->th_flags |= TH_LIST_EVENT_MGR_SCHED_PRI;
3285 }
3286 }
3287
3288 /*
3289 * Picks the best request to run, and returns the best overcommit fallback
3290 * if the best pick is non overcommit and risks failing its admission check.
3291 */
3292 static struct threadreq *
3293 workqueue_best_threadreqs(struct workqueue *wq, struct threadlist *tl,
3294 struct threadreq **fallback)
3295 {
3296 struct threadreq *req, *best_req = NULL;
3297 int priclass, prilimit;
3298
3299 if ((wq->wq_event_manager_threadreq.tr_state == TR_STATE_WAITING) &&
3300 ((wq->wq_thscheduled_count[WORKQUEUE_EVENT_MANAGER_BUCKET] == 0) ||
3301 (tl && tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET))) {
3302 /*
3303 * There's an event manager request and either:
3304 * - no event manager currently running
3305 * - we are re-using the event manager
3306 */
3307 req = &wq->wq_event_manager_threadreq;
3308 PTHREAD_TRACE_WQ_REQ(TRACE_wq_run_threadreq_req_select | DBG_FUNC_NONE, wq, req, 1, 0, 0);
3309 return req;
3310 }
3311
3312 if (tl) {
3313 prilimit = WORKQUEUE_EVENT_MANAGER_BUCKET;
3314 } else {
3315 prilimit = _wq_highest_paced_priority(wq);
3316 }
3317 for (priclass = 0; priclass < prilimit; priclass++) {
3318 req = TAILQ_FIRST(&wq->wq_overcommit_reqlist[priclass]);
3319 if (req) {
3320 PTHREAD_TRACE_WQ_REQ(TRACE_wq_run_threadreq_req_select | DBG_FUNC_NONE, wq, req, 2, 0, 0);
3321 if (best_req) {
3322 *fallback = req;
3323 } else {
3324 best_req = req;
3325 }
3326 break;
3327 }
3328 if (!best_req) {
3329 best_req = TAILQ_FIRST(&wq->wq_reqlist[priclass]);
3330 if (best_req) {
3331 PTHREAD_TRACE_WQ_REQ(TRACE_wq_run_threadreq_req_select | DBG_FUNC_NONE, wq, best_req, 3, 0, 0);
3332 }
3333 }
3334 }
3335 return best_req;
3336 }
3337
3338 /**
3339 * Runs a thread request on a thread
3340 *
3341 * - if thread is THREAD_NULL, will find a thread and run the request there.
3342 * Otherwise, the thread must be the current thread.
3343 *
3344 * - if req is NULL, will find the highest priority request and run that. If
3345 * it is not NULL, it must be a threadreq object in state NEW. If it can not
3346 * be run immediately, it will be enqueued and moved to state WAITING.
3347 *
3348 * Either way, the thread request object serviced will be moved to state
3349 * PENDING and attached to the threadlist.
3350 *
3351 * Should be called with the workqueue lock held. Will drop it.
3352 *
3353 * WARNING: _workq_kevent_reqthreads needs to be able to preflight any
3354 * admission checks in this function. If you are changing this function,
3355 * keep that one up-to-date.
3356 *
3357 * - if parking_tl is non NULL, then the current thread is parking. This will
3358 * try to reuse this thread for a request. If no match is found, it will be
3359 * parked.
3360 */
3361 static int
3362 workqueue_run_threadreq_and_unlock(proc_t p, struct workqueue *wq,
3363 struct threadlist *parking_tl, struct threadreq *req,
3364 bool may_add_new_thread)
3365 {
3366 struct threadreq *incoming_req = req;
3367
3368 struct threadlist *tl = parking_tl;
3369 int rc = WQ_RUN_TR_THROTTLED;
3370
3371 assert(tl == NULL || tl->th_thread == current_thread());
3372 assert(req == NULL || req->tr_state == TR_STATE_NEW);
3373 assert(!may_add_new_thread || !tl);
3374
3375 PTHREAD_TRACE_WQ_REQ(TRACE_wq_run_threadreq | DBG_FUNC_START, wq, req,
3376 tl ? thread_tid(tl->th_thread) : 0,
3377 req ? (req->tr_priority << 16 | req->tr_flags) : 0, 0);
3378
3379 /*
3380 * Special cases when provided an event manager request
3381 */
3382 if (req && req->tr_priority == WORKQUEUE_EVENT_MANAGER_BUCKET) {
3383 // Clients must not rely on identity of event manager requests
3384 assert(req->tr_flags & TR_FLAG_ONSTACK);
3385 // You can't be both overcommit and event manager
3386 assert((req->tr_flags & TR_FLAG_OVERCOMMIT) == 0);
3387
3388 /*
3389 * We can only ever have one event manager request, so coalesce them if
3390 * there's already one outstanding.
3391 */
3392 if (wq->wq_event_manager_threadreq.tr_state == TR_STATE_WAITING) {
3393 PTHREAD_TRACE_WQ_REQ(TRACE_wq_run_threadreq_mgr_merge | DBG_FUNC_NONE, wq, req, 0, 0, 0);
3394
3395 struct threadreq *existing_req = &wq->wq_event_manager_threadreq;
3396 if (req->tr_flags & TR_FLAG_KEVENT) {
3397 existing_req->tr_flags |= TR_FLAG_KEVENT;
3398 }
3399
3400 req = existing_req;
3401 incoming_req = NULL;
3402 }
3403
3404 if (wq->wq_thscheduled_count[WORKQUEUE_EVENT_MANAGER_BUCKET] &&
3405 (!tl || tl->th_priority != WORKQUEUE_EVENT_MANAGER_BUCKET)){
3406 /*
3407 * There can only be one event manager running at a time.
3408 */
3409 PTHREAD_TRACE_WQ(TRACE_wq_run_threadreq | DBG_FUNC_END, wq, 1, 0, 0, 0);
3410 goto done;
3411 }
3412 }
3413
3414 again: // Start again after creating a thread
3415
3416 if (_wq_exiting(wq)) {
3417 rc = WQ_RUN_TR_EXITING;
3418 goto exiting;
3419 }
3420
3421 /*
3422 * Thread request selection and admission control
3423 */
3424 struct threadreq *fallback = NULL;
3425 if (req) {
3426 if ((req->tr_flags & TR_FLAG_NO_PACING) == 0 &&
3427 _wq_should_pace_priority(wq, req->tr_priority)) {
3428 /*
3429 * If a request fails the pacing admission check, then thread
3430 * requests are redriven when the pacing thread is finally scheduled
3431 * when it calls _wq_pacing_end() in wq_unpark_continue().
3432 */
3433 goto done;
3434 }
3435 } else if (wq->wq_reqcount == 0) {
3436 PTHREAD_TRACE_WQ(TRACE_wq_run_threadreq | DBG_FUNC_END, wq, 2, 0, 0, 0);
3437 goto done;
3438 } else if ((req = workqueue_best_threadreqs(wq, tl, &fallback)) == NULL) {
3439 PTHREAD_TRACE_WQ(TRACE_wq_run_threadreq | DBG_FUNC_END, wq, 3, 0, 0, 0);
3440 goto done;
3441 }
3442
3443 if ((req->tr_flags & TR_FLAG_OVERCOMMIT) == 0 &&
3444 (req->tr_priority < WORKQUEUE_EVENT_MANAGER_BUCKET)) {
3445 if (!may_start_constrained_thread(wq, req->tr_priority, parking_tl, true)) {
3446 if (!fallback) {
3447 PTHREAD_TRACE_WQ(TRACE_wq_run_threadreq | DBG_FUNC_END, wq, 4, 0, 0, 0);
3448 goto done;
3449 }
3450 assert(req->tr_state == TR_STATE_WAITING);
3451 req = fallback;
3452 }
3453 }
3454
3455 /*
3456 * Thread selection.
3457 */
3458 if (parking_tl) {
3459 if (tl->th_priority != req->tr_priority) {
3460 _wq_thactive_move(wq, tl->th_priority, req->tr_priority);
3461 wq->wq_thscheduled_count[tl->th_priority]--;
3462 wq->wq_thscheduled_count[req->tr_priority]++;
3463 }
3464 PTHREAD_TRACE_WQ(TRACE_wq_run_threadreq_thread_select | DBG_FUNC_NONE,
3465 wq, 1, thread_tid(tl->th_thread), 0, 0);
3466 } else if (wq->wq_thidlecount) {
3467 tl = pop_from_thidlelist(wq, req->tr_priority);
3468 /*
3469 * This call will update wq_thscheduled_count and wq_thactive_count for
3470 * the provided priority. It will not set the returned thread to that
3471 * priority. This matches the behavior of the parking_tl clause above.
3472 */
3473 PTHREAD_TRACE_WQ(TRACE_wq_run_threadreq_thread_select | DBG_FUNC_NONE,
3474 wq, 2, thread_tid(tl->th_thread), 0, 0);
3475 } else /* no idle threads */ {
3476 if (!may_add_new_thread || wq->wq_nthreads >= wq_max_threads) {
3477 PTHREAD_TRACE_WQ(TRACE_wq_run_threadreq | DBG_FUNC_END, wq, 5,
3478 may_add_new_thread, wq->wq_nthreads, 0);
3479 if (wq->wq_nthreads < wq_max_threads) {
3480 rc = WQ_RUN_TR_THREAD_NEEDED;
3481 }
3482 goto done;
3483 }
3484
3485 bool added_thread = workqueue_addnewthread(p, wq);
3486 /*
3487 * workqueue_addnewthread will drop and re-take the lock, so we
3488 * need to ensure we still have a cached request.
3489 *
3490 * It also means we have to pick a new request, since our old pick may
3491 * not be valid anymore.
3492 */
3493 req = incoming_req;
3494 if (req && (req->tr_flags & TR_FLAG_ONSTACK)) {
3495 _threadreq_copy_prepare(wq);
3496 }
3497
3498 if (added_thread) {
3499 PTHREAD_TRACE_WQ(TRACE_wq_run_threadreq_thread_select | DBG_FUNC_NONE,
3500 wq, 3, 0, 0, 0);
3501 goto again;
3502 } else if (_wq_exiting(wq)) {
3503 rc = WQ_RUN_TR_EXITING;
3504 goto exiting;
3505 } else {
3506 PTHREAD_TRACE_WQ(TRACE_wq_run_threadreq | DBG_FUNC_END, wq, 6, 0, 0, 0);
3507 /*
3508 * Something caused thread creation to fail. Kick off the timer in
3509 * the hope that it'll succeed next time.
3510 */
3511 if (WQ_TIMER_DELAYED_NEEDED(wq)) {
3512 workqueue_interval_timer_start(wq);
3513 }
3514 goto done;
3515 }
3516 }
3517
3518 /*
3519 * Setup thread, mark request as complete and run with it.
3520 */
3521 if (req->tr_state == TR_STATE_WAITING) {
3522 _threadreq_dequeue(wq, req);
3523 }
3524 if (tl->th_priority != req->tr_priority) {
3525 PTHREAD_TRACE_WQ(TRACE_wq_thread_reset_priority | DBG_FUNC_NONE,
3526 wq, thread_tid(tl->th_thread),
3527 (tl->th_priority << 16) | req->tr_priority, 1, 0);
3528 reset_priority(tl, pthread_priority_from_wq_class_index(wq, req->tr_priority));
3529 tl->th_priority = (uint8_t)req->tr_priority;
3530 }
3531 if (req->tr_flags & TR_FLAG_OVERCOMMIT) {
3532 if ((tl->th_flags & TH_LIST_CONSTRAINED) != 0) {
3533 tl->th_flags &= ~TH_LIST_CONSTRAINED;
3534 wq->wq_constrained_threads_scheduled--;
3535 }
3536 } else {
3537 if ((tl->th_flags & TH_LIST_CONSTRAINED) == 0) {
3538 tl->th_flags |= TH_LIST_CONSTRAINED;
3539 wq->wq_constrained_threads_scheduled++;
3540 }
3541 }
3542
3543 if (!parking_tl && !(req->tr_flags & TR_FLAG_NO_PACING)) {
3544 _wq_pacing_start(wq, tl);
3545 }
3546 if ((req->tr_flags & TR_FLAG_OVERCOMMIT) == 0) {
3547 uint32_t old_qos, new_qos;
3548
3549 /*
3550 * If we are scheduling a constrained thread request, we may need to
3551 * update the best constrained qos in the thactive atomic state.
3552 */
3553 for (new_qos = 0; new_qos < WQ_THACTIVE_NO_PENDING_REQUEST; new_qos++) {
3554 if (TAILQ_FIRST(&wq->wq_reqlist[new_qos]))
3555 break;
3556 }
3557 old_qos = _wq_thactive_best_constrained_req_qos(wq);
3558 if (old_qos != new_qos) {
3559 wq_thactive_t v = _wq_thactive_set_best_constrained_req_qos(wq,
3560 old_qos, new_qos);
3561 #ifdef __LP64__
3562 PTHREAD_TRACE_WQ(TRACE_wq_thactive_update, 2, (uint64_t)v,
3563 (uint64_t)(v >> 64), 0, 0);
3564 #else
3565 PTHREAD_TRACE_WQ(TRACE_wq_thactive_update, 2, v, 0, 0, 0);
3566 #endif
3567 }
3568 }
3569 {
3570 uint32_t upcall_flags = WQ_FLAG_THREAD_NEWSPI;
3571 if (req->tr_flags & TR_FLAG_OVERCOMMIT)
3572 upcall_flags |= WQ_FLAG_THREAD_OVERCOMMIT;
3573 if (req->tr_flags & TR_FLAG_KEVENT)
3574 upcall_flags |= WQ_FLAG_THREAD_KEVENT;
3575 if (req->tr_flags & TR_FLAG_WORKLOOP)
3576 upcall_flags |= WQ_FLAG_THREAD_WORKLOOP | WQ_FLAG_THREAD_KEVENT;
3577 if (tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET)
3578 upcall_flags |= WQ_FLAG_THREAD_EVENT_MANAGER;
3579 tl->th_upcall_flags = upcall_flags >> WQ_FLAG_THREAD_PRIOSHIFT;
3580 }
3581 if (req->tr_flags & TR_FLAG_KEVENT) {
3582 tl->th_flags |= TH_LIST_KEVENT;
3583 } else {
3584 tl->th_flags &= ~TH_LIST_KEVENT;
3585 }
3586 return _threadreq_complete_and_unlock(p, wq, req, tl);
3587
3588 done:
3589 if (incoming_req) {
3590 _threadreq_enqueue(wq, incoming_req);
3591 }
3592
3593 exiting:
3594
3595 if (parking_tl && !(parking_tl->th_flags & TH_LIST_UNBINDING)) {
3596 parkit(wq, parking_tl, parking_tl->th_thread);
3597 __builtin_unreachable();
3598 }
3599
3600 workqueue_unlock(wq);
3601
3602 return rc;
3603 }
3604
3605 /**
3606 * parked thread wakes up
3607 */
3608 static void __dead2
3609 wq_unpark_continue(void* __unused ptr, wait_result_t wait_result)
3610 {
3611 boolean_t first_use = false;
3612 thread_t th = current_thread();
3613 proc_t p = current_proc();
3614
3615 struct uthread *uth = pthread_kern->get_bsdthread_info(th);
3616 if (uth == NULL) goto done;
3617
3618 struct workqueue *wq = pthread_kern->proc_get_wqptr(p);
3619 if (wq == NULL) goto done;
3620
3621 workqueue_lock_spin(wq);
3622
3623 struct threadlist *tl = pthread_kern->uthread_get_threadlist(uth);
3624 assert(tl != WQ_THREADLIST_EXITING_POISON);
3625 if (tl == NULL) {
3626 /*
3627 * We woke up before addnewthread() was finished setting us up. Go
3628 * ahead and exit, but before we do poison the threadlist variable so
3629 * that addnewthread() doesn't think we are valid still.
3630 */
3631 pthread_kern->uthread_set_threadlist(uth, WQ_THREADLIST_EXITING_POISON);
3632 workqueue_unlock(wq);
3633 goto done;
3634 }
3635
3636 assert(tl->th_flags & TH_LIST_INITED);
3637
3638 if ((tl->th_flags & TH_LIST_NEW)){
3639 tl->th_flags &= ~(TH_LIST_NEW);
3640 first_use = true;
3641 }
3642
3643 if ((tl->th_flags & (TH_LIST_RUNNING | TH_LIST_BUSY)) == TH_LIST_RUNNING) {
3644 /*
3645 * The normal wakeup path.
3646 */
3647 goto return_to_user;
3648 }
3649
3650 if ((tl->th_flags & TH_LIST_RUNNING) == 0 &&
3651 wait_result == THREAD_TIMED_OUT &&
3652 tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET &&
3653 TAILQ_FIRST(&wq->wq_thidlemgrlist) == tl &&
3654 TAILQ_NEXT(tl, th_entry) == NULL){
3655 /*
3656 * If we are the only idle manager and we pop'ed for self-destruction,
3657 * then don't actually exit. Instead, free our stack to save some
3658 * memory and re-park.
3659 */
3660
3661 workqueue_unlock(wq);
3662
3663 vm_map_t vmap = wq->wq_map;
3664
3665 // Keep this in sync with _setup_wqthread()
3666 const vm_size_t guardsize = vm_map_page_size(vmap);
3667 const user_addr_t freeaddr = (user_addr_t)tl->th_stackaddr + guardsize;
3668 const vm_map_offset_t freesize = vm_map_trunc_page_mask((PTH_DEFAULT_STACKSIZE + guardsize + PTHREAD_T_OFFSET) - 1, vm_map_page_mask(vmap)) - guardsize;
3669
3670 __assert_only int kr = mach_vm_behavior_set(vmap, freeaddr, freesize, VM_BEHAVIOR_REUSABLE);
3671 #if MACH_ASSERT
3672 if (kr != KERN_SUCCESS && kr != KERN_INVALID_ADDRESS) {
3673 os_log_error(OS_LOG_DEFAULT, "unable to make thread stack reusable (kr: %d)", kr);
3674 }
3675 #endif
3676
3677 workqueue_lock_spin(wq);
3678
3679 if ( !(tl->th_flags & TH_LIST_RUNNING)) {
3680 thread_set_pending_block_hint(th, kThreadWaitParkedWorkQueue);
3681 assert_wait((caddr_t)tl, (THREAD_INTERRUPTIBLE));
3682
3683 workqueue_unlock(wq);
3684
3685 thread_block(wq_unpark_continue);
3686 __builtin_unreachable();
3687 }
3688 }
3689
3690 if ((tl->th_flags & TH_LIST_RUNNING) == 0) {
3691 assert((tl->th_flags & TH_LIST_BUSY) == 0);
3692 if (!first_use) {
3693 PTHREAD_TRACE_WQ(TRACE_wq_thread_park | DBG_FUNC_END, wq, 0, 0, 0, 0);
3694 }
3695 /*
3696 * We were set running, but not for the purposes of actually running.
3697 * This could be because the timer elapsed. Or it could be because the
3698 * thread aborted. Either way, we need to return to userspace to exit.
3699 *
3700 * The call to workqueue_removethread will consume the lock.
3701 */
3702
3703 if (!first_use &&
3704 (tl->th_priority < qos_class_get_class_index(WQ_THREAD_CLEANUP_QOS) ||
3705 (tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET))) {
3706 // Reset the QoS to something low for the pthread cleanup
3707 PTHREAD_TRACE_WQ(TRACE_wq_thread_reset_priority | DBG_FUNC_NONE,
3708 wq, thread_tid(th),
3709 (tl->th_priority << 16) | qos_class_get_class_index(WQ_THREAD_CLEANUP_QOS), 3, 0);
3710 pthread_priority_t cleanup_pri = _pthread_priority_make_newest(WQ_THREAD_CLEANUP_QOS, 0, 0);
3711 reset_priority(tl, cleanup_pri);
3712 }
3713
3714 workqueue_removethread(tl, 0, first_use);
3715
3716 if (first_use){
3717 pthread_kern->thread_bootstrap_return();
3718 } else {
3719 pthread_kern->unix_syscall_return(0);
3720 }
3721 __builtin_unreachable();
3722 }
3723
3724 /*
3725 * The timer woke us up or the thread was aborted. However, we have
3726 * already started to make this a runnable thread. Wait for that to
3727 * finish, then continue to userspace.
3728 */
3729 while ((tl->th_flags & TH_LIST_BUSY)) {
3730 assert_wait((caddr_t)tl, (THREAD_UNINT));
3731
3732 workqueue_unlock(wq);
3733
3734 thread_block(THREAD_CONTINUE_NULL);
3735
3736 workqueue_lock_spin(wq);
3737 }
3738
3739 return_to_user:
3740 if (!first_use) {
3741 PTHREAD_TRACE_WQ(TRACE_wq_thread_park | DBG_FUNC_END, wq, 0, 0, 0, 0);
3742 }
3743 if (_wq_pacing_end(wq, tl) && wq->wq_reqcount) {
3744 workqueue_run_threadreq_and_unlock(p, wq, NULL, NULL, true);
3745 } else {
3746 workqueue_unlock(wq);
3747 }
3748 _setup_wqthread(p, th, wq, tl, first_use ? WQ_SETUP_FIRST_USE : 0);
3749 pthread_kern->thread_sched_call(th, workqueue_callback);
3750 done:
3751 if (first_use){
3752 pthread_kern->thread_bootstrap_return();
3753 } else {
3754 pthread_kern->unix_syscall_return(EJUSTRETURN);
3755 }
3756 panic("Our attempt to return to userspace failed...");
3757 }
3758
3759 /**
3760 * configures initial thread stack/registers to jump into:
3761 * _pthread_wqthread(pthread_t self, mach_port_t kport, void *stackaddr, void *keventlist, int upcall_flags, int nkevents);
3762 * to get there we jump through assembily stubs in pthread_asm.s. Those
3763 * routines setup a stack frame, using the current stack pointer, and marshall
3764 * arguments from registers to the stack as required by the ABI.
3765 *
3766 * One odd thing we do here is to start the pthread_t 4k below what would be the
3767 * top of the stack otherwise. This is because usually only the first 4k of the
3768 * pthread_t will be used and so we want to put it on the same 16k page as the
3769 * top of the stack to save memory.
3770 *
3771 * When we are done the stack will look like:
3772 * |-----------| th_stackaddr + th_allocsize
3773 * |pthread_t | th_stackaddr + DEFAULT_STACKSIZE + guardsize + PTHREAD_STACK_OFFSET
3774 * |kevent list| optionally - at most WQ_KEVENT_LIST_LEN events
3775 * |kevent data| optionally - at most WQ_KEVENT_DATA_SIZE bytes
3776 * |stack gap | bottom aligned to 16 bytes, and at least as big as stack_gap_min
3777 * | STACK |
3778 * | ⇓ |
3779 * | |
3780 * |guard page | guardsize
3781 * |-----------| th_stackaddr
3782 */
3783 void
3784 _setup_wqthread(proc_t p, thread_t th, struct workqueue *wq,
3785 struct threadlist *tl, int setup_flags)
3786 {
3787 int error;
3788 if (setup_flags & WQ_SETUP_CLEAR_VOUCHER) {
3789 /*
3790 * For preemption reasons, we want to reset the voucher as late as
3791 * possible, so we do it in two places:
3792 * - Just before parking (i.e. in parkit())
3793 * - Prior to doing the setup for the next workitem (i.e. here)
3794 *
3795 * Those two places are sufficient to ensure we always reset it before
3796 * it goes back out to user space, but be careful to not break that
3797 * guarantee.
3798 */
3799 __assert_only kern_return_t kr;
3800 kr = pthread_kern->thread_set_voucher_name(MACH_PORT_NULL);
3801 assert(kr == KERN_SUCCESS);
3802 }
3803
3804 uint32_t upcall_flags = tl->th_upcall_flags << WQ_FLAG_THREAD_PRIOSHIFT;
3805 if (!(setup_flags & WQ_SETUP_FIRST_USE)) {
3806 upcall_flags |= WQ_FLAG_THREAD_REUSE;
3807 }
3808
3809 /*
3810 * Put the QoS class value into the lower bits of the reuse_thread register, this is where
3811 * the thread priority used to be stored anyway.
3812 */
3813 pthread_priority_t priority = pthread_priority_from_wq_class_index(wq, tl->th_priority);
3814 upcall_flags |= (_pthread_priority_get_qos_newest(priority) & WQ_FLAG_THREAD_PRIOMASK);
3815
3816 const vm_size_t guardsize = vm_map_page_size(tl->th_workq->wq_map);
3817 const vm_size_t stack_gap_min = (proc_is64bit(p) == 0) ? C_32_STK_ALIGN : C_64_REDZONE_LEN;
3818 const vm_size_t stack_align_min = (proc_is64bit(p) == 0) ? C_32_STK_ALIGN : C_64_STK_ALIGN;
3819
3820 user_addr_t pthread_self_addr = (user_addr_t)(tl->th_stackaddr + PTH_DEFAULT_STACKSIZE + guardsize + PTHREAD_T_OFFSET);
3821 user_addr_t stack_top_addr = (user_addr_t)((pthread_self_addr - stack_gap_min) & -stack_align_min);
3822 user_addr_t stack_bottom_addr = (user_addr_t)(tl->th_stackaddr + guardsize);
3823
3824 user_addr_t wqstart_fnptr = pthread_kern->proc_get_wqthread(p);
3825 if (!wqstart_fnptr) {
3826 panic("workqueue thread start function pointer is NULL");
3827 }
3828
3829 if (setup_flags & WQ_SETUP_FIRST_USE) {
3830 uint32_t tsd_offset = pthread_kern->proc_get_pthread_tsd_offset(p);
3831 if (tsd_offset) {
3832 mach_vm_offset_t th_tsd_base = (mach_vm_offset_t)pthread_self_addr + tsd_offset;
3833 kern_return_t kret = pthread_kern->thread_set_tsd_base(th, th_tsd_base);
3834 if (kret == KERN_SUCCESS) {
3835 upcall_flags |= WQ_FLAG_THREAD_TSD_BASE_SET;
3836 }
3837 }
3838
3839 /*
3840 * Pre-fault the first page of the new thread's stack and the page that will
3841 * contain the pthread_t structure.
3842 */
3843 vm_map_t vmap = pthread_kern->current_map();
3844 if (vm_map_trunc_page_mask((vm_map_offset_t)(stack_top_addr - C_64_REDZONE_LEN), vm_map_page_mask(vmap)) !=
3845 vm_map_trunc_page_mask((vm_map_offset_t)pthread_self_addr, vm_map_page_mask(vmap))){
3846 vm_fault( vmap,
3847 vm_map_trunc_page_mask((vm_map_offset_t)(stack_top_addr - C_64_REDZONE_LEN), vm_map_page_mask(vmap)),
3848 VM_PROT_READ | VM_PROT_WRITE,
3849 FALSE,
3850 THREAD_UNINT, NULL, 0);
3851 }
3852 vm_fault( vmap,
3853 vm_map_trunc_page_mask((vm_map_offset_t)pthread_self_addr, vm_map_page_mask(vmap)),
3854 VM_PROT_READ | VM_PROT_WRITE,
3855 FALSE,
3856 THREAD_UNINT, NULL, 0);
3857 }
3858
3859 user_addr_t kevent_list = NULL;
3860 int kevent_count = 0;
3861 if (upcall_flags & WQ_FLAG_THREAD_KEVENT){
3862 bool workloop = upcall_flags & WQ_FLAG_THREAD_WORKLOOP;
3863
3864 kevent_list = pthread_self_addr - WQ_KEVENT_LIST_LEN * sizeof(struct kevent_qos_s);
3865 kevent_count = WQ_KEVENT_LIST_LEN;
3866
3867 user_addr_t kevent_id_addr = kevent_list;
3868 if (workloop) {
3869 /*
3870 * The kevent ID goes just below the kevent list. Sufficiently new
3871 * userspace will know to look there. Old userspace will just
3872 * ignore it.
3873 */
3874 kevent_id_addr -= sizeof(kqueue_id_t);
3875 }
3876
3877 user_addr_t kevent_data_buf = kevent_id_addr - WQ_KEVENT_DATA_SIZE;
3878 user_size_t kevent_data_available = WQ_KEVENT_DATA_SIZE;
3879
3880 int32_t events_out = 0;
3881
3882 assert(tl->th_flags | TH_LIST_KEVENT_BOUND);
3883 unsigned int flags = KEVENT_FLAG_STACK_DATA | KEVENT_FLAG_IMMEDIATE;
3884 if (tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET) {
3885 flags |= KEVENT_FLAG_WORKQ_MANAGER;
3886 }
3887 int ret = 0;
3888 if (workloop) {
3889 flags |= KEVENT_FLAG_WORKLOOP;
3890 kqueue_id_t kevent_id = -1;
3891 ret = kevent_id_internal(p, &kevent_id,
3892 NULL, 0, kevent_list, kevent_count,
3893 kevent_data_buf, &kevent_data_available,
3894 flags, &events_out);
3895 copyout(&kevent_id, kevent_id_addr, sizeof(kevent_id));
3896 } else {
3897 flags |= KEVENT_FLAG_WORKQ;
3898 ret = kevent_qos_internal(p,
3899 class_index_get_thread_qos(tl->th_priority),
3900 NULL, 0, kevent_list, kevent_count,
3901 kevent_data_buf, &kevent_data_available,
3902 flags, &events_out);
3903 }
3904
3905 // squash any errors into just empty output
3906 if (ret != KERN_SUCCESS || events_out == -1){
3907 events_out = 0;
3908 kevent_data_available = WQ_KEVENT_DATA_SIZE;
3909 }
3910
3911 // We shouldn't get data out if there aren't events available
3912 assert(events_out != 0 || kevent_data_available == WQ_KEVENT_DATA_SIZE);
3913
3914 if (events_out > 0){
3915 if (kevent_data_available == WQ_KEVENT_DATA_SIZE){
3916 stack_top_addr = (kevent_id_addr - stack_gap_min) & -stack_align_min;
3917 } else {
3918 stack_top_addr = (kevent_data_buf + kevent_data_available - stack_gap_min) & -stack_align_min;
3919 }
3920
3921 kevent_count = events_out;
3922 } else {
3923 kevent_list = NULL;
3924 kevent_count = 0;
3925 }
3926 }
3927
3928 PTHREAD_TRACE_WQ(TRACE_wq_runthread | DBG_FUNC_START, wq, 0, 0, 0, 0);
3929
3930 #if defined(__i386__) || defined(__x86_64__)
3931 if (proc_is64bit(p) == 0) {
3932 x86_thread_state32_t state = {
3933 .eip = (unsigned int)wqstart_fnptr,
3934 .eax = /* arg0 */ (unsigned int)pthread_self_addr,
3935 .ebx = /* arg1 */ (unsigned int)tl->th_thport,
3936 .ecx = /* arg2 */ (unsigned int)stack_bottom_addr,
3937 .edx = /* arg3 */ (unsigned int)kevent_list,
3938 .edi = /* arg4 */ (unsigned int)upcall_flags,
3939 .esi = /* arg5 */ (unsigned int)kevent_count,
3940
3941 .esp = (int)((vm_offset_t)stack_top_addr),
3942 };
3943
3944 error = pthread_kern->thread_set_wq_state32(th, (thread_state_t)&state);
3945 if (error != KERN_SUCCESS) {
3946 panic(__func__ ": thread_set_wq_state failed: %d", error);
3947 }
3948 } else {
3949 x86_thread_state64_t state64 = {
3950 // x86-64 already passes all the arguments in registers, so we just put them in their final place here
3951 .rip = (uint64_t)wqstart_fnptr,
3952 .rdi = (uint64_t)pthread_self_addr,
3953 .rsi = (uint64_t)tl->th_thport,
3954 .rdx = (uint64_t)stack_bottom_addr,
3955 .rcx = (uint64_t)kevent_list,
3956 .r8 = (uint64_t)upcall_flags,
3957 .r9 = (uint64_t)kevent_count,
3958
3959 .rsp = (uint64_t)(stack_top_addr)
3960 };
3961
3962 error = pthread_kern->thread_set_wq_state64(th, (thread_state_t)&state64);
3963 if (error != KERN_SUCCESS) {
3964 panic(__func__ ": thread_set_wq_state failed: %d", error);
3965 }
3966 }
3967 #else
3968 #error setup_wqthread not defined for this architecture
3969 #endif
3970 }
3971
3972 #if DEBUG
3973 static int wq_kevent_test SYSCTL_HANDLER_ARGS {
3974 //(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req)
3975 #pragma unused(oidp, arg1, arg2)
3976 int error;
3977 struct workq_reqthreads_req_s requests[64] = {};
3978
3979 if (req->newlen > sizeof(requests) || req->newlen < sizeof(struct workq_reqthreads_req_s))
3980 return EINVAL;
3981
3982 error = copyin(req->newptr, requests, req->newlen);
3983 if (error) return error;
3984
3985 _workq_reqthreads(req->p, (int)(req->newlen / sizeof(struct workq_reqthreads_req_s)), requests);
3986
3987 return 0;
3988 }
3989 #endif // DEBUG
3990
3991 #pragma mark - Misc
3992
3993 int
3994 _fill_procworkqueue(proc_t p, struct proc_workqueueinfo * pwqinfo)
3995 {
3996 struct workqueue * wq;
3997 int error = 0;
3998 int activecount;
3999
4000 if ((wq = pthread_kern->proc_get_wqptr(p)) == NULL) {
4001 return EINVAL;
4002 }
4003
4004 /*
4005 * This is sometimes called from interrupt context by the kperf sampler.
4006 * In that case, it's not safe to spin trying to take the lock since we
4007 * might already hold it. So, we just try-lock it and error out if it's
4008 * already held. Since this is just a debugging aid, and all our callers
4009 * are able to handle an error, that's fine.
4010 */
4011 bool locked = workqueue_lock_try(wq);
4012 if (!locked) {
4013 return EBUSY;
4014 }
4015
4016 activecount = _wq_thactive_aggregate_downto_qos(wq, _wq_thactive(wq),
4017 WORKQUEUE_NUM_BUCKETS - 1, NULL, NULL);
4018 pwqinfo->pwq_nthreads = wq->wq_nthreads;
4019 pwqinfo->pwq_runthreads = activecount;
4020 pwqinfo->pwq_blockedthreads = wq->wq_threads_scheduled - activecount;
4021 pwqinfo->pwq_state = 0;
4022
4023 if (wq->wq_constrained_threads_scheduled >= wq_max_constrained_threads) {
4024 pwqinfo->pwq_state |= WQ_EXCEEDED_CONSTRAINED_THREAD_LIMIT;
4025 }
4026
4027 if (wq->wq_nthreads >= wq_max_threads) {
4028 pwqinfo->pwq_state |= WQ_EXCEEDED_TOTAL_THREAD_LIMIT;
4029 }
4030
4031 workqueue_unlock(wq);
4032 return(error);
4033 }
4034
4035 uint32_t
4036 _get_pwq_state_kdp(proc_t p)
4037 {
4038 if (p == NULL) {
4039 return 0;
4040 }
4041
4042 struct workqueue *wq = pthread_kern->proc_get_wqptr(p);
4043
4044 if (wq == NULL || workqueue_lock_spin_is_acquired_kdp(wq)) {
4045 return 0;
4046 }
4047
4048 uint32_t pwq_state = WQ_FLAGS_AVAILABLE;
4049
4050 if (wq->wq_constrained_threads_scheduled >= wq_max_constrained_threads) {
4051 pwq_state |= WQ_EXCEEDED_CONSTRAINED_THREAD_LIMIT;
4052 }
4053
4054 if (wq->wq_nthreads >= wq_max_threads) {
4055 pwq_state |= WQ_EXCEEDED_TOTAL_THREAD_LIMIT;
4056 }
4057
4058 return pwq_state;
4059 }
4060
4061 int
4062 _thread_selfid(__unused struct proc *p, uint64_t *retval)
4063 {
4064 thread_t thread = current_thread();
4065 *retval = thread_tid(thread);
4066 return KERN_SUCCESS;
4067 }
4068
4069 void
4070 _pthread_init(void)
4071 {
4072 pthread_lck_grp_attr = lck_grp_attr_alloc_init();
4073 pthread_lck_grp = lck_grp_alloc_init("pthread", pthread_lck_grp_attr);
4074
4075 /*
4076 * allocate the lock attribute for pthread synchronizers
4077 */
4078 pthread_lck_attr = lck_attr_alloc_init();
4079
4080 pthread_list_mlock = lck_mtx_alloc_init(pthread_lck_grp, pthread_lck_attr);
4081
4082 pth_global_hashinit();
4083 psynch_thcall = thread_call_allocate(psynch_wq_cleanup, NULL);
4084 psynch_zoneinit();
4085
4086 pthread_zone_workqueue = zinit(sizeof(struct workqueue),
4087 1024 * sizeof(struct workqueue), 8192, "pthread.workqueue");
4088 pthread_zone_threadlist = zinit(sizeof(struct threadlist),
4089 1024 * sizeof(struct threadlist), 8192, "pthread.threadlist");
4090 pthread_zone_threadreq = zinit(sizeof(struct threadreq),
4091 1024 * sizeof(struct threadreq), 8192, "pthread.threadreq");
4092
4093 /*
4094 * register sysctls
4095 */
4096 sysctl_register_oid(&sysctl__kern_wq_stalled_window_usecs);
4097 sysctl_register_oid(&sysctl__kern_wq_reduce_pool_window_usecs);
4098 sysctl_register_oid(&sysctl__kern_wq_max_timer_interval_usecs);
4099 sysctl_register_oid(&sysctl__kern_wq_max_threads);
4100 sysctl_register_oid(&sysctl__kern_wq_max_constrained_threads);
4101 sysctl_register_oid(&sysctl__kern_pthread_debug_tracing);
4102
4103 #if DEBUG
4104 sysctl_register_oid(&sysctl__debug_wq_kevent_test);
4105 #endif
4106
4107 for (int i = 0; i < WORKQUEUE_NUM_BUCKETS; i++) {
4108 uint32_t thread_qos = _wq_bucket_to_thread_qos(i);
4109 wq_max_concurrency[i] = pthread_kern->qos_max_parallelism(thread_qos,
4110 QOS_PARALLELISM_COUNT_LOGICAL);
4111 }
4112 wq_max_concurrency[WORKQUEUE_EVENT_MANAGER_BUCKET] = 1;
4113 }