kern/kern_support.c

   1 /*
   2  * Copyright (c) 2000-2012 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995-2005 Apple Computer, Inc. All Rights Reserved */
  29 /*
  30  *      pthread_synch.c
  31  */
  32
  33 #pragma mark - Front Matter
  34
  35 #define  _PTHREAD_CONDATTR_T
  36 #define  _PTHREAD_COND_T
  37 #define _PTHREAD_MUTEXATTR_T
  38 #define _PTHREAD_MUTEX_T
  39 #define _PTHREAD_RWLOCKATTR_T
  40 #define _PTHREAD_RWLOCK_T
  41
  42 #undef pthread_mutexattr_t
  43 #undef pthread_mutex_t
  44 #undef pthread_condattr_t
  45 #undef pthread_cond_t
  46 #undef pthread_rwlockattr_t
  47 #undef pthread_rwlock_t
  48
  49 #include <sys/cdefs.h>
  50 #include <os/log.h>
  51
  52 // <rdar://problem/26158937> panic() should be marked noreturn
  53 extern void panic(const char *string, ...) __printflike(1,2) __dead2;
  54
  55 #include <sys/param.h>
  56 #include <sys/queue.h>
  57 #include <sys/resourcevar.h>
  58 //#include <sys/proc_internal.h>
  59 #include <sys/kauth.h>
  60 #include <sys/systm.h>
  61 #include <sys/timeb.h>
  62 #include <sys/times.h>
  63 #include <sys/acct.h>
  64 #include <sys/kernel.h>
  65 #include <sys/wait.h>
  66 #include <sys/signalvar.h>
  67 #include <sys/sysctl.h>
  68 #include <sys/syslog.h>
  69 #include <sys/stat.h>
  70 #include <sys/lock.h>
  71 #include <sys/kdebug.h>
  72 //#include <sys/sysproto.h>
  73 #include <sys/vm.h>
  74 #include <sys/user.h>           /* for coredump */
  75 #include <sys/proc_info.h>      /* for fill_procworkqueue */
  76
  77 #include <mach/mach_port.h>
  78 #include <mach/mach_types.h>
  79 #include <mach/semaphore.h>
  80 #include <mach/sync_policy.h>
  81 #include <mach/task.h>
  82 #include <mach/vm_prot.h>
  83 #include <kern/kern_types.h>
  84 #include <kern/task.h>
  85 #include <kern/clock.h>
  86 #include <mach/kern_return.h>
  87 #include <kern/thread.h>
  88 #include <kern/zalloc.h>
  89 #include <kern/sched_prim.h>    /* for thread_exception_return */
  90 #include <kern/processor.h>
  91 #include <kern/assert.h>
  92 #include <mach/mach_vm.h>
  93 #include <mach/mach_param.h>
  94 #include <mach/thread_status.h>
  95 #include <mach/thread_policy.h>
  96 #include <mach/message.h>
  97 #include <mach/port.h>
  98 //#include <vm/vm_protos.h>
  99 #include <vm/vm_fault.h>
 100 #include <vm/vm_map.h>
 101 #include <mach/thread_act.h> /* for thread_resume */
 102 #include <machine/machine_routines.h>
 103 #include <mach/shared_region.h>
 104
 105 #include <libkern/OSAtomic.h>
 106 #include <libkern/libkern.h>
 107
 108 #include <sys/pthread_shims.h>
 109 #include "kern_internal.h"
 110
 111 // XXX: Dirty import for sys/signarvar.h that's wrapped in BSD_KERNEL_PRIVATE
 112 #define sigcantmask (sigmask(SIGKILL) | sigmask(SIGSTOP))
 113
 114 // XXX: Ditto for thread tags from kern/thread.h
 115 #define THREAD_TAG_MAINTHREAD 0x1
 116 #define THREAD_TAG_PTHREAD 0x10
 117 #define THREAD_TAG_WORKQUEUE 0x20
 118
 119 lck_grp_attr_t   *pthread_lck_grp_attr;
 120 lck_grp_t    *pthread_lck_grp;
 121 lck_attr_t   *pthread_lck_attr;
 122
 123 zone_t pthread_zone_workqueue;
 124 zone_t pthread_zone_threadlist;
 125 zone_t pthread_zone_threadreq;
 126
 127 extern void thread_set_cthreadself(thread_t thread, uint64_t pself, int isLP64);
 128 extern void workqueue_thread_yielded(void);
 129
 130 #define WQ_SETUP_FIRST_USE  1
 131 #define WQ_SETUP_CLEAR_VOUCHER  2
 132 static void _setup_wqthread(proc_t p, thread_t th, struct workqueue *wq,
 133                 struct threadlist *tl, int flags);
 134
 135 static void reset_priority(struct threadlist *tl, pthread_priority_t pri);
 136 static pthread_priority_t pthread_priority_from_wq_class_index(struct workqueue *wq, int index);
 137
 138 static void wq_unpark_continue(void* ptr, wait_result_t wait_result) __dead2;
 139
 140 static bool workqueue_addnewthread(proc_t p, struct workqueue *wq);
 141 static void workqueue_removethread(struct threadlist *tl, bool fromexit, bool first_use);
 142 static void workqueue_lock_spin(struct workqueue *);
 143 static void workqueue_unlock(struct workqueue *);
 144
 145 #define WQ_RUN_TR_THROTTLED 0
 146 #define WQ_RUN_TR_THREAD_NEEDED 1
 147 #define WQ_RUN_TR_THREAD_STARTED 2
 148 #define WQ_RUN_TR_EXITING 3
 149 static int workqueue_run_threadreq_and_unlock(proc_t p, struct workqueue *wq,
 150                 struct threadlist *tl, struct threadreq *req, bool may_add_new_thread);
 151
 152 static bool may_start_constrained_thread(struct workqueue *wq,
 153                 uint32_t at_priclass, struct threadlist *tl, bool may_start_timer);
 154
 155 static mach_vm_offset_t stack_addr_hint(proc_t p, vm_map_t vmap);
 156 static boolean_t wq_thread_is_busy(uint64_t cur_ts,
 157                 _Atomic uint64_t *lastblocked_tsp);
 158
 159 int proc_settargetconc(pid_t pid, int queuenum, int32_t targetconc);
 160 int proc_setalltargetconc(pid_t pid, int32_t * targetconcp);
 161
 162 #define WQ_MAXPRI_MIN   0       /* low prio queue num */
 163 #define WQ_MAXPRI_MAX   2       /* max  prio queuenum */
 164 #define WQ_PRI_NUM      3       /* number of prio work queues */
 165
 166 #define C_32_STK_ALIGN          16
 167 #define C_64_STK_ALIGN          16
 168 #define C_64_REDZONE_LEN        128
 169
 170 #define PTHREAD_T_OFFSET 0
 171
 172 /*
 173  * Flags filed passed to bsdthread_create and back in pthread_start
 174 31  <---------------------------------> 0
 175 _________________________________________
 176 | flags(8) | policy(8) | importance(16) |
 177 -----------------------------------------
 178 */
 179
 180 #define PTHREAD_START_CUSTOM            0x01000000
 181 #define PTHREAD_START_SETSCHED          0x02000000
 182 #define PTHREAD_START_DETACHED          0x04000000
 183 #define PTHREAD_START_QOSCLASS          0x08000000
 184 #define PTHREAD_START_TSD_BASE_SET      0x10000000
 185 #define PTHREAD_START_QOSCLASS_MASK     0x00ffffff
 186 #define PTHREAD_START_POLICY_BITSHIFT 16
 187 #define PTHREAD_START_POLICY_MASK 0xff
 188 #define PTHREAD_START_IMPORTANCE_MASK 0xffff
 189
 190 #define SCHED_OTHER      POLICY_TIMESHARE
 191 #define SCHED_FIFO       POLICY_FIFO
 192 #define SCHED_RR         POLICY_RR
 193
 194 #define BASEPRI_DEFAULT 31
 195
 196 #pragma mark sysctls
 197
 198 static uint32_t wq_stalled_window_usecs = WQ_STALLED_WINDOW_USECS;
 199 static uint32_t wq_reduce_pool_window_usecs     = WQ_REDUCE_POOL_WINDOW_USECS;
 200 static uint32_t wq_max_timer_interval_usecs     = WQ_MAX_TIMER_INTERVAL_USECS;
 201 static uint32_t wq_max_threads                  = WORKQUEUE_MAXTHREADS;
 202 static uint32_t wq_max_constrained_threads      = WORKQUEUE_MAXTHREADS / 8;
 203 static uint32_t wq_max_concurrency[WORKQUEUE_NUM_BUCKETS + 1]; // set to ncpus on load
 204
 205 SYSCTL_INT(_kern, OID_AUTO, wq_stalled_window_usecs, CTLFLAG_RW | CTLFLAG_LOCKED,
 206            &wq_stalled_window_usecs, 0, "");
 207
 208 SYSCTL_INT(_kern, OID_AUTO, wq_reduce_pool_window_usecs, CTLFLAG_RW | CTLFLAG_LOCKED,
 209            &wq_reduce_pool_window_usecs, 0, "");
 210
 211 SYSCTL_INT(_kern, OID_AUTO, wq_max_timer_interval_usecs, CTLFLAG_RW | CTLFLAG_LOCKED,
 212            &wq_max_timer_interval_usecs, 0, "");
 213
 214 SYSCTL_INT(_kern, OID_AUTO, wq_max_threads, CTLFLAG_RW | CTLFLAG_LOCKED,
 215            &wq_max_threads, 0, "");
 216
 217 SYSCTL_INT(_kern, OID_AUTO, wq_max_constrained_threads, CTLFLAG_RW | CTLFLAG_LOCKED,
 218            &wq_max_constrained_threads, 0, "");
 219
 220 #ifdef DEBUG
 221 static int wq_kevent_test SYSCTL_HANDLER_ARGS;
 222 SYSCTL_PROC(_debug, OID_AUTO, wq_kevent_test, CTLFLAG_MASKED | CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY | CTLTYPE_OPAQUE, NULL, 0, wq_kevent_test, 0, "-");
 223 #endif
 224
 225 static uint32_t wq_init_constrained_limit = 1;
 226
 227 uint32_t pthread_debug_tracing = 1;
 228
 229 SYSCTL_INT(_kern, OID_AUTO, pthread_debug_tracing, CTLFLAG_RW | CTLFLAG_LOCKED,
 230                    &pthread_debug_tracing, 0, "")
 231
 232 /*
 233  *       +-----+-----+-----+-----+-----+-----+-----+
 234  *       | MT  | BG  | UT  | DE  | IN  | UN  | mgr |
 235  * +-----+-----+-----+-----+-----+-----+-----+-----+
 236  * | pri |  5  |  4  |  3  |  2  |  1  |  0  |  6  |
 237  * | qos |  1  |  2  |  3  |  4  |  5  |  6  |  7  |
 238  * +-----+-----+-----+-----+-----+-----+-----+-----+
 239  */
 240 static inline uint32_t
 241 _wq_bucket_to_thread_qos(int pri)
 242 {
 243         if (pri == WORKQUEUE_EVENT_MANAGER_BUCKET) {
 244                 return WORKQUEUE_EVENT_MANAGER_BUCKET + 1;
 245         }
 246         return WORKQUEUE_EVENT_MANAGER_BUCKET - pri;
 247 }
 248
 249 #pragma mark wq_thactive
 250
 251 #if defined(__LP64__)
 252 // Layout is:
 253 //   7 * 16 bits for each QoS bucket request count (including manager)
 254 //   3 bits of best QoS among all pending constrained requests
 255 //   13 bits of zeroes
 256 #define WQ_THACTIVE_BUCKET_WIDTH 16
 257 #define WQ_THACTIVE_QOS_SHIFT    (7 * WQ_THACTIVE_BUCKET_WIDTH)
 258 #else
 259 // Layout is:
 260 //   6 * 10 bits for each QoS bucket request count (except manager)
 261 //   1 bit for the manager bucket
 262 //   3 bits of best QoS among all pending constrained requests
 263 #define WQ_THACTIVE_BUCKET_WIDTH 10
 264 #define WQ_THACTIVE_QOS_SHIFT    (6 * WQ_THACTIVE_BUCKET_WIDTH + 1)
 265 #endif
 266 #define WQ_THACTIVE_BUCKET_MASK  ((1U << WQ_THACTIVE_BUCKET_WIDTH) - 1)
 267 #define WQ_THACTIVE_BUCKET_HALF  (1U << (WQ_THACTIVE_BUCKET_WIDTH - 1))
 268 #define WQ_THACTIVE_NO_PENDING_REQUEST 6
 269
 270 _Static_assert(sizeof(wq_thactive_t) * CHAR_BIT - WQ_THACTIVE_QOS_SHIFT >= 3,
 271                 "Make sure we have space to encode a QoS");
 272
 273 static inline wq_thactive_t
 274 _wq_thactive_fetch_and_add(struct workqueue *wq, wq_thactive_t offset)
 275 {
 276 #if PTHREAD_INLINE_RMW_ATOMICS || !defined(__LP64__)
 277         return atomic_fetch_add_explicit(&wq->wq_thactive, offset,
 278                         memory_order_relaxed);
 279 #else
 280         return pthread_kern->atomic_fetch_add_128_relaxed(&wq->wq_thactive, offset);
 281 #endif
 282 }
 283
 284 static inline wq_thactive_t
 285 _wq_thactive(struct workqueue *wq)
 286 {
 287 #if PTHREAD_INLINE_RMW_ATOMICS || !defined(__LP64__)
 288         return atomic_load_explicit(&wq->wq_thactive, memory_order_relaxed);
 289 #else
 290         return pthread_kern->atomic_load_128_relaxed(&wq->wq_thactive);
 291 #endif
 292 }
 293
 294 #define WQ_THACTIVE_BEST_CONSTRAINED_REQ_QOS(tha) \
 295                 ((tha) >> WQ_THACTIVE_QOS_SHIFT)
 296
 297 static inline uint32_t
 298 _wq_thactive_best_constrained_req_qos(struct workqueue *wq)
 299 {
 300         // Avoid expensive atomic operations: the three bits we're loading are in
 301         // a single byte, and always updated under the workqueue lock
 302         wq_thactive_t v = *(wq_thactive_t *)&wq->wq_thactive;
 303         return WQ_THACTIVE_BEST_CONSTRAINED_REQ_QOS(v);
 304 }
 305
 306 static inline wq_thactive_t
 307 _wq_thactive_set_best_constrained_req_qos(struct workqueue *wq,
 308                 uint32_t orig_qos, uint32_t new_qos)
 309 {
 310         wq_thactive_t v;
 311         v = (wq_thactive_t)(new_qos - orig_qos) << WQ_THACTIVE_QOS_SHIFT;
 312         /*
 313          * We can do an atomic add relative to the initial load because updates
 314          * to this qos are always serialized under the workqueue lock.
 315          */
 316         return _wq_thactive_fetch_and_add(wq, v) + v;
 317 }
 318
 319 static inline wq_thactive_t
 320 _wq_thactive_offset_for_qos(int qos)
 321 {
 322         return (wq_thactive_t)1 << (qos * WQ_THACTIVE_BUCKET_WIDTH);
 323 }
 324
 325 static inline wq_thactive_t
 326 _wq_thactive_inc(struct workqueue *wq, int qos)
 327 {
 328         return _wq_thactive_fetch_and_add(wq, _wq_thactive_offset_for_qos(qos));
 329 }
 330
 331 static inline wq_thactive_t
 332 _wq_thactive_dec(struct workqueue *wq, int qos)
 333 {
 334         return _wq_thactive_fetch_and_add(wq, -_wq_thactive_offset_for_qos(qos));
 335 }
 336
 337 static inline wq_thactive_t
 338 _wq_thactive_move(struct workqueue *wq, int oldqos, int newqos)
 339 {
 340         return _wq_thactive_fetch_and_add(wq, _wq_thactive_offset_for_qos(newqos) -
 341                         _wq_thactive_offset_for_qos(oldqos));
 342 }
 343
 344 static inline uint32_t
 345 _wq_thactive_aggregate_downto_qos(struct workqueue *wq, wq_thactive_t v,
 346                 int qos, uint32_t *busycount, uint32_t *max_busycount)
 347 {
 348         uint32_t count = 0, active;
 349         uint64_t curtime;
 350
 351 #ifndef __LP64__
 352         /*
 353          * on 32bits the manager bucket is a single bit and the best constrained
 354          * request QoS 3 bits are where the 10 bits of a regular QoS bucket count
 355          * would be. Mask them out.
 356          */
 357         v &= ~(~0ull << WQ_THACTIVE_QOS_SHIFT);
 358 #endif
 359         if (busycount) {
 360                 curtime = mach_absolute_time();
 361                 *busycount = 0;
 362         }
 363         if (max_busycount) {
 364                 *max_busycount = qos + 1;
 365         }
 366         for (int i = 0; i <= qos; i++, v >>= WQ_THACTIVE_BUCKET_WIDTH) {
 367                 active = v & WQ_THACTIVE_BUCKET_MASK;
 368                 count += active;
 369                 if (busycount && wq->wq_thscheduled_count[i] > active) {
 370                         if (wq_thread_is_busy(curtime, &wq->wq_lastblocked_ts[i])) {
 371                                 /*
 372                                  * We only consider the last blocked thread for a given bucket
 373                                  * as busy because we don't want to take the list lock in each
 374                                  * sched callback. However this is an approximation that could
 375                                  * contribute to thread creation storms.
 376                                  */
 377                                 (*busycount)++;
 378                         }
 379                 }
 380         }
 381         return count;
 382 }
 383
 384 #pragma mark - Process/Thread Setup/Teardown syscalls
 385
 386 static mach_vm_offset_t
 387 stack_addr_hint(proc_t p, vm_map_t vmap)
 388 {
 389         mach_vm_offset_t stackaddr;
 390         mach_vm_offset_t aslr_offset;
 391         bool proc64bit = proc_is64bit(p);
 392
 393         // We can't safely take random values % something unless its a power-of-two
 394         _Static_assert(powerof2(PTH_DEFAULT_STACKSIZE), "PTH_DEFAULT_STACKSIZE is a power-of-two");
 395
 396 #if defined(__i386__) || defined(__x86_64__)
 397         if (proc64bit) {
 398                 // Matches vm_map_get_max_aslr_slide_pages's image shift in xnu
 399                 aslr_offset = random() % (1 << 28); // about 512 stacks
 400         } else {
 401                 // Actually bigger than the image shift, we've got ~256MB to work with
 402                 aslr_offset = random() % (16 * PTH_DEFAULT_STACKSIZE);
 403         }
 404         aslr_offset = vm_map_trunc_page_mask(aslr_offset, vm_map_page_mask(vmap));
 405         if (proc64bit) {
 406                 // Above nanomalloc range (see NANOZONE_SIGNATURE)
 407                 stackaddr = 0x700000000000 + aslr_offset;
 408         } else {
 409                 stackaddr = SHARED_REGION_BASE_I386 + SHARED_REGION_SIZE_I386 + aslr_offset;
 410         }
 411 #elif defined(__arm__) || defined(__arm64__)
 412         user_addr_t main_thread_stack_top = 0;
 413         if (pthread_kern->proc_get_user_stack) {
 414                 main_thread_stack_top = pthread_kern->proc_get_user_stack(p);
 415         }
 416         if (proc64bit && main_thread_stack_top) {
 417                 // The main thread stack position is randomly slid by xnu (c.f.
 418                 // load_main() in mach_loader.c), so basing pthread stack allocations
 419                 // where the main thread stack ends is already ASLRd and doing so
 420                 // avoids creating a gap in the process address space that may cause
 421                 // extra PTE memory usage. rdar://problem/33328206
 422                 stackaddr = vm_map_trunc_page_mask((vm_map_offset_t)main_thread_stack_top,
 423                                 vm_map_page_mask(vmap));
 424         } else {
 425                 // vm_map_get_max_aslr_slide_pages ensures 1MB of slide, we do better
 426                 aslr_offset = random() % ((proc64bit ? 4 : 2) * PTH_DEFAULT_STACKSIZE);
 427                 aslr_offset = vm_map_trunc_page_mask((vm_map_offset_t)aslr_offset,
 428                                 vm_map_page_mask(vmap));
 429                 if (proc64bit) {
 430                         // 64 stacks below shared region
 431                         stackaddr = SHARED_REGION_BASE_ARM64 - 64 * PTH_DEFAULT_STACKSIZE - aslr_offset;
 432                 } else {
 433                         // If you try to slide down from this point, you risk ending up in memory consumed by malloc
 434                         stackaddr = SHARED_REGION_BASE_ARM - 32 * PTH_DEFAULT_STACKSIZE + aslr_offset;
 435                 }
 436         }
 437 #else
 438 #error Need to define a stack address hint for this architecture
 439 #endif
 440         return stackaddr;
 441 }
 442
 443 /**
 444  * bsdthread_create system call.  Used by pthread_create.
 445  */
 446 int
 447 _bsdthread_create(struct proc *p, user_addr_t user_func, user_addr_t user_funcarg, user_addr_t user_stack, user_addr_t user_pthread, uint32_t flags, user_addr_t *retval)
 448 {
 449         kern_return_t kret;
 450         void * sright;
 451         int error = 0;
 452         int allocated = 0;
 453         mach_vm_offset_t stackaddr;
 454         mach_vm_size_t th_allocsize = 0;
 455         mach_vm_size_t th_guardsize;
 456         mach_vm_offset_t th_stack;
 457         mach_vm_offset_t th_pthread;
 458         mach_vm_offset_t th_tsd_base;
 459         mach_port_name_t th_thport;
 460         thread_t th;
 461         vm_map_t vmap = pthread_kern->current_map();
 462         task_t ctask = current_task();
 463         unsigned int policy, importance;
 464         uint32_t tsd_offset;
 465
 466         int isLP64 = 0;
 467
 468         if (pthread_kern->proc_get_register(p) == 0) {
 469                 return EINVAL;
 470         }
 471
 472         PTHREAD_TRACE(TRACE_pthread_thread_create | DBG_FUNC_START, flags, 0, 0, 0, 0);
 473
 474         isLP64 = proc_is64bit(p);
 475         th_guardsize = vm_map_page_size(vmap);
 476
 477         stackaddr = pthread_kern->proc_get_stack_addr_hint(p);
 478         kret = pthread_kern->thread_create(ctask, &th);
 479         if (kret != KERN_SUCCESS)
 480                 return(ENOMEM);
 481         thread_reference(th);
 482
 483         pthread_kern->thread_set_tag(th, THREAD_TAG_PTHREAD);
 484
 485         sright = (void *)pthread_kern->convert_thread_to_port(th);
 486         th_thport = pthread_kern->ipc_port_copyout_send(sright, pthread_kern->task_get_ipcspace(ctask));
 487         if (!MACH_PORT_VALID(th_thport)) {
 488                 error = EMFILE; // userland will convert this into a crash
 489                 goto out;
 490         }
 491
 492         if ((flags & PTHREAD_START_CUSTOM) == 0) {
 493                 mach_vm_size_t pthread_size =
 494                         vm_map_round_page_mask(pthread_kern->proc_get_pthsize(p) + PTHREAD_T_OFFSET, vm_map_page_mask(vmap));
 495                 th_allocsize = th_guardsize + user_stack + pthread_size;
 496                 user_stack += PTHREAD_T_OFFSET;
 497
 498                 kret = mach_vm_map(vmap, &stackaddr,
 499                                 th_allocsize,
 500                                 page_size-1,
 501                                 VM_MAKE_TAG(VM_MEMORY_STACK)| VM_FLAGS_ANYWHERE , NULL,
 502                                 0, FALSE, VM_PROT_DEFAULT, VM_PROT_ALL,
 503                                 VM_INHERIT_DEFAULT);
 504                 if (kret != KERN_SUCCESS){
 505                         kret = mach_vm_allocate(vmap,
 506                                         &stackaddr, th_allocsize,
 507                                         VM_MAKE_TAG(VM_MEMORY_STACK)| VM_FLAGS_ANYWHERE);
 508                 }
 509                 if (kret != KERN_SUCCESS) {
 510                         error = ENOMEM;
 511                         goto out;
 512                 }
 513
 514                 PTHREAD_TRACE(TRACE_pthread_thread_create|DBG_FUNC_NONE, th_allocsize, stackaddr, 0, 2, 0);
 515
 516                 allocated = 1;
 517                 /*
 518                  * The guard page is at the lowest address
 519                  * The stack base is the highest address
 520                  */
 521                 kret = mach_vm_protect(vmap,  stackaddr, th_guardsize, FALSE, VM_PROT_NONE);
 522
 523                 if (kret != KERN_SUCCESS) {
 524                         error = ENOMEM;
 525                         goto out1;
 526                 }
 527
 528                 th_pthread = stackaddr + th_guardsize + user_stack;
 529                 th_stack = th_pthread;
 530
 531                 /*
 532                 * Pre-fault the first page of the new thread's stack and the page that will
 533                 * contain the pthread_t structure.
 534                 */
 535                 if (vm_map_trunc_page_mask((vm_map_offset_t)(th_stack - C_64_REDZONE_LEN), vm_map_page_mask(vmap)) !=
 536                                 vm_map_trunc_page_mask((vm_map_offset_t)th_pthread, vm_map_page_mask(vmap))){
 537                         vm_fault( vmap,
 538                                         vm_map_trunc_page_mask((vm_map_offset_t)(th_stack - C_64_REDZONE_LEN), vm_map_page_mask(vmap)),
 539                                         VM_PROT_READ | VM_PROT_WRITE,
 540                                         FALSE,
 541                                         THREAD_UNINT, NULL, 0);
 542                 }
 543
 544                 vm_fault( vmap,
 545                                 vm_map_trunc_page_mask((vm_map_offset_t)th_pthread, vm_map_page_mask(vmap)),
 546                                 VM_PROT_READ | VM_PROT_WRITE,
 547                                 FALSE,
 548                                 THREAD_UNINT, NULL, 0);
 549
 550         } else {
 551                 th_stack = user_stack;
 552                 th_pthread = user_pthread;
 553
 554                 PTHREAD_TRACE(TRACE_pthread_thread_create|DBG_FUNC_NONE, 0, 0, 0, 3, 0);
 555         }
 556
 557         tsd_offset = pthread_kern->proc_get_pthread_tsd_offset(p);
 558         if (tsd_offset) {
 559                 th_tsd_base = th_pthread + tsd_offset;
 560                 kret = pthread_kern->thread_set_tsd_base(th, th_tsd_base);
 561                 if (kret == KERN_SUCCESS) {
 562                         flags |= PTHREAD_START_TSD_BASE_SET;
 563                 }
 564         }
 565
 566 #if defined(__i386__) || defined(__x86_64__)
 567         /*
 568          * Set up i386 registers & function call.
 569          */
 570         if (isLP64 == 0) {
 571                 x86_thread_state32_t state = {
 572                         .eip = (unsigned int)pthread_kern->proc_get_threadstart(p),
 573                         .eax = (unsigned int)th_pthread,
 574                         .ebx = (unsigned int)th_thport,
 575                         .ecx = (unsigned int)user_func,
 576                         .edx = (unsigned int)user_funcarg,
 577                         .edi = (unsigned int)user_stack,
 578                         .esi = (unsigned int)flags,
 579                         /*
 580                          * set stack pointer
 581                          */
 582                         .esp = (int)((vm_offset_t)(th_stack-C_32_STK_ALIGN))
 583                 };
 584
 585                 error = pthread_kern->thread_set_wq_state32(th, (thread_state_t)&state);
 586                 if (error != KERN_SUCCESS) {
 587                         error = EINVAL;
 588                         goto out;
 589                 }
 590         } else {
 591                 x86_thread_state64_t state64 = {
 592                         .rip = (uint64_t)pthread_kern->proc_get_threadstart(p),
 593                         .rdi = (uint64_t)th_pthread,
 594                         .rsi = (uint64_t)(th_thport),
 595                         .rdx = (uint64_t)user_func,
 596                         .rcx = (uint64_t)user_funcarg,
 597                         .r8 = (uint64_t)user_stack,
 598                         .r9 = (uint64_t)flags,
 599                         /*
 600                          * set stack pointer aligned to 16 byte boundary
 601                          */
 602                         .rsp = (uint64_t)(th_stack - C_64_REDZONE_LEN)
 603                 };
 604
 605                 error = pthread_kern->thread_set_wq_state64(th, (thread_state_t)&state64);
 606                 if (error != KERN_SUCCESS) {
 607                         error = EINVAL;
 608                         goto out;
 609                 }
 610
 611         }
 612 #elif defined(__arm__)
 613         arm_thread_state_t state = {
 614                 .pc = (int)pthread_kern->proc_get_threadstart(p),
 615                 .r[0] = (unsigned int)th_pthread,
 616                 .r[1] = (unsigned int)th_thport,
 617                 .r[2] = (unsigned int)user_func,
 618                 .r[3] = (unsigned int)user_funcarg,
 619                 .r[4] = (unsigned int)user_stack,
 620                 .r[5] = (unsigned int)flags,
 621
 622                 /* Set r7 & lr to 0 for better back tracing */
 623                 .r[7] = 0,
 624                 .lr = 0,
 625
 626                 /*
 627                  * set stack pointer
 628                  */
 629                 .sp = (int)((vm_offset_t)(th_stack-C_32_STK_ALIGN))
 630         };
 631
 632         (void) pthread_kern->thread_set_wq_state32(th, (thread_state_t)&state);
 633
 634 #else
 635 #error bsdthread_create  not defined for this architecture
 636 #endif
 637
 638         if ((flags & PTHREAD_START_SETSCHED) != 0) {
 639                 /* Set scheduling parameters if needed */
 640                 thread_extended_policy_data_t    extinfo;
 641                 thread_precedence_policy_data_t   precedinfo;
 642
 643                 importance = (flags & PTHREAD_START_IMPORTANCE_MASK);
 644                 policy = (flags >> PTHREAD_START_POLICY_BITSHIFT) & PTHREAD_START_POLICY_MASK;
 645
 646                 if (policy == SCHED_OTHER) {
 647                         extinfo.timeshare = 1;
 648                 } else {
 649                         extinfo.timeshare = 0;
 650                 }
 651
 652                 thread_policy_set(th, THREAD_EXTENDED_POLICY, (thread_policy_t)&extinfo, THREAD_EXTENDED_POLICY_COUNT);
 653
 654                 precedinfo.importance = (importance - BASEPRI_DEFAULT);
 655                 thread_policy_set(th, THREAD_PRECEDENCE_POLICY, (thread_policy_t)&precedinfo, THREAD_PRECEDENCE_POLICY_COUNT);
 656         } else if ((flags & PTHREAD_START_QOSCLASS) != 0) {
 657                 /* Set thread QoS class if requested. */
 658                 pthread_priority_t priority = (pthread_priority_t)(flags & PTHREAD_START_QOSCLASS_MASK);
 659
 660                 thread_qos_policy_data_t qos;
 661                 qos.qos_tier = pthread_priority_get_thread_qos(priority);
 662                 qos.tier_importance = (qos.qos_tier == QOS_CLASS_UNSPECIFIED) ? 0 :
 663                                 _pthread_priority_get_relpri(priority);
 664
 665                 pthread_kern->thread_policy_set_internal(th, THREAD_QOS_POLICY, (thread_policy_t)&qos, THREAD_QOS_POLICY_COUNT);
 666         }
 667
 668         if (pthread_kern->proc_get_mach_thread_self_tsd_offset) {
 669                 uint64_t mach_thread_self_offset =
 670                                 pthread_kern->proc_get_mach_thread_self_tsd_offset(p);
 671                 if (mach_thread_self_offset && tsd_offset) {
 672                         bool proc64bit = proc_is64bit(p);
 673                         if (proc64bit) {
 674                                 uint64_t th_thport_tsd = (uint64_t)th_thport;
 675                                 error = copyout(&th_thport_tsd, th_pthread + tsd_offset +
 676                                                 mach_thread_self_offset, sizeof(th_thport_tsd));
 677                         } else {
 678                                 uint32_t th_thport_tsd = (uint32_t)th_thport;
 679                                 error = copyout(&th_thport_tsd, th_pthread + tsd_offset +
 680                                                 mach_thread_self_offset, sizeof(th_thport_tsd));
 681                         }
 682                         if (error) {
 683                                 goto out1;
 684                         }
 685                 }
 686         }
 687
 688         kret = pthread_kern->thread_resume(th);
 689         if (kret != KERN_SUCCESS) {
 690                 error = EINVAL;
 691                 goto out1;
 692         }
 693         thread_deallocate(th);  /* drop the creator reference */
 694
 695         PTHREAD_TRACE(TRACE_pthread_thread_create|DBG_FUNC_END, error, th_pthread, 0, 0, 0);
 696
 697         // cast required as mach_vm_offset_t is always 64 bits even on 32-bit platforms
 698         *retval = (user_addr_t)th_pthread;
 699
 700         return(0);
 701
 702 out1:
 703         if (allocated != 0) {
 704                 (void)mach_vm_deallocate(vmap, stackaddr, th_allocsize);
 705         }
 706 out:
 707         (void)pthread_kern->mach_port_deallocate(pthread_kern->task_get_ipcspace(ctask), th_thport);
 708         if (pthread_kern->thread_will_park_or_terminate) {
 709                 pthread_kern->thread_will_park_or_terminate(th);
 710         }
 711         (void)thread_terminate(th);
 712         (void)thread_deallocate(th);
 713         return(error);
 714 }
 715
 716 /**
 717  * bsdthread_terminate system call.  Used by pthread_terminate
 718  */
 719 int
 720 _bsdthread_terminate(__unused struct proc *p,
 721                      user_addr_t stackaddr,
 722                      size_t size,
 723                      uint32_t kthport,
 724                      uint32_t sem,
 725                      __unused int32_t *retval)
 726 {
 727         mach_vm_offset_t freeaddr;
 728         mach_vm_size_t freesize;
 729         kern_return_t kret;
 730         thread_t th = current_thread();
 731
 732         freeaddr = (mach_vm_offset_t)stackaddr;
 733         freesize = size;
 734
 735         PTHREAD_TRACE(TRACE_pthread_thread_terminate|DBG_FUNC_START, freeaddr, freesize, kthport, 0xff, 0);
 736
 737         if ((freesize != (mach_vm_size_t)0) && (freeaddr != (mach_vm_offset_t)0)) {
 738                 if (pthread_kern->thread_get_tag(th) & THREAD_TAG_MAINTHREAD){
 739                         vm_map_t user_map = pthread_kern->current_map();
 740                         freesize = vm_map_trunc_page_mask((vm_map_offset_t)freesize - 1, vm_map_page_mask(user_map));
 741                         kret = mach_vm_behavior_set(user_map, freeaddr, freesize, VM_BEHAVIOR_REUSABLE);
 742                         assert(kret == KERN_SUCCESS || kret == KERN_INVALID_ADDRESS);
 743                         kret = kret ? kret : mach_vm_protect(user_map, freeaddr, freesize, FALSE, VM_PROT_NONE);
 744                         assert(kret == KERN_SUCCESS || kret == KERN_INVALID_ADDRESS);
 745                 } else {
 746                         kret = mach_vm_deallocate(pthread_kern->current_map(), freeaddr, freesize);
 747                         if (kret != KERN_SUCCESS) {
 748                                 PTHREAD_TRACE(TRACE_pthread_thread_terminate|DBG_FUNC_END, kret, 0, 0, 0, 0);
 749                                 return(EINVAL);
 750                         }
 751                 }
 752         }
 753
 754         if (pthread_kern->thread_will_park_or_terminate) {
 755                 pthread_kern->thread_will_park_or_terminate(th);
 756         }
 757         (void)thread_terminate(th);
 758         if (sem != MACH_PORT_NULL) {
 759                  kret = pthread_kern->semaphore_signal_internal_trap(sem);
 760                 if (kret != KERN_SUCCESS) {
 761                         PTHREAD_TRACE(TRACE_pthread_thread_terminate|DBG_FUNC_END, kret, 0, 0, 0, 0);
 762                         return(EINVAL);
 763                 }
 764         }
 765
 766         if (kthport != MACH_PORT_NULL) {
 767                 pthread_kern->mach_port_deallocate(pthread_kern->task_get_ipcspace(current_task()), kthport);
 768         }
 769
 770         PTHREAD_TRACE(TRACE_pthread_thread_terminate|DBG_FUNC_END, 0, 0, 0, 0, 0);
 771
 772         pthread_kern->thread_exception_return();
 773         panic("bsdthread_terminate: still running\n");
 774
 775         PTHREAD_TRACE(TRACE_pthread_thread_terminate|DBG_FUNC_END, 0, 0xff, 0, 0, 0);
 776
 777         return(0);
 778 }
 779
 780 /**
 781  * bsdthread_register system call.  Performs per-process setup.  Responsible for
 782  * returning capabilitiy bits to userspace and receiving userspace function addresses.
 783  */
 784 int
 785 _bsdthread_register(struct proc *p,
 786                     user_addr_t threadstart,
 787                     user_addr_t wqthread,
 788                     int pthsize,
 789                     user_addr_t pthread_init_data,
 790                     user_addr_t pthread_init_data_size,
 791                     uint64_t dispatchqueue_offset,
 792                     int32_t *retval)
 793 {
 794         struct _pthread_registration_data data = {};
 795         uint32_t max_tsd_offset;
 796         kern_return_t kr;
 797         size_t pthread_init_sz = 0;
 798
 799         /* syscall randomizer test can pass bogus values */
 800         if (pthsize < 0 || pthsize > MAX_PTHREAD_SIZE) {
 801                 return(EINVAL);
 802         }
 803         /*
 804          * if we have pthread_init_data, then we use that and target_concptr
 805          * (which is an offset) get data.
 806          */
 807         if (pthread_init_data != 0) {
 808                 if (pthread_init_data_size < sizeof(data.version)) {
 809                         return EINVAL;
 810                 }
 811                 pthread_init_sz = MIN(sizeof(data), (size_t)pthread_init_data_size);
 812                 int ret = copyin(pthread_init_data, &data, pthread_init_sz);
 813                 if (ret) {
 814                         return ret;
 815                 }
 816                 if (data.version != (size_t)pthread_init_data_size) {
 817                         return EINVAL;
 818                 }
 819         } else {
 820                 data.dispatch_queue_offset = dispatchqueue_offset;
 821         }
 822
 823         /* We have to do this before proc_get_register so that it resets after fork */
 824         mach_vm_offset_t stackaddr = stack_addr_hint(p, pthread_kern->current_map());
 825         pthread_kern->proc_set_stack_addr_hint(p, (user_addr_t)stackaddr);
 826
 827         /* prevent multiple registrations */
 828         if (pthread_kern->proc_get_register(p) != 0) {
 829                 return(EINVAL);
 830         }
 831
 832         pthread_kern->proc_set_threadstart(p, threadstart);
 833         pthread_kern->proc_set_wqthread(p, wqthread);
 834         pthread_kern->proc_set_pthsize(p, pthsize);
 835         pthread_kern->proc_set_register(p);
 836
 837         uint32_t tsd_slot_sz = proc_is64bit(p) ? sizeof(uint64_t) : sizeof(uint32_t);
 838         if ((uint32_t)pthsize >= tsd_slot_sz &&
 839                         data.tsd_offset <= (uint32_t)(pthsize - tsd_slot_sz)) {
 840                 max_tsd_offset = ((uint32_t)pthsize - data.tsd_offset - tsd_slot_sz);
 841         } else {
 842                 data.tsd_offset = 0;
 843                 max_tsd_offset = 0;
 844         }
 845         pthread_kern->proc_set_pthread_tsd_offset(p, data.tsd_offset);
 846
 847         if (data.dispatch_queue_offset > max_tsd_offset) {
 848                 data.dispatch_queue_offset = 0;
 849         }
 850         pthread_kern->proc_set_dispatchqueue_offset(p, data.dispatch_queue_offset);
 851
 852         if (pthread_kern->proc_set_return_to_kernel_offset) {
 853                 if (data.return_to_kernel_offset > max_tsd_offset) {
 854                         data.return_to_kernel_offset = 0;
 855                 }
 856                 pthread_kern->proc_set_return_to_kernel_offset(p,
 857                                 data.return_to_kernel_offset);
 858         }
 859
 860         if (pthread_kern->proc_set_mach_thread_self_tsd_offset) {
 861                 if (data.mach_thread_self_offset > max_tsd_offset) {
 862                         data.mach_thread_self_offset = 0;
 863                 }
 864                 pthread_kern->proc_set_mach_thread_self_tsd_offset(p,
 865                                 data.mach_thread_self_offset);
 866         }
 867
 868         if (pthread_init_data != 0) {
 869                 /* Outgoing data that userspace expects as a reply */
 870                 data.version = sizeof(struct _pthread_registration_data);
 871                 if (pthread_kern->qos_main_thread_active()) {
 872                         mach_msg_type_number_t nqos = THREAD_QOS_POLICY_COUNT;
 873                         thread_qos_policy_data_t qos;
 874                         boolean_t gd = FALSE;
 875
 876                         kr = pthread_kern->thread_policy_get(current_thread(), THREAD_QOS_POLICY, (thread_policy_t)&qos, &nqos, &gd);
 877                         if (kr != KERN_SUCCESS || qos.qos_tier == THREAD_QOS_UNSPECIFIED) {
 878                                 /* Unspecified threads means the kernel wants us to impose legacy upon the thread. */
 879                                 qos.qos_tier = THREAD_QOS_LEGACY;
 880                                 qos.tier_importance = 0;
 881
 882                                 kr = pthread_kern->thread_policy_set_internal(current_thread(), THREAD_QOS_POLICY, (thread_policy_t)&qos, THREAD_QOS_POLICY_COUNT);
 883                         }
 884
 885                         if (kr == KERN_SUCCESS) {
 886                                 data.main_qos = thread_qos_get_pthread_priority(qos.qos_tier);
 887                         } else {
 888                                 data.main_qos = _pthread_priority_make_newest(QOS_CLASS_UNSPECIFIED, 0, 0);
 889                         }
 890                 } else {
 891                         data.main_qos = _pthread_priority_make_newest(QOS_CLASS_UNSPECIFIED, 0, 0);
 892                 }
 893
 894                 kr = copyout(&data, pthread_init_data, pthread_init_sz);
 895                 if (kr != KERN_SUCCESS) {
 896                         return EINVAL;
 897                 }
 898         }
 899
 900         /* return the supported feature set as the return value. */
 901         *retval = PTHREAD_FEATURE_SUPPORTED;
 902
 903         return(0);
 904 }
 905
 906 #pragma mark - QoS Manipulation
 907
 908 int
 909 _bsdthread_ctl_set_qos(struct proc *p, user_addr_t __unused cmd, mach_port_name_t kport, user_addr_t tsd_priority_addr, user_addr_t arg3, int *retval)
 910 {
 911         int rv;
 912         thread_t th;
 913
 914         pthread_priority_t priority;
 915
 916         /* Unused parameters must be zero. */
 917         if (arg3 != 0) {
 918                 return EINVAL;
 919         }
 920
 921         /* QoS is stored in a given slot in the pthread TSD. We need to copy that in and set our QoS based on it. */
 922         if (proc_is64bit(p)) {
 923                 uint64_t v;
 924                 rv = copyin(tsd_priority_addr, &v, sizeof(v));
 925                 if (rv) goto out;
 926                 priority = (int)(v & 0xffffffff);
 927         } else {
 928                 uint32_t v;
 929                 rv = copyin(tsd_priority_addr, &v, sizeof(v));
 930                 if (rv) goto out;
 931                 priority = v;
 932         }
 933
 934         if ((th = port_name_to_thread(kport)) == THREAD_NULL) {
 935                 return ESRCH;
 936         }
 937
 938         /* <rdar://problem/16211829> Disable pthread_set_qos_class_np() on threads other than pthread_self */
 939         if (th != current_thread()) {
 940                 thread_deallocate(th);
 941                 return EPERM;
 942         }
 943
 944         rv = _bsdthread_ctl_set_self(p, 0, priority, 0, _PTHREAD_SET_SELF_QOS_FLAG, retval);
 945
 946         /* Static param the thread, we just set QoS on it, so its stuck in QoS land now. */
 947         /* pthread_kern->thread_static_param(th, TRUE); */ // see <rdar://problem/16433744>, for details
 948
 949         thread_deallocate(th);
 950
 951 out:
 952         return rv;
 953 }
 954
 955 static inline struct threadlist *
 956 util_get_thread_threadlist_entry(thread_t th)
 957 {
 958         struct uthread *uth = pthread_kern->get_bsdthread_info(th);
 959         if (uth) {
 960                 struct threadlist *tl = pthread_kern->uthread_get_threadlist(uth);
 961                 return tl;
 962         }
 963         return NULL;
 964 }
 965
 966 boolean_t
 967 _workq_thread_has_been_unbound(thread_t th, int qos_class)
 968 {
 969         struct threadlist *tl = util_get_thread_threadlist_entry(th);
 970         if (!tl) {
 971                 return FALSE;
 972         }
 973
 974         struct workqueue *wq = tl->th_workq;
 975         workqueue_lock_spin(wq);
 976
 977         if (tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET) {
 978                 goto failure;
 979         } else if (qos_class != class_index_get_thread_qos(tl->th_priority)) {
 980                 goto failure;
 981         }
 982
 983         if ((tl->th_flags & TH_LIST_KEVENT_BOUND)){
 984                 goto failure;
 985         }
 986         tl->th_flags &= ~TH_LIST_KEVENT_BOUND;
 987
 988         workqueue_unlock(wq);
 989         return TRUE;
 990
 991 failure:
 992         workqueue_unlock(wq);
 993         return FALSE;
 994 }
 995
 996 int
 997 _bsdthread_ctl_set_self(struct proc *p, user_addr_t __unused cmd, pthread_priority_t priority, mach_port_name_t voucher, _pthread_set_flags_t flags, int __unused *retval)
 998 {
 999         thread_qos_policy_data_t qos;
1000         mach_msg_type_number_t nqos = THREAD_QOS_POLICY_COUNT;
1001         boolean_t gd = FALSE;
1002         thread_t th = current_thread();
1003         struct workqueue *wq = NULL;
1004         struct threadlist *tl = NULL;
1005
1006         kern_return_t kr;
1007         int qos_rv = 0, voucher_rv = 0, fixedpri_rv = 0;
1008
1009         if ((flags & _PTHREAD_SET_SELF_WQ_KEVENT_UNBIND) != 0) {
1010                 tl = util_get_thread_threadlist_entry(th);
1011                 if (tl) {
1012                         wq = tl->th_workq;
1013                 } else {
1014                         goto qos;
1015                 }
1016
1017                 workqueue_lock_spin(wq);
1018                 if (tl->th_flags & TH_LIST_KEVENT_BOUND) {
1019                         tl->th_flags &= ~TH_LIST_KEVENT_BOUND;
1020                         unsigned int kevent_flags = KEVENT_FLAG_WORKQ | KEVENT_FLAG_UNBIND_CHECK_FLAGS;
1021                         if (tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET) {
1022                                 kevent_flags |= KEVENT_FLAG_WORKQ_MANAGER;
1023                         }
1024
1025                         workqueue_unlock(wq);
1026                         __assert_only int ret = kevent_qos_internal_unbind(p, class_index_get_thread_qos(tl->th_priority), th, kevent_flags);
1027                         assert(ret == 0);
1028                 } else {
1029                         workqueue_unlock(wq);
1030                 }
1031         }
1032
1033 qos:
1034         if ((flags & _PTHREAD_SET_SELF_QOS_FLAG) != 0) {
1035                 kr = pthread_kern->thread_policy_get(th, THREAD_QOS_POLICY, (thread_policy_t)&qos, &nqos, &gd);
1036                 if (kr != KERN_SUCCESS) {
1037                         qos_rv = EINVAL;
1038                         goto voucher;
1039                 }
1040
1041                 /*
1042                  * If we have main-thread QoS then we don't allow a thread to come out
1043                  * of QOS_CLASS_UNSPECIFIED.
1044                  */
1045                 if (pthread_kern->qos_main_thread_active() && qos.qos_tier ==
1046                                 THREAD_QOS_UNSPECIFIED) {
1047                         qos_rv = EPERM;
1048                         goto voucher;
1049                 }
1050
1051                 if (!tl) {
1052                         tl = util_get_thread_threadlist_entry(th);
1053                         if (tl) wq = tl->th_workq;
1054                 }
1055
1056                 PTHREAD_TRACE_WQ(TRACE_pthread_set_qos_self | DBG_FUNC_START, wq, qos.qos_tier, qos.tier_importance, 0, 0);
1057
1058                 qos.qos_tier = pthread_priority_get_thread_qos(priority);
1059                 qos.tier_importance = (qos.qos_tier == QOS_CLASS_UNSPECIFIED) ? 0 : _pthread_priority_get_relpri(priority);
1060
1061                 if (qos.qos_tier == QOS_CLASS_UNSPECIFIED ||
1062                                 qos.tier_importance > 0 || qos.tier_importance < THREAD_QOS_MIN_TIER_IMPORTANCE) {
1063                         qos_rv = EINVAL;
1064                         goto voucher;
1065                 }
1066
1067                 /*
1068                  * If we're a workqueue, the threadlist item priority needs adjusting,
1069                  * along with the bucket we were running in.
1070                  */
1071                 if (tl) {
1072                         bool try_run_threadreq = false;
1073
1074                         workqueue_lock_spin(wq);
1075                         kr = pthread_kern->thread_set_workq_qos(th, qos.qos_tier, qos.tier_importance);
1076                         assert(kr == KERN_SUCCESS || kr == KERN_TERMINATED);
1077
1078                         /* Fix up counters. */
1079                         uint8_t old_bucket = tl->th_priority;
1080                         uint8_t new_bucket = pthread_priority_get_class_index(priority);
1081
1082                         if (old_bucket != new_bucket) {
1083                                 _wq_thactive_move(wq, old_bucket, new_bucket);
1084                                 wq->wq_thscheduled_count[old_bucket]--;
1085                                 wq->wq_thscheduled_count[new_bucket]++;
1086                                 if (old_bucket == WORKQUEUE_EVENT_MANAGER_BUCKET ||
1087                                                 old_bucket < new_bucket) {
1088                                         /*
1089                                          * if the QoS of the thread was lowered, then this could
1090                                          * allow for a higher QoS thread request to run, so we need
1091                                          * to reevaluate.
1092                                          */
1093                                         try_run_threadreq = true;
1094                                 }
1095                                 tl->th_priority = new_bucket;
1096                         }
1097
1098                         bool old_overcommit = !(tl->th_flags & TH_LIST_CONSTRAINED);
1099                         bool new_overcommit = priority & _PTHREAD_PRIORITY_OVERCOMMIT_FLAG;
1100                         if (!old_overcommit && new_overcommit) {
1101                                 if (wq->wq_constrained_threads_scheduled-- ==
1102                                                 wq_max_constrained_threads) {
1103                                         try_run_threadreq = true;
1104                                 }
1105                                 tl->th_flags &= ~TH_LIST_CONSTRAINED;
1106                         } else if (old_overcommit && !new_overcommit) {
1107                                 wq->wq_constrained_threads_scheduled++;
1108                                 tl->th_flags |= TH_LIST_CONSTRAINED;
1109                         }
1110
1111                         if (try_run_threadreq) {
1112                                 workqueue_run_threadreq_and_unlock(p, wq, NULL, NULL, true);
1113                         } else {
1114                                 workqueue_unlock(wq);
1115                         }
1116                 } else {
1117                         kr = pthread_kern->thread_policy_set_internal(th, THREAD_QOS_POLICY, (thread_policy_t)&qos, THREAD_QOS_POLICY_COUNT);
1118                         if (kr != KERN_SUCCESS) {
1119                                 qos_rv = EINVAL;
1120                         }
1121                 }
1122
1123                 PTHREAD_TRACE_WQ(TRACE_pthread_set_qos_self | DBG_FUNC_END, wq, qos.qos_tier, qos.tier_importance, 0, 0);
1124         }
1125
1126 voucher:
1127         if ((flags & _PTHREAD_SET_SELF_VOUCHER_FLAG) != 0) {
1128                 kr = pthread_kern->thread_set_voucher_name(voucher);
1129                 if (kr != KERN_SUCCESS) {
1130                         voucher_rv = ENOENT;
1131                         goto fixedpri;
1132                 }
1133         }
1134
1135 fixedpri:
1136         if (qos_rv) goto done;
1137         if ((flags & _PTHREAD_SET_SELF_FIXEDPRIORITY_FLAG) != 0) {
1138                 thread_extended_policy_data_t extpol = {.timeshare = 0};
1139
1140                 if (!tl) tl  = util_get_thread_threadlist_entry(th);
1141                 if (tl) {
1142                         /* Not allowed on workqueue threads */
1143                         fixedpri_rv = ENOTSUP;
1144                         goto done;
1145                 }
1146
1147                 kr = pthread_kern->thread_policy_set_internal(th, THREAD_EXTENDED_POLICY, (thread_policy_t)&extpol, THREAD_EXTENDED_POLICY_COUNT);
1148                 if (kr != KERN_SUCCESS) {
1149                         fixedpri_rv = EINVAL;
1150                         goto done;
1151                 }
1152         } else if ((flags & _PTHREAD_SET_SELF_TIMESHARE_FLAG) != 0) {
1153                 thread_extended_policy_data_t extpol = {.timeshare = 1};
1154
1155                 if (!tl) tl = util_get_thread_threadlist_entry(th);
1156                 if (tl) {
1157                         /* Not allowed on workqueue threads */
1158                         fixedpri_rv = ENOTSUP;
1159                         goto done;
1160                 }
1161
1162                 kr = pthread_kern->thread_policy_set_internal(th, THREAD_EXTENDED_POLICY, (thread_policy_t)&extpol, THREAD_EXTENDED_POLICY_COUNT);
1163                 if (kr != KERN_SUCCESS) {
1164                         fixedpri_rv = EINVAL;
1165                         goto done;
1166                 }
1167         }
1168
1169 done:
1170         if (qos_rv && voucher_rv) {
1171                 /* Both failed, give that a unique error. */
1172                 return EBADMSG;
1173         }
1174
1175         if (qos_rv) {
1176                 return qos_rv;
1177         }
1178
1179         if (voucher_rv) {
1180                 return voucher_rv;
1181         }
1182
1183         if (fixedpri_rv) {
1184                 return fixedpri_rv;
1185         }
1186
1187         return 0;
1188 }
1189
1190 int
1191 _bsdthread_ctl_qos_override_start(struct proc __unused *p, user_addr_t __unused cmd, mach_port_name_t kport, pthread_priority_t priority, user_addr_t resource, int __unused *retval)
1192 {
1193         thread_t th;
1194         int rv = 0;
1195
1196         if ((th = port_name_to_thread(kport)) == THREAD_NULL) {
1197                 return ESRCH;
1198         }
1199
1200         int override_qos = pthread_priority_get_thread_qos(priority);
1201
1202         struct threadlist *tl = util_get_thread_threadlist_entry(th);
1203         if (tl) {
1204                 PTHREAD_TRACE_WQ(TRACE_wq_override_start | DBG_FUNC_NONE, tl->th_workq, thread_tid(th), 1, priority, 0);
1205         }
1206
1207         /* The only failure case here is if we pass a tid and have it lookup the thread, we pass the uthread, so this all always succeeds. */
1208         pthread_kern->proc_usynch_thread_qos_add_override_for_resource_check_owner(th, override_qos, TRUE,
1209                         resource, THREAD_QOS_OVERRIDE_TYPE_PTHREAD_EXPLICIT_OVERRIDE, USER_ADDR_NULL, MACH_PORT_NULL);
1210         thread_deallocate(th);
1211         return rv;
1212 }
1213
1214 int
1215 _bsdthread_ctl_qos_override_end(struct proc __unused *p, user_addr_t __unused cmd, mach_port_name_t kport, user_addr_t resource, user_addr_t arg3, int __unused *retval)
1216 {
1217         thread_t th;
1218         int rv = 0;
1219
1220         if (arg3 != 0) {
1221                 return EINVAL;
1222         }
1223
1224         if ((th = port_name_to_thread(kport)) == THREAD_NULL) {
1225                 return ESRCH;
1226         }
1227
1228         struct uthread *uth = pthread_kern->get_bsdthread_info(th);
1229
1230         struct threadlist *tl = util_get_thread_threadlist_entry(th);
1231         if (tl) {
1232                 PTHREAD_TRACE_WQ(TRACE_wq_override_end | DBG_FUNC_NONE, tl->th_workq, thread_tid(th), 0, 0, 0);
1233         }
1234
1235         pthread_kern->proc_usynch_thread_qos_remove_override_for_resource(current_task(), uth, 0, resource, THREAD_QOS_OVERRIDE_TYPE_PTHREAD_EXPLICIT_OVERRIDE);
1236
1237         thread_deallocate(th);
1238         return rv;
1239 }
1240
1241 static int
1242 _bsdthread_ctl_qos_dispatch_asynchronous_override_add_internal(mach_port_name_t kport, pthread_priority_t priority, user_addr_t resource, user_addr_t ulock_addr)
1243 {
1244         thread_t th;
1245         int rv = 0;
1246
1247         if ((th = port_name_to_thread(kport)) == THREAD_NULL) {
1248                 return ESRCH;
1249         }
1250
1251         int override_qos = pthread_priority_get_thread_qos(priority);
1252
1253         struct threadlist *tl = util_get_thread_threadlist_entry(th);
1254         if (!tl) {
1255                 thread_deallocate(th);
1256                 return EPERM;
1257         }
1258
1259         PTHREAD_TRACE_WQ(TRACE_wq_override_dispatch | DBG_FUNC_NONE, tl->th_workq, thread_tid(th), 1, priority, 0);
1260
1261         rv = pthread_kern->proc_usynch_thread_qos_add_override_for_resource_check_owner(th, override_qos, TRUE,
1262                         resource, THREAD_QOS_OVERRIDE_TYPE_DISPATCH_ASYNCHRONOUS_OVERRIDE, ulock_addr, kport);
1263
1264         thread_deallocate(th);
1265         return rv;
1266 }
1267
1268 int _bsdthread_ctl_qos_dispatch_asynchronous_override_add(struct proc __unused *p, user_addr_t __unused cmd,
1269                 mach_port_name_t kport, pthread_priority_t priority, user_addr_t resource, int __unused *retval)
1270 {
1271         return _bsdthread_ctl_qos_dispatch_asynchronous_override_add_internal(kport, priority, resource, USER_ADDR_NULL);
1272 }
1273
1274 int
1275 _bsdthread_ctl_qos_override_dispatch(struct proc *p __unused, user_addr_t cmd __unused, mach_port_name_t kport, pthread_priority_t priority, user_addr_t ulock_addr, int __unused *retval)
1276 {
1277         return _bsdthread_ctl_qos_dispatch_asynchronous_override_add_internal(kport, priority, USER_ADDR_NULL, ulock_addr);
1278 }
1279
1280 int
1281 _bsdthread_ctl_qos_override_reset(struct proc *p, user_addr_t cmd, user_addr_t arg1, user_addr_t arg2, user_addr_t arg3, int *retval)
1282 {
1283         if (arg1 != 0 || arg2 != 0 || arg3 != 0) {
1284                 return EINVAL;
1285         }
1286
1287         return _bsdthread_ctl_qos_dispatch_asynchronous_override_reset(p, cmd, 1 /* reset_all */, 0, 0, retval);
1288 }
1289
1290 int
1291 _bsdthread_ctl_qos_dispatch_asynchronous_override_reset(struct proc __unused *p, user_addr_t __unused cmd, int reset_all, user_addr_t resource, user_addr_t arg3, int __unused *retval)
1292 {
1293         if ((reset_all && (resource != 0)) || arg3 != 0) {
1294                 return EINVAL;
1295         }
1296
1297         thread_t th = current_thread();
1298         struct uthread *uth = pthread_kern->get_bsdthread_info(th);
1299         struct threadlist *tl = pthread_kern->uthread_get_threadlist(uth);
1300
1301         if (!tl) {
1302                 return EPERM;
1303         }
1304
1305         PTHREAD_TRACE_WQ(TRACE_wq_override_reset | DBG_FUNC_NONE, tl->th_workq, 0, 0, 0, 0);
1306
1307         resource = reset_all ? THREAD_QOS_OVERRIDE_RESOURCE_WILDCARD : resource;
1308         pthread_kern->proc_usynch_thread_qos_reset_override_for_resource(current_task(), uth, 0, resource, THREAD_QOS_OVERRIDE_TYPE_DISPATCH_ASYNCHRONOUS_OVERRIDE);
1309
1310         return 0;
1311 }
1312
1313 static int
1314 _bsdthread_ctl_max_parallelism(struct proc __unused *p, user_addr_t __unused cmd,
1315                 int qos, unsigned long flags, int *retval)
1316 {
1317         _Static_assert(QOS_PARALLELISM_COUNT_LOGICAL ==
1318                         _PTHREAD_QOS_PARALLELISM_COUNT_LOGICAL, "logical");
1319         _Static_assert(QOS_PARALLELISM_REALTIME ==
1320                         _PTHREAD_QOS_PARALLELISM_REALTIME, "realtime");
1321
1322         if (flags & ~(QOS_PARALLELISM_REALTIME | QOS_PARALLELISM_COUNT_LOGICAL)) {
1323                 return EINVAL;
1324         }
1325
1326         if (flags & QOS_PARALLELISM_REALTIME) {
1327                 if (qos) {
1328                         return EINVAL;
1329                 }
1330         } else if (qos == THREAD_QOS_UNSPECIFIED || qos >= THREAD_QOS_LAST) {
1331                 return EINVAL;
1332         }
1333
1334         *retval = pthread_kern->qos_max_parallelism(qos, flags);
1335         return 0;
1336 }
1337
1338 int
1339 _bsdthread_ctl(struct proc *p, user_addr_t cmd, user_addr_t arg1, user_addr_t arg2, user_addr_t arg3, int *retval)
1340 {
1341         switch (cmd) {
1342         case BSDTHREAD_CTL_SET_QOS:
1343                 return _bsdthread_ctl_set_qos(p, cmd, (mach_port_name_t)arg1, arg2, arg3, retval);
1344         case BSDTHREAD_CTL_QOS_OVERRIDE_START:
1345                 return _bsdthread_ctl_qos_override_start(p, cmd, (mach_port_name_t)arg1, (pthread_priority_t)arg2, arg3, retval);
1346         case BSDTHREAD_CTL_QOS_OVERRIDE_END:
1347                 return _bsdthread_ctl_qos_override_end(p, cmd, (mach_port_name_t)arg1, arg2, arg3, retval);
1348         case BSDTHREAD_CTL_QOS_OVERRIDE_RESET:
1349                 return _bsdthread_ctl_qos_override_reset(p, cmd, arg1, arg2, arg3, retval);
1350         case BSDTHREAD_CTL_QOS_OVERRIDE_DISPATCH:
1351                 return _bsdthread_ctl_qos_override_dispatch(p, cmd, (mach_port_name_t)arg1, (pthread_priority_t)arg2, arg3, retval);
1352         case BSDTHREAD_CTL_QOS_DISPATCH_ASYNCHRONOUS_OVERRIDE_ADD:
1353                 return _bsdthread_ctl_qos_dispatch_asynchronous_override_add(p, cmd, (mach_port_name_t)arg1, (pthread_priority_t)arg2, arg3, retval);
1354         case BSDTHREAD_CTL_QOS_DISPATCH_ASYNCHRONOUS_OVERRIDE_RESET:
1355                 return _bsdthread_ctl_qos_dispatch_asynchronous_override_reset(p, cmd, (int)arg1, arg2, arg3, retval);
1356         case BSDTHREAD_CTL_SET_SELF:
1357                 return _bsdthread_ctl_set_self(p, cmd, (pthread_priority_t)arg1, (mach_port_name_t)arg2, (_pthread_set_flags_t)arg3, retval);
1358         case BSDTHREAD_CTL_QOS_MAX_PARALLELISM:
1359                 return _bsdthread_ctl_max_parallelism(p, cmd, (int)arg1, (unsigned long)arg2, retval);
1360         default:
1361                 return EINVAL;
1362         }
1363 }
1364
1365 #pragma mark - Workqueue Implementation
1366
1367 #pragma mark wq_flags
1368
1369 static inline uint32_t
1370 _wq_flags(struct workqueue *wq)
1371 {
1372         return atomic_load_explicit(&wq->wq_flags, memory_order_relaxed);
1373 }
1374
1375 static inline bool
1376 _wq_exiting(struct workqueue *wq)
1377 {
1378         return _wq_flags(wq) & WQ_EXITING;
1379 }
1380
1381 static inline uint32_t
1382 _wq_flags_or_orig(struct workqueue *wq, uint32_t v)
1383 {
1384 #if PTHREAD_INLINE_RMW_ATOMICS
1385         uint32_t state;
1386         do {
1387                 state = _wq_flags(wq);
1388         } while (!OSCompareAndSwap(state, state | v, &wq->wq_flags));
1389         return state;
1390 #else
1391         return atomic_fetch_or_explicit(&wq->wq_flags, v, memory_order_relaxed);
1392 #endif
1393 }
1394
1395 static inline uint32_t
1396 _wq_flags_and_orig(struct workqueue *wq, uint32_t v)
1397 {
1398 #if PTHREAD_INLINE_RMW_ATOMICS
1399         uint32_t state;
1400         do {
1401                 state = _wq_flags(wq);
1402         } while (!OSCompareAndSwap(state, state & v, &wq->wq_flags));
1403         return state;
1404 #else
1405         return atomic_fetch_and_explicit(&wq->wq_flags, v, memory_order_relaxed);
1406 #endif
1407 }
1408
1409 static inline bool
1410 WQ_TIMER_DELAYED_NEEDED(struct workqueue *wq)
1411 {
1412         uint32_t oldflags, newflags;
1413         do {
1414                 oldflags = _wq_flags(wq);
1415                 if (oldflags & (WQ_EXITING | WQ_ATIMER_DELAYED_RUNNING)) {
1416                         return false;
1417                 }
1418                 newflags = oldflags | WQ_ATIMER_DELAYED_RUNNING;
1419         } while (!OSCompareAndSwap(oldflags, newflags, &wq->wq_flags));
1420         return true;
1421 }
1422
1423 static inline bool
1424 WQ_TIMER_IMMEDIATE_NEEDED(struct workqueue *wq)
1425 {
1426         uint32_t oldflags, newflags;
1427         do {
1428                 oldflags = _wq_flags(wq);
1429                 if (oldflags & (WQ_EXITING | WQ_ATIMER_IMMEDIATE_RUNNING)) {
1430                         return false;
1431                 }
1432                 newflags = oldflags | WQ_ATIMER_IMMEDIATE_RUNNING;
1433         } while (!OSCompareAndSwap(oldflags, newflags, &wq->wq_flags));
1434         return true;
1435 }
1436
1437 #pragma mark thread requests pacing
1438
1439 static inline uint32_t
1440 _wq_pacing_shift_for_pri(int pri)
1441 {
1442         return _wq_bucket_to_thread_qos(pri) - 1;
1443 }
1444
1445 static inline int
1446 _wq_highest_paced_priority(struct workqueue *wq)
1447 {
1448         uint8_t paced = wq->wq_paced;
1449         int msb = paced ? 32 - __builtin_clz(paced) : 0; // fls(paced) == bit + 1
1450         return WORKQUEUE_EVENT_MANAGER_BUCKET - msb;
1451 }
1452
1453 static inline uint8_t
1454 _wq_pacing_bit_for_pri(int pri)
1455 {
1456         return 1u << _wq_pacing_shift_for_pri(pri);
1457 }
1458
1459 static inline bool
1460 _wq_should_pace_priority(struct workqueue *wq, int pri)
1461 {
1462         return wq->wq_paced >= _wq_pacing_bit_for_pri(pri);
1463 }
1464
1465 static inline void
1466 _wq_pacing_start(struct workqueue *wq, struct threadlist *tl)
1467 {
1468         uint8_t bit = _wq_pacing_bit_for_pri(tl->th_priority);
1469         assert((tl->th_flags & TH_LIST_PACING) == 0);
1470         assert((wq->wq_paced & bit) == 0);
1471         wq->wq_paced |= bit;
1472         tl->th_flags |= TH_LIST_PACING;
1473 }
1474
1475 static inline bool
1476 _wq_pacing_end(struct workqueue *wq, struct threadlist *tl)
1477 {
1478         if (tl->th_flags & TH_LIST_PACING) {
1479                 uint8_t bit = _wq_pacing_bit_for_pri(tl->th_priority);
1480                 assert((wq->wq_paced & bit) != 0);
1481                 wq->wq_paced ^= bit;
1482                 tl->th_flags &= ~TH_LIST_PACING;
1483                 return wq->wq_paced < bit; // !_wq_should_pace_priority
1484         }
1485         return false;
1486 }
1487
1488 #pragma mark thread requests
1489
1490 static void
1491 _threadreq_init_alloced(struct threadreq *req, int priority, int flags)
1492 {
1493         assert((flags & TR_FLAG_ONSTACK) == 0);
1494         req->tr_state = TR_STATE_NEW;
1495         req->tr_priority = priority;
1496         req->tr_flags = flags;
1497 }
1498
1499 static void
1500 _threadreq_init_stack(struct threadreq *req, int priority, int flags)
1501 {
1502         req->tr_state = TR_STATE_NEW;
1503         req->tr_priority = priority;
1504         req->tr_flags = flags | TR_FLAG_ONSTACK;
1505 }
1506
1507 static void
1508 _threadreq_copy_prepare(struct workqueue *wq)
1509 {
1510 again:
1511         if (wq->wq_cached_threadreq) {
1512                 return;
1513         }
1514
1515         workqueue_unlock(wq);
1516         struct threadreq *req = zalloc(pthread_zone_threadreq);
1517         workqueue_lock_spin(wq);
1518
1519         if (wq->wq_cached_threadreq) {
1520                 /*
1521                  * We lost the race and someone left behind an extra threadreq for us
1522                  * to use.  Throw away our request and retry.
1523                  */
1524                 workqueue_unlock(wq);
1525                 zfree(pthread_zone_threadreq, req);
1526                 workqueue_lock_spin(wq);
1527                 goto again;
1528         } else {
1529                 wq->wq_cached_threadreq = req;
1530         }
1531
1532         assert(wq->wq_cached_threadreq);
1533 }
1534
1535 static bool
1536 _threadreq_copy_prepare_noblock(struct workqueue *wq)
1537 {
1538         if (wq->wq_cached_threadreq) {
1539                 return true;
1540         }
1541
1542         wq->wq_cached_threadreq = zalloc_noblock(pthread_zone_threadreq);
1543
1544         return wq->wq_cached_threadreq != NULL;
1545 }
1546
1547 static inline struct threadreq_head *
1548 _threadreq_list_for_req(struct workqueue *wq, const struct threadreq *req)
1549 {
1550         if (req->tr_flags & TR_FLAG_OVERCOMMIT) {
1551                 return &wq->wq_overcommit_reqlist[req->tr_priority];
1552         } else {
1553                 return &wq->wq_reqlist[req->tr_priority];
1554         }
1555 }
1556
1557 static void
1558 _threadreq_enqueue(struct workqueue *wq, struct threadreq *req)
1559 {
1560         assert(req && req->tr_state == TR_STATE_NEW);
1561         if (req->tr_priority == WORKQUEUE_EVENT_MANAGER_BUCKET) {
1562                 assert(wq->wq_event_manager_threadreq.tr_state != TR_STATE_WAITING);
1563                 memcpy(&wq->wq_event_manager_threadreq, req, sizeof(struct threadreq));
1564                 req = &wq->wq_event_manager_threadreq;
1565                 req->tr_flags &= ~(TR_FLAG_ONSTACK | TR_FLAG_NO_PACING);
1566         } else {
1567                 if (req->tr_flags & TR_FLAG_ONSTACK) {
1568                         assert(wq->wq_cached_threadreq);
1569                         struct threadreq *newreq = wq->wq_cached_threadreq;
1570                         wq->wq_cached_threadreq = NULL;
1571
1572                         memcpy(newreq, req, sizeof(struct threadreq));
1573                         newreq->tr_flags &= ~(TR_FLAG_ONSTACK | TR_FLAG_NO_PACING);
1574                         req->tr_state = TR_STATE_DEAD;
1575                         req = newreq;
1576                 }
1577                 TAILQ_INSERT_TAIL(_threadreq_list_for_req(wq, req), req, tr_entry);
1578         }
1579         req->tr_state = TR_STATE_WAITING;
1580         wq->wq_reqcount++;
1581 }
1582
1583 static void
1584 _threadreq_dequeue(struct workqueue *wq, struct threadreq *req)
1585 {
1586         if (req->tr_priority != WORKQUEUE_EVENT_MANAGER_BUCKET) {
1587                 struct threadreq_head *req_list = _threadreq_list_for_req(wq, req);
1588 #if DEBUG
1589                 struct threadreq *cursor = NULL;
1590                 TAILQ_FOREACH(cursor, req_list, tr_entry) {
1591                         if (cursor == req) break;
1592                 }
1593                 assert(cursor == req);
1594 #endif
1595                 TAILQ_REMOVE(req_list, req, tr_entry);
1596         }
1597         wq->wq_reqcount--;
1598 }
1599
1600 /*
1601  * Mark a thread request as complete.  At this point, it is treated as owned by
1602  * the submitting subsystem and you should assume it could be freed.
1603  *
1604  * Called with the workqueue lock held.
1605  */
1606 static int
1607 _threadreq_complete_and_unlock(proc_t p, struct workqueue *wq,
1608                 struct threadreq *req, struct threadlist *tl)
1609 {
1610         struct threadreq *req_tofree = NULL;
1611         bool sync = (req->tr_state == TR_STATE_NEW);
1612         bool workloop = req->tr_flags & TR_FLAG_WORKLOOP;
1613         bool onstack = req->tr_flags & TR_FLAG_ONSTACK;
1614         bool kevent = req->tr_flags & TR_FLAG_KEVENT;
1615         bool unbinding = tl->th_flags & TH_LIST_UNBINDING;
1616         bool locked = true;
1617         bool waking_parked_thread = (tl->th_flags & TH_LIST_BUSY);
1618         int ret;
1619
1620         req->tr_state = TR_STATE_COMPLETE;
1621
1622         if (!workloop && !onstack && req != &wq->wq_event_manager_threadreq) {
1623                 if (wq->wq_cached_threadreq) {
1624                         req_tofree = req;
1625                 } else {
1626                         wq->wq_cached_threadreq = req;
1627                 }
1628         }
1629
1630         if (tl->th_flags & TH_LIST_UNBINDING) {
1631                 tl->th_flags &= ~TH_LIST_UNBINDING;
1632                 assert((tl->th_flags & TH_LIST_KEVENT_BOUND));
1633         } else if (workloop || kevent) {
1634                 assert((tl->th_flags & TH_LIST_KEVENT_BOUND) == 0);
1635                 tl->th_flags |= TH_LIST_KEVENT_BOUND;
1636         }
1637
1638         if (workloop) {
1639                 workqueue_unlock(wq);
1640                 ret = pthread_kern->workloop_fulfill_threadreq(wq->wq_proc, (void*)req,
1641                                 tl->th_thread, sync ? WORKLOOP_FULFILL_THREADREQ_SYNC : 0);
1642                 assert(ret == 0);
1643                 locked = false;
1644         } else if (kevent) {
1645                 unsigned int kevent_flags = KEVENT_FLAG_WORKQ;
1646                 if (sync) {
1647                         kevent_flags |= KEVENT_FLAG_SYNCHRONOUS_BIND;
1648                 }
1649                 if (tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET) {
1650                         kevent_flags |= KEVENT_FLAG_WORKQ_MANAGER;
1651                 }
1652                 workqueue_unlock(wq);
1653                 ret = kevent_qos_internal_bind(wq->wq_proc,
1654                                 class_index_get_thread_qos(tl->th_priority), tl->th_thread,
1655                                 kevent_flags);
1656                 if (ret != 0) {
1657                         workqueue_lock_spin(wq);
1658                         tl->th_flags &= ~TH_LIST_KEVENT_BOUND;
1659                         locked = true;
1660                 } else {
1661                         locked = false;
1662                 }
1663         }
1664
1665         /*
1666          * Run Thread, Run!
1667          */
1668         PTHREAD_TRACE_WQ(TRACE_wq_run_threadreq | DBG_FUNC_END, wq, 0, 0, 0, 0);
1669         PTHREAD_TRACE_WQ_REQ(TRACE_wq_runitem | DBG_FUNC_START, wq, req, tl->th_priority,
1670                         thread_tid(current_thread()), thread_tid(tl->th_thread));
1671
1672         if (waking_parked_thread) {
1673                 if (!locked) {
1674                         workqueue_lock_spin(wq);
1675                 }
1676                 tl->th_flags &= ~(TH_LIST_BUSY);
1677                 if ((tl->th_flags & TH_LIST_REMOVING_VOUCHER) == 0) {
1678                         /*
1679                          * If the thread is in the process of removing its voucher, then it
1680                          * isn't actually in the wait event yet and we don't need to wake
1681                          * it up.  Save the trouble (and potential lock-ordering issues
1682                          * (see 30617015)).
1683                          */
1684                         thread_wakeup_thread(tl, tl->th_thread);
1685                 }
1686                 workqueue_unlock(wq);
1687
1688                 if (req_tofree) zfree(pthread_zone_threadreq, req_tofree);
1689                 return WQ_RUN_TR_THREAD_STARTED;
1690         }
1691
1692         assert ((tl->th_flags & TH_LIST_PACING) == 0);
1693         if (locked) {
1694                 workqueue_unlock(wq);
1695         }
1696         if (req_tofree) zfree(pthread_zone_threadreq, req_tofree);
1697         if (unbinding) {
1698                 return WQ_RUN_TR_THREAD_STARTED;
1699         }
1700         _setup_wqthread(p, tl->th_thread, wq, tl, WQ_SETUP_CLEAR_VOUCHER);
1701         pthread_kern->unix_syscall_return(EJUSTRETURN);
1702         __builtin_unreachable();
1703 }
1704
1705 /*
1706  * Mark a thread request as cancelled.  Has similar ownership semantics to the
1707  * complete call above.
1708  */
1709 static void
1710 _threadreq_cancel(struct workqueue *wq, struct threadreq *req)
1711 {
1712         assert(req->tr_state == TR_STATE_WAITING);
1713         req->tr_state = TR_STATE_DEAD;
1714
1715         assert((req->tr_flags & TR_FLAG_ONSTACK) == 0);
1716         if (req->tr_flags & TR_FLAG_WORKLOOP) {
1717                 __assert_only int ret;
1718                 ret = pthread_kern->workloop_fulfill_threadreq(wq->wq_proc, (void*)req,
1719                                 THREAD_NULL, WORKLOOP_FULFILL_THREADREQ_CANCEL);
1720                 assert(ret == 0 || ret == ECANCELED);
1721         } else if (req != &wq->wq_event_manager_threadreq) {
1722                 zfree(pthread_zone_threadreq, req);
1723         }
1724 }
1725
1726 #pragma mark workqueue lock
1727
1728 static boolean_t workqueue_lock_spin_is_acquired_kdp(struct workqueue *wq) {
1729   return kdp_lck_spin_is_acquired(&wq->wq_lock);
1730 }
1731
1732 static void
1733 workqueue_lock_spin(struct workqueue *wq)
1734 {
1735         assert(ml_get_interrupts_enabled() == TRUE);
1736         lck_spin_lock(&wq->wq_lock);
1737 }
1738
1739 static bool
1740 workqueue_lock_try(struct workqueue *wq)
1741 {
1742         return lck_spin_try_lock(&wq->wq_lock);
1743 }
1744
1745 static void
1746 workqueue_unlock(struct workqueue *wq)
1747 {
1748         lck_spin_unlock(&wq->wq_lock);
1749 }
1750
1751 #pragma mark workqueue add timer
1752
1753 /**
1754  * Sets up the timer which will call out to workqueue_add_timer
1755  */
1756 static void
1757 workqueue_interval_timer_start(struct workqueue *wq)
1758 {
1759         uint64_t deadline;
1760
1761         /* n.b. wq_timer_interval is reset to 0 in workqueue_add_timer if the
1762          ATIMER_RUNNING flag is not present.  The net effect here is that if a
1763          sequence of threads is required, we'll double the time before we give out
1764          the next one. */
1765         if (wq->wq_timer_interval == 0) {
1766                 wq->wq_timer_interval = wq_stalled_window_usecs;
1767
1768         } else {
1769                 wq->wq_timer_interval = wq->wq_timer_interval * 2;
1770
1771                 if (wq->wq_timer_interval > wq_max_timer_interval_usecs) {
1772                         wq->wq_timer_interval = wq_max_timer_interval_usecs;
1773                 }
1774         }
1775         clock_interval_to_deadline(wq->wq_timer_interval, 1000, &deadline);
1776
1777         PTHREAD_TRACE_WQ(TRACE_wq_start_add_timer, wq, wq->wq_reqcount,
1778                         _wq_flags(wq), wq->wq_timer_interval, 0);
1779
1780         thread_call_t call = wq->wq_atimer_delayed_call;
1781         if (thread_call_enter1_delayed(call, call, deadline)) {
1782                 panic("delayed_call was already enqueued");
1783         }
1784 }
1785
1786 /**
1787  * Immediately trigger the workqueue_add_timer
1788  */
1789 static void
1790 workqueue_interval_timer_trigger(struct workqueue *wq)
1791 {
1792         PTHREAD_TRACE_WQ(TRACE_wq_start_add_timer, wq, wq->wq_reqcount,
1793                         _wq_flags(wq), 0, 0);
1794
1795         thread_call_t call = wq->wq_atimer_immediate_call;
1796         if (thread_call_enter1(call, call)) {
1797                 panic("immediate_call was already enqueued");
1798         }
1799 }
1800
1801 /**
1802  * returns whether lastblocked_tsp is within wq_stalled_window_usecs of cur_ts
1803  */
1804 static boolean_t
1805 wq_thread_is_busy(uint64_t cur_ts, _Atomic uint64_t *lastblocked_tsp)
1806 {
1807         clock_sec_t     secs;
1808         clock_usec_t    usecs;
1809         uint64_t lastblocked_ts;
1810         uint64_t elapsed;
1811
1812         lastblocked_ts = atomic_load_explicit(lastblocked_tsp, memory_order_relaxed);
1813         if (lastblocked_ts >= cur_ts) {
1814                 /*
1815                  * because the update of the timestamp when a thread blocks isn't
1816                  * serialized against us looking at it (i.e. we don't hold the workq lock)
1817                  * it's possible to have a timestamp that matches the current time or
1818                  * that even looks to be in the future relative to when we grabbed the current
1819                  * time... just treat this as a busy thread since it must have just blocked.
1820                  */
1821                 return (TRUE);
1822         }
1823         elapsed = cur_ts - lastblocked_ts;
1824
1825         pthread_kern->absolutetime_to_microtime(elapsed, &secs, &usecs);
1826
1827         return (secs == 0 && usecs < wq_stalled_window_usecs);
1828 }
1829
1830 /**
1831  * handler function for the timer
1832  */
1833 static void
1834 workqueue_add_timer(struct workqueue *wq, thread_call_t thread_call_self)
1835 {
1836         proc_t p = wq->wq_proc;
1837
1838         workqueue_lock_spin(wq);
1839
1840         PTHREAD_TRACE_WQ(TRACE_wq_add_timer | DBG_FUNC_START, wq,
1841                         _wq_flags(wq), wq->wq_nthreads, wq->wq_thidlecount, 0);
1842
1843         /*
1844          * There's two tricky issues here.
1845          *
1846          * First issue: we start the thread_call's that invoke this routine without
1847          * the workqueue lock held.  The scheduler callback needs to trigger
1848          * reevaluation of the number of running threads but shouldn't take that
1849          * lock, so we can't use it to synchronize state around the thread_call.
1850          * As a result, it might re-enter the thread_call while this routine is
1851          * already running.  This could cause it to fire a second time and we'll
1852          * have two add_timers running at once.  Obviously, we don't want that to
1853          * keep stacking, so we need to keep it at two timers.
1854          *
1855          * Solution: use wq_flags (accessed via atomic CAS) to synchronize the
1856          * enqueue of the thread_call itself.  When a thread needs to trigger the
1857          * add_timer, it checks for ATIMER_DELAYED_RUNNING and, when not set, sets
1858          * the flag then does a thread_call_enter.  We'll then remove that flag
1859          * only once we've got the lock and it's safe for the thread_call to be
1860          * entered again.
1861          *
1862          * Second issue: we need to make sure that the two timers don't execute this
1863          * routine concurrently.  We can't use the workqueue lock for this because
1864          * we'll need to drop it during our execution.
1865          *
1866          * Solution: use WQL_ATIMER_BUSY as a condition variable to indicate that
1867          * we are currently executing the routine and the next thread should wait.
1868          *
1869          * After all that, we arrive at the following four possible states:
1870          * !WQ_ATIMER_DELAYED_RUNNING && !WQL_ATIMER_BUSY       no pending timer, no active timer
1871          * !WQ_ATIMER_DELAYED_RUNNING &&  WQL_ATIMER_BUSY       no pending timer,  1 active timer
1872          *  WQ_ATIMER_DELAYED_RUNNING && !WQL_ATIMER_BUSY        1 pending timer, no active timer
1873          *  WQ_ATIMER_DELAYED_RUNNING &&  WQL_ATIMER_BUSY        1 pending timer,  1 active timer
1874          *
1875          * Further complication sometimes we need to trigger this function to run
1876          * without delay.  Because we aren't under a lock between setting
1877          * WQ_ATIMER_DELAYED_RUNNING and calling thread_call_enter, we can't simply
1878          * re-enter the thread call: if thread_call_enter() returned false, we
1879          * wouldn't be able to distinguish the case where the thread_call had
1880          * already fired from the case where it hadn't been entered yet from the
1881          * other thread.  So, we use a separate thread_call for immediate
1882          * invocations, and a separate RUNNING flag, WQ_ATIMER_IMMEDIATE_RUNNING.
1883          */
1884
1885         while (wq->wq_lflags & WQL_ATIMER_BUSY) {
1886                 wq->wq_lflags |= WQL_ATIMER_WAITING;
1887
1888                 assert_wait((caddr_t)wq, (THREAD_UNINT));
1889                 workqueue_unlock(wq);
1890
1891                 thread_block(THREAD_CONTINUE_NULL);
1892
1893                 workqueue_lock_spin(wq);
1894         }
1895         /*
1896          * Prevent _workqueue_mark_exiting() from going away
1897          */
1898         wq->wq_lflags |= WQL_ATIMER_BUSY;
1899
1900         /*
1901          * Decide which timer we are and remove the RUNNING flag.
1902          */
1903         if (thread_call_self == wq->wq_atimer_delayed_call) {
1904                 uint64_t wq_flags = _wq_flags_and_orig(wq, ~WQ_ATIMER_DELAYED_RUNNING);
1905                 if ((wq_flags & WQ_ATIMER_DELAYED_RUNNING) == 0) {
1906                         panic("workqueue_add_timer(delayed) w/o WQ_ATIMER_DELAYED_RUNNING");
1907                 }
1908         } else if (thread_call_self == wq->wq_atimer_immediate_call) {
1909                 uint64_t wq_flags = _wq_flags_and_orig(wq, ~WQ_ATIMER_IMMEDIATE_RUNNING);
1910                 if ((wq_flags & WQ_ATIMER_IMMEDIATE_RUNNING) == 0) {
1911                         panic("workqueue_add_timer(immediate) w/o WQ_ATIMER_IMMEDIATE_RUNNING");
1912                 }
1913         } else {
1914                 panic("workqueue_add_timer can't figure out which timer it is");
1915         }
1916
1917         int ret = WQ_RUN_TR_THREAD_STARTED;
1918         while (ret == WQ_RUN_TR_THREAD_STARTED && wq->wq_reqcount) {
1919                 ret = workqueue_run_threadreq_and_unlock(p, wq, NULL, NULL, true);
1920
1921                 workqueue_lock_spin(wq);
1922         }
1923         _threadreq_copy_prepare(wq);
1924
1925         /*
1926          * If we called WQ_TIMER_NEEDED above, then this flag will be set if that
1927          * call marked the timer running.  If so, we let the timer interval grow.
1928          * Otherwise, we reset it back to 0.
1929          */
1930         uint32_t wq_flags = _wq_flags(wq);
1931         if (!(wq_flags & WQ_ATIMER_DELAYED_RUNNING)) {
1932                 wq->wq_timer_interval = 0;
1933         }
1934
1935         wq->wq_lflags &= ~WQL_ATIMER_BUSY;
1936
1937         if ((wq_flags & WQ_EXITING) || (wq->wq_lflags & WQL_ATIMER_WAITING)) {
1938                 /*
1939                  * wakeup the thread hung up in _workqueue_mark_exiting or
1940                  * workqueue_add_timer waiting for this timer to finish getting out of
1941                  * the way
1942                  */
1943                 wq->wq_lflags &= ~WQL_ATIMER_WAITING;
1944                 wakeup(wq);
1945         }
1946
1947         PTHREAD_TRACE_WQ(TRACE_wq_add_timer | DBG_FUNC_END, wq, 0, wq->wq_nthreads, wq->wq_thidlecount, 0);
1948
1949         workqueue_unlock(wq);
1950 }
1951
1952 #pragma mark thread state tracking
1953
1954 // called by spinlock code when trying to yield to lock owner
1955 void
1956 _workqueue_thread_yielded(void)
1957 {
1958 }
1959
1960 static void
1961 workqueue_callback(int type, thread_t thread)
1962 {
1963         struct uthread *uth = pthread_kern->get_bsdthread_info(thread);
1964         struct threadlist *tl = pthread_kern->uthread_get_threadlist(uth);
1965         struct workqueue *wq = tl->th_workq;
1966         uint32_t old_count, req_qos, qos = tl->th_priority;
1967         wq_thactive_t old_thactive;
1968
1969         switch (type) {
1970         case SCHED_CALL_BLOCK: {
1971                 bool start_timer = false;
1972
1973                 old_thactive = _wq_thactive_dec(wq, tl->th_priority);
1974                 req_qos = WQ_THACTIVE_BEST_CONSTRAINED_REQ_QOS(old_thactive);
1975                 old_count = _wq_thactive_aggregate_downto_qos(wq, old_thactive,
1976                                 qos, NULL, NULL);
1977
1978                 if (old_count == wq_max_concurrency[tl->th_priority]) {
1979                         /*
1980                          * The number of active threads at this priority has fallen below
1981                          * the maximum number of concurrent threads that are allowed to run
1982                          *
1983                          * if we collide with another thread trying to update the
1984                          * last_blocked (really unlikely since another thread would have to
1985                          * get scheduled and then block after we start down this path), it's
1986                          * not a problem.  Either timestamp is adequate, so no need to retry
1987                          */
1988                         atomic_store_explicit(&wq->wq_lastblocked_ts[qos],
1989                                         mach_absolute_time(), memory_order_relaxed);
1990                 }
1991
1992                 if (req_qos == WORKQUEUE_EVENT_MANAGER_BUCKET || qos > req_qos) {
1993                         /*
1994                          * The blocking thread is at a lower QoS than the highest currently
1995                          * pending constrained request, nothing has to be redriven
1996                          */
1997                 } else {
1998                         uint32_t max_busycount, old_req_count;
1999                         old_req_count = _wq_thactive_aggregate_downto_qos(wq, old_thactive,
2000                                         req_qos, NULL, &max_busycount);
2001                         /*
2002                          * If it is possible that may_start_constrained_thread had refused
2003                          * admission due to being over the max concurrency, we may need to
2004                          * spin up a new thread.
2005                          *
2006                          * We take into account the maximum number of busy threads
2007                          * that can affect may_start_constrained_thread as looking at the
2008                          * actual number may_start_constrained_thread will see is racy.
2009                          *
2010                          * IOW at NCPU = 4, for IN (req_qos = 1), if the old req count is
2011                          * between NCPU (4) and NCPU - 2 (2) we need to redrive.
2012                          */
2013                         if (wq_max_concurrency[req_qos] <= old_req_count + max_busycount &&
2014                                         old_req_count <= wq_max_concurrency[req_qos]) {
2015                                 if (WQ_TIMER_DELAYED_NEEDED(wq)) {
2016                                         start_timer = true;
2017                                         workqueue_interval_timer_start(wq);
2018                                 }
2019                         }
2020                 }
2021
2022                 PTHREAD_TRACE_WQ(TRACE_wq_thread_block | DBG_FUNC_START, wq,
2023                                 old_count - 1, qos | (req_qos << 8),
2024                                 wq->wq_reqcount << 1 | start_timer, 0);
2025                 break;
2026         }
2027         case SCHED_CALL_UNBLOCK: {
2028                 /*
2029                  * we cannot take the workqueue_lock here...
2030                  * an UNBLOCK can occur from a timer event which
2031                  * is run from an interrupt context... if the workqueue_lock
2032                  * is already held by this processor, we'll deadlock...
2033                  * the thread lock for the thread being UNBLOCKED
2034                  * is also held
2035                  */
2036                 old_thactive = _wq_thactive_inc(wq, qos);
2037                 if (pthread_debug_tracing) {
2038                         req_qos = WQ_THACTIVE_BEST_CONSTRAINED_REQ_QOS(old_thactive);
2039                         old_count = _wq_thactive_aggregate_downto_qos(wq, old_thactive,
2040                                         qos, NULL, NULL);
2041                         PTHREAD_TRACE_WQ(TRACE_wq_thread_block | DBG_FUNC_END, wq,
2042                                         old_count + 1, qos | (req_qos << 8),
2043                                         wq->wq_threads_scheduled, 0);
2044                 }
2045                 break;
2046         }
2047         }
2048 }
2049
2050 sched_call_t
2051 _workqueue_get_sched_callback(void)
2052 {
2053         return workqueue_callback;
2054 }
2055
2056 #pragma mark thread addition/removal
2057
2058 static mach_vm_size_t
2059 _workqueue_allocsize(struct workqueue *wq)
2060 {
2061         proc_t p = wq->wq_proc;
2062         mach_vm_size_t guardsize = vm_map_page_size(wq->wq_map);
2063         mach_vm_size_t pthread_size =
2064                 vm_map_round_page_mask(pthread_kern->proc_get_pthsize(p) + PTHREAD_T_OFFSET, vm_map_page_mask(wq->wq_map));
2065         return guardsize + PTH_DEFAULT_STACKSIZE + pthread_size;
2066 }
2067
2068 /**
2069  * pop goes the thread
2070  *
2071  * If fromexit is set, the call is from workqueue_exit(,
2072  * so some cleanups are to be avoided.
2073  */
2074 static void
2075 workqueue_removethread(struct threadlist *tl, bool fromexit, bool first_use)
2076 {
2077         struct uthread * uth;
2078         struct workqueue * wq = tl->th_workq;
2079
2080         if (tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET){
2081                 TAILQ_REMOVE(&wq->wq_thidlemgrlist, tl, th_entry);
2082         } else {
2083                 TAILQ_REMOVE(&wq->wq_thidlelist, tl, th_entry);
2084         }
2085
2086         if (fromexit == 0) {
2087                 assert(wq->wq_nthreads && wq->wq_thidlecount);
2088                 wq->wq_nthreads--;
2089                 wq->wq_thidlecount--;
2090         }
2091
2092         /*
2093          * Clear the threadlist pointer in uthread so
2094          * blocked thread on wakeup for termination will
2095          * not access the thread list as it is going to be
2096          * freed.
2097          */
2098         pthread_kern->thread_sched_call(tl->th_thread, NULL);
2099
2100         uth = pthread_kern->get_bsdthread_info(tl->th_thread);
2101         if (uth != (struct uthread *)0) {
2102                 pthread_kern->uthread_set_threadlist(uth, NULL);
2103         }
2104         if (fromexit == 0) {
2105                 /* during exit the lock is not held */
2106                 workqueue_unlock(wq);
2107         }
2108
2109         if ( (tl->th_flags & TH_LIST_NEW) || first_use ) {
2110                 /*
2111                  * thread was created, but never used...
2112                  * need to clean up the stack and port ourselves
2113                  * since we're not going to spin up through the
2114                  * normal exit path triggered from Libc
2115                  */
2116                 if (fromexit == 0) {
2117                         /* vm map is already deallocated when this is called from exit */
2118                         (void)mach_vm_deallocate(wq->wq_map, tl->th_stackaddr, _workqueue_allocsize(wq));
2119                 }
2120                 (void)pthread_kern->mach_port_deallocate(pthread_kern->task_get_ipcspace(wq->wq_task), tl->th_thport);
2121         }
2122         /*
2123          * drop our ref on the thread
2124          */
2125         thread_deallocate(tl->th_thread);
2126
2127         zfree(pthread_zone_threadlist, tl);
2128 }
2129
2130
2131 /**
2132  * Try to add a new workqueue thread.
2133  *
2134  * - called with workq lock held
2135  * - dropped and retaken around thread creation
2136  * - return with workq lock held
2137  */
2138 static bool
2139 workqueue_addnewthread(proc_t p, struct workqueue *wq)
2140 {
2141         kern_return_t kret;
2142
2143         wq->wq_nthreads++;
2144
2145         workqueue_unlock(wq);
2146
2147         struct threadlist *tl = zalloc(pthread_zone_threadlist);
2148         bzero(tl, sizeof(struct threadlist));
2149
2150         thread_t th;
2151         kret = pthread_kern->thread_create_workq_waiting(wq->wq_task, wq_unpark_continue, tl, &th);
2152         if (kret != KERN_SUCCESS) {
2153                 PTHREAD_TRACE_WQ(TRACE_wq_thread_create_failed | DBG_FUNC_NONE, wq, kret, 0, 0, 0);
2154                 goto fail_free;
2155         }
2156
2157         mach_vm_offset_t stackaddr = pthread_kern->proc_get_stack_addr_hint(p);
2158
2159         mach_vm_size_t guardsize = vm_map_page_size(wq->wq_map);
2160         mach_vm_size_t pthread_size =
2161                 vm_map_round_page_mask(pthread_kern->proc_get_pthsize(p) + PTHREAD_T_OFFSET, vm_map_page_mask(wq->wq_map));
2162         mach_vm_size_t th_allocsize = guardsize + PTH_DEFAULT_STACKSIZE + pthread_size;
2163
2164         kret = mach_vm_map(wq->wq_map, &stackaddr,
2165                         th_allocsize, page_size-1,
2166                         VM_MAKE_TAG(VM_MEMORY_STACK)| VM_FLAGS_ANYWHERE, NULL,
2167                         0, FALSE, VM_PROT_DEFAULT, VM_PROT_ALL,
2168                         VM_INHERIT_DEFAULT);
2169
2170         if (kret != KERN_SUCCESS) {
2171                 kret = mach_vm_allocate(wq->wq_map,
2172                                 &stackaddr, th_allocsize,
2173                                 VM_MAKE_TAG(VM_MEMORY_STACK) | VM_FLAGS_ANYWHERE);
2174         }
2175
2176         if (kret != KERN_SUCCESS) {
2177                 PTHREAD_TRACE_WQ(TRACE_wq_thread_create_failed | DBG_FUNC_NONE, wq, kret, 1, 0, 0);
2178                 goto fail_terminate;
2179         }
2180
2181         /*
2182          * The guard page is at the lowest address
2183          * The stack base is the highest address
2184          */
2185         kret = mach_vm_protect(wq->wq_map, stackaddr, guardsize, FALSE, VM_PROT_NONE);
2186         if (kret != KERN_SUCCESS) {
2187                 PTHREAD_TRACE_WQ(TRACE_wq_thread_create_failed | DBG_FUNC_NONE, wq, kret, 2, 0, 0);
2188                 goto fail_vm_deallocate;
2189         }
2190
2191
2192         pthread_kern->thread_set_tag(th, THREAD_TAG_PTHREAD | THREAD_TAG_WORKQUEUE);
2193         pthread_kern->thread_static_param(th, TRUE);
2194
2195         /*
2196          * convert_thread_to_port() consumes a reference
2197          */
2198         thread_reference(th);
2199         void *sright = (void *)pthread_kern->convert_thread_to_port(th);
2200         tl->th_thport = pthread_kern->ipc_port_copyout_send(sright,
2201                         pthread_kern->task_get_ipcspace(wq->wq_task));
2202
2203         tl->th_flags = TH_LIST_INITED | TH_LIST_NEW;
2204         tl->th_thread = th;
2205         tl->th_workq = wq;
2206         tl->th_stackaddr = stackaddr;
2207         tl->th_priority = WORKQUEUE_NUM_BUCKETS;
2208
2209         struct uthread *uth;
2210         uth = pthread_kern->get_bsdthread_info(tl->th_thread);
2211
2212         workqueue_lock_spin(wq);
2213
2214         void *current_tl = pthread_kern->uthread_get_threadlist(uth);
2215         if (current_tl == NULL) {
2216                 pthread_kern->uthread_set_threadlist(uth, tl);
2217                 TAILQ_INSERT_TAIL(&wq->wq_thidlelist, tl, th_entry);
2218                 wq->wq_thidlecount++;
2219         } else if (current_tl == WQ_THREADLIST_EXITING_POISON) {
2220                 /*
2221                  * Failed thread creation race: The thread already woke up and has exited.
2222                  */
2223                 PTHREAD_TRACE_WQ(TRACE_wq_thread_create_failed | DBG_FUNC_NONE, wq, kret, 3, 0, 0);
2224                 goto fail_unlock;
2225         } else {
2226                 panic("Unexpected initial threadlist value");
2227         }
2228
2229         PTHREAD_TRACE_WQ(TRACE_wq_thread_create | DBG_FUNC_NONE, wq, 0, 0, 0, 0);
2230
2231         return (TRUE);
2232
2233 fail_unlock:
2234         workqueue_unlock(wq);
2235         (void)pthread_kern->mach_port_deallocate(pthread_kern->task_get_ipcspace(wq->wq_task),
2236                         tl->th_thport);
2237
2238 fail_vm_deallocate:
2239         (void) mach_vm_deallocate(wq->wq_map, stackaddr, th_allocsize);
2240
2241 fail_terminate:
2242         if (pthread_kern->thread_will_park_or_terminate) {
2243                 pthread_kern->thread_will_park_or_terminate(th);
2244         }
2245         (void)thread_terminate(th);
2246         thread_deallocate(th);
2247
2248 fail_free:
2249         zfree(pthread_zone_threadlist, tl);
2250
2251         workqueue_lock_spin(wq);
2252         wq->wq_nthreads--;
2253
2254         return (FALSE);
2255 }
2256
2257 /**
2258  * Setup per-process state for the workqueue.
2259  */
2260 int
2261 _workq_open(struct proc *p, __unused int32_t *retval)
2262 {
2263         struct workqueue * wq;
2264         char * ptr;
2265         uint32_t num_cpus;
2266         int error = 0;
2267
2268         if (pthread_kern->proc_get_register(p) == 0) {
2269                 return EINVAL;
2270         }
2271
2272         num_cpus = pthread_kern->ml_get_max_cpus();
2273
2274         if (wq_init_constrained_limit) {
2275                 uint32_t limit;
2276                 /*
2277                  * set up the limit for the constrained pool
2278                  * this is a virtual pool in that we don't
2279                  * maintain it on a separate idle and run list
2280                  */
2281                 limit = num_cpus * WORKQUEUE_CONSTRAINED_FACTOR;
2282
2283                 if (limit > wq_max_constrained_threads)
2284                         wq_max_constrained_threads = limit;
2285
2286                 wq_init_constrained_limit = 0;
2287
2288                 if (wq_max_threads > WQ_THACTIVE_BUCKET_HALF) {
2289                         wq_max_threads = WQ_THACTIVE_BUCKET_HALF;
2290                 }
2291                 if (wq_max_threads > pthread_kern->config_thread_max - 20) {
2292                         wq_max_threads = pthread_kern->config_thread_max - 20;
2293                 }
2294         }
2295
2296         if (pthread_kern->proc_get_wqptr(p) == NULL) {
2297                 if (pthread_kern->proc_init_wqptr_or_wait(p) == FALSE) {
2298                         assert(pthread_kern->proc_get_wqptr(p) != NULL);
2299                         goto out;
2300                 }
2301
2302                 ptr = (char *)zalloc(pthread_zone_workqueue);
2303                 bzero(ptr, sizeof(struct workqueue));
2304
2305                 wq = (struct workqueue *)ptr;
2306                 wq->wq_proc = p;
2307                 wq->wq_task = current_task();
2308                 wq->wq_map  = pthread_kern->current_map();
2309
2310                 // Start the event manager at the priority hinted at by the policy engine
2311                 int mgr_priority_hint = pthread_kern->task_get_default_manager_qos(current_task());
2312                 wq->wq_event_manager_priority = (uint32_t)thread_qos_get_pthread_priority(mgr_priority_hint) | _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG;
2313
2314                 TAILQ_INIT(&wq->wq_thrunlist);
2315                 TAILQ_INIT(&wq->wq_thidlelist);
2316                 for (int i = 0; i < WORKQUEUE_EVENT_MANAGER_BUCKET; i++) {
2317                         TAILQ_INIT(&wq->wq_overcommit_reqlist[i]);
2318                         TAILQ_INIT(&wq->wq_reqlist[i]);
2319                 }
2320
2321                 wq->wq_atimer_delayed_call =
2322                                 thread_call_allocate_with_priority((thread_call_func_t)workqueue_add_timer,
2323                                                 (thread_call_param_t)wq, THREAD_CALL_PRIORITY_KERNEL);
2324                 wq->wq_atimer_immediate_call =
2325                                 thread_call_allocate_with_priority((thread_call_func_t)workqueue_add_timer,
2326                                                 (thread_call_param_t)wq, THREAD_CALL_PRIORITY_KERNEL);
2327
2328                 lck_spin_init(&wq->wq_lock, pthread_lck_grp, pthread_lck_attr);
2329
2330                 wq->wq_cached_threadreq = zalloc(pthread_zone_threadreq);
2331                 *(wq_thactive_t *)&wq->wq_thactive =
2332                                 (wq_thactive_t)WQ_THACTIVE_NO_PENDING_REQUEST <<
2333                                 WQ_THACTIVE_QOS_SHIFT;
2334
2335                 pthread_kern->proc_set_wqptr(p, wq);
2336
2337         }
2338 out:
2339
2340         return(error);
2341 }
2342
2343 /*
2344  * Routine:     workqueue_mark_exiting
2345  *
2346  * Function:    Mark the work queue such that new threads will not be added to the
2347  *              work queue after we return.
2348  *
2349  * Conditions:  Called against the current process.
2350  */
2351 void
2352 _workqueue_mark_exiting(struct proc *p)
2353 {
2354         struct workqueue *wq = pthread_kern->proc_get_wqptr(p);
2355         if (!wq) return;
2356
2357         PTHREAD_TRACE_WQ(TRACE_wq_pthread_exit|DBG_FUNC_START, wq, 0, 0, 0, 0);
2358
2359         workqueue_lock_spin(wq);
2360
2361         /*
2362          * We arm the add timer without holding the workqueue lock so we need
2363          * to synchronize with any running or soon to be running timers.
2364          *
2365          * Threads that intend to arm the timer atomically OR
2366          * WQ_ATIMER_{DELAYED,IMMEDIATE}_RUNNING into the wq_flags, only if
2367          * WQ_EXITING is not present.  So, once we have set WQ_EXITING, we can
2368          * be sure that no new RUNNING flags will be set, but still need to
2369          * wait for the already running timers to complete.
2370          *
2371          * We always hold the workq lock when dropping WQ_ATIMER_RUNNING, so
2372          * the check for and sleep until clear is protected.
2373          */
2374         uint64_t wq_flags = _wq_flags_or_orig(wq, WQ_EXITING);
2375
2376         if (wq_flags & WQ_ATIMER_DELAYED_RUNNING) {
2377                 if (thread_call_cancel(wq->wq_atimer_delayed_call) == TRUE) {
2378                         wq_flags = _wq_flags_and_orig(wq, ~WQ_ATIMER_DELAYED_RUNNING);
2379                 }
2380         }
2381         if (wq_flags & WQ_ATIMER_IMMEDIATE_RUNNING) {
2382                 if (thread_call_cancel(wq->wq_atimer_immediate_call) == TRUE) {
2383                         wq_flags = _wq_flags_and_orig(wq, ~WQ_ATIMER_IMMEDIATE_RUNNING);
2384                 }
2385         }
2386         while ((_wq_flags(wq) & (WQ_ATIMER_DELAYED_RUNNING | WQ_ATIMER_IMMEDIATE_RUNNING)) ||
2387                         (wq->wq_lflags & WQL_ATIMER_BUSY)) {
2388                 assert_wait((caddr_t)wq, (THREAD_UNINT));
2389                 workqueue_unlock(wq);
2390
2391                 thread_block(THREAD_CONTINUE_NULL);
2392
2393                 workqueue_lock_spin(wq);
2394         }
2395
2396         /*
2397          * Save off pending requests, will complete/free them below after unlocking
2398          */
2399         TAILQ_HEAD(, threadreq) local_list = TAILQ_HEAD_INITIALIZER(local_list);
2400
2401         for (int i = 0; i < WORKQUEUE_EVENT_MANAGER_BUCKET; i++) {
2402                 TAILQ_CONCAT(&local_list, &wq->wq_overcommit_reqlist[i], tr_entry);
2403                 TAILQ_CONCAT(&local_list, &wq->wq_reqlist[i], tr_entry);
2404         }
2405
2406         /*
2407          * XXX: Can't deferred cancel the event manager request, so just smash it.
2408          */
2409         assert((wq->wq_event_manager_threadreq.tr_flags & TR_FLAG_WORKLOOP) == 0);
2410         wq->wq_event_manager_threadreq.tr_state = TR_STATE_DEAD;
2411
2412         workqueue_unlock(wq);
2413
2414         struct threadreq *tr, *tr_temp;
2415         TAILQ_FOREACH_SAFE(tr, &local_list, tr_entry, tr_temp) {
2416                 _threadreq_cancel(wq, tr);
2417         }
2418         PTHREAD_TRACE(TRACE_wq_pthread_exit|DBG_FUNC_END, 0, 0, 0, 0, 0);
2419 }
2420
2421 /*
2422  * Routine:     workqueue_exit
2423  *
2424  * Function:    clean up the work queue structure(s) now that there are no threads
2425  *              left running inside the work queue (except possibly current_thread).
2426  *
2427  * Conditions:  Called by the last thread in the process.
2428  *              Called against current process.
2429  */
2430 void
2431 _workqueue_exit(struct proc *p)
2432 {
2433         struct workqueue  * wq;
2434         struct threadlist  * tl, *tlist;
2435         struct uthread  *uth;
2436
2437         wq = pthread_kern->proc_get_wqptr(p);
2438         if (wq != NULL) {
2439
2440                 PTHREAD_TRACE_WQ(TRACE_wq_workqueue_exit|DBG_FUNC_START, wq, 0, 0, 0, 0);
2441
2442                 pthread_kern->proc_set_wqptr(p, NULL);
2443
2444                 /*
2445                  * Clean up workqueue data structures for threads that exited and
2446                  * didn't get a chance to clean up after themselves.
2447                  */
2448                 TAILQ_FOREACH_SAFE(tl, &wq->wq_thrunlist, th_entry, tlist) {
2449                         assert((tl->th_flags & TH_LIST_RUNNING) != 0);
2450
2451                         pthread_kern->thread_sched_call(tl->th_thread, NULL);
2452
2453                         uth = pthread_kern->get_bsdthread_info(tl->th_thread);
2454                         if (uth != (struct uthread *)0) {
2455                                 pthread_kern->uthread_set_threadlist(uth, NULL);
2456                         }
2457                         TAILQ_REMOVE(&wq->wq_thrunlist, tl, th_entry);
2458
2459                         /*
2460                          * drop our last ref on the thread
2461                          */
2462                         thread_deallocate(tl->th_thread);
2463
2464                         zfree(pthread_zone_threadlist, tl);
2465                 }
2466                 TAILQ_FOREACH_SAFE(tl, &wq->wq_thidlelist, th_entry, tlist) {
2467                         assert((tl->th_flags & TH_LIST_RUNNING) == 0);
2468                         assert(tl->th_priority != WORKQUEUE_EVENT_MANAGER_BUCKET);
2469                         workqueue_removethread(tl, true, false);
2470                 }
2471                 TAILQ_FOREACH_SAFE(tl, &wq->wq_thidlemgrlist, th_entry, tlist) {
2472                         assert((tl->th_flags & TH_LIST_RUNNING) == 0);
2473                         assert(tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET);
2474                         workqueue_removethread(tl, true, false);
2475                 }
2476                 if (wq->wq_cached_threadreq) {
2477                         zfree(pthread_zone_threadreq, wq->wq_cached_threadreq);
2478                 }
2479                 thread_call_free(wq->wq_atimer_delayed_call);
2480                 thread_call_free(wq->wq_atimer_immediate_call);
2481                 lck_spin_destroy(&wq->wq_lock, pthread_lck_grp);
2482
2483                 for (int i = 0; i < WORKQUEUE_EVENT_MANAGER_BUCKET; i++) {
2484                         assert(TAILQ_EMPTY(&wq->wq_overcommit_reqlist[i]));
2485                         assert(TAILQ_EMPTY(&wq->wq_reqlist[i]));
2486                 }
2487
2488                 zfree(pthread_zone_workqueue, wq);
2489
2490                 PTHREAD_TRACE(TRACE_wq_workqueue_exit|DBG_FUNC_END, 0, 0, 0, 0, 0);
2491         }
2492 }
2493
2494
2495 #pragma mark workqueue thread manipulation
2496
2497
2498 /**
2499  * Entry point for libdispatch to ask for threads
2500  */
2501 static int
2502 wqops_queue_reqthreads(struct proc *p, int reqcount,
2503                 pthread_priority_t priority)
2504 {
2505         bool overcommit = _pthread_priority_get_flags(priority) & _PTHREAD_PRIORITY_OVERCOMMIT_FLAG;
2506         bool event_manager = _pthread_priority_get_flags(priority) & _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG;
2507         int class = event_manager ? WORKQUEUE_EVENT_MANAGER_BUCKET :
2508                         pthread_priority_get_class_index(priority);
2509
2510         if ((reqcount <= 0) || (class < 0) || (class >= WORKQUEUE_NUM_BUCKETS) ||
2511                         (overcommit && event_manager)) {
2512                 return EINVAL;
2513         }
2514
2515         struct workqueue *wq;
2516         if ((wq = (struct workqueue *)pthread_kern->proc_get_wqptr(p)) == NULL) {
2517                 return EINVAL;
2518         }
2519
2520         workqueue_lock_spin(wq);
2521         _threadreq_copy_prepare(wq);
2522
2523         PTHREAD_TRACE_WQ(TRACE_wq_wqops_reqthreads | DBG_FUNC_NONE, wq, reqcount, priority, 0, 0);
2524
2525         int tr_flags = 0;
2526         if (overcommit) tr_flags |= TR_FLAG_OVERCOMMIT;
2527         if (reqcount > 1) {
2528                 /*
2529                  * when libdispatch asks for more than one thread, it wants to achieve
2530                  * parallelism. Pacing would be detrimental to this ask, so treat
2531                  * these specially to not do the pacing admission check
2532                  */
2533                 tr_flags |= TR_FLAG_NO_PACING;
2534         }
2535
2536         while (reqcount-- && !_wq_exiting(wq)) {
2537                 struct threadreq req;
2538                 _threadreq_init_stack(&req, class, tr_flags);
2539
2540                 workqueue_run_threadreq_and_unlock(p, wq, NULL, &req, true);
2541
2542                 workqueue_lock_spin(wq); /* reacquire */
2543                 _threadreq_copy_prepare(wq);
2544         }
2545
2546         workqueue_unlock(wq);
2547
2548         return 0;
2549 }
2550
2551 /*
2552  * Used by the kevent system to request threads.
2553  *
2554  * Currently count is ignored and we always return one thread per invocation.
2555  */
2556 static thread_t
2557 _workq_kevent_reqthreads(struct proc *p, pthread_priority_t priority,
2558                 bool no_emergency)
2559 {
2560         int wq_run_tr = WQ_RUN_TR_THROTTLED;
2561         bool emergency_thread = false;
2562         struct threadreq req;
2563
2564
2565         struct workqueue *wq;
2566         if ((wq = (struct workqueue *)pthread_kern->proc_get_wqptr(p)) == NULL) {
2567                 return THREAD_NULL;
2568         }
2569
2570         int class = pthread_priority_get_class_index(priority);
2571
2572         workqueue_lock_spin(wq);
2573         bool has_threadreq = _threadreq_copy_prepare_noblock(wq);
2574
2575         PTHREAD_TRACE_WQ_REQ(TRACE_wq_kevent_reqthreads | DBG_FUNC_NONE, wq, NULL, priority, 0, 0);
2576
2577         /*
2578          * Skip straight to event manager if that's what was requested
2579          */
2580         if ((_pthread_priority_get_qos_newest(priority) == QOS_CLASS_UNSPECIFIED) ||
2581                         (_pthread_priority_get_flags(priority) & _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG)){
2582                 goto event_manager;
2583         }
2584
2585         bool will_pace = _wq_should_pace_priority(wq, class);
2586         if ((wq->wq_thidlecount == 0 || will_pace) && has_threadreq == false) {
2587                 /*
2588                  * We'll need to persist the request and can't, so return the emergency
2589                  * thread instead, which has a persistent request object.
2590                  */
2591                 emergency_thread = true;
2592                 goto event_manager;
2593         }
2594
2595         /*
2596          * Handle overcommit requests
2597          */
2598         if ((_pthread_priority_get_flags(priority) & _PTHREAD_PRIORITY_OVERCOMMIT_FLAG) != 0){
2599                 _threadreq_init_stack(&req, class, TR_FLAG_KEVENT | TR_FLAG_OVERCOMMIT);
2600                 wq_run_tr = workqueue_run_threadreq_and_unlock(p, wq, NULL, &req, false);
2601                 goto done;
2602         }
2603
2604         /*
2605          * Handle constrained requests
2606          */
2607         boolean_t may_start = may_start_constrained_thread(wq, class, NULL, false);
2608         if (may_start || no_emergency) {
2609                 _threadreq_init_stack(&req, class, TR_FLAG_KEVENT);
2610                 wq_run_tr = workqueue_run_threadreq_and_unlock(p, wq, NULL, &req, false);
2611                 goto done;
2612         } else {
2613                 emergency_thread = true;
2614         }
2615
2616
2617 event_manager:
2618         _threadreq_init_stack(&req, WORKQUEUE_EVENT_MANAGER_BUCKET, TR_FLAG_KEVENT);
2619         wq_run_tr = workqueue_run_threadreq_and_unlock(p, wq, NULL, &req, false);
2620
2621 done:
2622         if (wq_run_tr == WQ_RUN_TR_THREAD_NEEDED && WQ_TIMER_IMMEDIATE_NEEDED(wq)) {
2623                 workqueue_interval_timer_trigger(wq);
2624         }
2625         return emergency_thread ? (void*)-1 : 0;
2626 }
2627
2628 thread_t
2629 _workq_reqthreads(struct proc *p, __assert_only int requests_count,
2630                 workq_reqthreads_req_t request)
2631 {
2632         assert(requests_count == 1);
2633
2634         pthread_priority_t priority = request->priority;
2635         bool no_emergency = request->count & WORKQ_REQTHREADS_NOEMERGENCY;
2636
2637         return _workq_kevent_reqthreads(p, priority, no_emergency);
2638 }
2639
2640
2641 int
2642 workq_kern_threadreq(struct proc *p, workq_threadreq_t _req,
2643                 enum workq_threadreq_type type, unsigned long priority, int flags)
2644 {
2645         struct workqueue *wq;
2646         int ret;
2647
2648         if ((wq = (struct workqueue *)pthread_kern->proc_get_wqptr(p)) == NULL) {
2649                 return EINVAL;
2650         }
2651
2652         switch (type) {
2653         case WORKQ_THREADREQ_KEVENT: {
2654                 bool no_emergency = flags & WORKQ_THREADREQ_FLAG_NOEMERGENCY;
2655                 (void)_workq_kevent_reqthreads(p, priority, no_emergency);
2656                 return 0;
2657         }
2658         case WORKQ_THREADREQ_WORKLOOP:
2659         case WORKQ_THREADREQ_WORKLOOP_NO_THREAD_CALL: {
2660                 struct threadreq *req = (struct threadreq *)_req;
2661                 int req_class = pthread_priority_get_class_index(priority);
2662                 int req_flags = TR_FLAG_WORKLOOP;
2663                 if ((_pthread_priority_get_flags(priority) &
2664                                 _PTHREAD_PRIORITY_OVERCOMMIT_FLAG) != 0){
2665                         req_flags |= TR_FLAG_OVERCOMMIT;
2666                 }
2667
2668                 thread_t thread = current_thread();
2669                 struct threadlist *tl = util_get_thread_threadlist_entry(thread);
2670
2671                 if (tl && tl != WQ_THREADLIST_EXITING_POISON &&
2672                                 (tl->th_flags & TH_LIST_UNBINDING)) {
2673                         /*
2674                          * we're called back synchronously from the context of
2675                          * kevent_qos_internal_unbind from within wqops_thread_return()
2676                          * we can try to match up this thread with this request !
2677                          */
2678                 } else {
2679                         tl = NULL;
2680                 }
2681
2682                 _threadreq_init_alloced(req, req_class, req_flags);
2683                 workqueue_lock_spin(wq);
2684                 PTHREAD_TRACE_WQ_REQ(TRACE_wq_kevent_reqthreads | DBG_FUNC_NONE, wq, req, priority, 1, 0);
2685                 ret = workqueue_run_threadreq_and_unlock(p, wq, tl, req, false);
2686                 if (ret == WQ_RUN_TR_EXITING) {
2687                         return ECANCELED;
2688                 }
2689                 if (ret == WQ_RUN_TR_THREAD_NEEDED) {
2690                         if (type == WORKQ_THREADREQ_WORKLOOP_NO_THREAD_CALL) {
2691                                 return EAGAIN;
2692                         }
2693                         if (WQ_TIMER_IMMEDIATE_NEEDED(wq)) {
2694                                 workqueue_interval_timer_trigger(wq);
2695                         }
2696                 }
2697                 return 0;
2698         }
2699         case WORKQ_THREADREQ_REDRIVE:
2700                 PTHREAD_TRACE_WQ_REQ(TRACE_wq_kevent_reqthreads | DBG_FUNC_NONE, wq, 0, 0, 4, 0);
2701                 workqueue_lock_spin(wq);
2702                 ret = workqueue_run_threadreq_and_unlock(p, wq, NULL, NULL, true);
2703                 if (ret == WQ_RUN_TR_EXITING) {
2704                         return ECANCELED;
2705                 }
2706                 return 0;
2707         default:
2708                 return ENOTSUP;
2709         }
2710 }
2711
2712 int
2713 workq_kern_threadreq_modify(struct proc *p, workq_threadreq_t _req,
2714                 enum workq_threadreq_op operation, unsigned long arg1,
2715                 unsigned long __unused arg2)
2716 {
2717         struct threadreq *req = (struct threadreq *)_req;
2718         struct workqueue *wq;
2719         int priclass, ret = 0, wq_tr_rc = WQ_RUN_TR_THROTTLED;
2720
2721         if (req == NULL || (wq = pthread_kern->proc_get_wqptr(p)) == NULL) {
2722                 return EINVAL;
2723         }
2724
2725         workqueue_lock_spin(wq);
2726
2727         if (_wq_exiting(wq)) {
2728                 ret = ECANCELED;
2729                 goto out_unlock;
2730         }
2731
2732         /*
2733          * Find/validate the referenced request structure
2734          */
2735         if (req->tr_state != TR_STATE_WAITING) {
2736                 ret = EINVAL;
2737                 goto out_unlock;
2738         }
2739         assert(req->tr_priority < WORKQUEUE_EVENT_MANAGER_BUCKET);
2740         assert(req->tr_flags & TR_FLAG_WORKLOOP);
2741
2742         switch (operation) {
2743         case WORKQ_THREADREQ_CHANGE_PRI:
2744         case WORKQ_THREADREQ_CHANGE_PRI_NO_THREAD_CALL:
2745                 priclass = pthread_priority_get_class_index(arg1);
2746                 PTHREAD_TRACE_WQ_REQ(TRACE_wq_kevent_reqthreads | DBG_FUNC_NONE, wq, req, arg1, 2, 0);
2747                 if (req->tr_priority == priclass) {
2748                         goto out_unlock;
2749                 }
2750                 _threadreq_dequeue(wq, req);
2751                 req->tr_priority = priclass;
2752                 req->tr_state = TR_STATE_NEW; // what was old is new again
2753                 wq_tr_rc = workqueue_run_threadreq_and_unlock(p, wq, NULL, req, false);
2754                 goto out;
2755
2756         case WORKQ_THREADREQ_CANCEL:
2757                 PTHREAD_TRACE_WQ_REQ(TRACE_wq_kevent_reqthreads | DBG_FUNC_NONE, wq, req, 0, 3, 0);
2758                 _threadreq_dequeue(wq, req);
2759                 req->tr_state = TR_STATE_DEAD;
2760                 break;
2761
2762         default:
2763                 ret = ENOTSUP;
2764                 break;
2765         }
2766
2767 out_unlock:
2768         workqueue_unlock(wq);
2769 out:
2770         if (wq_tr_rc == WQ_RUN_TR_THREAD_NEEDED) {
2771                 if (operation == WORKQ_THREADREQ_CHANGE_PRI_NO_THREAD_CALL) {
2772                         ret = EAGAIN;
2773                 } else if (WQ_TIMER_IMMEDIATE_NEEDED(wq)) {
2774                         workqueue_interval_timer_trigger(wq);
2775                 }
2776         }
2777         return ret;
2778 }
2779
2780
2781 static int
2782 wqops_thread_return(struct proc *p, struct workqueue *wq)
2783 {
2784         thread_t th = current_thread();
2785         struct uthread *uth = pthread_kern->get_bsdthread_info(th);
2786         struct threadlist *tl = pthread_kern->uthread_get_threadlist(uth);
2787
2788         /* reset signal mask on the workqueue thread to default state */
2789         if (pthread_kern->uthread_get_sigmask(uth) != (sigset_t)(~workq_threadmask)) {
2790                 pthread_kern->proc_lock(p);
2791                 pthread_kern->uthread_set_sigmask(uth, ~workq_threadmask);
2792                 pthread_kern->proc_unlock(p);
2793         }
2794
2795         if (wq == NULL || !tl) {
2796                 return EINVAL;
2797         }
2798
2799         PTHREAD_TRACE_WQ(TRACE_wq_override_reset | DBG_FUNC_START, tl->th_workq, 0, 0, 0, 0);
2800
2801         /*
2802          * This squash call has neat semantics: it removes the specified overrides,
2803          * replacing the current requested QoS with the previous effective QoS from
2804          * those overrides.  This means we won't be preempted due to having our QoS
2805          * lowered.  Of course, now our understanding of the thread's QoS is wrong,
2806          * so we'll adjust below.
2807          */
2808         bool was_manager = (tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET);
2809         int new_qos;
2810
2811         if (!was_manager) {
2812                 new_qos = pthread_kern->proc_usynch_thread_qos_squash_override_for_resource(th,
2813                                 THREAD_QOS_OVERRIDE_RESOURCE_WILDCARD,
2814                                 THREAD_QOS_OVERRIDE_TYPE_DISPATCH_ASYNCHRONOUS_OVERRIDE);
2815         }
2816
2817         PTHREAD_TRACE_WQ(TRACE_wq_runitem | DBG_FUNC_END, wq, tl->th_priority, 0, 0, 0);
2818
2819         workqueue_lock_spin(wq);
2820
2821         if (tl->th_flags & TH_LIST_KEVENT_BOUND) {
2822                 unsigned int flags = KEVENT_FLAG_WORKQ;
2823                 if (was_manager) {
2824                         flags |= KEVENT_FLAG_WORKQ_MANAGER;
2825                 }
2826
2827                 tl->th_flags |= TH_LIST_UNBINDING;
2828                 workqueue_unlock(wq);
2829                 kevent_qos_internal_unbind(p, class_index_get_thread_qos(tl->th_priority), th, flags);
2830                 if (!(tl->th_flags & TH_LIST_UNBINDING)) {
2831                         _setup_wqthread(p, th, wq, tl, WQ_SETUP_CLEAR_VOUCHER);
2832                         pthread_kern->unix_syscall_return(EJUSTRETURN);
2833                         __builtin_unreachable();
2834                 }
2835                 workqueue_lock_spin(wq);
2836                 tl->th_flags &= ~(TH_LIST_KEVENT_BOUND | TH_LIST_UNBINDING);
2837         }
2838
2839         if (!was_manager) {
2840                 /* Fix up counters from the squash operation. */
2841                 uint8_t old_bucket = tl->th_priority;
2842                 uint8_t new_bucket = thread_qos_get_class_index(new_qos);
2843
2844                 if (old_bucket != new_bucket) {
2845                         _wq_thactive_move(wq, old_bucket, new_bucket);
2846                         wq->wq_thscheduled_count[old_bucket]--;
2847                         wq->wq_thscheduled_count[new_bucket]++;
2848
2849                         PTHREAD_TRACE_WQ(TRACE_wq_thread_squash | DBG_FUNC_NONE, wq, tl->th_priority, new_bucket, 0, 0);
2850                         tl->th_priority = new_bucket;
2851                         PTHREAD_TRACE_WQ(TRACE_wq_override_reset | DBG_FUNC_END, tl->th_workq, new_qos, 0, 0, 0);
2852                 }
2853         }
2854
2855         workqueue_run_threadreq_and_unlock(p, wq, tl, NULL, false);
2856         return 0;
2857 }
2858
2859 /**
2860  * Multiplexed call to interact with the workqueue mechanism
2861  */
2862 int
2863 _workq_kernreturn(struct proc *p,
2864                   int options,
2865                   user_addr_t item,
2866                   int arg2,
2867                   int arg3,
2868                   int32_t *retval)
2869 {
2870         struct workqueue *wq;
2871         int error = 0;
2872
2873         if (pthread_kern->proc_get_register(p) == 0) {
2874                 return EINVAL;
2875         }
2876
2877         switch (options) {
2878         case WQOPS_QUEUE_NEWSPISUPP: {
2879                 /*
2880                  * arg2 = offset of serialno into dispatch queue
2881                  * arg3 = kevent support
2882                  */
2883                 int offset = arg2;
2884                 if (arg3 & 0x01){
2885                         // If we get here, then userspace has indicated support for kevent delivery.
2886                 }
2887
2888                 pthread_kern->proc_set_dispatchqueue_serialno_offset(p, (uint64_t)offset);
2889                 break;
2890         }
2891         case WQOPS_QUEUE_REQTHREADS: {
2892                 /*
2893                  * arg2 = number of threads to start
2894                  * arg3 = priority
2895                  */
2896                 error = wqops_queue_reqthreads(p, arg2, arg3);
2897                 break;
2898         }
2899         case WQOPS_SET_EVENT_MANAGER_PRIORITY: {
2900                 /*
2901                  * arg2 = priority for the manager thread
2902                  *
2903                  * if _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG is set, the
2904                  * ~_PTHREAD_PRIORITY_FLAGS_MASK contains a scheduling priority instead
2905                  * of a QOS value
2906                  */
2907                 pthread_priority_t pri = arg2;
2908
2909                 wq = (struct workqueue *)pthread_kern->proc_get_wqptr(p);
2910                 if (wq == NULL) {
2911                         error = EINVAL;
2912                         break;
2913                 }
2914                 workqueue_lock_spin(wq);
2915                 if (pri & _PTHREAD_PRIORITY_SCHED_PRI_FLAG){
2916                         /*
2917                          * If userspace passes a scheduling priority, that takes precidence
2918                          * over any QoS.  (So, userspace should take care not to accidenatally
2919                          * lower the priority this way.)
2920                          */
2921                         uint32_t sched_pri = pri & _PTHREAD_PRIORITY_SCHED_PRI_MASK;
2922                         if (wq->wq_event_manager_priority & _PTHREAD_PRIORITY_SCHED_PRI_FLAG){
2923                                 wq->wq_event_manager_priority = MAX(sched_pri, wq->wq_event_manager_priority & _PTHREAD_PRIORITY_SCHED_PRI_MASK)
2924                                                 | _PTHREAD_PRIORITY_SCHED_PRI_FLAG | _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG;
2925                         } else {
2926                                 wq->wq_event_manager_priority = sched_pri
2927                                                 | _PTHREAD_PRIORITY_SCHED_PRI_FLAG | _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG;
2928                         }
2929                 } else if ((wq->wq_event_manager_priority & _PTHREAD_PRIORITY_SCHED_PRI_FLAG) == 0){
2930                         int cur_qos = pthread_priority_get_thread_qos(wq->wq_event_manager_priority);
2931                         int new_qos = pthread_priority_get_thread_qos(pri);
2932                         wq->wq_event_manager_priority = (uint32_t)thread_qos_get_pthread_priority(MAX(cur_qos, new_qos)) | _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG;
2933                 }
2934                 workqueue_unlock(wq);
2935                 break;
2936         }
2937         case WQOPS_THREAD_KEVENT_RETURN:
2938         case WQOPS_THREAD_WORKLOOP_RETURN:
2939                 wq = (struct workqueue *)pthread_kern->proc_get_wqptr(p);
2940                 PTHREAD_TRACE_WQ(TRACE_wq_runthread | DBG_FUNC_END, wq, options, 0, 0, 0);
2941                 if (item != 0 && arg2 != 0) {
2942                         int32_t kevent_retval;
2943                         int ret;
2944                         if (options == WQOPS_THREAD_KEVENT_RETURN) {
2945                                 ret = kevent_qos_internal(p, -1, item, arg2, item, arg2, NULL, NULL,
2946                                                 KEVENT_FLAG_WORKQ | KEVENT_FLAG_IMMEDIATE | KEVENT_FLAG_ERROR_EVENTS,
2947                                                 &kevent_retval);
2948                         } else /* options == WQOPS_THREAD_WORKLOOP_RETURN */ {
2949                                 kqueue_id_t kevent_id = -1;
2950                                 ret = kevent_id_internal(p, &kevent_id, item, arg2, item, arg2,
2951                                                 NULL, NULL,
2952                                                 KEVENT_FLAG_WORKLOOP | KEVENT_FLAG_IMMEDIATE | KEVENT_FLAG_ERROR_EVENTS,
2953                                                 &kevent_retval);
2954                         }
2955                         /*
2956                          * We shouldn't be getting more errors out than events we put in, so
2957                          * reusing the input buffer should always provide enough space.  But,
2958                          * the assert is commented out since we get errors in edge cases in the
2959                          * process lifecycle.
2960                          */
2961                         //assert(ret == KERN_SUCCESS && kevent_retval >= 0);
2962                         if (ret != KERN_SUCCESS){
2963                                 error = ret;
2964                                 break;
2965                         } else if (kevent_retval > 0){
2966                                 assert(kevent_retval <= arg2);
2967                                 *retval = kevent_retval;
2968                                 error = 0;
2969                                 break;
2970                         }
2971                 }
2972                 goto thread_return;
2973
2974         case WQOPS_THREAD_RETURN:
2975                 wq = (struct workqueue *)pthread_kern->proc_get_wqptr(p);
2976                 PTHREAD_TRACE_WQ(TRACE_wq_runthread | DBG_FUNC_END, wq, options, 0, 0, 0);
2977         thread_return:
2978                 error = wqops_thread_return(p, wq);
2979                 // NOT REACHED except in case of error
2980                 assert(error);
2981                 break;
2982
2983         case WQOPS_SHOULD_NARROW: {
2984                 /*
2985                  * arg2 = priority to test
2986                  * arg3 = unused
2987                  */
2988                 pthread_priority_t priority = arg2;
2989                 thread_t th = current_thread();
2990                 struct threadlist *tl = util_get_thread_threadlist_entry(th);
2991
2992                 if (tl == NULL || (tl->th_flags & TH_LIST_CONSTRAINED) == 0) {
2993                         error = EINVAL;
2994                         break;
2995                 }
2996
2997                 int class = pthread_priority_get_class_index(priority);
2998                 wq = tl->th_workq;
2999                 workqueue_lock_spin(wq);
3000                 bool should_narrow = !may_start_constrained_thread(wq, class, tl, false);
3001                 workqueue_unlock(wq);
3002
3003                 *retval = should_narrow;
3004                 break;
3005         }
3006         default:
3007                 error = EINVAL;
3008                 break;
3009         }
3010
3011         switch (options) {
3012         case WQOPS_THREAD_KEVENT_RETURN:
3013         case WQOPS_THREAD_WORKLOOP_RETURN:
3014         case WQOPS_THREAD_RETURN:
3015                 PTHREAD_TRACE_WQ(TRACE_wq_runthread | DBG_FUNC_START, wq, options, 0, 0, 0);
3016                 break;
3017         }
3018         return (error);
3019 }
3020
3021 /*
3022  * We have no work to do, park ourselves on the idle list.
3023  *
3024  * Consumes the workqueue lock and does not return.
3025  */
3026 static void __dead2
3027 parkit(struct workqueue *wq, struct threadlist *tl, thread_t thread)
3028 {
3029         assert(thread == tl->th_thread);
3030         assert(thread == current_thread());
3031
3032         PTHREAD_TRACE_WQ(TRACE_wq_thread_park | DBG_FUNC_START, wq, 0, 0, 0, 0);
3033
3034         uint32_t us_to_wait = 0;
3035
3036         TAILQ_REMOVE(&wq->wq_thrunlist, tl, th_entry);
3037
3038         tl->th_flags &= ~TH_LIST_RUNNING;
3039         tl->th_flags &= ~TH_LIST_KEVENT;
3040         assert((tl->th_flags & TH_LIST_KEVENT_BOUND) == 0);
3041
3042         if (tl->th_flags & TH_LIST_CONSTRAINED) {
3043                 wq->wq_constrained_threads_scheduled--;
3044                 tl->th_flags &= ~TH_LIST_CONSTRAINED;
3045         }
3046
3047         _wq_thactive_dec(wq, tl->th_priority);
3048         wq->wq_thscheduled_count[tl->th_priority]--;
3049         wq->wq_threads_scheduled--;
3050         uint32_t thidlecount = ++wq->wq_thidlecount;
3051
3052         pthread_kern->thread_sched_call(thread, NULL);
3053
3054         /*
3055          * We'd like to always have one manager thread parked so that we can have
3056          * low latency when we need to bring a manager thread up.  If that idle
3057          * thread list is empty, make this thread a manager thread.
3058          *
3059          * XXX: This doesn't check that there's not a manager thread outstanding,
3060          * so it's based on the assumption that most manager callouts will change
3061          * their QoS before parking.  If that stops being true, this may end up
3062          * costing us more than we gain.
3063          */
3064         if (TAILQ_EMPTY(&wq->wq_thidlemgrlist) &&
3065                         tl->th_priority != WORKQUEUE_EVENT_MANAGER_BUCKET){
3066                 PTHREAD_TRACE_WQ(TRACE_wq_thread_reset_priority | DBG_FUNC_NONE,
3067                                         wq, thread_tid(thread),
3068                                         (tl->th_priority << 16) | WORKQUEUE_EVENT_MANAGER_BUCKET, 2, 0);
3069                 reset_priority(tl, pthread_priority_from_wq_class_index(wq, WORKQUEUE_EVENT_MANAGER_BUCKET));
3070                 tl->th_priority = WORKQUEUE_EVENT_MANAGER_BUCKET;
3071         }
3072
3073         if (tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET){
3074                 TAILQ_INSERT_HEAD(&wq->wq_thidlemgrlist, tl, th_entry);
3075         } else {
3076                 TAILQ_INSERT_HEAD(&wq->wq_thidlelist, tl, th_entry);
3077         }
3078
3079         /*
3080          * When we remove the voucher from the thread, we may lose our importance
3081          * causing us to get preempted, so we do this after putting the thread on
3082          * the idle list.  That when, when we get our importance back we'll be able
3083          * to use this thread from e.g. the kevent call out to deliver a boosting
3084          * message.
3085          */
3086         tl->th_flags |= TH_LIST_REMOVING_VOUCHER;
3087         workqueue_unlock(wq);
3088         if (pthread_kern->thread_will_park_or_terminate) {
3089                 pthread_kern->thread_will_park_or_terminate(tl->th_thread);
3090         }
3091         __assert_only kern_return_t kr;
3092         kr = pthread_kern->thread_set_voucher_name(MACH_PORT_NULL);
3093         assert(kr == KERN_SUCCESS);
3094         workqueue_lock_spin(wq);
3095         tl->th_flags &= ~(TH_LIST_REMOVING_VOUCHER);
3096
3097         if ((tl->th_flags & TH_LIST_RUNNING) == 0) {
3098                 if (thidlecount < 101) {
3099                         us_to_wait = wq_reduce_pool_window_usecs - ((thidlecount-2) * (wq_reduce_pool_window_usecs / 100));
3100                 } else {
3101                         us_to_wait = wq_reduce_pool_window_usecs / 100;
3102                 }
3103
3104                 thread_set_pending_block_hint(thread, kThreadWaitParkedWorkQueue);
3105                 assert_wait_timeout_with_leeway((caddr_t)tl, (THREAD_INTERRUPTIBLE),
3106                                 TIMEOUT_URGENCY_SYS_BACKGROUND|TIMEOUT_URGENCY_LEEWAY, us_to_wait,
3107                                 wq_reduce_pool_window_usecs/10, NSEC_PER_USEC);
3108
3109                 workqueue_unlock(wq);
3110
3111                 thread_block(wq_unpark_continue);
3112                 panic("thread_block(wq_unpark_continue) returned!");
3113         } else {
3114                 workqueue_unlock(wq);
3115
3116                 /*
3117                  * While we'd dropped the lock to unset our voucher, someone came
3118                  * around and made us runnable.  But because we weren't waiting on the
3119                  * event their wakeup() was ineffectual.  To correct for that, we just
3120                  * run the continuation ourselves.
3121                  */
3122                 wq_unpark_continue(NULL, THREAD_AWAKENED);
3123         }
3124 }
3125
3126 static bool
3127 may_start_constrained_thread(struct workqueue *wq, uint32_t at_priclass,
3128                 struct threadlist *tl, bool may_start_timer)
3129 {
3130         uint32_t req_qos = _wq_thactive_best_constrained_req_qos(wq);
3131         wq_thactive_t thactive;
3132
3133         if (may_start_timer && at_priclass < req_qos) {
3134                 /*
3135                  * When called from workqueue_run_threadreq_and_unlock() pre-post newest
3136                  * higher priorities into the thactive state so that
3137                  * workqueue_callback() takes the right decision.
3138                  *
3139                  * If the admission check passes, workqueue_run_threadreq_and_unlock
3140                  * will reset this value before running the request.
3141                  */
3142                 thactive = _wq_thactive_set_best_constrained_req_qos(wq, req_qos,
3143                                 at_priclass);
3144 #ifdef __LP64__
3145                 PTHREAD_TRACE_WQ(TRACE_wq_thactive_update, 1, (uint64_t)thactive,
3146                                 (uint64_t)(thactive >> 64), 0, 0);
3147 #endif
3148         } else {
3149                 thactive = _wq_thactive(wq);
3150         }
3151
3152         uint32_t constrained_threads = wq->wq_constrained_threads_scheduled;
3153         if (tl && (tl->th_flags & TH_LIST_CONSTRAINED)) {
3154                 /*
3155                  * don't count the current thread as scheduled
3156                  */
3157                 constrained_threads--;
3158         }
3159         if (constrained_threads >= wq_max_constrained_threads) {
3160                 PTHREAD_TRACE_WQ(TRACE_wq_constrained_admission | DBG_FUNC_NONE, wq, 1,
3161                                 wq->wq_constrained_threads_scheduled,
3162                                 wq_max_constrained_threads, 0);
3163                 /*
3164                  * we need 1 or more constrained threads to return to the kernel before
3165                  * we can dispatch additional work
3166                  */
3167                 return false;
3168         }
3169
3170         /*
3171          * Compute a metric for many how many threads are active.  We find the
3172          * highest priority request outstanding and then add up the number of
3173          * active threads in that and all higher-priority buckets.  We'll also add
3174          * any "busy" threads which are not active but blocked recently enough that
3175          * we can't be sure they've gone idle yet.  We'll then compare this metric
3176          * to our max concurrency to decide whether to add a new thread.
3177          */
3178
3179         uint32_t busycount, thactive_count;
3180
3181         thactive_count = _wq_thactive_aggregate_downto_qos(wq, thactive,
3182                         at_priclass, &busycount, NULL);
3183
3184         if (tl && tl->th_priority <= at_priclass) {
3185                 /*
3186                  * don't count this thread as currently active
3187                  */
3188                 assert(thactive_count > 0);
3189                 thactive_count--;
3190         }
3191
3192         if (thactive_count + busycount < wq_max_concurrency[at_priclass]) {
3193                 PTHREAD_TRACE_WQ(TRACE_wq_constrained_admission | DBG_FUNC_NONE, wq, 2,
3194                                 thactive_count, busycount, 0);
3195                 return true;
3196         } else {
3197                 PTHREAD_TRACE_WQ(TRACE_wq_constrained_admission | DBG_FUNC_NONE, wq, 3,
3198                                 thactive_count, busycount, 0);
3199         }
3200
3201         if (busycount && may_start_timer) {
3202                 /*
3203                  * If this is called from the add timer, we won't have another timer
3204                  * fire when the thread exits the "busy" state, so rearm the timer.
3205                  */
3206                 if (WQ_TIMER_DELAYED_NEEDED(wq)) {
3207                         workqueue_interval_timer_start(wq);
3208                 }
3209         }
3210
3211         return false;
3212 }
3213
3214 static struct threadlist *
3215 pop_from_thidlelist(struct workqueue *wq, uint32_t priclass)
3216 {
3217         assert(wq->wq_thidlecount);
3218
3219         struct threadlist *tl = NULL;
3220
3221         if (!TAILQ_EMPTY(&wq->wq_thidlemgrlist) &&
3222                         (priclass == WORKQUEUE_EVENT_MANAGER_BUCKET || TAILQ_EMPTY(&wq->wq_thidlelist))){
3223                 tl = TAILQ_FIRST(&wq->wq_thidlemgrlist);
3224                 TAILQ_REMOVE(&wq->wq_thidlemgrlist, tl, th_entry);
3225                 assert(tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET);
3226         } else if (!TAILQ_EMPTY(&wq->wq_thidlelist) &&
3227                         (priclass != WORKQUEUE_EVENT_MANAGER_BUCKET || TAILQ_EMPTY(&wq->wq_thidlemgrlist))){
3228                 tl = TAILQ_FIRST(&wq->wq_thidlelist);
3229                 TAILQ_REMOVE(&wq->wq_thidlelist, tl, th_entry);
3230                 assert(tl->th_priority != WORKQUEUE_EVENT_MANAGER_BUCKET);
3231         } else {
3232                 panic("pop_from_thidlelist called with no threads available");
3233         }
3234         assert((tl->th_flags & TH_LIST_RUNNING) == 0);
3235
3236         assert(wq->wq_thidlecount);
3237         wq->wq_thidlecount--;
3238
3239         TAILQ_INSERT_TAIL(&wq->wq_thrunlist, tl, th_entry);
3240
3241         tl->th_flags |= TH_LIST_RUNNING | TH_LIST_BUSY;
3242
3243         wq->wq_threads_scheduled++;
3244         wq->wq_thscheduled_count[priclass]++;
3245         _wq_thactive_inc(wq, priclass);
3246         return tl;
3247 }
3248
3249 static pthread_priority_t
3250 pthread_priority_from_wq_class_index(struct workqueue *wq, int index)
3251 {
3252         if (index == WORKQUEUE_EVENT_MANAGER_BUCKET){
3253                 return wq->wq_event_manager_priority;
3254         } else {
3255                 return class_index_get_pthread_priority(index);
3256         }
3257 }
3258
3259 static void
3260 reset_priority(struct threadlist *tl, pthread_priority_t pri)
3261 {
3262         kern_return_t ret;
3263         thread_t th = tl->th_thread;
3264
3265         if ((pri & _PTHREAD_PRIORITY_SCHED_PRI_FLAG) == 0){
3266                 ret = pthread_kern->thread_set_workq_qos(th, pthread_priority_get_thread_qos(pri), 0);
3267                 assert(ret == KERN_SUCCESS || ret == KERN_TERMINATED);
3268
3269                 if (tl->th_flags & TH_LIST_EVENT_MGR_SCHED_PRI) {
3270
3271                         /* Reset priority to default (masked by QoS) */
3272
3273                         ret = pthread_kern->thread_set_workq_pri(th, 31, POLICY_TIMESHARE);
3274                         assert(ret == KERN_SUCCESS || ret == KERN_TERMINATED);
3275
3276                         tl->th_flags &= ~TH_LIST_EVENT_MGR_SCHED_PRI;
3277                 }
3278         } else {
3279                 ret = pthread_kern->thread_set_workq_qos(th, THREAD_QOS_UNSPECIFIED, 0);
3280                 assert(ret == KERN_SUCCESS || ret == KERN_TERMINATED);
3281                 ret = pthread_kern->thread_set_workq_pri(th, (pri & (~_PTHREAD_PRIORITY_FLAGS_MASK)), POLICY_TIMESHARE);
3282                 assert(ret == KERN_SUCCESS || ret == KERN_TERMINATED);
3283
3284                 tl->th_flags |= TH_LIST_EVENT_MGR_SCHED_PRI;
3285         }
3286 }
3287
3288 /*
3289  * Picks the best request to run, and returns the best overcommit fallback
3290  * if the best pick is non overcommit and risks failing its admission check.
3291  */
3292 static struct threadreq *
3293 workqueue_best_threadreqs(struct workqueue *wq, struct threadlist *tl,
3294                 struct threadreq **fallback)
3295 {
3296         struct threadreq *req, *best_req = NULL;
3297         int priclass, prilimit;
3298
3299         if ((wq->wq_event_manager_threadreq.tr_state == TR_STATE_WAITING) &&
3300                         ((wq->wq_thscheduled_count[WORKQUEUE_EVENT_MANAGER_BUCKET] == 0) ||
3301                         (tl && tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET))) {
3302                 /*
3303                  * There's an event manager request and either:
3304                  *   - no event manager currently running
3305                  *   - we are re-using the event manager
3306                  */
3307                 req = &wq->wq_event_manager_threadreq;
3308                 PTHREAD_TRACE_WQ_REQ(TRACE_wq_run_threadreq_req_select | DBG_FUNC_NONE, wq, req, 1, 0, 0);
3309                 return req;
3310         }
3311
3312         if (tl) {
3313                 prilimit = WORKQUEUE_EVENT_MANAGER_BUCKET;
3314         } else {
3315                 prilimit = _wq_highest_paced_priority(wq);
3316         }
3317         for (priclass = 0; priclass < prilimit; priclass++) {
3318                 req = TAILQ_FIRST(&wq->wq_overcommit_reqlist[priclass]);
3319                 if (req) {
3320                         PTHREAD_TRACE_WQ_REQ(TRACE_wq_run_threadreq_req_select | DBG_FUNC_NONE, wq, req, 2, 0, 0);
3321                         if (best_req) {
3322                                 *fallback = req;
3323                         } else {
3324                                 best_req = req;
3325                         }
3326                         break;
3327                 }
3328                 if (!best_req) {
3329                         best_req = TAILQ_FIRST(&wq->wq_reqlist[priclass]);
3330                         if (best_req) {
3331                                 PTHREAD_TRACE_WQ_REQ(TRACE_wq_run_threadreq_req_select | DBG_FUNC_NONE, wq, best_req, 3, 0, 0);
3332                         }
3333                 }
3334         }
3335         return best_req;
3336 }
3337
3338 /**
3339  * Runs a thread request on a thread
3340  *
3341  * - if thread is THREAD_NULL, will find a thread and run the request there.
3342  *   Otherwise, the thread must be the current thread.
3343  *
3344  * - if req is NULL, will find the highest priority request and run that.  If
3345  *   it is not NULL, it must be a threadreq object in state NEW.  If it can not
3346  *   be run immediately, it will be enqueued and moved to state WAITING.
3347  *
3348  *   Either way, the thread request object serviced will be moved to state
3349  *   PENDING and attached to the threadlist.
3350  *
3351  *   Should be called with the workqueue lock held.  Will drop it.
3352  *
3353  *   WARNING: _workq_kevent_reqthreads needs to be able to preflight any
3354  *   admission checks in this function.  If you are changing this function,
3355  *   keep that one up-to-date.
3356  *
3357  * - if parking_tl is non NULL, then the current thread is parking. This will
3358  *   try to reuse this thread for a request. If no match is found, it will be
3359  *   parked.
3360  */
3361 static int
3362 workqueue_run_threadreq_and_unlock(proc_t p, struct workqueue *wq,
3363                 struct threadlist *parking_tl, struct threadreq *req,
3364                 bool may_add_new_thread)
3365 {
3366         struct threadreq *incoming_req = req;
3367
3368         struct threadlist *tl = parking_tl;
3369         int rc = WQ_RUN_TR_THROTTLED;
3370
3371         assert(tl == NULL || tl->th_thread == current_thread());
3372         assert(req == NULL || req->tr_state == TR_STATE_NEW);
3373         assert(!may_add_new_thread || !tl);
3374
3375         PTHREAD_TRACE_WQ_REQ(TRACE_wq_run_threadreq | DBG_FUNC_START, wq, req,
3376                         tl ? thread_tid(tl->th_thread) : 0,
3377                         req ? (req->tr_priority << 16 | req->tr_flags) : 0, 0);
3378
3379         /*
3380          * Special cases when provided an event manager request
3381          */
3382         if (req && req->tr_priority == WORKQUEUE_EVENT_MANAGER_BUCKET) {
3383                 // Clients must not rely on identity of event manager requests
3384                 assert(req->tr_flags & TR_FLAG_ONSTACK);
3385                 // You can't be both overcommit and event manager
3386                 assert((req->tr_flags & TR_FLAG_OVERCOMMIT) == 0);
3387
3388                 /*
3389                  * We can only ever have one event manager request, so coalesce them if
3390                  * there's already one outstanding.
3391                  */
3392                 if (wq->wq_event_manager_threadreq.tr_state == TR_STATE_WAITING) {
3393                         PTHREAD_TRACE_WQ_REQ(TRACE_wq_run_threadreq_mgr_merge | DBG_FUNC_NONE, wq, req, 0, 0, 0);
3394
3395                         struct threadreq *existing_req = &wq->wq_event_manager_threadreq;
3396                         if (req->tr_flags & TR_FLAG_KEVENT) {
3397                                 existing_req->tr_flags |= TR_FLAG_KEVENT;
3398                         }
3399
3400                         req = existing_req;
3401                         incoming_req = NULL;
3402                 }
3403
3404                 if (wq->wq_thscheduled_count[WORKQUEUE_EVENT_MANAGER_BUCKET] &&
3405                                 (!tl || tl->th_priority != WORKQUEUE_EVENT_MANAGER_BUCKET)){
3406                         /*
3407                          * There can only be one event manager running at a time.
3408                          */
3409                         PTHREAD_TRACE_WQ(TRACE_wq_run_threadreq | DBG_FUNC_END, wq, 1, 0, 0, 0);
3410                         goto done;
3411                 }
3412         }
3413
3414 again: // Start again after creating a thread
3415
3416         if (_wq_exiting(wq)) {
3417                 rc = WQ_RUN_TR_EXITING;
3418                 goto exiting;
3419         }
3420
3421         /*
3422          * Thread request selection and admission control
3423          */
3424         struct threadreq *fallback = NULL;
3425         if (req) {
3426                 if ((req->tr_flags & TR_FLAG_NO_PACING) == 0 &&
3427                                 _wq_should_pace_priority(wq, req->tr_priority)) {
3428                         /*
3429                          * If a request fails the pacing admission check, then thread
3430                          * requests are redriven when the pacing thread is finally scheduled
3431                          * when it calls _wq_pacing_end() in wq_unpark_continue().
3432                          */
3433                         goto done;
3434                 }
3435         } else if (wq->wq_reqcount == 0) {
3436                 PTHREAD_TRACE_WQ(TRACE_wq_run_threadreq | DBG_FUNC_END, wq, 2, 0, 0, 0);
3437                 goto done;
3438         } else if ((req = workqueue_best_threadreqs(wq, tl, &fallback)) == NULL) {
3439                 PTHREAD_TRACE_WQ(TRACE_wq_run_threadreq | DBG_FUNC_END, wq, 3, 0, 0, 0);
3440                 goto done;
3441         }
3442
3443         if ((req->tr_flags & TR_FLAG_OVERCOMMIT) == 0 &&
3444                         (req->tr_priority < WORKQUEUE_EVENT_MANAGER_BUCKET)) {
3445                 if (!may_start_constrained_thread(wq, req->tr_priority, parking_tl, true)) {
3446                         if (!fallback) {
3447                                 PTHREAD_TRACE_WQ(TRACE_wq_run_threadreq | DBG_FUNC_END, wq, 4, 0, 0, 0);
3448                                 goto done;
3449                         }
3450                         assert(req->tr_state == TR_STATE_WAITING);
3451                         req = fallback;
3452                 }
3453         }
3454
3455         /*
3456          * Thread selection.
3457          */
3458         if (parking_tl) {
3459                 if (tl->th_priority != req->tr_priority) {
3460                         _wq_thactive_move(wq, tl->th_priority, req->tr_priority);
3461                         wq->wq_thscheduled_count[tl->th_priority]--;
3462                         wq->wq_thscheduled_count[req->tr_priority]++;
3463                 }
3464                 PTHREAD_TRACE_WQ(TRACE_wq_run_threadreq_thread_select | DBG_FUNC_NONE,
3465                                 wq, 1, thread_tid(tl->th_thread), 0, 0);
3466         } else if (wq->wq_thidlecount) {
3467                 tl = pop_from_thidlelist(wq, req->tr_priority);
3468                 /*
3469                  * This call will update wq_thscheduled_count and wq_thactive_count for
3470                  * the provided priority.  It will not set the returned thread to that
3471                  * priority.  This matches the behavior of the parking_tl clause above.
3472                  */
3473                 PTHREAD_TRACE_WQ(TRACE_wq_run_threadreq_thread_select | DBG_FUNC_NONE,
3474                                 wq, 2, thread_tid(tl->th_thread), 0, 0);
3475         } else /* no idle threads */ {
3476                 if (!may_add_new_thread || wq->wq_nthreads >= wq_max_threads) {
3477                         PTHREAD_TRACE_WQ(TRACE_wq_run_threadreq | DBG_FUNC_END, wq, 5,
3478                                         may_add_new_thread, wq->wq_nthreads, 0);
3479                         if (wq->wq_nthreads < wq_max_threads) {
3480                                 rc = WQ_RUN_TR_THREAD_NEEDED;
3481                         }
3482                         goto done;
3483                 }
3484
3485                 bool added_thread = workqueue_addnewthread(p, wq);
3486                 /*
3487                  * workqueue_addnewthread will drop and re-take the lock, so we
3488                  * need to ensure we still have a cached request.
3489                  *
3490                  * It also means we have to pick a new request, since our old pick may
3491                  * not be valid anymore.
3492                  */
3493                 req = incoming_req;
3494                 if (req && (req->tr_flags & TR_FLAG_ONSTACK)) {
3495                         _threadreq_copy_prepare(wq);
3496                 }
3497
3498                 if (added_thread) {
3499                         PTHREAD_TRACE_WQ(TRACE_wq_run_threadreq_thread_select | DBG_FUNC_NONE,
3500                                         wq, 3, 0, 0, 0);
3501                         goto again;
3502                 } else if (_wq_exiting(wq)) {
3503                         rc = WQ_RUN_TR_EXITING;
3504                         goto exiting;
3505                 } else {
3506                         PTHREAD_TRACE_WQ(TRACE_wq_run_threadreq | DBG_FUNC_END, wq, 6, 0, 0, 0);
3507                         /*
3508                          * Something caused thread creation to fail.  Kick off the timer in
3509                          * the hope that it'll succeed next time.
3510                          */
3511                         if (WQ_TIMER_DELAYED_NEEDED(wq)) {
3512                                 workqueue_interval_timer_start(wq);
3513                         }
3514                         goto done;
3515                 }
3516         }
3517
3518         /*
3519          * Setup thread, mark request as complete and run with it.
3520          */
3521         if (req->tr_state == TR_STATE_WAITING) {
3522                 _threadreq_dequeue(wq, req);
3523         }
3524         if (tl->th_priority != req->tr_priority) {
3525                 PTHREAD_TRACE_WQ(TRACE_wq_thread_reset_priority | DBG_FUNC_NONE,
3526                                         wq, thread_tid(tl->th_thread),
3527                                         (tl->th_priority << 16) | req->tr_priority, 1, 0);
3528                 reset_priority(tl, pthread_priority_from_wq_class_index(wq, req->tr_priority));
3529                 tl->th_priority = (uint8_t)req->tr_priority;
3530         }
3531         if (req->tr_flags & TR_FLAG_OVERCOMMIT) {
3532                 if ((tl->th_flags & TH_LIST_CONSTRAINED) != 0) {
3533                         tl->th_flags &= ~TH_LIST_CONSTRAINED;
3534                         wq->wq_constrained_threads_scheduled--;
3535                 }
3536         } else {
3537                 if ((tl->th_flags & TH_LIST_CONSTRAINED) == 0) {
3538                         tl->th_flags |= TH_LIST_CONSTRAINED;
3539                         wq->wq_constrained_threads_scheduled++;
3540                 }
3541         }
3542
3543         if (!parking_tl && !(req->tr_flags & TR_FLAG_NO_PACING)) {
3544                 _wq_pacing_start(wq, tl);
3545         }
3546         if ((req->tr_flags & TR_FLAG_OVERCOMMIT) == 0) {
3547                 uint32_t old_qos, new_qos;
3548
3549                 /*
3550                  * If we are scheduling a constrained thread request, we may need to
3551                  * update the best constrained qos in the thactive atomic state.
3552                  */
3553                 for (new_qos = 0; new_qos < WQ_THACTIVE_NO_PENDING_REQUEST; new_qos++) {
3554                         if (TAILQ_FIRST(&wq->wq_reqlist[new_qos]))
3555                                 break;
3556                 }
3557                 old_qos = _wq_thactive_best_constrained_req_qos(wq);
3558                 if (old_qos != new_qos) {
3559                         wq_thactive_t v = _wq_thactive_set_best_constrained_req_qos(wq,
3560                                         old_qos, new_qos);
3561 #ifdef __LP64__
3562                         PTHREAD_TRACE_WQ(TRACE_wq_thactive_update, 2, (uint64_t)v,
3563                                         (uint64_t)(v >> 64), 0, 0);
3564 #else
3565                         PTHREAD_TRACE_WQ(TRACE_wq_thactive_update, 2, v, 0, 0, 0);
3566 #endif
3567                 }
3568         }
3569         {
3570                 uint32_t upcall_flags = WQ_FLAG_THREAD_NEWSPI;
3571                 if (req->tr_flags & TR_FLAG_OVERCOMMIT)
3572                         upcall_flags |= WQ_FLAG_THREAD_OVERCOMMIT;
3573                 if (req->tr_flags & TR_FLAG_KEVENT)
3574                         upcall_flags |= WQ_FLAG_THREAD_KEVENT;
3575                 if (req->tr_flags & TR_FLAG_WORKLOOP)
3576                         upcall_flags |= WQ_FLAG_THREAD_WORKLOOP | WQ_FLAG_THREAD_KEVENT;
3577                 if (tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET)
3578                         upcall_flags |= WQ_FLAG_THREAD_EVENT_MANAGER;
3579                 tl->th_upcall_flags = upcall_flags >> WQ_FLAG_THREAD_PRIOSHIFT;
3580         }
3581         if (req->tr_flags & TR_FLAG_KEVENT) {
3582                 tl->th_flags |= TH_LIST_KEVENT;
3583         } else {
3584                 tl->th_flags &= ~TH_LIST_KEVENT;
3585         }
3586         return _threadreq_complete_and_unlock(p, wq, req, tl);
3587
3588 done:
3589         if (incoming_req) {
3590                 _threadreq_enqueue(wq, incoming_req);
3591         }
3592
3593 exiting:
3594
3595         if (parking_tl && !(parking_tl->th_flags & TH_LIST_UNBINDING)) {
3596                 parkit(wq, parking_tl, parking_tl->th_thread);
3597                 __builtin_unreachable();
3598         }
3599
3600         workqueue_unlock(wq);
3601
3602         return rc;
3603 }
3604
3605 /**
3606  * parked thread wakes up
3607  */
3608 static void __dead2
3609 wq_unpark_continue(void* __unused ptr, wait_result_t wait_result)
3610 {
3611         boolean_t first_use = false;
3612         thread_t th = current_thread();
3613         proc_t p = current_proc();
3614
3615         struct uthread *uth = pthread_kern->get_bsdthread_info(th);
3616         if (uth == NULL) goto done;
3617
3618         struct workqueue *wq = pthread_kern->proc_get_wqptr(p);
3619         if (wq == NULL) goto done;
3620
3621         workqueue_lock_spin(wq);
3622
3623         struct threadlist *tl = pthread_kern->uthread_get_threadlist(uth);
3624         assert(tl != WQ_THREADLIST_EXITING_POISON);
3625         if (tl == NULL) {
3626                 /*
3627                  * We woke up before addnewthread() was finished setting us up.  Go
3628                  * ahead and exit, but before we do poison the threadlist variable so
3629                  * that addnewthread() doesn't think we are valid still.
3630                  */
3631                 pthread_kern->uthread_set_threadlist(uth, WQ_THREADLIST_EXITING_POISON);
3632                 workqueue_unlock(wq);
3633                 goto done;
3634         }
3635
3636         assert(tl->th_flags & TH_LIST_INITED);
3637
3638         if ((tl->th_flags & TH_LIST_NEW)){
3639                 tl->th_flags &= ~(TH_LIST_NEW);
3640                 first_use = true;
3641         }
3642
3643         if ((tl->th_flags & (TH_LIST_RUNNING | TH_LIST_BUSY)) == TH_LIST_RUNNING) {
3644                 /*
3645                  * The normal wakeup path.
3646                  */
3647                 goto return_to_user;
3648         }
3649
3650         if ((tl->th_flags & TH_LIST_RUNNING) == 0 &&
3651                         wait_result == THREAD_TIMED_OUT &&
3652                         tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET &&
3653                         TAILQ_FIRST(&wq->wq_thidlemgrlist) == tl &&
3654                         TAILQ_NEXT(tl, th_entry) == NULL){
3655                 /*
3656                  * If we are the only idle manager and we pop'ed for self-destruction,
3657                  * then don't actually exit.  Instead, free our stack to save some
3658                  * memory and re-park.
3659                  */
3660
3661                 workqueue_unlock(wq);
3662
3663                 vm_map_t vmap = wq->wq_map;
3664
3665                 // Keep this in sync with _setup_wqthread()
3666                 const vm_size_t       guardsize = vm_map_page_size(vmap);
3667                 const user_addr_t     freeaddr = (user_addr_t)tl->th_stackaddr + guardsize;
3668                 const vm_map_offset_t freesize = vm_map_trunc_page_mask((PTH_DEFAULT_STACKSIZE + guardsize + PTHREAD_T_OFFSET) - 1, vm_map_page_mask(vmap)) - guardsize;
3669
3670                 __assert_only int kr = mach_vm_behavior_set(vmap, freeaddr, freesize, VM_BEHAVIOR_REUSABLE);
3671 #if MACH_ASSERT
3672                 if (kr != KERN_SUCCESS && kr != KERN_INVALID_ADDRESS) {
3673                         os_log_error(OS_LOG_DEFAULT, "unable to make thread stack reusable (kr: %d)", kr);
3674                 }
3675 #endif
3676
3677                 workqueue_lock_spin(wq);
3678
3679                 if ( !(tl->th_flags & TH_LIST_RUNNING)) {
3680                         thread_set_pending_block_hint(th, kThreadWaitParkedWorkQueue);
3681                         assert_wait((caddr_t)tl, (THREAD_INTERRUPTIBLE));
3682
3683                         workqueue_unlock(wq);
3684
3685                         thread_block(wq_unpark_continue);
3686                         __builtin_unreachable();
3687                 }
3688         }
3689
3690         if ((tl->th_flags & TH_LIST_RUNNING) == 0) {
3691                 assert((tl->th_flags & TH_LIST_BUSY) == 0);
3692                 if (!first_use) {
3693                         PTHREAD_TRACE_WQ(TRACE_wq_thread_park | DBG_FUNC_END, wq, 0, 0, 0, 0);
3694                 }
3695                 /*
3696                  * We were set running, but not for the purposes of actually running.
3697                  * This could be because the timer elapsed.  Or it could be because the
3698                  * thread aborted.  Either way, we need to return to userspace to exit.
3699                  *
3700                  * The call to workqueue_removethread will consume the lock.
3701                  */
3702
3703                 if (!first_use &&
3704                                 (tl->th_priority < qos_class_get_class_index(WQ_THREAD_CLEANUP_QOS) ||
3705                                 (tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET))) {
3706                         // Reset the QoS to something low for the pthread cleanup
3707                         PTHREAD_TRACE_WQ(TRACE_wq_thread_reset_priority | DBG_FUNC_NONE,
3708                                                 wq, thread_tid(th),
3709                                                 (tl->th_priority << 16) | qos_class_get_class_index(WQ_THREAD_CLEANUP_QOS), 3, 0);
3710                         pthread_priority_t cleanup_pri = _pthread_priority_make_newest(WQ_THREAD_CLEANUP_QOS, 0, 0);
3711                         reset_priority(tl, cleanup_pri);
3712                 }
3713
3714                 workqueue_removethread(tl, 0, first_use);
3715
3716                 if (first_use){
3717                         pthread_kern->thread_bootstrap_return();
3718                 } else {
3719                         pthread_kern->unix_syscall_return(0);
3720                 }
3721                 __builtin_unreachable();
3722         }
3723
3724         /*
3725          * The timer woke us up or the thread was aborted.  However, we have
3726          * already started to make this a runnable thread.  Wait for that to
3727          * finish, then continue to userspace.
3728          */
3729         while ((tl->th_flags & TH_LIST_BUSY)) {
3730                 assert_wait((caddr_t)tl, (THREAD_UNINT));
3731
3732                 workqueue_unlock(wq);
3733
3734                 thread_block(THREAD_CONTINUE_NULL);
3735
3736                 workqueue_lock_spin(wq);
3737         }
3738
3739 return_to_user:
3740         if (!first_use) {
3741                 PTHREAD_TRACE_WQ(TRACE_wq_thread_park | DBG_FUNC_END, wq, 0, 0, 0, 0);
3742         }
3743         if (_wq_pacing_end(wq, tl) && wq->wq_reqcount) {
3744                 workqueue_run_threadreq_and_unlock(p, wq, NULL, NULL, true);
3745         } else {
3746                 workqueue_unlock(wq);
3747         }
3748         _setup_wqthread(p, th, wq, tl, first_use ? WQ_SETUP_FIRST_USE : 0);
3749         pthread_kern->thread_sched_call(th, workqueue_callback);
3750 done:
3751         if (first_use){
3752                 pthread_kern->thread_bootstrap_return();
3753         } else {
3754                 pthread_kern->unix_syscall_return(EJUSTRETURN);
3755         }
3756         panic("Our attempt to return to userspace failed...");
3757 }
3758
3759 /**
3760  * configures initial thread stack/registers to jump into:
3761  * _pthread_wqthread(pthread_t self, mach_port_t kport, void *stackaddr, void *keventlist, int upcall_flags, int nkevents);
3762  * to get there we jump through assembily stubs in pthread_asm.s.  Those
3763  * routines setup a stack frame, using the current stack pointer, and marshall
3764  * arguments from registers to the stack as required by the ABI.
3765  *
3766  * One odd thing we do here is to start the pthread_t 4k below what would be the
3767  * top of the stack otherwise.  This is because usually only the first 4k of the
3768  * pthread_t will be used and so we want to put it on the same 16k page as the
3769  * top of the stack to save memory.
3770  *
3771  * When we are done the stack will look like:
3772  * |-----------| th_stackaddr + th_allocsize
3773  * |pthread_t  | th_stackaddr + DEFAULT_STACKSIZE + guardsize + PTHREAD_STACK_OFFSET
3774  * |kevent list| optionally - at most WQ_KEVENT_LIST_LEN events
3775  * |kevent data| optionally - at most WQ_KEVENT_DATA_SIZE bytes
3776  * |stack gap  | bottom aligned to 16 bytes, and at least as big as stack_gap_min
3777  * |   STACK   |
3778  * |     ⇓     |
3779  * |           |
3780  * |guard page | guardsize
3781  * |-----------| th_stackaddr
3782  */
3783 void
3784 _setup_wqthread(proc_t p, thread_t th, struct workqueue *wq,
3785                 struct threadlist *tl, int setup_flags)
3786 {
3787         int error;
3788         if (setup_flags & WQ_SETUP_CLEAR_VOUCHER) {
3789                 /*
3790                  * For preemption reasons, we want to reset the voucher as late as
3791                  * possible, so we do it in two places:
3792                  *   - Just before parking (i.e. in parkit())
3793                  *   - Prior to doing the setup for the next workitem (i.e. here)
3794                  *
3795                  * Those two places are sufficient to ensure we always reset it before
3796                  * it goes back out to user space, but be careful to not break that
3797                  * guarantee.
3798                  */
3799                 __assert_only kern_return_t kr;
3800                 kr = pthread_kern->thread_set_voucher_name(MACH_PORT_NULL);
3801                 assert(kr == KERN_SUCCESS);
3802         }
3803
3804         uint32_t upcall_flags = tl->th_upcall_flags << WQ_FLAG_THREAD_PRIOSHIFT;
3805         if (!(setup_flags & WQ_SETUP_FIRST_USE)) {
3806                 upcall_flags |= WQ_FLAG_THREAD_REUSE;
3807         }
3808
3809         /*
3810          * Put the QoS class value into the lower bits of the reuse_thread register, this is where
3811          * the thread priority used to be stored anyway.
3812          */
3813         pthread_priority_t priority = pthread_priority_from_wq_class_index(wq, tl->th_priority);
3814         upcall_flags |= (_pthread_priority_get_qos_newest(priority) & WQ_FLAG_THREAD_PRIOMASK);
3815
3816         const vm_size_t guardsize = vm_map_page_size(tl->th_workq->wq_map);
3817         const vm_size_t stack_gap_min = (proc_is64bit(p) == 0) ? C_32_STK_ALIGN : C_64_REDZONE_LEN;
3818         const vm_size_t stack_align_min = (proc_is64bit(p) == 0) ? C_32_STK_ALIGN : C_64_STK_ALIGN;
3819
3820         user_addr_t pthread_self_addr = (user_addr_t)(tl->th_stackaddr + PTH_DEFAULT_STACKSIZE + guardsize + PTHREAD_T_OFFSET);
3821         user_addr_t stack_top_addr = (user_addr_t)((pthread_self_addr - stack_gap_min) & -stack_align_min);
3822         user_addr_t stack_bottom_addr = (user_addr_t)(tl->th_stackaddr + guardsize);
3823
3824         user_addr_t wqstart_fnptr = pthread_kern->proc_get_wqthread(p);
3825         if (!wqstart_fnptr) {
3826                 panic("workqueue thread start function pointer is NULL");
3827         }
3828
3829         if (setup_flags & WQ_SETUP_FIRST_USE) {
3830                 uint32_t tsd_offset = pthread_kern->proc_get_pthread_tsd_offset(p);
3831                 if (tsd_offset) {
3832                         mach_vm_offset_t th_tsd_base = (mach_vm_offset_t)pthread_self_addr + tsd_offset;
3833                         kern_return_t kret = pthread_kern->thread_set_tsd_base(th, th_tsd_base);
3834                         if (kret == KERN_SUCCESS) {
3835                                 upcall_flags |= WQ_FLAG_THREAD_TSD_BASE_SET;
3836                         }
3837                 }
3838
3839                 /*
3840                 * Pre-fault the first page of the new thread's stack and the page that will
3841                 * contain the pthread_t structure.
3842                 */
3843                 vm_map_t vmap = pthread_kern->current_map();
3844                 if (vm_map_trunc_page_mask((vm_map_offset_t)(stack_top_addr - C_64_REDZONE_LEN), vm_map_page_mask(vmap)) !=
3845                                 vm_map_trunc_page_mask((vm_map_offset_t)pthread_self_addr, vm_map_page_mask(vmap))){
3846                         vm_fault( vmap,
3847                                         vm_map_trunc_page_mask((vm_map_offset_t)(stack_top_addr - C_64_REDZONE_LEN), vm_map_page_mask(vmap)),
3848                                         VM_PROT_READ | VM_PROT_WRITE,
3849                                         FALSE,
3850                                         THREAD_UNINT, NULL, 0);
3851                 }
3852                 vm_fault( vmap,
3853                                 vm_map_trunc_page_mask((vm_map_offset_t)pthread_self_addr, vm_map_page_mask(vmap)),
3854                                 VM_PROT_READ | VM_PROT_WRITE,
3855                                 FALSE,
3856                                 THREAD_UNINT, NULL, 0);
3857         }
3858
3859         user_addr_t kevent_list = NULL;
3860         int kevent_count = 0;
3861         if (upcall_flags & WQ_FLAG_THREAD_KEVENT){
3862                 bool workloop = upcall_flags & WQ_FLAG_THREAD_WORKLOOP;
3863
3864                 kevent_list = pthread_self_addr - WQ_KEVENT_LIST_LEN * sizeof(struct kevent_qos_s);
3865                 kevent_count = WQ_KEVENT_LIST_LEN;
3866
3867                 user_addr_t kevent_id_addr = kevent_list;
3868                 if (workloop) {
3869                         /*
3870                          * The kevent ID goes just below the kevent list.  Sufficiently new
3871                          * userspace will know to look there.  Old userspace will just
3872                          * ignore it.
3873                          */
3874                         kevent_id_addr -= sizeof(kqueue_id_t);
3875                 }
3876
3877                 user_addr_t kevent_data_buf = kevent_id_addr - WQ_KEVENT_DATA_SIZE;
3878                 user_size_t kevent_data_available = WQ_KEVENT_DATA_SIZE;
3879
3880                 int32_t events_out = 0;
3881
3882                 assert(tl->th_flags | TH_LIST_KEVENT_BOUND);
3883                 unsigned int flags = KEVENT_FLAG_STACK_DATA | KEVENT_FLAG_IMMEDIATE;
3884                 if (tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET) {
3885                         flags |= KEVENT_FLAG_WORKQ_MANAGER;
3886                 }
3887                 int ret = 0;
3888                 if (workloop) {
3889                         flags |= KEVENT_FLAG_WORKLOOP;
3890                         kqueue_id_t kevent_id = -1;
3891                         ret = kevent_id_internal(p, &kevent_id,
3892                                         NULL, 0, kevent_list, kevent_count,
3893                                         kevent_data_buf, &kevent_data_available,
3894                                         flags, &events_out);
3895                         copyout(&kevent_id, kevent_id_addr, sizeof(kevent_id));
3896                 } else {
3897                         flags |= KEVENT_FLAG_WORKQ;
3898                         ret = kevent_qos_internal(p,
3899                                         class_index_get_thread_qos(tl->th_priority),
3900                                         NULL, 0, kevent_list, kevent_count,
3901                                         kevent_data_buf, &kevent_data_available,
3902                                         flags, &events_out);
3903                 }
3904
3905                 // squash any errors into just empty output
3906                 if (ret != KERN_SUCCESS || events_out == -1){
3907                         events_out = 0;
3908                         kevent_data_available = WQ_KEVENT_DATA_SIZE;
3909                 }
3910
3911                 // We shouldn't get data out if there aren't events available
3912                 assert(events_out != 0 || kevent_data_available == WQ_KEVENT_DATA_SIZE);
3913
3914                 if (events_out > 0){
3915                         if (kevent_data_available == WQ_KEVENT_DATA_SIZE){
3916                                 stack_top_addr = (kevent_id_addr - stack_gap_min) & -stack_align_min;
3917                         } else {
3918                                 stack_top_addr = (kevent_data_buf + kevent_data_available - stack_gap_min) & -stack_align_min;
3919                         }
3920
3921                         kevent_count = events_out;
3922                 } else {
3923                         kevent_list = NULL;
3924                         kevent_count = 0;
3925                 }
3926         }
3927
3928         PTHREAD_TRACE_WQ(TRACE_wq_runthread | DBG_FUNC_START, wq, 0, 0, 0, 0);
3929
3930 #if defined(__i386__) || defined(__x86_64__)
3931         if (proc_is64bit(p) == 0) {
3932                 x86_thread_state32_t state = {
3933                         .eip = (unsigned int)wqstart_fnptr,
3934                         .eax = /* arg0 */ (unsigned int)pthread_self_addr,
3935                         .ebx = /* arg1 */ (unsigned int)tl->th_thport,
3936                         .ecx = /* arg2 */ (unsigned int)stack_bottom_addr,
3937                         .edx = /* arg3 */ (unsigned int)kevent_list,
3938                         .edi = /* arg4 */ (unsigned int)upcall_flags,
3939                         .esi = /* arg5 */ (unsigned int)kevent_count,
3940
3941                         .esp = (int)((vm_offset_t)stack_top_addr),
3942                 };
3943
3944                 error = pthread_kern->thread_set_wq_state32(th, (thread_state_t)&state);
3945                 if (error != KERN_SUCCESS) {
3946                         panic(__func__ ": thread_set_wq_state failed: %d", error);
3947                 }
3948         } else {
3949                 x86_thread_state64_t state64 = {
3950                         // x86-64 already passes all the arguments in registers, so we just put them in their final place here
3951                         .rip = (uint64_t)wqstart_fnptr,
3952                         .rdi = (uint64_t)pthread_self_addr,
3953                         .rsi = (uint64_t)tl->th_thport,
3954                         .rdx = (uint64_t)stack_bottom_addr,
3955                         .rcx = (uint64_t)kevent_list,
3956                         .r8  = (uint64_t)upcall_flags,
3957                         .r9  = (uint64_t)kevent_count,
3958
3959                         .rsp = (uint64_t)(stack_top_addr)
3960                 };
3961
3962                 error = pthread_kern->thread_set_wq_state64(th, (thread_state_t)&state64);
3963                 if (error != KERN_SUCCESS) {
3964                         panic(__func__ ": thread_set_wq_state failed: %d", error);
3965                 }
3966         }
3967 #else
3968 #error setup_wqthread  not defined for this architecture
3969 #endif
3970 }
3971
3972 #if DEBUG
3973 static int wq_kevent_test SYSCTL_HANDLER_ARGS {
3974         //(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req)
3975 #pragma unused(oidp, arg1, arg2)
3976         int error;
3977         struct workq_reqthreads_req_s requests[64] = {};
3978
3979         if (req->newlen > sizeof(requests) || req->newlen < sizeof(struct workq_reqthreads_req_s))
3980                 return EINVAL;
3981
3982         error = copyin(req->newptr, requests, req->newlen);
3983         if (error) return error;
3984
3985         _workq_reqthreads(req->p, (int)(req->newlen / sizeof(struct workq_reqthreads_req_s)), requests);
3986
3987         return 0;
3988 }
3989 #endif // DEBUG
3990
3991 #pragma mark - Misc
3992
3993 int
3994 _fill_procworkqueue(proc_t p, struct proc_workqueueinfo * pwqinfo)
3995 {
3996         struct workqueue * wq;
3997         int error = 0;
3998         int     activecount;
3999
4000         if ((wq = pthread_kern->proc_get_wqptr(p)) == NULL) {
4001                 return EINVAL;
4002         }
4003
4004         /*
4005          * This is sometimes called from interrupt context by the kperf sampler.
4006          * In that case, it's not safe to spin trying to take the lock since we
4007          * might already hold it.  So, we just try-lock it and error out if it's
4008          * already held.  Since this is just a debugging aid, and all our callers
4009          * are able to handle an error, that's fine.
4010          */
4011         bool locked = workqueue_lock_try(wq);
4012         if (!locked) {
4013                 return EBUSY;
4014         }
4015
4016         activecount = _wq_thactive_aggregate_downto_qos(wq, _wq_thactive(wq),
4017                         WORKQUEUE_NUM_BUCKETS - 1, NULL, NULL);
4018         pwqinfo->pwq_nthreads = wq->wq_nthreads;
4019         pwqinfo->pwq_runthreads = activecount;
4020         pwqinfo->pwq_blockedthreads = wq->wq_threads_scheduled - activecount;
4021         pwqinfo->pwq_state = 0;
4022
4023         if (wq->wq_constrained_threads_scheduled >= wq_max_constrained_threads) {
4024                 pwqinfo->pwq_state |= WQ_EXCEEDED_CONSTRAINED_THREAD_LIMIT;
4025         }
4026
4027         if (wq->wq_nthreads >= wq_max_threads) {
4028                 pwqinfo->pwq_state |= WQ_EXCEEDED_TOTAL_THREAD_LIMIT;
4029         }
4030
4031         workqueue_unlock(wq);
4032         return(error);
4033 }
4034
4035 uint32_t
4036 _get_pwq_state_kdp(proc_t p)
4037 {
4038         if (p == NULL) {
4039                 return 0;
4040         }
4041
4042         struct workqueue *wq = pthread_kern->proc_get_wqptr(p);
4043
4044         if (wq == NULL || workqueue_lock_spin_is_acquired_kdp(wq)) {
4045                 return 0;
4046         }
4047
4048         uint32_t pwq_state = WQ_FLAGS_AVAILABLE;
4049
4050         if (wq->wq_constrained_threads_scheduled >= wq_max_constrained_threads) {
4051                 pwq_state |= WQ_EXCEEDED_CONSTRAINED_THREAD_LIMIT;
4052         }
4053
4054         if (wq->wq_nthreads >= wq_max_threads) {
4055                 pwq_state |= WQ_EXCEEDED_TOTAL_THREAD_LIMIT;
4056         }
4057
4058         return pwq_state;
4059 }
4060
4061 int
4062 _thread_selfid(__unused struct proc *p, uint64_t *retval)
4063 {
4064         thread_t thread = current_thread();
4065         *retval = thread_tid(thread);
4066         return KERN_SUCCESS;
4067 }
4068
4069 void
4070 _pthread_init(void)
4071 {
4072         pthread_lck_grp_attr = lck_grp_attr_alloc_init();
4073         pthread_lck_grp = lck_grp_alloc_init("pthread", pthread_lck_grp_attr);
4074
4075         /*
4076          * allocate the lock attribute for pthread synchronizers
4077          */
4078         pthread_lck_attr = lck_attr_alloc_init();
4079
4080         pthread_list_mlock = lck_mtx_alloc_init(pthread_lck_grp, pthread_lck_attr);
4081
4082         pth_global_hashinit();
4083         psynch_thcall = thread_call_allocate(psynch_wq_cleanup, NULL);
4084         psynch_zoneinit();
4085
4086         pthread_zone_workqueue = zinit(sizeof(struct workqueue),
4087                         1024 * sizeof(struct workqueue), 8192, "pthread.workqueue");
4088         pthread_zone_threadlist = zinit(sizeof(struct threadlist),
4089                         1024 * sizeof(struct threadlist), 8192, "pthread.threadlist");
4090         pthread_zone_threadreq = zinit(sizeof(struct threadreq),
4091                         1024 * sizeof(struct threadreq), 8192, "pthread.threadreq");
4092
4093         /*
4094          * register sysctls
4095          */
4096         sysctl_register_oid(&sysctl__kern_wq_stalled_window_usecs);
4097         sysctl_register_oid(&sysctl__kern_wq_reduce_pool_window_usecs);
4098         sysctl_register_oid(&sysctl__kern_wq_max_timer_interval_usecs);
4099         sysctl_register_oid(&sysctl__kern_wq_max_threads);
4100         sysctl_register_oid(&sysctl__kern_wq_max_constrained_threads);
4101         sysctl_register_oid(&sysctl__kern_pthread_debug_tracing);
4102
4103 #if DEBUG
4104         sysctl_register_oid(&sysctl__debug_wq_kevent_test);
4105 #endif
4106
4107         for (int i = 0; i < WORKQUEUE_NUM_BUCKETS; i++) {
4108                 uint32_t thread_qos = _wq_bucket_to_thread_qos(i);
4109                 wq_max_concurrency[i] = pthread_kern->qos_max_parallelism(thread_qos,
4110                                 QOS_PARALLELISM_COUNT_LOGICAL);
4111         }
4112         wq_max_concurrency[WORKQUEUE_EVENT_MANAGER_BUCKET] = 1;
4113 }