kern/kern_support.c

   1 /*
   2  * Copyright (c) 2000-2012 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995-2005 Apple Computer, Inc. All Rights Reserved */
  29 /*
  30  *      pthread_synch.c
  31  */
  32
  33 #pragma mark - Front Matter
  34
  35 #define  _PTHREAD_CONDATTR_T
  36 #define  _PTHREAD_COND_T
  37 #define _PTHREAD_MUTEXATTR_T
  38 #define _PTHREAD_MUTEX_T
  39 #define _PTHREAD_RWLOCKATTR_T
  40 #define _PTHREAD_RWLOCK_T
  41
  42 #undef pthread_mutexattr_t
  43 #undef pthread_mutex_t
  44 #undef pthread_condattr_t
  45 #undef pthread_cond_t
  46 #undef pthread_rwlockattr_t
  47 #undef pthread_rwlock_t
  48
  49 #include <sys/cdefs.h>
  50 #include <os/log.h>
  51
  52 // <rdar://problem/26158937> panic() should be marked noreturn
  53 extern void panic(const char *string, ...) __printflike(1,2) __dead2;
  54
  55 #include <sys/param.h>
  56 #include <sys/queue.h>
  57 #include <sys/resourcevar.h>
  58 //#include <sys/proc_internal.h>
  59 #include <sys/kauth.h>
  60 #include <sys/systm.h>
  61 #include <sys/timeb.h>
  62 #include <sys/times.h>
  63 #include <sys/acct.h>
  64 #include <sys/kernel.h>
  65 #include <sys/wait.h>
  66 #include <sys/signalvar.h>
  67 #include <sys/sysctl.h>
  68 #include <sys/syslog.h>
  69 #include <sys/stat.h>
  70 #include <sys/lock.h>
  71 #include <sys/kdebug.h>
  72 //#include <sys/sysproto.h>
  73 #include <sys/vm.h>
  74 #include <sys/user.h>           /* for coredump */
  75 #include <sys/proc_info.h>      /* for fill_procworkqueue */
  76
  77 #include <mach/mach_port.h>
  78 #include <mach/mach_types.h>
  79 #include <mach/semaphore.h>
  80 #include <mach/sync_policy.h>
  81 #include <mach/task.h>
  82 #include <mach/vm_prot.h>
  83 #include <kern/kern_types.h>
  84 #include <kern/task.h>
  85 #include <kern/clock.h>
  86 #include <mach/kern_return.h>
  87 #include <kern/thread.h>
  88 #include <kern/zalloc.h>
  89 #include <kern/sched_prim.h>    /* for thread_exception_return */
  90 #include <kern/processor.h>
  91 #include <kern/assert.h>
  92 #include <mach/mach_vm.h>
  93 #include <mach/mach_param.h>
  94 #include <mach/thread_status.h>
  95 #include <mach/thread_policy.h>
  96 #include <mach/message.h>
  97 #include <mach/port.h>
  98 //#include <vm/vm_protos.h>
  99 #include <vm/vm_fault.h>
 100 #include <vm/vm_map.h>
 101 #include <mach/thread_act.h> /* for thread_resume */
 102 #include <machine/machine_routines.h>
 103 #include <mach/shared_region.h>
 104
 105 #include <libkern/OSAtomic.h>
 106 #include <libkern/libkern.h>
 107
 108 #include <sys/pthread_shims.h>
 109 #include "kern_internal.h"
 110
 111 // XXX: Dirty import for sys/signarvar.h that's wrapped in BSD_KERNEL_PRIVATE
 112 #define sigcantmask (sigmask(SIGKILL) | sigmask(SIGSTOP))
 113
 114 // XXX: Ditto for thread tags from kern/thread.h
 115 #define THREAD_TAG_MAINTHREAD 0x1
 116 #define THREAD_TAG_PTHREAD 0x10
 117 #define THREAD_TAG_WORKQUEUE 0x20
 118
 119 lck_grp_attr_t   *pthread_lck_grp_attr;
 120 lck_grp_t    *pthread_lck_grp;
 121 lck_attr_t   *pthread_lck_attr;
 122
 123 zone_t pthread_zone_workqueue;
 124 zone_t pthread_zone_threadlist;
 125 zone_t pthread_zone_threadreq;
 126
 127 extern void thread_set_cthreadself(thread_t thread, uint64_t pself, int isLP64);
 128 extern void workqueue_thread_yielded(void);
 129
 130 #define WQ_SETUP_FIRST_USE  1
 131 #define WQ_SETUP_CLEAR_VOUCHER  2
 132 static void _setup_wqthread(proc_t p, thread_t th, struct workqueue *wq,
 133                 struct threadlist *tl, int flags);
 134
 135 static void reset_priority(struct threadlist *tl, pthread_priority_t pri);
 136 static pthread_priority_t pthread_priority_from_wq_class_index(struct workqueue *wq, int index);
 137
 138 static void wq_unpark_continue(void* ptr, wait_result_t wait_result) __dead2;
 139
 140 static bool workqueue_addnewthread(proc_t p, struct workqueue *wq);
 141 static void workqueue_removethread(struct threadlist *tl, bool fromexit, bool first_use);
 142 static void workqueue_lock_spin(struct workqueue *);
 143 static void workqueue_unlock(struct workqueue *);
 144
 145 #define WQ_RUN_TR_THROTTLED 0
 146 #define WQ_RUN_TR_THREAD_NEEDED 1
 147 #define WQ_RUN_TR_THREAD_STARTED 2
 148 #define WQ_RUN_TR_EXITING 3
 149 static int workqueue_run_threadreq_and_unlock(proc_t p, struct workqueue *wq,
 150                 struct threadlist *tl, struct threadreq *req, bool may_add_new_thread);
 151
 152 static bool may_start_constrained_thread(struct workqueue *wq,
 153                 uint32_t at_priclass, struct threadlist *tl, bool may_start_timer);
 154
 155 static mach_vm_offset_t stack_addr_hint(proc_t p, vm_map_t vmap);
 156 static boolean_t wq_thread_is_busy(uint64_t cur_ts,
 157                 _Atomic uint64_t *lastblocked_tsp);
 158
 159 int proc_settargetconc(pid_t pid, int queuenum, int32_t targetconc);
 160 int proc_setalltargetconc(pid_t pid, int32_t * targetconcp);
 161
 162 #define WQ_MAXPRI_MIN   0       /* low prio queue num */
 163 #define WQ_MAXPRI_MAX   2       /* max  prio queuenum */
 164 #define WQ_PRI_NUM      3       /* number of prio work queues */
 165
 166 #define C_32_STK_ALIGN          16
 167 #define C_64_STK_ALIGN          16
 168 #define C_64_REDZONE_LEN        128
 169
 170 #define PTHREAD_T_OFFSET 0
 171
 172 /*
 173  * Flags filed passed to bsdthread_create and back in pthread_start
 174 31  <---------------------------------> 0
 175 _________________________________________
 176 | flags(8) | policy(8) | importance(16) |
 177 -----------------------------------------
 178 */
 179
 180 #define PTHREAD_START_CUSTOM            0x01000000
 181 #define PTHREAD_START_SETSCHED          0x02000000
 182 #define PTHREAD_START_DETACHED          0x04000000
 183 #define PTHREAD_START_QOSCLASS          0x08000000
 184 #define PTHREAD_START_TSD_BASE_SET      0x10000000
 185 #define PTHREAD_START_QOSCLASS_MASK     0x00ffffff
 186 #define PTHREAD_START_POLICY_BITSHIFT 16
 187 #define PTHREAD_START_POLICY_MASK 0xff
 188 #define PTHREAD_START_IMPORTANCE_MASK 0xffff
 189
 190 #define SCHED_OTHER      POLICY_TIMESHARE
 191 #define SCHED_FIFO       POLICY_FIFO
 192 #define SCHED_RR         POLICY_RR
 193
 194 #define BASEPRI_DEFAULT 31
 195
 196 #pragma mark sysctls
 197
 198 static uint32_t wq_stalled_window_usecs = WQ_STALLED_WINDOW_USECS;
 199 static uint32_t wq_reduce_pool_window_usecs     = WQ_REDUCE_POOL_WINDOW_USECS;
 200 static uint32_t wq_max_timer_interval_usecs     = WQ_MAX_TIMER_INTERVAL_USECS;
 201 static uint32_t wq_max_threads                  = WORKQUEUE_MAXTHREADS;
 202 static uint32_t wq_max_constrained_threads      = WORKQUEUE_MAXTHREADS / 8;
 203 static uint32_t wq_max_concurrency[WORKQUEUE_NUM_BUCKETS + 1]; // set to ncpus on load
 204
 205 SYSCTL_INT(_kern, OID_AUTO, wq_stalled_window_usecs, CTLFLAG_RW | CTLFLAG_LOCKED,
 206            &wq_stalled_window_usecs, 0, "");
 207
 208 SYSCTL_INT(_kern, OID_AUTO, wq_reduce_pool_window_usecs, CTLFLAG_RW | CTLFLAG_LOCKED,
 209            &wq_reduce_pool_window_usecs, 0, "");
 210
 211 SYSCTL_INT(_kern, OID_AUTO, wq_max_timer_interval_usecs, CTLFLAG_RW | CTLFLAG_LOCKED,
 212            &wq_max_timer_interval_usecs, 0, "");
 213
 214 SYSCTL_INT(_kern, OID_AUTO, wq_max_threads, CTLFLAG_RW | CTLFLAG_LOCKED,
 215            &wq_max_threads, 0, "");
 216
 217 SYSCTL_INT(_kern, OID_AUTO, wq_max_constrained_threads, CTLFLAG_RW | CTLFLAG_LOCKED,
 218            &wq_max_constrained_threads, 0, "");
 219
 220 #ifdef DEBUG
 221 static int wq_kevent_test SYSCTL_HANDLER_ARGS;
 222 SYSCTL_PROC(_debug, OID_AUTO, wq_kevent_test, CTLFLAG_MASKED | CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY | CTLTYPE_OPAQUE, NULL, 0, wq_kevent_test, 0, "-");
 223 #endif
 224
 225 static uint32_t wq_init_constrained_limit = 1;
 226
 227 uint32_t pthread_debug_tracing = 1;
 228
 229 SYSCTL_INT(_kern, OID_AUTO, pthread_debug_tracing, CTLFLAG_RW | CTLFLAG_LOCKED,
 230                    &pthread_debug_tracing, 0, "")
 231
 232 static uint32_t pthread_mutex_default_policy;
 233
 234 SYSCTL_INT(_kern, OID_AUTO, pthread_mutex_default_policy, CTLFLAG_RW | CTLFLAG_LOCKED,
 235            &pthread_mutex_default_policy, 0, "");
 236
 237 /*
 238  *       +-----+-----+-----+-----+-----+-----+-----+
 239  *       | MT  | BG  | UT  | DE  | IN  | UN  | mgr |
 240  * +-----+-----+-----+-----+-----+-----+-----+-----+
 241  * | pri |  5  |  4  |  3  |  2  |  1  |  0  |  6  |
 242  * | qos |  1  |  2  |  3  |  4  |  5  |  6  |  7  |
 243  * +-----+-----+-----+-----+-----+-----+-----+-----+
 244  */
 245 static inline uint32_t
 246 _wq_bucket_to_thread_qos(int pri)
 247 {
 248         if (pri == WORKQUEUE_EVENT_MANAGER_BUCKET) {
 249                 return WORKQUEUE_EVENT_MANAGER_BUCKET + 1;
 250         }
 251         return WORKQUEUE_EVENT_MANAGER_BUCKET - pri;
 252 }
 253
 254 #pragma mark wq_thactive
 255
 256 #if defined(__LP64__)
 257 // Layout is:
 258 //   7 * 16 bits for each QoS bucket request count (including manager)
 259 //   3 bits of best QoS among all pending constrained requests
 260 //   13 bits of zeroes
 261 #define WQ_THACTIVE_BUCKET_WIDTH 16
 262 #define WQ_THACTIVE_QOS_SHIFT    (7 * WQ_THACTIVE_BUCKET_WIDTH)
 263 #else
 264 // Layout is:
 265 //   6 * 10 bits for each QoS bucket request count (except manager)
 266 //   1 bit for the manager bucket
 267 //   3 bits of best QoS among all pending constrained requests
 268 #define WQ_THACTIVE_BUCKET_WIDTH 10
 269 #define WQ_THACTIVE_QOS_SHIFT    (6 * WQ_THACTIVE_BUCKET_WIDTH + 1)
 270 #endif
 271 #define WQ_THACTIVE_BUCKET_MASK  ((1U << WQ_THACTIVE_BUCKET_WIDTH) - 1)
 272 #define WQ_THACTIVE_BUCKET_HALF  (1U << (WQ_THACTIVE_BUCKET_WIDTH - 1))
 273 #define WQ_THACTIVE_NO_PENDING_REQUEST 6
 274
 275 _Static_assert(sizeof(wq_thactive_t) * CHAR_BIT - WQ_THACTIVE_QOS_SHIFT >= 3,
 276                 "Make sure we have space to encode a QoS");
 277
 278 static inline wq_thactive_t
 279 _wq_thactive_fetch_and_add(struct workqueue *wq, wq_thactive_t offset)
 280 {
 281 #if PTHREAD_INLINE_RMW_ATOMICS || !defined(__LP64__)
 282         return atomic_fetch_add_explicit(&wq->wq_thactive, offset,
 283                         memory_order_relaxed);
 284 #else
 285         return pthread_kern->atomic_fetch_add_128_relaxed(&wq->wq_thactive, offset);
 286 #endif
 287 }
 288
 289 static inline wq_thactive_t
 290 _wq_thactive(struct workqueue *wq)
 291 {
 292 #if PTHREAD_INLINE_RMW_ATOMICS || !defined(__LP64__)
 293         return atomic_load_explicit(&wq->wq_thactive, memory_order_relaxed);
 294 #else
 295         return pthread_kern->atomic_load_128_relaxed(&wq->wq_thactive);
 296 #endif
 297 }
 298
 299 #define WQ_THACTIVE_BEST_CONSTRAINED_REQ_QOS(tha) \
 300                 ((tha) >> WQ_THACTIVE_QOS_SHIFT)
 301
 302 static inline uint32_t
 303 _wq_thactive_best_constrained_req_qos(struct workqueue *wq)
 304 {
 305         // Avoid expensive atomic operations: the three bits we're loading are in
 306         // a single byte, and always updated under the workqueue lock
 307         wq_thactive_t v = *(wq_thactive_t *)&wq->wq_thactive;
 308         return WQ_THACTIVE_BEST_CONSTRAINED_REQ_QOS(v);
 309 }
 310
 311 static inline wq_thactive_t
 312 _wq_thactive_set_best_constrained_req_qos(struct workqueue *wq,
 313                 uint32_t orig_qos, uint32_t new_qos)
 314 {
 315         wq_thactive_t v;
 316         v = (wq_thactive_t)(new_qos - orig_qos) << WQ_THACTIVE_QOS_SHIFT;
 317         /*
 318          * We can do an atomic add relative to the initial load because updates
 319          * to this qos are always serialized under the workqueue lock.
 320          */
 321         return _wq_thactive_fetch_and_add(wq, v) + v;
 322 }
 323
 324 static inline wq_thactive_t
 325 _wq_thactive_offset_for_qos(int qos)
 326 {
 327         return (wq_thactive_t)1 << (qos * WQ_THACTIVE_BUCKET_WIDTH);
 328 }
 329
 330 static inline wq_thactive_t
 331 _wq_thactive_inc(struct workqueue *wq, int qos)
 332 {
 333         return _wq_thactive_fetch_and_add(wq, _wq_thactive_offset_for_qos(qos));
 334 }
 335
 336 static inline wq_thactive_t
 337 _wq_thactive_dec(struct workqueue *wq, int qos)
 338 {
 339         return _wq_thactive_fetch_and_add(wq, -_wq_thactive_offset_for_qos(qos));
 340 }
 341
 342 static inline wq_thactive_t
 343 _wq_thactive_move(struct workqueue *wq, int oldqos, int newqos)
 344 {
 345         return _wq_thactive_fetch_and_add(wq, _wq_thactive_offset_for_qos(newqos) -
 346                         _wq_thactive_offset_for_qos(oldqos));
 347 }
 348
 349 static inline uint32_t
 350 _wq_thactive_aggregate_downto_qos(struct workqueue *wq, wq_thactive_t v,
 351                 int qos, uint32_t *busycount, uint32_t *max_busycount)
 352 {
 353         uint32_t count = 0, active;
 354         uint64_t curtime;
 355
 356 #ifndef __LP64__
 357         /*
 358          * on 32bits the manager bucket is a single bit and the best constrained
 359          * request QoS 3 bits are where the 10 bits of a regular QoS bucket count
 360          * would be. Mask them out.
 361          */
 362         v &= ~(~0ull << WQ_THACTIVE_QOS_SHIFT);
 363 #endif
 364         if (busycount) {
 365                 curtime = mach_absolute_time();
 366                 *busycount = 0;
 367         }
 368         if (max_busycount) {
 369                 *max_busycount = qos + 1;
 370         }
 371         for (int i = 0; i <= qos; i++, v >>= WQ_THACTIVE_BUCKET_WIDTH) {
 372                 active = v & WQ_THACTIVE_BUCKET_MASK;
 373                 count += active;
 374                 if (busycount && wq->wq_thscheduled_count[i] > active) {
 375                         if (wq_thread_is_busy(curtime, &wq->wq_lastblocked_ts[i])) {
 376                                 /*
 377                                  * We only consider the last blocked thread for a given bucket
 378                                  * as busy because we don't want to take the list lock in each
 379                                  * sched callback. However this is an approximation that could
 380                                  * contribute to thread creation storms.
 381                                  */
 382                                 (*busycount)++;
 383                         }
 384                 }
 385         }
 386         return count;
 387 }
 388
 389 #pragma mark - Process/Thread Setup/Teardown syscalls
 390
 391 static mach_vm_offset_t
 392 stack_addr_hint(proc_t p, vm_map_t vmap)
 393 {
 394         mach_vm_offset_t stackaddr;
 395         mach_vm_offset_t aslr_offset;
 396         bool proc64bit = proc_is64bit(p);
 397
 398         // We can't safely take random values % something unless its a power-of-two
 399         _Static_assert(powerof2(PTH_DEFAULT_STACKSIZE), "PTH_DEFAULT_STACKSIZE is a power-of-two");
 400
 401 #if defined(__i386__) || defined(__x86_64__)
 402         if (proc64bit) {
 403                 // Matches vm_map_get_max_aslr_slide_pages's image shift in xnu
 404                 aslr_offset = random() % (1 << 28); // about 512 stacks
 405         } else {
 406                 // Actually bigger than the image shift, we've got ~256MB to work with
 407                 aslr_offset = random() % (16 * PTH_DEFAULT_STACKSIZE);
 408         }
 409         aslr_offset = vm_map_trunc_page_mask(aslr_offset, vm_map_page_mask(vmap));
 410         if (proc64bit) {
 411                 // Above nanomalloc range (see NANOZONE_SIGNATURE)
 412                 stackaddr = 0x700000000000 + aslr_offset;
 413         } else {
 414                 stackaddr = SHARED_REGION_BASE_I386 + SHARED_REGION_SIZE_I386 + aslr_offset;
 415         }
 416 #elif defined(__arm__) || defined(__arm64__)
 417         user_addr_t main_thread_stack_top = 0;
 418         if (pthread_kern->proc_get_user_stack) {
 419                 main_thread_stack_top = pthread_kern->proc_get_user_stack(p);
 420         }
 421         if (proc64bit && main_thread_stack_top) {
 422                 // The main thread stack position is randomly slid by xnu (c.f.
 423                 // load_main() in mach_loader.c), so basing pthread stack allocations
 424                 // where the main thread stack ends is already ASLRd and doing so
 425                 // avoids creating a gap in the process address space that may cause
 426                 // extra PTE memory usage. rdar://problem/33328206
 427                 stackaddr = vm_map_trunc_page_mask((vm_map_offset_t)main_thread_stack_top,
 428                                 vm_map_page_mask(vmap));
 429         } else {
 430                 // vm_map_get_max_aslr_slide_pages ensures 1MB of slide, we do better
 431                 aslr_offset = random() % ((proc64bit ? 4 : 2) * PTH_DEFAULT_STACKSIZE);
 432                 aslr_offset = vm_map_trunc_page_mask((vm_map_offset_t)aslr_offset,
 433                                 vm_map_page_mask(vmap));
 434                 if (proc64bit) {
 435                         // 64 stacks below shared region
 436                         stackaddr = SHARED_REGION_BASE_ARM64 - 64 * PTH_DEFAULT_STACKSIZE - aslr_offset;
 437                 } else {
 438                         // If you try to slide down from this point, you risk ending up in memory consumed by malloc
 439                         stackaddr = SHARED_REGION_BASE_ARM - 32 * PTH_DEFAULT_STACKSIZE + aslr_offset;
 440                 }
 441         }
 442 #else
 443 #error Need to define a stack address hint for this architecture
 444 #endif
 445         return stackaddr;
 446 }
 447
 448 /**
 449  * bsdthread_create system call.  Used by pthread_create.
 450  */
 451 int
 452 _bsdthread_create(struct proc *p, user_addr_t user_func, user_addr_t user_funcarg, user_addr_t user_stack, user_addr_t user_pthread, uint32_t flags, user_addr_t *retval)
 453 {
 454         kern_return_t kret;
 455         void * sright;
 456         int error = 0;
 457         int allocated = 0;
 458         mach_vm_offset_t stackaddr;
 459         mach_vm_size_t th_allocsize = 0;
 460         mach_vm_size_t th_guardsize;
 461         mach_vm_offset_t th_stack;
 462         mach_vm_offset_t th_pthread;
 463         mach_vm_offset_t th_tsd_base;
 464         mach_port_name_t th_thport;
 465         thread_t th;
 466         vm_map_t vmap = pthread_kern->current_map();
 467         task_t ctask = current_task();
 468         unsigned int policy, importance;
 469         uint32_t tsd_offset;
 470
 471         int isLP64 = 0;
 472
 473         if (pthread_kern->proc_get_register(p) == 0) {
 474                 return EINVAL;
 475         }
 476
 477         PTHREAD_TRACE(TRACE_pthread_thread_create | DBG_FUNC_START, flags, 0, 0, 0, 0);
 478
 479         isLP64 = proc_is64bit(p);
 480         th_guardsize = vm_map_page_size(vmap);
 481
 482         stackaddr = pthread_kern->proc_get_stack_addr_hint(p);
 483         kret = pthread_kern->thread_create(ctask, &th);
 484         if (kret != KERN_SUCCESS)
 485                 return(ENOMEM);
 486         thread_reference(th);
 487
 488         pthread_kern->thread_set_tag(th, THREAD_TAG_PTHREAD);
 489
 490         sright = (void *)pthread_kern->convert_thread_to_port(th);
 491         th_thport = pthread_kern->ipc_port_copyout_send(sright, pthread_kern->task_get_ipcspace(ctask));
 492         if (!MACH_PORT_VALID(th_thport)) {
 493                 error = EMFILE; // userland will convert this into a crash
 494                 goto out;
 495         }
 496
 497         if ((flags & PTHREAD_START_CUSTOM) == 0) {
 498                 mach_vm_size_t pthread_size =
 499                         vm_map_round_page_mask(pthread_kern->proc_get_pthsize(p) + PTHREAD_T_OFFSET, vm_map_page_mask(vmap));
 500                 th_allocsize = th_guardsize + user_stack + pthread_size;
 501                 user_stack += PTHREAD_T_OFFSET;
 502
 503                 kret = mach_vm_map(vmap, &stackaddr,
 504                                 th_allocsize,
 505                                 page_size-1,
 506                                 VM_MAKE_TAG(VM_MEMORY_STACK)| VM_FLAGS_ANYWHERE , NULL,
 507                                 0, FALSE, VM_PROT_DEFAULT, VM_PROT_ALL,
 508                                 VM_INHERIT_DEFAULT);
 509                 if (kret != KERN_SUCCESS){
 510                         kret = mach_vm_allocate(vmap,
 511                                         &stackaddr, th_allocsize,
 512                                         VM_MAKE_TAG(VM_MEMORY_STACK)| VM_FLAGS_ANYWHERE);
 513                 }
 514                 if (kret != KERN_SUCCESS) {
 515                         error = ENOMEM;
 516                         goto out;
 517                 }
 518
 519                 PTHREAD_TRACE(TRACE_pthread_thread_create|DBG_FUNC_NONE, th_allocsize, stackaddr, 0, 2, 0);
 520
 521                 allocated = 1;
 522                 /*
 523                  * The guard page is at the lowest address
 524                  * The stack base is the highest address
 525                  */
 526                 kret = mach_vm_protect(vmap,  stackaddr, th_guardsize, FALSE, VM_PROT_NONE);
 527
 528                 if (kret != KERN_SUCCESS) {
 529                         error = ENOMEM;
 530                         goto out1;
 531                 }
 532
 533                 th_pthread = stackaddr + th_guardsize + user_stack;
 534                 th_stack = th_pthread;
 535
 536                 /*
 537                 * Pre-fault the first page of the new thread's stack and the page that will
 538                 * contain the pthread_t structure.
 539                 */
 540                 if (vm_map_trunc_page_mask((vm_map_offset_t)(th_stack - C_64_REDZONE_LEN), vm_map_page_mask(vmap)) !=
 541                                 vm_map_trunc_page_mask((vm_map_offset_t)th_pthread, vm_map_page_mask(vmap))){
 542                         vm_fault( vmap,
 543                                         vm_map_trunc_page_mask((vm_map_offset_t)(th_stack - C_64_REDZONE_LEN), vm_map_page_mask(vmap)),
 544                                         VM_PROT_READ | VM_PROT_WRITE,
 545                                         FALSE,
 546                                         THREAD_UNINT, NULL, 0);
 547                 }
 548
 549                 vm_fault( vmap,
 550                                 vm_map_trunc_page_mask((vm_map_offset_t)th_pthread, vm_map_page_mask(vmap)),
 551                                 VM_PROT_READ | VM_PROT_WRITE,
 552                                 FALSE,
 553                                 THREAD_UNINT, NULL, 0);
 554
 555         } else {
 556                 th_stack = user_stack;
 557                 th_pthread = user_pthread;
 558
 559                 PTHREAD_TRACE(TRACE_pthread_thread_create|DBG_FUNC_NONE, 0, 0, 0, 3, 0);
 560         }
 561
 562         tsd_offset = pthread_kern->proc_get_pthread_tsd_offset(p);
 563         if (tsd_offset) {
 564                 th_tsd_base = th_pthread + tsd_offset;
 565                 kret = pthread_kern->thread_set_tsd_base(th, th_tsd_base);
 566                 if (kret == KERN_SUCCESS) {
 567                         flags |= PTHREAD_START_TSD_BASE_SET;
 568                 }
 569         }
 570
 571 #if defined(__i386__) || defined(__x86_64__)
 572         /*
 573          * Set up i386 registers & function call.
 574          */
 575         if (isLP64 == 0) {
 576                 x86_thread_state32_t state = {
 577                         .eip = (unsigned int)pthread_kern->proc_get_threadstart(p),
 578                         .eax = (unsigned int)th_pthread,
 579                         .ebx = (unsigned int)th_thport,
 580                         .ecx = (unsigned int)user_func,
 581                         .edx = (unsigned int)user_funcarg,
 582                         .edi = (unsigned int)user_stack,
 583                         .esi = (unsigned int)flags,
 584                         /*
 585                          * set stack pointer
 586                          */
 587                         .esp = (int)((vm_offset_t)(th_stack-C_32_STK_ALIGN))
 588                 };
 589
 590                 error = pthread_kern->thread_set_wq_state32(th, (thread_state_t)&state);
 591                 if (error != KERN_SUCCESS) {
 592                         error = EINVAL;
 593                         goto out;
 594                 }
 595         } else {
 596                 x86_thread_state64_t state64 = {
 597                         .rip = (uint64_t)pthread_kern->proc_get_threadstart(p),
 598                         .rdi = (uint64_t)th_pthread,
 599                         .rsi = (uint64_t)(th_thport),
 600                         .rdx = (uint64_t)user_func,
 601                         .rcx = (uint64_t)user_funcarg,
 602                         .r8 = (uint64_t)user_stack,
 603                         .r9 = (uint64_t)flags,
 604                         /*
 605                          * set stack pointer aligned to 16 byte boundary
 606                          */
 607                         .rsp = (uint64_t)(th_stack - C_64_REDZONE_LEN)
 608                 };
 609
 610                 error = pthread_kern->thread_set_wq_state64(th, (thread_state_t)&state64);
 611                 if (error != KERN_SUCCESS) {
 612                         error = EINVAL;
 613                         goto out;
 614                 }
 615
 616         }
 617 #elif defined(__arm__)
 618         arm_thread_state_t state = {
 619                 .pc = (int)pthread_kern->proc_get_threadstart(p),
 620                 .r[0] = (unsigned int)th_pthread,
 621                 .r[1] = (unsigned int)th_thport,
 622                 .r[2] = (unsigned int)user_func,
 623                 .r[3] = (unsigned int)user_funcarg,
 624                 .r[4] = (unsigned int)user_stack,
 625                 .r[5] = (unsigned int)flags,
 626
 627                 /* Set r7 & lr to 0 for better back tracing */
 628                 .r[7] = 0,
 629                 .lr = 0,
 630
 631                 /*
 632                  * set stack pointer
 633                  */
 634                 .sp = (int)((vm_offset_t)(th_stack-C_32_STK_ALIGN))
 635         };
 636
 637         (void) pthread_kern->thread_set_wq_state32(th, (thread_state_t)&state);
 638
 639 #else
 640 #error bsdthread_create  not defined for this architecture
 641 #endif
 642
 643         if ((flags & PTHREAD_START_SETSCHED) != 0) {
 644                 /* Set scheduling parameters if needed */
 645                 thread_extended_policy_data_t    extinfo;
 646                 thread_precedence_policy_data_t   precedinfo;
 647
 648                 importance = (flags & PTHREAD_START_IMPORTANCE_MASK);
 649                 policy = (flags >> PTHREAD_START_POLICY_BITSHIFT) & PTHREAD_START_POLICY_MASK;
 650
 651                 if (policy == SCHED_OTHER) {
 652                         extinfo.timeshare = 1;
 653                 } else {
 654                         extinfo.timeshare = 0;
 655                 }
 656
 657                 thread_policy_set(th, THREAD_EXTENDED_POLICY, (thread_policy_t)&extinfo, THREAD_EXTENDED_POLICY_COUNT);
 658
 659                 precedinfo.importance = (importance - BASEPRI_DEFAULT);
 660                 thread_policy_set(th, THREAD_PRECEDENCE_POLICY, (thread_policy_t)&precedinfo, THREAD_PRECEDENCE_POLICY_COUNT);
 661         } else if ((flags & PTHREAD_START_QOSCLASS) != 0) {
 662                 /* Set thread QoS class if requested. */
 663                 pthread_priority_t priority = (pthread_priority_t)(flags & PTHREAD_START_QOSCLASS_MASK);
 664
 665                 thread_qos_policy_data_t qos;
 666                 qos.qos_tier = pthread_priority_get_thread_qos(priority);
 667                 qos.tier_importance = (qos.qos_tier == QOS_CLASS_UNSPECIFIED) ? 0 :
 668                                 _pthread_priority_get_relpri(priority);
 669
 670                 pthread_kern->thread_policy_set_internal(th, THREAD_QOS_POLICY, (thread_policy_t)&qos, THREAD_QOS_POLICY_COUNT);
 671         }
 672
 673         if (pthread_kern->proc_get_mach_thread_self_tsd_offset) {
 674                 uint64_t mach_thread_self_offset =
 675                                 pthread_kern->proc_get_mach_thread_self_tsd_offset(p);
 676                 if (mach_thread_self_offset && tsd_offset) {
 677                         bool proc64bit = proc_is64bit(p);
 678                         if (proc64bit) {
 679                                 uint64_t th_thport_tsd = (uint64_t)th_thport;
 680                                 error = copyout(&th_thport_tsd, th_pthread + tsd_offset +
 681                                                 mach_thread_self_offset, sizeof(th_thport_tsd));
 682                         } else {
 683                                 uint32_t th_thport_tsd = (uint32_t)th_thport;
 684                                 error = copyout(&th_thport_tsd, th_pthread + tsd_offset +
 685                                                 mach_thread_self_offset, sizeof(th_thport_tsd));
 686                         }
 687                         if (error) {
 688                                 goto out1;
 689                         }
 690                 }
 691         }
 692
 693         kret = pthread_kern->thread_resume(th);
 694         if (kret != KERN_SUCCESS) {
 695                 error = EINVAL;
 696                 goto out1;
 697         }
 698         thread_deallocate(th);  /* drop the creator reference */
 699
 700         PTHREAD_TRACE(TRACE_pthread_thread_create|DBG_FUNC_END, error, th_pthread, 0, 0, 0);
 701
 702         // cast required as mach_vm_offset_t is always 64 bits even on 32-bit platforms
 703         *retval = (user_addr_t)th_pthread;
 704
 705         return(0);
 706
 707 out1:
 708         if (allocated != 0) {
 709                 (void)mach_vm_deallocate(vmap, stackaddr, th_allocsize);
 710         }
 711 out:
 712         (void)pthread_kern->mach_port_deallocate(pthread_kern->task_get_ipcspace(ctask), th_thport);
 713         if (pthread_kern->thread_will_park_or_terminate) {
 714                 pthread_kern->thread_will_park_or_terminate(th);
 715         }
 716         (void)thread_terminate(th);
 717         (void)thread_deallocate(th);
 718         return(error);
 719 }
 720
 721 /**
 722  * bsdthread_terminate system call.  Used by pthread_terminate
 723  */
 724 int
 725 _bsdthread_terminate(__unused struct proc *p,
 726                      user_addr_t stackaddr,
 727                      size_t size,
 728                      uint32_t kthport,
 729                      uint32_t sem,
 730                      __unused int32_t *retval)
 731 {
 732         mach_vm_offset_t freeaddr;
 733         mach_vm_size_t freesize;
 734         kern_return_t kret;
 735         thread_t th = current_thread();
 736
 737         freeaddr = (mach_vm_offset_t)stackaddr;
 738         freesize = size;
 739
 740         PTHREAD_TRACE(TRACE_pthread_thread_terminate|DBG_FUNC_START, freeaddr, freesize, kthport, 0xff, 0);
 741
 742         if ((freesize != (mach_vm_size_t)0) && (freeaddr != (mach_vm_offset_t)0)) {
 743                 if (pthread_kern->thread_get_tag(th) & THREAD_TAG_MAINTHREAD){
 744                         vm_map_t user_map = pthread_kern->current_map();
 745                         freesize = vm_map_trunc_page_mask((vm_map_offset_t)freesize - 1, vm_map_page_mask(user_map));
 746                         kret = mach_vm_behavior_set(user_map, freeaddr, freesize, VM_BEHAVIOR_REUSABLE);
 747                         assert(kret == KERN_SUCCESS || kret == KERN_INVALID_ADDRESS);
 748                         kret = kret ? kret : mach_vm_protect(user_map, freeaddr, freesize, FALSE, VM_PROT_NONE);
 749                         assert(kret == KERN_SUCCESS || kret == KERN_INVALID_ADDRESS);
 750                 } else {
 751                         kret = mach_vm_deallocate(pthread_kern->current_map(), freeaddr, freesize);
 752                         if (kret != KERN_SUCCESS) {
 753                                 PTHREAD_TRACE(TRACE_pthread_thread_terminate|DBG_FUNC_END, kret, 0, 0, 0, 0);
 754                                 return(EINVAL);
 755                         }
 756                 }
 757         }
 758
 759         if (pthread_kern->thread_will_park_or_terminate) {
 760                 pthread_kern->thread_will_park_or_terminate(th);
 761         }
 762         (void)thread_terminate(th);
 763         if (sem != MACH_PORT_NULL) {
 764                  kret = pthread_kern->semaphore_signal_internal_trap(sem);
 765                 if (kret != KERN_SUCCESS) {
 766                         PTHREAD_TRACE(TRACE_pthread_thread_terminate|DBG_FUNC_END, kret, 0, 0, 0, 0);
 767                         return(EINVAL);
 768                 }
 769         }
 770
 771         if (kthport != MACH_PORT_NULL) {
 772                 pthread_kern->mach_port_deallocate(pthread_kern->task_get_ipcspace(current_task()), kthport);
 773         }
 774
 775         PTHREAD_TRACE(TRACE_pthread_thread_terminate|DBG_FUNC_END, 0, 0, 0, 0, 0);
 776
 777         pthread_kern->thread_exception_return();
 778         panic("bsdthread_terminate: still running\n");
 779
 780         PTHREAD_TRACE(TRACE_pthread_thread_terminate|DBG_FUNC_END, 0, 0xff, 0, 0, 0);
 781
 782         return(0);
 783 }
 784
 785 /**
 786  * bsdthread_register system call.  Performs per-process setup.  Responsible for
 787  * returning capabilitiy bits to userspace and receiving userspace function addresses.
 788  */
 789 int
 790 _bsdthread_register(struct proc *p,
 791                     user_addr_t threadstart,
 792                     user_addr_t wqthread,
 793                     int pthsize,
 794                     user_addr_t pthread_init_data,
 795                     user_addr_t pthread_init_data_size,
 796                     uint64_t dispatchqueue_offset,
 797                     int32_t *retval)
 798 {
 799         struct _pthread_registration_data data = {};
 800         uint32_t max_tsd_offset;
 801         kern_return_t kr;
 802         size_t pthread_init_sz = 0;
 803
 804         /* syscall randomizer test can pass bogus values */
 805         if (pthsize < 0 || pthsize > MAX_PTHREAD_SIZE) {
 806                 return(EINVAL);
 807         }
 808         /*
 809          * if we have pthread_init_data, then we use that and target_concptr
 810          * (which is an offset) get data.
 811          */
 812         if (pthread_init_data != 0) {
 813                 if (pthread_init_data_size < sizeof(data.version)) {
 814                         return EINVAL;
 815                 }
 816                 pthread_init_sz = MIN(sizeof(data), (size_t)pthread_init_data_size);
 817                 int ret = copyin(pthread_init_data, &data, pthread_init_sz);
 818                 if (ret) {
 819                         return ret;
 820                 }
 821                 if (data.version != (size_t)pthread_init_data_size) {
 822                         return EINVAL;
 823                 }
 824         } else {
 825                 data.dispatch_queue_offset = dispatchqueue_offset;
 826         }
 827
 828         /* We have to do this before proc_get_register so that it resets after fork */
 829         mach_vm_offset_t stackaddr = stack_addr_hint(p, pthread_kern->current_map());
 830         pthread_kern->proc_set_stack_addr_hint(p, (user_addr_t)stackaddr);
 831
 832         /* prevent multiple registrations */
 833         if (pthread_kern->proc_get_register(p) != 0) {
 834                 return(EINVAL);
 835         }
 836
 837         pthread_kern->proc_set_threadstart(p, threadstart);
 838         pthread_kern->proc_set_wqthread(p, wqthread);
 839         pthread_kern->proc_set_pthsize(p, pthsize);
 840         pthread_kern->proc_set_register(p);
 841
 842         uint32_t tsd_slot_sz = proc_is64bit(p) ? sizeof(uint64_t) : sizeof(uint32_t);
 843         if ((uint32_t)pthsize >= tsd_slot_sz &&
 844                         data.tsd_offset <= (uint32_t)(pthsize - tsd_slot_sz)) {
 845                 max_tsd_offset = ((uint32_t)pthsize - data.tsd_offset - tsd_slot_sz);
 846         } else {
 847                 data.tsd_offset = 0;
 848                 max_tsd_offset = 0;
 849         }
 850         pthread_kern->proc_set_pthread_tsd_offset(p, data.tsd_offset);
 851
 852         if (data.dispatch_queue_offset > max_tsd_offset) {
 853                 data.dispatch_queue_offset = 0;
 854         }
 855         pthread_kern->proc_set_dispatchqueue_offset(p, data.dispatch_queue_offset);
 856
 857         if (pthread_kern->proc_set_return_to_kernel_offset) {
 858                 if (data.return_to_kernel_offset > max_tsd_offset) {
 859                         data.return_to_kernel_offset = 0;
 860                 }
 861                 pthread_kern->proc_set_return_to_kernel_offset(p,
 862                                 data.return_to_kernel_offset);
 863         }
 864
 865         if (pthread_kern->proc_set_mach_thread_self_tsd_offset) {
 866                 if (data.mach_thread_self_offset > max_tsd_offset) {
 867                         data.mach_thread_self_offset = 0;
 868                 }
 869                 pthread_kern->proc_set_mach_thread_self_tsd_offset(p,
 870                                 data.mach_thread_self_offset);
 871         }
 872
 873         if (pthread_init_data != 0) {
 874                 /* Outgoing data that userspace expects as a reply */
 875                 data.version = sizeof(struct _pthread_registration_data);
 876                 if (pthread_kern->qos_main_thread_active()) {
 877                         mach_msg_type_number_t nqos = THREAD_QOS_POLICY_COUNT;
 878                         thread_qos_policy_data_t qos;
 879                         boolean_t gd = FALSE;
 880
 881                         kr = pthread_kern->thread_policy_get(current_thread(), THREAD_QOS_POLICY, (thread_policy_t)&qos, &nqos, &gd);
 882                         if (kr != KERN_SUCCESS || qos.qos_tier == THREAD_QOS_UNSPECIFIED) {
 883                                 /* Unspecified threads means the kernel wants us to impose legacy upon the thread. */
 884                                 qos.qos_tier = THREAD_QOS_LEGACY;
 885                                 qos.tier_importance = 0;
 886
 887                                 kr = pthread_kern->thread_policy_set_internal(current_thread(), THREAD_QOS_POLICY, (thread_policy_t)&qos, THREAD_QOS_POLICY_COUNT);
 888                         }
 889
 890                         if (kr == KERN_SUCCESS) {
 891                                 data.main_qos = thread_qos_get_pthread_priority(qos.qos_tier);
 892                         } else {
 893                                 data.main_qos = _pthread_priority_make_newest(QOS_CLASS_UNSPECIFIED, 0, 0);
 894                         }
 895                 } else {
 896                         data.main_qos = _pthread_priority_make_newest(QOS_CLASS_UNSPECIFIED, 0, 0);
 897                 }
 898
 899                 data.mutex_default_policy = pthread_mutex_default_policy;
 900
 901                 kr = copyout(&data, pthread_init_data, pthread_init_sz);
 902                 if (kr != KERN_SUCCESS) {
 903                         return EINVAL;
 904                 }
 905         }
 906
 907         /* return the supported feature set as the return value. */
 908         *retval = PTHREAD_FEATURE_SUPPORTED;
 909
 910         return(0);
 911 }
 912
 913 #pragma mark - QoS Manipulation
 914
 915 int
 916 _bsdthread_ctl_set_qos(struct proc *p, user_addr_t __unused cmd, mach_port_name_t kport, user_addr_t tsd_priority_addr, user_addr_t arg3, int *retval)
 917 {
 918         int rv;
 919         thread_t th;
 920
 921         pthread_priority_t priority;
 922
 923         /* Unused parameters must be zero. */
 924         if (arg3 != 0) {
 925                 return EINVAL;
 926         }
 927
 928         /* QoS is stored in a given slot in the pthread TSD. We need to copy that in and set our QoS based on it. */
 929         if (proc_is64bit(p)) {
 930                 uint64_t v;
 931                 rv = copyin(tsd_priority_addr, &v, sizeof(v));
 932                 if (rv) goto out;
 933                 priority = (int)(v & 0xffffffff);
 934         } else {
 935                 uint32_t v;
 936                 rv = copyin(tsd_priority_addr, &v, sizeof(v));
 937                 if (rv) goto out;
 938                 priority = v;
 939         }
 940
 941         if ((th = port_name_to_thread(kport)) == THREAD_NULL) {
 942                 return ESRCH;
 943         }
 944
 945         /* <rdar://problem/16211829> Disable pthread_set_qos_class_np() on threads other than pthread_self */
 946         if (th != current_thread()) {
 947                 thread_deallocate(th);
 948                 return EPERM;
 949         }
 950
 951         rv = _bsdthread_ctl_set_self(p, 0, priority, 0, _PTHREAD_SET_SELF_QOS_FLAG, retval);
 952
 953         /* Static param the thread, we just set QoS on it, so its stuck in QoS land now. */
 954         /* pthread_kern->thread_static_param(th, TRUE); */ // see <rdar://problem/16433744>, for details
 955
 956         thread_deallocate(th);
 957
 958 out:
 959         return rv;
 960 }
 961
 962 static inline struct threadlist *
 963 util_get_thread_threadlist_entry(thread_t th)
 964 {
 965         struct uthread *uth = pthread_kern->get_bsdthread_info(th);
 966         if (uth) {
 967                 struct threadlist *tl = pthread_kern->uthread_get_threadlist(uth);
 968                 return tl;
 969         }
 970         return NULL;
 971 }
 972
 973 boolean_t
 974 _workq_thread_has_been_unbound(thread_t th, int qos_class)
 975 {
 976         struct threadlist *tl = util_get_thread_threadlist_entry(th);
 977         if (!tl) {
 978                 return FALSE;
 979         }
 980
 981         struct workqueue *wq = tl->th_workq;
 982         workqueue_lock_spin(wq);
 983
 984         if (tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET) {
 985                 goto failure;
 986         } else if (qos_class != class_index_get_thread_qos(tl->th_priority)) {
 987                 goto failure;
 988         }
 989
 990         if ((tl->th_flags & TH_LIST_KEVENT_BOUND)){
 991                 goto failure;
 992         }
 993         tl->th_flags &= ~TH_LIST_KEVENT_BOUND;
 994
 995         workqueue_unlock(wq);
 996         return TRUE;
 997
 998 failure:
 999         workqueue_unlock(wq);
1000         return FALSE;
1001 }
1002
1003 int
1004 _bsdthread_ctl_set_self(struct proc *p, user_addr_t __unused cmd, pthread_priority_t priority, mach_port_name_t voucher, _pthread_set_flags_t flags, int __unused *retval)
1005 {
1006         thread_qos_policy_data_t qos;
1007         mach_msg_type_number_t nqos = THREAD_QOS_POLICY_COUNT;
1008         boolean_t gd = FALSE;
1009         thread_t th = current_thread();
1010         struct workqueue *wq = NULL;
1011         struct threadlist *tl = NULL;
1012
1013         kern_return_t kr;
1014         int qos_rv = 0, voucher_rv = 0, fixedpri_rv = 0;
1015
1016         if ((flags & _PTHREAD_SET_SELF_WQ_KEVENT_UNBIND) != 0) {
1017                 tl = util_get_thread_threadlist_entry(th);
1018                 if (tl) {
1019                         wq = tl->th_workq;
1020                 } else {
1021                         goto qos;
1022                 }
1023
1024                 workqueue_lock_spin(wq);
1025                 if (tl->th_flags & TH_LIST_KEVENT_BOUND) {
1026                         tl->th_flags &= ~TH_LIST_KEVENT_BOUND;
1027                         unsigned int kevent_flags = KEVENT_FLAG_WORKQ | KEVENT_FLAG_UNBIND_CHECK_FLAGS;
1028                         if (tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET) {
1029                                 kevent_flags |= KEVENT_FLAG_WORKQ_MANAGER;
1030                         }
1031
1032                         workqueue_unlock(wq);
1033                         __assert_only int ret = kevent_qos_internal_unbind(p, class_index_get_thread_qos(tl->th_priority), th, kevent_flags);
1034                         assert(ret == 0);
1035                 } else {
1036                         workqueue_unlock(wq);
1037                 }
1038         }
1039
1040 qos:
1041         if ((flags & _PTHREAD_SET_SELF_QOS_FLAG) != 0) {
1042                 kr = pthread_kern->thread_policy_get(th, THREAD_QOS_POLICY, (thread_policy_t)&qos, &nqos, &gd);
1043                 if (kr != KERN_SUCCESS) {
1044                         qos_rv = EINVAL;
1045                         goto voucher;
1046                 }
1047
1048                 /*
1049                  * If we have main-thread QoS then we don't allow a thread to come out
1050                  * of QOS_CLASS_UNSPECIFIED.
1051                  */
1052                 if (pthread_kern->qos_main_thread_active() && qos.qos_tier ==
1053                                 THREAD_QOS_UNSPECIFIED) {
1054                         qos_rv = EPERM;
1055                         goto voucher;
1056                 }
1057
1058                 if (!tl) {
1059                         tl = util_get_thread_threadlist_entry(th);
1060                         if (tl) wq = tl->th_workq;
1061                 }
1062
1063                 PTHREAD_TRACE_WQ(TRACE_pthread_set_qos_self | DBG_FUNC_START, wq, qos.qos_tier, qos.tier_importance, 0, 0);
1064
1065                 qos.qos_tier = pthread_priority_get_thread_qos(priority);
1066                 qos.tier_importance = (qos.qos_tier == QOS_CLASS_UNSPECIFIED) ? 0 : _pthread_priority_get_relpri(priority);
1067
1068                 if (qos.qos_tier == QOS_CLASS_UNSPECIFIED ||
1069                                 qos.tier_importance > 0 || qos.tier_importance < THREAD_QOS_MIN_TIER_IMPORTANCE) {
1070                         qos_rv = EINVAL;
1071                         goto voucher;
1072                 }
1073
1074                 /*
1075                  * If we're a workqueue, the threadlist item priority needs adjusting,
1076                  * along with the bucket we were running in.
1077                  */
1078                 if (tl) {
1079                         bool try_run_threadreq = false;
1080
1081                         workqueue_lock_spin(wq);
1082                         kr = pthread_kern->thread_set_workq_qos(th, qos.qos_tier, qos.tier_importance);
1083                         assert(kr == KERN_SUCCESS || kr == KERN_TERMINATED);
1084
1085                         /* Fix up counters. */
1086                         uint8_t old_bucket = tl->th_priority;
1087                         uint8_t new_bucket = pthread_priority_get_class_index(priority);
1088
1089                         if (old_bucket != new_bucket) {
1090                                 _wq_thactive_move(wq, old_bucket, new_bucket);
1091                                 wq->wq_thscheduled_count[old_bucket]--;
1092                                 wq->wq_thscheduled_count[new_bucket]++;
1093                                 if (old_bucket == WORKQUEUE_EVENT_MANAGER_BUCKET ||
1094                                                 old_bucket < new_bucket) {
1095                                         /*
1096                                          * if the QoS of the thread was lowered, then this could
1097                                          * allow for a higher QoS thread request to run, so we need
1098                                          * to reevaluate.
1099                                          */
1100                                         try_run_threadreq = true;
1101                                 }
1102                                 tl->th_priority = new_bucket;
1103                         }
1104
1105                         bool old_overcommit = !(tl->th_flags & TH_LIST_CONSTRAINED);
1106                         bool new_overcommit = priority & _PTHREAD_PRIORITY_OVERCOMMIT_FLAG;
1107                         if (!old_overcommit && new_overcommit) {
1108                                 if (wq->wq_constrained_threads_scheduled-- ==
1109                                                 wq_max_constrained_threads) {
1110                                         try_run_threadreq = true;
1111                                 }
1112                                 tl->th_flags &= ~TH_LIST_CONSTRAINED;
1113                         } else if (old_overcommit && !new_overcommit) {
1114                                 wq->wq_constrained_threads_scheduled++;
1115                                 tl->th_flags |= TH_LIST_CONSTRAINED;
1116                         }
1117
1118                         if (try_run_threadreq) {
1119                                 workqueue_run_threadreq_and_unlock(p, wq, NULL, NULL, true);
1120                         } else {
1121                                 workqueue_unlock(wq);
1122                         }
1123                 } else {
1124                         kr = pthread_kern->thread_policy_set_internal(th, THREAD_QOS_POLICY, (thread_policy_t)&qos, THREAD_QOS_POLICY_COUNT);
1125                         if (kr != KERN_SUCCESS) {
1126                                 qos_rv = EINVAL;
1127                         }
1128                 }
1129
1130                 PTHREAD_TRACE_WQ(TRACE_pthread_set_qos_self | DBG_FUNC_END, wq, qos.qos_tier, qos.tier_importance, 0, 0);
1131         }
1132
1133 voucher:
1134         if ((flags & _PTHREAD_SET_SELF_VOUCHER_FLAG) != 0) {
1135                 kr = pthread_kern->thread_set_voucher_name(voucher);
1136                 if (kr != KERN_SUCCESS) {
1137                         voucher_rv = ENOENT;
1138                         goto fixedpri;
1139                 }
1140         }
1141
1142 fixedpri:
1143         if (qos_rv) goto done;
1144         if ((flags & _PTHREAD_SET_SELF_FIXEDPRIORITY_FLAG) != 0) {
1145                 thread_extended_policy_data_t extpol = {.timeshare = 0};
1146
1147                 if (!tl) tl  = util_get_thread_threadlist_entry(th);
1148                 if (tl) {
1149                         /* Not allowed on workqueue threads */
1150                         fixedpri_rv = ENOTSUP;
1151                         goto done;
1152                 }
1153
1154                 kr = pthread_kern->thread_policy_set_internal(th, THREAD_EXTENDED_POLICY, (thread_policy_t)&extpol, THREAD_EXTENDED_POLICY_COUNT);
1155                 if (kr != KERN_SUCCESS) {
1156                         fixedpri_rv = EINVAL;
1157                         goto done;
1158                 }
1159         } else if ((flags & _PTHREAD_SET_SELF_TIMESHARE_FLAG) != 0) {
1160                 thread_extended_policy_data_t extpol = {.timeshare = 1};
1161
1162                 if (!tl) tl = util_get_thread_threadlist_entry(th);
1163                 if (tl) {
1164                         /* Not allowed on workqueue threads */
1165                         fixedpri_rv = ENOTSUP;
1166                         goto done;
1167                 }
1168
1169                 kr = pthread_kern->thread_policy_set_internal(th, THREAD_EXTENDED_POLICY, (thread_policy_t)&extpol, THREAD_EXTENDED_POLICY_COUNT);
1170                 if (kr != KERN_SUCCESS) {
1171                         fixedpri_rv = EINVAL;
1172                         goto done;
1173                 }
1174         }
1175
1176 done:
1177         if (qos_rv && voucher_rv) {
1178                 /* Both failed, give that a unique error. */
1179                 return EBADMSG;
1180         }
1181
1182         if (qos_rv) {
1183                 return qos_rv;
1184         }
1185
1186         if (voucher_rv) {
1187                 return voucher_rv;
1188         }
1189
1190         if (fixedpri_rv) {
1191                 return fixedpri_rv;
1192         }
1193
1194         return 0;
1195 }
1196
1197 int
1198 _bsdthread_ctl_qos_override_start(struct proc __unused *p, user_addr_t __unused cmd, mach_port_name_t kport, pthread_priority_t priority, user_addr_t resource, int __unused *retval)
1199 {
1200         thread_t th;
1201         int rv = 0;
1202
1203         if ((th = port_name_to_thread(kport)) == THREAD_NULL) {
1204                 return ESRCH;
1205         }
1206
1207         int override_qos = pthread_priority_get_thread_qos(priority);
1208
1209         struct threadlist *tl = util_get_thread_threadlist_entry(th);
1210         if (tl) {
1211                 PTHREAD_TRACE_WQ(TRACE_wq_override_start | DBG_FUNC_NONE, tl->th_workq, thread_tid(th), 1, priority, 0);
1212         }
1213
1214         /* The only failure case here is if we pass a tid and have it lookup the thread, we pass the uthread, so this all always succeeds. */
1215         pthread_kern->proc_usynch_thread_qos_add_override_for_resource_check_owner(th, override_qos, TRUE,
1216                         resource, THREAD_QOS_OVERRIDE_TYPE_PTHREAD_EXPLICIT_OVERRIDE, USER_ADDR_NULL, MACH_PORT_NULL);
1217         thread_deallocate(th);
1218         return rv;
1219 }
1220
1221 int
1222 _bsdthread_ctl_qos_override_end(struct proc __unused *p, user_addr_t __unused cmd, mach_port_name_t kport, user_addr_t resource, user_addr_t arg3, int __unused *retval)
1223 {
1224         thread_t th;
1225         int rv = 0;
1226
1227         if (arg3 != 0) {
1228                 return EINVAL;
1229         }
1230
1231         if ((th = port_name_to_thread(kport)) == THREAD_NULL) {
1232                 return ESRCH;
1233         }
1234
1235         struct uthread *uth = pthread_kern->get_bsdthread_info(th);
1236
1237         struct threadlist *tl = util_get_thread_threadlist_entry(th);
1238         if (tl) {
1239                 PTHREAD_TRACE_WQ(TRACE_wq_override_end | DBG_FUNC_NONE, tl->th_workq, thread_tid(th), 0, 0, 0);
1240         }
1241
1242         pthread_kern->proc_usynch_thread_qos_remove_override_for_resource(current_task(), uth, 0, resource, THREAD_QOS_OVERRIDE_TYPE_PTHREAD_EXPLICIT_OVERRIDE);
1243
1244         thread_deallocate(th);
1245         return rv;
1246 }
1247
1248 static int
1249 _bsdthread_ctl_qos_dispatch_asynchronous_override_add_internal(mach_port_name_t kport, pthread_priority_t priority, user_addr_t resource, user_addr_t ulock_addr)
1250 {
1251         thread_t th;
1252         int rv = 0;
1253
1254         if ((th = port_name_to_thread(kport)) == THREAD_NULL) {
1255                 return ESRCH;
1256         }
1257
1258         int override_qos = pthread_priority_get_thread_qos(priority);
1259
1260         struct threadlist *tl = util_get_thread_threadlist_entry(th);
1261         if (!tl) {
1262                 thread_deallocate(th);
1263                 return EPERM;
1264         }
1265
1266         PTHREAD_TRACE_WQ(TRACE_wq_override_dispatch | DBG_FUNC_NONE, tl->th_workq, thread_tid(th), 1, priority, 0);
1267
1268         rv = pthread_kern->proc_usynch_thread_qos_add_override_for_resource_check_owner(th, override_qos, TRUE,
1269                         resource, THREAD_QOS_OVERRIDE_TYPE_DISPATCH_ASYNCHRONOUS_OVERRIDE, ulock_addr, kport);
1270
1271         thread_deallocate(th);
1272         return rv;
1273 }
1274
1275 int _bsdthread_ctl_qos_dispatch_asynchronous_override_add(struct proc __unused *p, user_addr_t __unused cmd,
1276                 mach_port_name_t kport, pthread_priority_t priority, user_addr_t resource, int __unused *retval)
1277 {
1278         return _bsdthread_ctl_qos_dispatch_asynchronous_override_add_internal(kport, priority, resource, USER_ADDR_NULL);
1279 }
1280
1281 int
1282 _bsdthread_ctl_qos_override_dispatch(struct proc *p __unused, user_addr_t cmd __unused, mach_port_name_t kport, pthread_priority_t priority, user_addr_t ulock_addr, int __unused *retval)
1283 {
1284         return _bsdthread_ctl_qos_dispatch_asynchronous_override_add_internal(kport, priority, USER_ADDR_NULL, ulock_addr);
1285 }
1286
1287 int
1288 _bsdthread_ctl_qos_override_reset(struct proc *p, user_addr_t cmd, user_addr_t arg1, user_addr_t arg2, user_addr_t arg3, int *retval)
1289 {
1290         if (arg1 != 0 || arg2 != 0 || arg3 != 0) {
1291                 return EINVAL;
1292         }
1293
1294         return _bsdthread_ctl_qos_dispatch_asynchronous_override_reset(p, cmd, 1 /* reset_all */, 0, 0, retval);
1295 }
1296
1297 int
1298 _bsdthread_ctl_qos_dispatch_asynchronous_override_reset(struct proc __unused *p, user_addr_t __unused cmd, int reset_all, user_addr_t resource, user_addr_t arg3, int __unused *retval)
1299 {
1300         if ((reset_all && (resource != 0)) || arg3 != 0) {
1301                 return EINVAL;
1302         }
1303
1304         thread_t th = current_thread();
1305         struct uthread *uth = pthread_kern->get_bsdthread_info(th);
1306         struct threadlist *tl = pthread_kern->uthread_get_threadlist(uth);
1307
1308         if (!tl) {
1309                 return EPERM;
1310         }
1311
1312         PTHREAD_TRACE_WQ(TRACE_wq_override_reset | DBG_FUNC_NONE, tl->th_workq, 0, 0, 0, 0);
1313
1314         resource = reset_all ? THREAD_QOS_OVERRIDE_RESOURCE_WILDCARD : resource;
1315         pthread_kern->proc_usynch_thread_qos_reset_override_for_resource(current_task(), uth, 0, resource, THREAD_QOS_OVERRIDE_TYPE_DISPATCH_ASYNCHRONOUS_OVERRIDE);
1316
1317         return 0;
1318 }
1319
1320 static int
1321 _bsdthread_ctl_max_parallelism(struct proc __unused *p, user_addr_t __unused cmd,
1322                 int qos, unsigned long flags, int *retval)
1323 {
1324         _Static_assert(QOS_PARALLELISM_COUNT_LOGICAL ==
1325                         _PTHREAD_QOS_PARALLELISM_COUNT_LOGICAL, "logical");
1326         _Static_assert(QOS_PARALLELISM_REALTIME ==
1327                         _PTHREAD_QOS_PARALLELISM_REALTIME, "realtime");
1328
1329         if (flags & ~(QOS_PARALLELISM_REALTIME | QOS_PARALLELISM_COUNT_LOGICAL)) {
1330                 return EINVAL;
1331         }
1332
1333         if (flags & QOS_PARALLELISM_REALTIME) {
1334                 if (qos) {
1335                         return EINVAL;
1336                 }
1337         } else if (qos == THREAD_QOS_UNSPECIFIED || qos >= THREAD_QOS_LAST) {
1338                 return EINVAL;
1339         }
1340
1341         *retval = pthread_kern->qos_max_parallelism(qos, flags);
1342         return 0;
1343 }
1344
1345 int
1346 _bsdthread_ctl(struct proc *p, user_addr_t cmd, user_addr_t arg1, user_addr_t arg2, user_addr_t arg3, int *retval)
1347 {
1348         switch (cmd) {
1349         case BSDTHREAD_CTL_SET_QOS:
1350                 return _bsdthread_ctl_set_qos(p, cmd, (mach_port_name_t)arg1, arg2, arg3, retval);
1351         case BSDTHREAD_CTL_QOS_OVERRIDE_START:
1352                 return _bsdthread_ctl_qos_override_start(p, cmd, (mach_port_name_t)arg1, (pthread_priority_t)arg2, arg3, retval);
1353         case BSDTHREAD_CTL_QOS_OVERRIDE_END:
1354                 return _bsdthread_ctl_qos_override_end(p, cmd, (mach_port_name_t)arg1, arg2, arg3, retval);
1355         case BSDTHREAD_CTL_QOS_OVERRIDE_RESET:
1356                 return _bsdthread_ctl_qos_override_reset(p, cmd, arg1, arg2, arg3, retval);
1357         case BSDTHREAD_CTL_QOS_OVERRIDE_DISPATCH:
1358                 return _bsdthread_ctl_qos_override_dispatch(p, cmd, (mach_port_name_t)arg1, (pthread_priority_t)arg2, arg3, retval);
1359         case BSDTHREAD_CTL_QOS_DISPATCH_ASYNCHRONOUS_OVERRIDE_ADD:
1360                 return _bsdthread_ctl_qos_dispatch_asynchronous_override_add(p, cmd, (mach_port_name_t)arg1, (pthread_priority_t)arg2, arg3, retval);
1361         case BSDTHREAD_CTL_QOS_DISPATCH_ASYNCHRONOUS_OVERRIDE_RESET:
1362                 return _bsdthread_ctl_qos_dispatch_asynchronous_override_reset(p, cmd, (int)arg1, arg2, arg3, retval);
1363         case BSDTHREAD_CTL_SET_SELF:
1364                 return _bsdthread_ctl_set_self(p, cmd, (pthread_priority_t)arg1, (mach_port_name_t)arg2, (_pthread_set_flags_t)arg3, retval);
1365         case BSDTHREAD_CTL_QOS_MAX_PARALLELISM:
1366                 return _bsdthread_ctl_max_parallelism(p, cmd, (int)arg1, (unsigned long)arg2, retval);
1367         default:
1368                 return EINVAL;
1369         }
1370 }
1371
1372 #pragma mark - Workqueue Implementation
1373
1374 #pragma mark wq_flags
1375
1376 static inline uint32_t
1377 _wq_flags(struct workqueue *wq)
1378 {
1379         return atomic_load_explicit(&wq->wq_flags, memory_order_relaxed);
1380 }
1381
1382 static inline bool
1383 _wq_exiting(struct workqueue *wq)
1384 {
1385         return _wq_flags(wq) & WQ_EXITING;
1386 }
1387
1388 static inline uint32_t
1389 _wq_flags_or_orig(struct workqueue *wq, uint32_t v)
1390 {
1391 #if PTHREAD_INLINE_RMW_ATOMICS
1392         uint32_t state;
1393         do {
1394                 state = _wq_flags(wq);
1395         } while (!OSCompareAndSwap(state, state | v, &wq->wq_flags));
1396         return state;
1397 #else
1398         return atomic_fetch_or_explicit(&wq->wq_flags, v, memory_order_relaxed);
1399 #endif
1400 }
1401
1402 static inline uint32_t
1403 _wq_flags_and_orig(struct workqueue *wq, uint32_t v)
1404 {
1405 #if PTHREAD_INLINE_RMW_ATOMICS
1406         uint32_t state;
1407         do {
1408                 state = _wq_flags(wq);
1409         } while (!OSCompareAndSwap(state, state & v, &wq->wq_flags));
1410         return state;
1411 #else
1412         return atomic_fetch_and_explicit(&wq->wq_flags, v, memory_order_relaxed);
1413 #endif
1414 }
1415
1416 static inline bool
1417 WQ_TIMER_DELAYED_NEEDED(struct workqueue *wq)
1418 {
1419         uint32_t oldflags, newflags;
1420         do {
1421                 oldflags = _wq_flags(wq);
1422                 if (oldflags & (WQ_EXITING | WQ_ATIMER_DELAYED_RUNNING)) {
1423                         return false;
1424                 }
1425                 newflags = oldflags | WQ_ATIMER_DELAYED_RUNNING;
1426         } while (!OSCompareAndSwap(oldflags, newflags, &wq->wq_flags));
1427         return true;
1428 }
1429
1430 static inline bool
1431 WQ_TIMER_IMMEDIATE_NEEDED(struct workqueue *wq)
1432 {
1433         uint32_t oldflags, newflags;
1434         do {
1435                 oldflags = _wq_flags(wq);
1436                 if (oldflags & (WQ_EXITING | WQ_ATIMER_IMMEDIATE_RUNNING)) {
1437                         return false;
1438                 }
1439                 newflags = oldflags | WQ_ATIMER_IMMEDIATE_RUNNING;
1440         } while (!OSCompareAndSwap(oldflags, newflags, &wq->wq_flags));
1441         return true;
1442 }
1443
1444 #pragma mark thread requests pacing
1445
1446 static inline uint32_t
1447 _wq_pacing_shift_for_pri(int pri)
1448 {
1449         return _wq_bucket_to_thread_qos(pri) - 1;
1450 }
1451
1452 static inline int
1453 _wq_highest_paced_priority(struct workqueue *wq)
1454 {
1455         uint8_t paced = wq->wq_paced;
1456         int msb = paced ? 32 - __builtin_clz(paced) : 0; // fls(paced) == bit + 1
1457         return WORKQUEUE_EVENT_MANAGER_BUCKET - msb;
1458 }
1459
1460 static inline uint8_t
1461 _wq_pacing_bit_for_pri(int pri)
1462 {
1463         return 1u << _wq_pacing_shift_for_pri(pri);
1464 }
1465
1466 static inline bool
1467 _wq_should_pace_priority(struct workqueue *wq, int pri)
1468 {
1469         return wq->wq_paced >= _wq_pacing_bit_for_pri(pri);
1470 }
1471
1472 static inline void
1473 _wq_pacing_start(struct workqueue *wq, struct threadlist *tl)
1474 {
1475         uint8_t bit = _wq_pacing_bit_for_pri(tl->th_priority);
1476         assert((tl->th_flags & TH_LIST_PACING) == 0);
1477         assert((wq->wq_paced & bit) == 0);
1478         wq->wq_paced |= bit;
1479         tl->th_flags |= TH_LIST_PACING;
1480 }
1481
1482 static inline bool
1483 _wq_pacing_end(struct workqueue *wq, struct threadlist *tl)
1484 {
1485         if (tl->th_flags & TH_LIST_PACING) {
1486                 uint8_t bit = _wq_pacing_bit_for_pri(tl->th_priority);
1487                 assert((wq->wq_paced & bit) != 0);
1488                 wq->wq_paced ^= bit;
1489                 tl->th_flags &= ~TH_LIST_PACING;
1490                 return wq->wq_paced < bit; // !_wq_should_pace_priority
1491         }
1492         return false;
1493 }
1494
1495 #pragma mark thread requests
1496
1497 static void
1498 _threadreq_init_alloced(struct threadreq *req, int priority, int flags)
1499 {
1500         assert((flags & TR_FLAG_ONSTACK) == 0);
1501         req->tr_state = TR_STATE_NEW;
1502         req->tr_priority = priority;
1503         req->tr_flags = flags;
1504 }
1505
1506 static void
1507 _threadreq_init_stack(struct threadreq *req, int priority, int flags)
1508 {
1509         req->tr_state = TR_STATE_NEW;
1510         req->tr_priority = priority;
1511         req->tr_flags = flags | TR_FLAG_ONSTACK;
1512 }
1513
1514 static void
1515 _threadreq_copy_prepare(struct workqueue *wq)
1516 {
1517 again:
1518         if (wq->wq_cached_threadreq) {
1519                 return;
1520         }
1521
1522         workqueue_unlock(wq);
1523         struct threadreq *req = zalloc(pthread_zone_threadreq);
1524         workqueue_lock_spin(wq);
1525
1526         if (wq->wq_cached_threadreq) {
1527                 /*
1528                  * We lost the race and someone left behind an extra threadreq for us
1529                  * to use.  Throw away our request and retry.
1530                  */
1531                 workqueue_unlock(wq);
1532                 zfree(pthread_zone_threadreq, req);
1533                 workqueue_lock_spin(wq);
1534                 goto again;
1535         } else {
1536                 wq->wq_cached_threadreq = req;
1537         }
1538
1539         assert(wq->wq_cached_threadreq);
1540 }
1541
1542 static bool
1543 _threadreq_copy_prepare_noblock(struct workqueue *wq)
1544 {
1545         if (wq->wq_cached_threadreq) {
1546                 return true;
1547         }
1548
1549         wq->wq_cached_threadreq = zalloc_noblock(pthread_zone_threadreq);
1550
1551         return wq->wq_cached_threadreq != NULL;
1552 }
1553
1554 static inline struct threadreq_head *
1555 _threadreq_list_for_req(struct workqueue *wq, const struct threadreq *req)
1556 {
1557         if (req->tr_flags & TR_FLAG_OVERCOMMIT) {
1558                 return &wq->wq_overcommit_reqlist[req->tr_priority];
1559         } else {
1560                 return &wq->wq_reqlist[req->tr_priority];
1561         }
1562 }
1563
1564 static void
1565 _threadreq_enqueue(struct workqueue *wq, struct threadreq *req)
1566 {
1567         assert(req && req->tr_state == TR_STATE_NEW);
1568         if (req->tr_priority == WORKQUEUE_EVENT_MANAGER_BUCKET) {
1569                 assert(wq->wq_event_manager_threadreq.tr_state != TR_STATE_WAITING);
1570                 memcpy(&wq->wq_event_manager_threadreq, req, sizeof(struct threadreq));
1571                 req = &wq->wq_event_manager_threadreq;
1572                 req->tr_flags &= ~(TR_FLAG_ONSTACK | TR_FLAG_NO_PACING);
1573         } else {
1574                 if (req->tr_flags & TR_FLAG_ONSTACK) {
1575                         assert(wq->wq_cached_threadreq);
1576                         struct threadreq *newreq = wq->wq_cached_threadreq;
1577                         wq->wq_cached_threadreq = NULL;
1578
1579                         memcpy(newreq, req, sizeof(struct threadreq));
1580                         newreq->tr_flags &= ~(TR_FLAG_ONSTACK | TR_FLAG_NO_PACING);
1581                         req->tr_state = TR_STATE_DEAD;
1582                         req = newreq;
1583                 }
1584                 TAILQ_INSERT_TAIL(_threadreq_list_for_req(wq, req), req, tr_entry);
1585         }
1586         req->tr_state = TR_STATE_WAITING;
1587         wq->wq_reqcount++;
1588 }
1589
1590 static void
1591 _threadreq_dequeue(struct workqueue *wq, struct threadreq *req)
1592 {
1593         if (req->tr_priority != WORKQUEUE_EVENT_MANAGER_BUCKET) {
1594                 struct threadreq_head *req_list = _threadreq_list_for_req(wq, req);
1595 #if DEBUG
1596                 struct threadreq *cursor = NULL;
1597                 TAILQ_FOREACH(cursor, req_list, tr_entry) {
1598                         if (cursor == req) break;
1599                 }
1600                 assert(cursor == req);
1601 #endif
1602                 TAILQ_REMOVE(req_list, req, tr_entry);
1603         }
1604         wq->wq_reqcount--;
1605 }
1606
1607 /*
1608  * Mark a thread request as complete.  At this point, it is treated as owned by
1609  * the submitting subsystem and you should assume it could be freed.
1610  *
1611  * Called with the workqueue lock held.
1612  */
1613 static int
1614 _threadreq_complete_and_unlock(proc_t p, struct workqueue *wq,
1615                 struct threadreq *req, struct threadlist *tl)
1616 {
1617         struct threadreq *req_tofree = NULL;
1618         bool sync = (req->tr_state == TR_STATE_NEW);
1619         bool workloop = req->tr_flags & TR_FLAG_WORKLOOP;
1620         bool onstack = req->tr_flags & TR_FLAG_ONSTACK;
1621         bool kevent = req->tr_flags & TR_FLAG_KEVENT;
1622         bool unbinding = tl->th_flags & TH_LIST_UNBINDING;
1623         bool locked = true;
1624         bool waking_parked_thread = (tl->th_flags & TH_LIST_BUSY);
1625         int ret;
1626
1627         req->tr_state = TR_STATE_COMPLETE;
1628
1629         if (!workloop && !onstack && req != &wq->wq_event_manager_threadreq) {
1630                 if (wq->wq_cached_threadreq) {
1631                         req_tofree = req;
1632                 } else {
1633                         wq->wq_cached_threadreq = req;
1634                 }
1635         }
1636
1637         if (tl->th_flags & TH_LIST_UNBINDING) {
1638                 tl->th_flags &= ~TH_LIST_UNBINDING;
1639                 assert((tl->th_flags & TH_LIST_KEVENT_BOUND));
1640         } else if (workloop || kevent) {
1641                 assert((tl->th_flags & TH_LIST_KEVENT_BOUND) == 0);
1642                 tl->th_flags |= TH_LIST_KEVENT_BOUND;
1643         }
1644
1645         if (workloop) {
1646                 workqueue_unlock(wq);
1647                 ret = pthread_kern->workloop_fulfill_threadreq(wq->wq_proc, (void*)req,
1648                                 tl->th_thread, sync ? WORKLOOP_FULFILL_THREADREQ_SYNC : 0);
1649                 assert(ret == 0);
1650                 locked = false;
1651         } else if (kevent) {
1652                 unsigned int kevent_flags = KEVENT_FLAG_WORKQ;
1653                 if (sync) {
1654                         kevent_flags |= KEVENT_FLAG_SYNCHRONOUS_BIND;
1655                 }
1656                 if (tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET) {
1657                         kevent_flags |= KEVENT_FLAG_WORKQ_MANAGER;
1658                 }
1659                 workqueue_unlock(wq);
1660                 ret = kevent_qos_internal_bind(wq->wq_proc,
1661                                 class_index_get_thread_qos(tl->th_priority), tl->th_thread,
1662                                 kevent_flags);
1663                 if (ret != 0) {
1664                         workqueue_lock_spin(wq);
1665                         tl->th_flags &= ~TH_LIST_KEVENT_BOUND;
1666                         locked = true;
1667                 } else {
1668                         locked = false;
1669                 }
1670         }
1671
1672         /*
1673          * Run Thread, Run!
1674          */
1675         PTHREAD_TRACE_WQ(TRACE_wq_run_threadreq | DBG_FUNC_END, wq, 0, 0, 0, 0);
1676         PTHREAD_TRACE_WQ_REQ(TRACE_wq_runitem | DBG_FUNC_START, wq, req, tl->th_priority,
1677                         thread_tid(current_thread()), thread_tid(tl->th_thread));
1678
1679         if (waking_parked_thread) {
1680                 if (!locked) {
1681                         workqueue_lock_spin(wq);
1682                 }
1683                 tl->th_flags &= ~(TH_LIST_BUSY);
1684                 if ((tl->th_flags & TH_LIST_REMOVING_VOUCHER) == 0) {
1685                         /*
1686                          * If the thread is in the process of removing its voucher, then it
1687                          * isn't actually in the wait event yet and we don't need to wake
1688                          * it up.  Save the trouble (and potential lock-ordering issues
1689                          * (see 30617015)).
1690                          */
1691                         thread_wakeup_thread(tl, tl->th_thread);
1692                 }
1693                 workqueue_unlock(wq);
1694
1695                 if (req_tofree) zfree(pthread_zone_threadreq, req_tofree);
1696                 return WQ_RUN_TR_THREAD_STARTED;
1697         }
1698
1699         assert ((tl->th_flags & TH_LIST_PACING) == 0);
1700         if (locked) {
1701                 workqueue_unlock(wq);
1702         }
1703         if (req_tofree) zfree(pthread_zone_threadreq, req_tofree);
1704         if (unbinding) {
1705                 return WQ_RUN_TR_THREAD_STARTED;
1706         }
1707         _setup_wqthread(p, tl->th_thread, wq, tl, WQ_SETUP_CLEAR_VOUCHER);
1708         pthread_kern->unix_syscall_return(EJUSTRETURN);
1709         __builtin_unreachable();
1710 }
1711
1712 /*
1713  * Mark a thread request as cancelled.  Has similar ownership semantics to the
1714  * complete call above.
1715  */
1716 static void
1717 _threadreq_cancel(struct workqueue *wq, struct threadreq *req)
1718 {
1719         assert(req->tr_state == TR_STATE_WAITING);
1720         req->tr_state = TR_STATE_DEAD;
1721
1722         assert((req->tr_flags & TR_FLAG_ONSTACK) == 0);
1723         if (req->tr_flags & TR_FLAG_WORKLOOP) {
1724                 __assert_only int ret;
1725                 ret = pthread_kern->workloop_fulfill_threadreq(wq->wq_proc, (void*)req,
1726                                 THREAD_NULL, WORKLOOP_FULFILL_THREADREQ_CANCEL);
1727                 assert(ret == 0 || ret == ECANCELED);
1728         } else if (req != &wq->wq_event_manager_threadreq) {
1729                 zfree(pthread_zone_threadreq, req);
1730         }
1731 }
1732
1733 #pragma mark workqueue lock
1734
1735 static boolean_t workqueue_lock_spin_is_acquired_kdp(struct workqueue *wq) {
1736   return kdp_lck_spin_is_acquired(&wq->wq_lock);
1737 }
1738
1739 static void
1740 workqueue_lock_spin(struct workqueue *wq)
1741 {
1742         assert(ml_get_interrupts_enabled() == TRUE);
1743         lck_spin_lock(&wq->wq_lock);
1744 }
1745
1746 static bool
1747 workqueue_lock_try(struct workqueue *wq)
1748 {
1749         return lck_spin_try_lock(&wq->wq_lock);
1750 }
1751
1752 static void
1753 workqueue_unlock(struct workqueue *wq)
1754 {
1755         lck_spin_unlock(&wq->wq_lock);
1756 }
1757
1758 #pragma mark workqueue add timer
1759
1760 /**
1761  * Sets up the timer which will call out to workqueue_add_timer
1762  */
1763 static void
1764 workqueue_interval_timer_start(struct workqueue *wq)
1765 {
1766         uint64_t deadline;
1767
1768         /* n.b. wq_timer_interval is reset to 0 in workqueue_add_timer if the
1769          ATIMER_RUNNING flag is not present.  The net effect here is that if a
1770          sequence of threads is required, we'll double the time before we give out
1771          the next one. */
1772         if (wq->wq_timer_interval == 0) {
1773                 wq->wq_timer_interval = wq_stalled_window_usecs;
1774
1775         } else {
1776                 wq->wq_timer_interval = wq->wq_timer_interval * 2;
1777
1778                 if (wq->wq_timer_interval > wq_max_timer_interval_usecs) {
1779                         wq->wq_timer_interval = wq_max_timer_interval_usecs;
1780                 }
1781         }
1782         clock_interval_to_deadline(wq->wq_timer_interval, 1000, &deadline);
1783
1784         PTHREAD_TRACE_WQ(TRACE_wq_start_add_timer, wq, wq->wq_reqcount,
1785                         _wq_flags(wq), wq->wq_timer_interval, 0);
1786
1787         thread_call_t call = wq->wq_atimer_delayed_call;
1788         if (thread_call_enter1_delayed(call, call, deadline)) {
1789                 panic("delayed_call was already enqueued");
1790         }
1791 }
1792
1793 /**
1794  * Immediately trigger the workqueue_add_timer
1795  */
1796 static void
1797 workqueue_interval_timer_trigger(struct workqueue *wq)
1798 {
1799         PTHREAD_TRACE_WQ(TRACE_wq_start_add_timer, wq, wq->wq_reqcount,
1800                         _wq_flags(wq), 0, 0);
1801
1802         thread_call_t call = wq->wq_atimer_immediate_call;
1803         if (thread_call_enter1(call, call)) {
1804                 panic("immediate_call was already enqueued");
1805         }
1806 }
1807
1808 /**
1809  * returns whether lastblocked_tsp is within wq_stalled_window_usecs of cur_ts
1810  */
1811 static boolean_t
1812 wq_thread_is_busy(uint64_t cur_ts, _Atomic uint64_t *lastblocked_tsp)
1813 {
1814         clock_sec_t     secs;
1815         clock_usec_t    usecs;
1816         uint64_t lastblocked_ts;
1817         uint64_t elapsed;
1818
1819         lastblocked_ts = atomic_load_explicit(lastblocked_tsp, memory_order_relaxed);
1820         if (lastblocked_ts >= cur_ts) {
1821                 /*
1822                  * because the update of the timestamp when a thread blocks isn't
1823                  * serialized against us looking at it (i.e. we don't hold the workq lock)
1824                  * it's possible to have a timestamp that matches the current time or
1825                  * that even looks to be in the future relative to when we grabbed the current
1826                  * time... just treat this as a busy thread since it must have just blocked.
1827                  */
1828                 return (TRUE);
1829         }
1830         elapsed = cur_ts - lastblocked_ts;
1831
1832         pthread_kern->absolutetime_to_microtime(elapsed, &secs, &usecs);
1833
1834         return (secs == 0 && usecs < wq_stalled_window_usecs);
1835 }
1836
1837 /**
1838  * handler function for the timer
1839  */
1840 static void
1841 workqueue_add_timer(struct workqueue *wq, thread_call_t thread_call_self)
1842 {
1843         proc_t p = wq->wq_proc;
1844
1845         workqueue_lock_spin(wq);
1846
1847         PTHREAD_TRACE_WQ(TRACE_wq_add_timer | DBG_FUNC_START, wq,
1848                         _wq_flags(wq), wq->wq_nthreads, wq->wq_thidlecount, 0);
1849
1850         /*
1851          * There's two tricky issues here.
1852          *
1853          * First issue: we start the thread_call's that invoke this routine without
1854          * the workqueue lock held.  The scheduler callback needs to trigger
1855          * reevaluation of the number of running threads but shouldn't take that
1856          * lock, so we can't use it to synchronize state around the thread_call.
1857          * As a result, it might re-enter the thread_call while this routine is
1858          * already running.  This could cause it to fire a second time and we'll
1859          * have two add_timers running at once.  Obviously, we don't want that to
1860          * keep stacking, so we need to keep it at two timers.
1861          *
1862          * Solution: use wq_flags (accessed via atomic CAS) to synchronize the
1863          * enqueue of the thread_call itself.  When a thread needs to trigger the
1864          * add_timer, it checks for ATIMER_DELAYED_RUNNING and, when not set, sets
1865          * the flag then does a thread_call_enter.  We'll then remove that flag
1866          * only once we've got the lock and it's safe for the thread_call to be
1867          * entered again.
1868          *
1869          * Second issue: we need to make sure that the two timers don't execute this
1870          * routine concurrently.  We can't use the workqueue lock for this because
1871          * we'll need to drop it during our execution.
1872          *
1873          * Solution: use WQL_ATIMER_BUSY as a condition variable to indicate that
1874          * we are currently executing the routine and the next thread should wait.
1875          *
1876          * After all that, we arrive at the following four possible states:
1877          * !WQ_ATIMER_DELAYED_RUNNING && !WQL_ATIMER_BUSY       no pending timer, no active timer
1878          * !WQ_ATIMER_DELAYED_RUNNING &&  WQL_ATIMER_BUSY       no pending timer,  1 active timer
1879          *  WQ_ATIMER_DELAYED_RUNNING && !WQL_ATIMER_BUSY        1 pending timer, no active timer
1880          *  WQ_ATIMER_DELAYED_RUNNING &&  WQL_ATIMER_BUSY        1 pending timer,  1 active timer
1881          *
1882          * Further complication sometimes we need to trigger this function to run
1883          * without delay.  Because we aren't under a lock between setting
1884          * WQ_ATIMER_DELAYED_RUNNING and calling thread_call_enter, we can't simply
1885          * re-enter the thread call: if thread_call_enter() returned false, we
1886          * wouldn't be able to distinguish the case where the thread_call had
1887          * already fired from the case where it hadn't been entered yet from the
1888          * other thread.  So, we use a separate thread_call for immediate
1889          * invocations, and a separate RUNNING flag, WQ_ATIMER_IMMEDIATE_RUNNING.
1890          */
1891
1892         while (wq->wq_lflags & WQL_ATIMER_BUSY) {
1893                 wq->wq_lflags |= WQL_ATIMER_WAITING;
1894
1895                 assert_wait((caddr_t)wq, (THREAD_UNINT));
1896                 workqueue_unlock(wq);
1897
1898                 thread_block(THREAD_CONTINUE_NULL);
1899
1900                 workqueue_lock_spin(wq);
1901         }
1902         /*
1903          * Prevent _workqueue_mark_exiting() from going away
1904          */
1905         wq->wq_lflags |= WQL_ATIMER_BUSY;
1906
1907         /*
1908          * Decide which timer we are and remove the RUNNING flag.
1909          */
1910         if (thread_call_self == wq->wq_atimer_delayed_call) {
1911                 uint64_t wq_flags = _wq_flags_and_orig(wq, ~WQ_ATIMER_DELAYED_RUNNING);
1912                 if ((wq_flags & WQ_ATIMER_DELAYED_RUNNING) == 0) {
1913                         panic("workqueue_add_timer(delayed) w/o WQ_ATIMER_DELAYED_RUNNING");
1914                 }
1915         } else if (thread_call_self == wq->wq_atimer_immediate_call) {
1916                 uint64_t wq_flags = _wq_flags_and_orig(wq, ~WQ_ATIMER_IMMEDIATE_RUNNING);
1917                 if ((wq_flags & WQ_ATIMER_IMMEDIATE_RUNNING) == 0) {
1918                         panic("workqueue_add_timer(immediate) w/o WQ_ATIMER_IMMEDIATE_RUNNING");
1919                 }
1920         } else {
1921                 panic("workqueue_add_timer can't figure out which timer it is");
1922         }
1923
1924         int ret = WQ_RUN_TR_THREAD_STARTED;
1925         while (ret == WQ_RUN_TR_THREAD_STARTED && wq->wq_reqcount) {
1926                 ret = workqueue_run_threadreq_and_unlock(p, wq, NULL, NULL, true);
1927
1928                 workqueue_lock_spin(wq);
1929         }
1930         _threadreq_copy_prepare(wq);
1931
1932         /*
1933          * If we called WQ_TIMER_NEEDED above, then this flag will be set if that
1934          * call marked the timer running.  If so, we let the timer interval grow.
1935          * Otherwise, we reset it back to 0.
1936          */
1937         uint32_t wq_flags = _wq_flags(wq);
1938         if (!(wq_flags & WQ_ATIMER_DELAYED_RUNNING)) {
1939                 wq->wq_timer_interval = 0;
1940         }
1941
1942         wq->wq_lflags &= ~WQL_ATIMER_BUSY;
1943
1944         if ((wq_flags & WQ_EXITING) || (wq->wq_lflags & WQL_ATIMER_WAITING)) {
1945                 /*
1946                  * wakeup the thread hung up in _workqueue_mark_exiting or
1947                  * workqueue_add_timer waiting for this timer to finish getting out of
1948                  * the way
1949                  */
1950                 wq->wq_lflags &= ~WQL_ATIMER_WAITING;
1951                 wakeup(wq);
1952         }
1953
1954         PTHREAD_TRACE_WQ(TRACE_wq_add_timer | DBG_FUNC_END, wq, 0, wq->wq_nthreads, wq->wq_thidlecount, 0);
1955
1956         workqueue_unlock(wq);
1957 }
1958
1959 #pragma mark thread state tracking
1960
1961 // called by spinlock code when trying to yield to lock owner
1962 void
1963 _workqueue_thread_yielded(void)
1964 {
1965 }
1966
1967 static void
1968 workqueue_callback(int type, thread_t thread)
1969 {
1970         struct uthread *uth = pthread_kern->get_bsdthread_info(thread);
1971         struct threadlist *tl = pthread_kern->uthread_get_threadlist(uth);
1972         struct workqueue *wq = tl->th_workq;
1973         uint32_t old_count, req_qos, qos = tl->th_priority;
1974         wq_thactive_t old_thactive;
1975
1976         switch (type) {
1977         case SCHED_CALL_BLOCK: {
1978                 bool start_timer = false;
1979
1980                 old_thactive = _wq_thactive_dec(wq, tl->th_priority);
1981                 req_qos = WQ_THACTIVE_BEST_CONSTRAINED_REQ_QOS(old_thactive);
1982                 old_count = _wq_thactive_aggregate_downto_qos(wq, old_thactive,
1983                                 qos, NULL, NULL);
1984
1985                 if (old_count == wq_max_concurrency[tl->th_priority]) {
1986                         /*
1987                          * The number of active threads at this priority has fallen below
1988                          * the maximum number of concurrent threads that are allowed to run
1989                          *
1990                          * if we collide with another thread trying to update the
1991                          * last_blocked (really unlikely since another thread would have to
1992                          * get scheduled and then block after we start down this path), it's
1993                          * not a problem.  Either timestamp is adequate, so no need to retry
1994                          */
1995                         atomic_store_explicit(&wq->wq_lastblocked_ts[qos],
1996                                         mach_absolute_time(), memory_order_relaxed);
1997                 }
1998
1999                 if (req_qos == WORKQUEUE_EVENT_MANAGER_BUCKET || qos > req_qos) {
2000                         /*
2001                          * The blocking thread is at a lower QoS than the highest currently
2002                          * pending constrained request, nothing has to be redriven
2003                          */
2004                 } else {
2005                         uint32_t max_busycount, old_req_count;
2006                         old_req_count = _wq_thactive_aggregate_downto_qos(wq, old_thactive,
2007                                         req_qos, NULL, &max_busycount);
2008                         /*
2009                          * If it is possible that may_start_constrained_thread had refused
2010                          * admission due to being over the max concurrency, we may need to
2011                          * spin up a new thread.
2012                          *
2013                          * We take into account the maximum number of busy threads
2014                          * that can affect may_start_constrained_thread as looking at the
2015                          * actual number may_start_constrained_thread will see is racy.
2016                          *
2017                          * IOW at NCPU = 4, for IN (req_qos = 1), if the old req count is
2018                          * between NCPU (4) and NCPU - 2 (2) we need to redrive.
2019                          */
2020                         if (wq_max_concurrency[req_qos] <= old_req_count + max_busycount &&
2021                                         old_req_count <= wq_max_concurrency[req_qos]) {
2022                                 if (WQ_TIMER_DELAYED_NEEDED(wq)) {
2023                                         start_timer = true;
2024                                         workqueue_interval_timer_start(wq);
2025                                 }
2026                         }
2027                 }
2028
2029                 PTHREAD_TRACE_WQ(TRACE_wq_thread_block | DBG_FUNC_START, wq,
2030                                 old_count - 1, qos | (req_qos << 8),
2031                                 wq->wq_reqcount << 1 | start_timer, 0);
2032                 break;
2033         }
2034         case SCHED_CALL_UNBLOCK: {
2035                 /*
2036                  * we cannot take the workqueue_lock here...
2037                  * an UNBLOCK can occur from a timer event which
2038                  * is run from an interrupt context... if the workqueue_lock
2039                  * is already held by this processor, we'll deadlock...
2040                  * the thread lock for the thread being UNBLOCKED
2041                  * is also held
2042                  */
2043                 old_thactive = _wq_thactive_inc(wq, qos);
2044                 if (pthread_debug_tracing) {
2045                         req_qos = WQ_THACTIVE_BEST_CONSTRAINED_REQ_QOS(old_thactive);
2046                         old_count = _wq_thactive_aggregate_downto_qos(wq, old_thactive,
2047                                         qos, NULL, NULL);
2048                         PTHREAD_TRACE_WQ(TRACE_wq_thread_block | DBG_FUNC_END, wq,
2049                                         old_count + 1, qos | (req_qos << 8),
2050                                         wq->wq_threads_scheduled, 0);
2051                 }
2052                 break;
2053         }
2054         }
2055 }
2056
2057 sched_call_t
2058 _workqueue_get_sched_callback(void)
2059 {
2060         return workqueue_callback;
2061 }
2062
2063 #pragma mark thread addition/removal
2064
2065 static mach_vm_size_t
2066 _workqueue_allocsize(struct workqueue *wq)
2067 {
2068         proc_t p = wq->wq_proc;
2069         mach_vm_size_t guardsize = vm_map_page_size(wq->wq_map);
2070         mach_vm_size_t pthread_size =
2071                 vm_map_round_page_mask(pthread_kern->proc_get_pthsize(p) + PTHREAD_T_OFFSET, vm_map_page_mask(wq->wq_map));
2072         return guardsize + PTH_DEFAULT_STACKSIZE + pthread_size;
2073 }
2074
2075 /**
2076  * pop goes the thread
2077  *
2078  * If fromexit is set, the call is from workqueue_exit(,
2079  * so some cleanups are to be avoided.
2080  */
2081 static void
2082 workqueue_removethread(struct threadlist *tl, bool fromexit, bool first_use)
2083 {
2084         struct uthread * uth;
2085         struct workqueue * wq = tl->th_workq;
2086
2087         if (tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET){
2088                 TAILQ_REMOVE(&wq->wq_thidlemgrlist, tl, th_entry);
2089         } else {
2090                 TAILQ_REMOVE(&wq->wq_thidlelist, tl, th_entry);
2091         }
2092
2093         if (fromexit == 0) {
2094                 assert(wq->wq_nthreads && wq->wq_thidlecount);
2095                 wq->wq_nthreads--;
2096                 wq->wq_thidlecount--;
2097         }
2098
2099         /*
2100          * Clear the threadlist pointer in uthread so
2101          * blocked thread on wakeup for termination will
2102          * not access the thread list as it is going to be
2103          * freed.
2104          */
2105         pthread_kern->thread_sched_call(tl->th_thread, NULL);
2106
2107         uth = pthread_kern->get_bsdthread_info(tl->th_thread);
2108         if (uth != (struct uthread *)0) {
2109                 pthread_kern->uthread_set_threadlist(uth, NULL);
2110         }
2111         if (fromexit == 0) {
2112                 /* during exit the lock is not held */
2113                 workqueue_unlock(wq);
2114         }
2115
2116         if ( (tl->th_flags & TH_LIST_NEW) || first_use ) {
2117                 /*
2118                  * thread was created, but never used...
2119                  * need to clean up the stack and port ourselves
2120                  * since we're not going to spin up through the
2121                  * normal exit path triggered from Libc
2122                  */
2123                 if (fromexit == 0) {
2124                         /* vm map is already deallocated when this is called from exit */
2125                         (void)mach_vm_deallocate(wq->wq_map, tl->th_stackaddr, _workqueue_allocsize(wq));
2126                 }
2127                 (void)pthread_kern->mach_port_deallocate(pthread_kern->task_get_ipcspace(wq->wq_task), tl->th_thport);
2128         }
2129         /*
2130          * drop our ref on the thread
2131          */
2132         thread_deallocate(tl->th_thread);
2133
2134         zfree(pthread_zone_threadlist, tl);
2135 }
2136
2137
2138 /**
2139  * Try to add a new workqueue thread.
2140  *
2141  * - called with workq lock held
2142  * - dropped and retaken around thread creation
2143  * - return with workq lock held
2144  */
2145 static bool
2146 workqueue_addnewthread(proc_t p, struct workqueue *wq)
2147 {
2148         kern_return_t kret;
2149
2150         wq->wq_nthreads++;
2151
2152         workqueue_unlock(wq);
2153
2154         struct threadlist *tl = zalloc(pthread_zone_threadlist);
2155         bzero(tl, sizeof(struct threadlist));
2156
2157         thread_t th;
2158         kret = pthread_kern->thread_create_workq_waiting(wq->wq_task, wq_unpark_continue, tl, &th);
2159         if (kret != KERN_SUCCESS) {
2160                 PTHREAD_TRACE_WQ(TRACE_wq_thread_create_failed | DBG_FUNC_NONE, wq, kret, 0, 0, 0);
2161                 goto fail_free;
2162         }
2163
2164         mach_vm_offset_t stackaddr = pthread_kern->proc_get_stack_addr_hint(p);
2165
2166         mach_vm_size_t guardsize = vm_map_page_size(wq->wq_map);
2167         mach_vm_size_t pthread_size =
2168                 vm_map_round_page_mask(pthread_kern->proc_get_pthsize(p) + PTHREAD_T_OFFSET, vm_map_page_mask(wq->wq_map));
2169         mach_vm_size_t th_allocsize = guardsize + PTH_DEFAULT_STACKSIZE + pthread_size;
2170
2171         kret = mach_vm_map(wq->wq_map, &stackaddr,
2172                         th_allocsize, page_size-1,
2173                         VM_MAKE_TAG(VM_MEMORY_STACK)| VM_FLAGS_ANYWHERE, NULL,
2174                         0, FALSE, VM_PROT_DEFAULT, VM_PROT_ALL,
2175                         VM_INHERIT_DEFAULT);
2176
2177         if (kret != KERN_SUCCESS) {
2178                 kret = mach_vm_allocate(wq->wq_map,
2179                                 &stackaddr, th_allocsize,
2180                                 VM_MAKE_TAG(VM_MEMORY_STACK) | VM_FLAGS_ANYWHERE);
2181         }
2182
2183         if (kret != KERN_SUCCESS) {
2184                 PTHREAD_TRACE_WQ(TRACE_wq_thread_create_failed | DBG_FUNC_NONE, wq, kret, 1, 0, 0);
2185                 goto fail_terminate;
2186         }
2187
2188         /*
2189          * The guard page is at the lowest address
2190          * The stack base is the highest address
2191          */
2192         kret = mach_vm_protect(wq->wq_map, stackaddr, guardsize, FALSE, VM_PROT_NONE);
2193         if (kret != KERN_SUCCESS) {
2194                 PTHREAD_TRACE_WQ(TRACE_wq_thread_create_failed | DBG_FUNC_NONE, wq, kret, 2, 0, 0);
2195                 goto fail_vm_deallocate;
2196         }
2197
2198
2199         pthread_kern->thread_set_tag(th, THREAD_TAG_PTHREAD | THREAD_TAG_WORKQUEUE);
2200         pthread_kern->thread_static_param(th, TRUE);
2201
2202         /*
2203          * convert_thread_to_port() consumes a reference
2204          */
2205         thread_reference(th);
2206         void *sright = (void *)pthread_kern->convert_thread_to_port(th);
2207         tl->th_thport = pthread_kern->ipc_port_copyout_send(sright,
2208                         pthread_kern->task_get_ipcspace(wq->wq_task));
2209
2210         tl->th_flags = TH_LIST_INITED | TH_LIST_NEW;
2211         tl->th_thread = th;
2212         tl->th_workq = wq;
2213         tl->th_stackaddr = stackaddr;
2214         tl->th_priority = WORKQUEUE_NUM_BUCKETS;
2215
2216         struct uthread *uth;
2217         uth = pthread_kern->get_bsdthread_info(tl->th_thread);
2218
2219         workqueue_lock_spin(wq);
2220
2221         void *current_tl = pthread_kern->uthread_get_threadlist(uth);
2222         if (current_tl == NULL) {
2223                 pthread_kern->uthread_set_threadlist(uth, tl);
2224                 TAILQ_INSERT_TAIL(&wq->wq_thidlelist, tl, th_entry);
2225                 wq->wq_thidlecount++;
2226         } else if (current_tl == WQ_THREADLIST_EXITING_POISON) {
2227                 /*
2228                  * Failed thread creation race: The thread already woke up and has exited.
2229                  */
2230                 PTHREAD_TRACE_WQ(TRACE_wq_thread_create_failed | DBG_FUNC_NONE, wq, kret, 3, 0, 0);
2231                 goto fail_unlock;
2232         } else {
2233                 panic("Unexpected initial threadlist value");
2234         }
2235
2236         PTHREAD_TRACE_WQ(TRACE_wq_thread_create | DBG_FUNC_NONE, wq, 0, 0, 0, 0);
2237
2238         return (TRUE);
2239
2240 fail_unlock:
2241         workqueue_unlock(wq);
2242         (void)pthread_kern->mach_port_deallocate(pthread_kern->task_get_ipcspace(wq->wq_task),
2243                         tl->th_thport);
2244
2245 fail_vm_deallocate:
2246         (void) mach_vm_deallocate(wq->wq_map, stackaddr, th_allocsize);
2247
2248 fail_terminate:
2249         if (pthread_kern->thread_will_park_or_terminate) {
2250                 pthread_kern->thread_will_park_or_terminate(th);
2251         }
2252         (void)thread_terminate(th);
2253         thread_deallocate(th);
2254
2255 fail_free:
2256         zfree(pthread_zone_threadlist, tl);
2257
2258         workqueue_lock_spin(wq);
2259         wq->wq_nthreads--;
2260
2261         return (FALSE);
2262 }
2263
2264 /**
2265  * Setup per-process state for the workqueue.
2266  */
2267 int
2268 _workq_open(struct proc *p, __unused int32_t *retval)
2269 {
2270         struct workqueue * wq;
2271         char * ptr;
2272         uint32_t num_cpus;
2273         int error = 0;
2274
2275         if (pthread_kern->proc_get_register(p) == 0) {
2276                 return EINVAL;
2277         }
2278
2279         num_cpus = pthread_kern->ml_get_max_cpus();
2280
2281         if (wq_init_constrained_limit) {
2282                 uint32_t limit;
2283                 /*
2284                  * set up the limit for the constrained pool
2285                  * this is a virtual pool in that we don't
2286                  * maintain it on a separate idle and run list
2287                  */
2288                 limit = num_cpus * WORKQUEUE_CONSTRAINED_FACTOR;
2289
2290                 if (limit > wq_max_constrained_threads)
2291                         wq_max_constrained_threads = limit;
2292
2293                 wq_init_constrained_limit = 0;
2294
2295                 if (wq_max_threads > WQ_THACTIVE_BUCKET_HALF) {
2296                         wq_max_threads = WQ_THACTIVE_BUCKET_HALF;
2297                 }
2298                 if (wq_max_threads > pthread_kern->config_thread_max - 20) {
2299                         wq_max_threads = pthread_kern->config_thread_max - 20;
2300                 }
2301         }
2302
2303         if (pthread_kern->proc_get_wqptr(p) == NULL) {
2304                 if (pthread_kern->proc_init_wqptr_or_wait(p) == FALSE) {
2305                         assert(pthread_kern->proc_get_wqptr(p) != NULL);
2306                         goto out;
2307                 }
2308
2309                 ptr = (char *)zalloc(pthread_zone_workqueue);
2310                 bzero(ptr, sizeof(struct workqueue));
2311
2312                 wq = (struct workqueue *)ptr;
2313                 wq->wq_proc = p;
2314                 wq->wq_task = current_task();
2315                 wq->wq_map  = pthread_kern->current_map();
2316
2317                 // Start the event manager at the priority hinted at by the policy engine
2318                 int mgr_priority_hint = pthread_kern->task_get_default_manager_qos(current_task());
2319                 wq->wq_event_manager_priority = (uint32_t)thread_qos_get_pthread_priority(mgr_priority_hint) | _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG;
2320
2321                 TAILQ_INIT(&wq->wq_thrunlist);
2322                 TAILQ_INIT(&wq->wq_thidlelist);
2323                 for (int i = 0; i < WORKQUEUE_EVENT_MANAGER_BUCKET; i++) {
2324                         TAILQ_INIT(&wq->wq_overcommit_reqlist[i]);
2325                         TAILQ_INIT(&wq->wq_reqlist[i]);
2326                 }
2327
2328                 wq->wq_atimer_delayed_call =
2329                                 thread_call_allocate_with_priority((thread_call_func_t)workqueue_add_timer,
2330                                                 (thread_call_param_t)wq, THREAD_CALL_PRIORITY_KERNEL);
2331                 wq->wq_atimer_immediate_call =
2332                                 thread_call_allocate_with_priority((thread_call_func_t)workqueue_add_timer,
2333                                                 (thread_call_param_t)wq, THREAD_CALL_PRIORITY_KERNEL);
2334
2335                 lck_spin_init(&wq->wq_lock, pthread_lck_grp, pthread_lck_attr);
2336
2337                 wq->wq_cached_threadreq = zalloc(pthread_zone_threadreq);
2338                 *(wq_thactive_t *)&wq->wq_thactive =
2339                                 (wq_thactive_t)WQ_THACTIVE_NO_PENDING_REQUEST <<
2340                                 WQ_THACTIVE_QOS_SHIFT;
2341
2342                 pthread_kern->proc_set_wqptr(p, wq);
2343
2344         }
2345 out:
2346
2347         return(error);
2348 }
2349
2350 /*
2351  * Routine:     workqueue_mark_exiting
2352  *
2353  * Function:    Mark the work queue such that new threads will not be added to the
2354  *              work queue after we return.
2355  *
2356  * Conditions:  Called against the current process.
2357  */
2358 void
2359 _workqueue_mark_exiting(struct proc *p)
2360 {
2361         struct workqueue *wq = pthread_kern->proc_get_wqptr(p);
2362         if (!wq) return;
2363
2364         PTHREAD_TRACE_WQ(TRACE_wq_pthread_exit|DBG_FUNC_START, wq, 0, 0, 0, 0);
2365
2366         workqueue_lock_spin(wq);
2367
2368         /*
2369          * We arm the add timer without holding the workqueue lock so we need
2370          * to synchronize with any running or soon to be running timers.
2371          *
2372          * Threads that intend to arm the timer atomically OR
2373          * WQ_ATIMER_{DELAYED,IMMEDIATE}_RUNNING into the wq_flags, only if
2374          * WQ_EXITING is not present.  So, once we have set WQ_EXITING, we can
2375          * be sure that no new RUNNING flags will be set, but still need to
2376          * wait for the already running timers to complete.
2377          *
2378          * We always hold the workq lock when dropping WQ_ATIMER_RUNNING, so
2379          * the check for and sleep until clear is protected.
2380          */
2381         uint64_t wq_flags = _wq_flags_or_orig(wq, WQ_EXITING);
2382
2383         if (wq_flags & WQ_ATIMER_DELAYED_RUNNING) {
2384                 if (thread_call_cancel(wq->wq_atimer_delayed_call) == TRUE) {
2385                         wq_flags = _wq_flags_and_orig(wq, ~WQ_ATIMER_DELAYED_RUNNING);
2386                 }
2387         }
2388         if (wq_flags & WQ_ATIMER_IMMEDIATE_RUNNING) {
2389                 if (thread_call_cancel(wq->wq_atimer_immediate_call) == TRUE) {
2390                         wq_flags = _wq_flags_and_orig(wq, ~WQ_ATIMER_IMMEDIATE_RUNNING);
2391                 }
2392         }
2393         while ((_wq_flags(wq) & (WQ_ATIMER_DELAYED_RUNNING | WQ_ATIMER_IMMEDIATE_RUNNING)) ||
2394                         (wq->wq_lflags & WQL_ATIMER_BUSY)) {
2395                 assert_wait((caddr_t)wq, (THREAD_UNINT));
2396                 workqueue_unlock(wq);
2397
2398                 thread_block(THREAD_CONTINUE_NULL);
2399
2400                 workqueue_lock_spin(wq);
2401         }
2402
2403         /*
2404          * Save off pending requests, will complete/free them below after unlocking
2405          */
2406         TAILQ_HEAD(, threadreq) local_list = TAILQ_HEAD_INITIALIZER(local_list);
2407
2408         for (int i = 0; i < WORKQUEUE_EVENT_MANAGER_BUCKET; i++) {
2409                 TAILQ_CONCAT(&local_list, &wq->wq_overcommit_reqlist[i], tr_entry);
2410                 TAILQ_CONCAT(&local_list, &wq->wq_reqlist[i], tr_entry);
2411         }
2412
2413         /*
2414          * XXX: Can't deferred cancel the event manager request, so just smash it.
2415          */
2416         assert((wq->wq_event_manager_threadreq.tr_flags & TR_FLAG_WORKLOOP) == 0);
2417         wq->wq_event_manager_threadreq.tr_state = TR_STATE_DEAD;
2418
2419         workqueue_unlock(wq);
2420
2421         struct threadreq *tr, *tr_temp;
2422         TAILQ_FOREACH_SAFE(tr, &local_list, tr_entry, tr_temp) {
2423                 _threadreq_cancel(wq, tr);
2424         }
2425         PTHREAD_TRACE(TRACE_wq_pthread_exit|DBG_FUNC_END, 0, 0, 0, 0, 0);
2426 }
2427
2428 /*
2429  * Routine:     workqueue_exit
2430  *
2431  * Function:    clean up the work queue structure(s) now that there are no threads
2432  *              left running inside the work queue (except possibly current_thread).
2433  *
2434  * Conditions:  Called by the last thread in the process.
2435  *              Called against current process.
2436  */
2437 void
2438 _workqueue_exit(struct proc *p)
2439 {
2440         struct workqueue  * wq;
2441         struct threadlist  * tl, *tlist;
2442         struct uthread  *uth;
2443
2444         wq = pthread_kern->proc_get_wqptr(p);
2445         if (wq != NULL) {
2446
2447                 PTHREAD_TRACE_WQ(TRACE_wq_workqueue_exit|DBG_FUNC_START, wq, 0, 0, 0, 0);
2448
2449                 pthread_kern->proc_set_wqptr(p, NULL);
2450
2451                 /*
2452                  * Clean up workqueue data structures for threads that exited and
2453                  * didn't get a chance to clean up after themselves.
2454                  */
2455                 TAILQ_FOREACH_SAFE(tl, &wq->wq_thrunlist, th_entry, tlist) {
2456                         assert((tl->th_flags & TH_LIST_RUNNING) != 0);
2457
2458                         pthread_kern->thread_sched_call(tl->th_thread, NULL);
2459
2460                         uth = pthread_kern->get_bsdthread_info(tl->th_thread);
2461                         if (uth != (struct uthread *)0) {
2462                                 pthread_kern->uthread_set_threadlist(uth, NULL);
2463                         }
2464                         TAILQ_REMOVE(&wq->wq_thrunlist, tl, th_entry);
2465
2466                         /*
2467                          * drop our last ref on the thread
2468                          */
2469                         thread_deallocate(tl->th_thread);
2470
2471                         zfree(pthread_zone_threadlist, tl);
2472                 }
2473                 TAILQ_FOREACH_SAFE(tl, &wq->wq_thidlelist, th_entry, tlist) {
2474                         assert((tl->th_flags & TH_LIST_RUNNING) == 0);
2475                         assert(tl->th_priority != WORKQUEUE_EVENT_MANAGER_BUCKET);
2476                         workqueue_removethread(tl, true, false);
2477                 }
2478                 TAILQ_FOREACH_SAFE(tl, &wq->wq_thidlemgrlist, th_entry, tlist) {
2479                         assert((tl->th_flags & TH_LIST_RUNNING) == 0);
2480                         assert(tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET);
2481                         workqueue_removethread(tl, true, false);
2482                 }
2483                 if (wq->wq_cached_threadreq) {
2484                         zfree(pthread_zone_threadreq, wq->wq_cached_threadreq);
2485                 }
2486                 thread_call_free(wq->wq_atimer_delayed_call);
2487                 thread_call_free(wq->wq_atimer_immediate_call);
2488                 lck_spin_destroy(&wq->wq_lock, pthread_lck_grp);
2489
2490                 for (int i = 0; i < WORKQUEUE_EVENT_MANAGER_BUCKET; i++) {
2491                         assert(TAILQ_EMPTY(&wq->wq_overcommit_reqlist[i]));
2492                         assert(TAILQ_EMPTY(&wq->wq_reqlist[i]));
2493                 }
2494
2495                 zfree(pthread_zone_workqueue, wq);
2496
2497                 PTHREAD_TRACE(TRACE_wq_workqueue_exit|DBG_FUNC_END, 0, 0, 0, 0, 0);
2498         }
2499 }
2500
2501
2502 #pragma mark workqueue thread manipulation
2503
2504
2505 /**
2506  * Entry point for libdispatch to ask for threads
2507  */
2508 static int
2509 wqops_queue_reqthreads(struct proc *p, int reqcount,
2510                 pthread_priority_t priority)
2511 {
2512         bool overcommit = _pthread_priority_get_flags(priority) & _PTHREAD_PRIORITY_OVERCOMMIT_FLAG;
2513         bool event_manager = _pthread_priority_get_flags(priority) & _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG;
2514         int class = event_manager ? WORKQUEUE_EVENT_MANAGER_BUCKET :
2515                         pthread_priority_get_class_index(priority);
2516
2517         if ((reqcount <= 0) || (class < 0) || (class >= WORKQUEUE_NUM_BUCKETS) ||
2518                         (overcommit && event_manager)) {
2519                 return EINVAL;
2520         }
2521
2522         struct workqueue *wq;
2523         if ((wq = (struct workqueue *)pthread_kern->proc_get_wqptr(p)) == NULL) {
2524                 return EINVAL;
2525         }
2526
2527         workqueue_lock_spin(wq);
2528         _threadreq_copy_prepare(wq);
2529
2530         PTHREAD_TRACE_WQ(TRACE_wq_wqops_reqthreads | DBG_FUNC_NONE, wq, reqcount, priority, 0, 0);
2531
2532         int tr_flags = 0;
2533         if (overcommit) tr_flags |= TR_FLAG_OVERCOMMIT;
2534         if (reqcount > 1) {
2535                 /*
2536                  * when libdispatch asks for more than one thread, it wants to achieve
2537                  * parallelism. Pacing would be detrimental to this ask, so treat
2538                  * these specially to not do the pacing admission check
2539                  */
2540                 tr_flags |= TR_FLAG_NO_PACING;
2541         }
2542
2543         while (reqcount-- && !_wq_exiting(wq)) {
2544                 struct threadreq req;
2545                 _threadreq_init_stack(&req, class, tr_flags);
2546
2547                 workqueue_run_threadreq_and_unlock(p, wq, NULL, &req, true);
2548
2549                 workqueue_lock_spin(wq); /* reacquire */
2550                 _threadreq_copy_prepare(wq);
2551         }
2552
2553         workqueue_unlock(wq);
2554
2555         return 0;
2556 }
2557
2558 /*
2559  * Used by the kevent system to request threads.
2560  *
2561  * Currently count is ignored and we always return one thread per invocation.
2562  */
2563 static thread_t
2564 _workq_kevent_reqthreads(struct proc *p, pthread_priority_t priority,
2565                 bool no_emergency)
2566 {
2567         int wq_run_tr = WQ_RUN_TR_THROTTLED;
2568         bool emergency_thread = false;
2569         struct threadreq req;
2570
2571
2572         struct workqueue *wq;
2573         if ((wq = (struct workqueue *)pthread_kern->proc_get_wqptr(p)) == NULL) {
2574                 return THREAD_NULL;
2575         }
2576
2577         int class = pthread_priority_get_class_index(priority);
2578
2579         workqueue_lock_spin(wq);
2580         bool has_threadreq = _threadreq_copy_prepare_noblock(wq);
2581
2582         PTHREAD_TRACE_WQ_REQ(TRACE_wq_kevent_reqthreads | DBG_FUNC_NONE, wq, NULL, priority, 0, 0);
2583
2584         /*
2585          * Skip straight to event manager if that's what was requested
2586          */
2587         if ((_pthread_priority_get_qos_newest(priority) == QOS_CLASS_UNSPECIFIED) ||
2588                         (_pthread_priority_get_flags(priority) & _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG)){
2589                 goto event_manager;
2590         }
2591
2592         bool will_pace = _wq_should_pace_priority(wq, class);
2593         if ((wq->wq_thidlecount == 0 || will_pace) && has_threadreq == false) {
2594                 /*
2595                  * We'll need to persist the request and can't, so return the emergency
2596                  * thread instead, which has a persistent request object.
2597                  */
2598                 emergency_thread = true;
2599                 goto event_manager;
2600         }
2601
2602         /*
2603          * Handle overcommit requests
2604          */
2605         if ((_pthread_priority_get_flags(priority) & _PTHREAD_PRIORITY_OVERCOMMIT_FLAG) != 0){
2606                 _threadreq_init_stack(&req, class, TR_FLAG_KEVENT | TR_FLAG_OVERCOMMIT);
2607                 wq_run_tr = workqueue_run_threadreq_and_unlock(p, wq, NULL, &req, false);
2608                 goto done;
2609         }
2610
2611         /*
2612          * Handle constrained requests
2613          */
2614         boolean_t may_start = may_start_constrained_thread(wq, class, NULL, false);
2615         if (may_start || no_emergency) {
2616                 _threadreq_init_stack(&req, class, TR_FLAG_KEVENT);
2617                 wq_run_tr = workqueue_run_threadreq_and_unlock(p, wq, NULL, &req, false);
2618                 goto done;
2619         } else {
2620                 emergency_thread = true;
2621         }
2622
2623
2624 event_manager:
2625         _threadreq_init_stack(&req, WORKQUEUE_EVENT_MANAGER_BUCKET, TR_FLAG_KEVENT);
2626         wq_run_tr = workqueue_run_threadreq_and_unlock(p, wq, NULL, &req, false);
2627
2628 done:
2629         if (wq_run_tr == WQ_RUN_TR_THREAD_NEEDED && WQ_TIMER_IMMEDIATE_NEEDED(wq)) {
2630                 workqueue_interval_timer_trigger(wq);
2631         }
2632         return emergency_thread ? (void*)-1 : 0;
2633 }
2634
2635 thread_t
2636 _workq_reqthreads(struct proc *p, __assert_only int requests_count,
2637                 workq_reqthreads_req_t request)
2638 {
2639         assert(requests_count == 1);
2640
2641         pthread_priority_t priority = request->priority;
2642         bool no_emergency = request->count & WORKQ_REQTHREADS_NOEMERGENCY;
2643
2644         return _workq_kevent_reqthreads(p, priority, no_emergency);
2645 }
2646
2647
2648 int
2649 workq_kern_threadreq(struct proc *p, workq_threadreq_t _req,
2650                 enum workq_threadreq_type type, unsigned long priority, int flags)
2651 {
2652         struct workqueue *wq;
2653         int ret;
2654
2655         if ((wq = (struct workqueue *)pthread_kern->proc_get_wqptr(p)) == NULL) {
2656                 return EINVAL;
2657         }
2658
2659         switch (type) {
2660         case WORKQ_THREADREQ_KEVENT: {
2661                 bool no_emergency = flags & WORKQ_THREADREQ_FLAG_NOEMERGENCY;
2662                 (void)_workq_kevent_reqthreads(p, priority, no_emergency);
2663                 return 0;
2664         }
2665         case WORKQ_THREADREQ_WORKLOOP:
2666         case WORKQ_THREADREQ_WORKLOOP_NO_THREAD_CALL: {
2667                 struct threadreq *req = (struct threadreq *)_req;
2668                 int req_class = pthread_priority_get_class_index(priority);
2669                 int req_flags = TR_FLAG_WORKLOOP;
2670                 if ((_pthread_priority_get_flags(priority) &
2671                                 _PTHREAD_PRIORITY_OVERCOMMIT_FLAG) != 0){
2672                         req_flags |= TR_FLAG_OVERCOMMIT;
2673                 }
2674
2675                 thread_t thread = current_thread();
2676                 struct threadlist *tl = util_get_thread_threadlist_entry(thread);
2677
2678                 if (tl && tl != WQ_THREADLIST_EXITING_POISON &&
2679                                 (tl->th_flags & TH_LIST_UNBINDING)) {
2680                         /*
2681                          * we're called back synchronously from the context of
2682                          * kevent_qos_internal_unbind from within wqops_thread_return()
2683                          * we can try to match up this thread with this request !
2684                          */
2685                 } else {
2686                         tl = NULL;
2687                 }
2688
2689                 _threadreq_init_alloced(req, req_class, req_flags);
2690                 workqueue_lock_spin(wq);
2691                 PTHREAD_TRACE_WQ_REQ(TRACE_wq_kevent_reqthreads | DBG_FUNC_NONE, wq, req, priority, 1, 0);
2692                 ret = workqueue_run_threadreq_and_unlock(p, wq, tl, req, false);
2693                 if (ret == WQ_RUN_TR_EXITING) {
2694                         return ECANCELED;
2695                 }
2696                 if (ret == WQ_RUN_TR_THREAD_NEEDED) {
2697                         if (type == WORKQ_THREADREQ_WORKLOOP_NO_THREAD_CALL) {
2698                                 return EAGAIN;
2699                         }
2700                         if (WQ_TIMER_IMMEDIATE_NEEDED(wq)) {
2701                                 workqueue_interval_timer_trigger(wq);
2702                         }
2703                 }
2704                 return 0;
2705         }
2706         case WORKQ_THREADREQ_REDRIVE:
2707                 PTHREAD_TRACE_WQ_REQ(TRACE_wq_kevent_reqthreads | DBG_FUNC_NONE, wq, 0, 0, 4, 0);
2708                 workqueue_lock_spin(wq);
2709                 ret = workqueue_run_threadreq_and_unlock(p, wq, NULL, NULL, true);
2710                 if (ret == WQ_RUN_TR_EXITING) {
2711                         return ECANCELED;
2712                 }
2713                 return 0;
2714         default:
2715                 return ENOTSUP;
2716         }
2717 }
2718
2719 int
2720 workq_kern_threadreq_modify(struct proc *p, workq_threadreq_t _req,
2721                 enum workq_threadreq_op operation, unsigned long arg1,
2722                 unsigned long __unused arg2)
2723 {
2724         struct threadreq *req = (struct threadreq *)_req;
2725         struct workqueue *wq;
2726         int priclass, ret = 0, wq_tr_rc = WQ_RUN_TR_THROTTLED;
2727
2728         if (req == NULL || (wq = pthread_kern->proc_get_wqptr(p)) == NULL) {
2729                 return EINVAL;
2730         }
2731
2732         workqueue_lock_spin(wq);
2733
2734         if (_wq_exiting(wq)) {
2735                 ret = ECANCELED;
2736                 goto out_unlock;
2737         }
2738
2739         /*
2740          * Find/validate the referenced request structure
2741          */
2742         if (req->tr_state != TR_STATE_WAITING) {
2743                 ret = EINVAL;
2744                 goto out_unlock;
2745         }
2746         assert(req->tr_priority < WORKQUEUE_EVENT_MANAGER_BUCKET);
2747         assert(req->tr_flags & TR_FLAG_WORKLOOP);
2748
2749         switch (operation) {
2750         case WORKQ_THREADREQ_CHANGE_PRI:
2751         case WORKQ_THREADREQ_CHANGE_PRI_NO_THREAD_CALL:
2752                 priclass = pthread_priority_get_class_index(arg1);
2753                 PTHREAD_TRACE_WQ_REQ(TRACE_wq_kevent_reqthreads | DBG_FUNC_NONE, wq, req, arg1, 2, 0);
2754                 if (req->tr_priority == priclass) {
2755                         goto out_unlock;
2756                 }
2757                 _threadreq_dequeue(wq, req);
2758                 req->tr_priority = priclass;
2759                 req->tr_state = TR_STATE_NEW; // what was old is new again
2760                 wq_tr_rc = workqueue_run_threadreq_and_unlock(p, wq, NULL, req, false);
2761                 goto out;
2762
2763         case WORKQ_THREADREQ_CANCEL:
2764                 PTHREAD_TRACE_WQ_REQ(TRACE_wq_kevent_reqthreads | DBG_FUNC_NONE, wq, req, 0, 3, 0);
2765                 _threadreq_dequeue(wq, req);
2766                 req->tr_state = TR_STATE_DEAD;
2767                 break;
2768
2769         default:
2770                 ret = ENOTSUP;
2771                 break;
2772         }
2773
2774 out_unlock:
2775         workqueue_unlock(wq);
2776 out:
2777         if (wq_tr_rc == WQ_RUN_TR_THREAD_NEEDED) {
2778                 if (operation == WORKQ_THREADREQ_CHANGE_PRI_NO_THREAD_CALL) {
2779                         ret = EAGAIN;
2780                 } else if (WQ_TIMER_IMMEDIATE_NEEDED(wq)) {
2781                         workqueue_interval_timer_trigger(wq);
2782                 }
2783         }
2784         return ret;
2785 }
2786
2787
2788 static int
2789 wqops_thread_return(struct proc *p, struct workqueue *wq)
2790 {
2791         thread_t th = current_thread();
2792         struct uthread *uth = pthread_kern->get_bsdthread_info(th);
2793         struct threadlist *tl = pthread_kern->uthread_get_threadlist(uth);
2794
2795         /* reset signal mask on the workqueue thread to default state */
2796         if (pthread_kern->uthread_get_sigmask(uth) != (sigset_t)(~workq_threadmask)) {
2797                 pthread_kern->proc_lock(p);
2798                 pthread_kern->uthread_set_sigmask(uth, ~workq_threadmask);
2799                 pthread_kern->proc_unlock(p);
2800         }
2801
2802         if (wq == NULL || !tl) {
2803                 return EINVAL;
2804         }
2805
2806         PTHREAD_TRACE_WQ(TRACE_wq_override_reset | DBG_FUNC_START, tl->th_workq, 0, 0, 0, 0);
2807
2808         /*
2809          * This squash call has neat semantics: it removes the specified overrides,
2810          * replacing the current requested QoS with the previous effective QoS from
2811          * those overrides.  This means we won't be preempted due to having our QoS
2812          * lowered.  Of course, now our understanding of the thread's QoS is wrong,
2813          * so we'll adjust below.
2814          */
2815         bool was_manager = (tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET);
2816         int new_qos;
2817
2818         if (!was_manager) {
2819                 new_qos = pthread_kern->proc_usynch_thread_qos_squash_override_for_resource(th,
2820                                 THREAD_QOS_OVERRIDE_RESOURCE_WILDCARD,
2821                                 THREAD_QOS_OVERRIDE_TYPE_DISPATCH_ASYNCHRONOUS_OVERRIDE);
2822         }
2823
2824         PTHREAD_TRACE_WQ(TRACE_wq_runitem | DBG_FUNC_END, wq, tl->th_priority, 0, 0, 0);
2825
2826         workqueue_lock_spin(wq);
2827
2828         if (tl->th_flags & TH_LIST_KEVENT_BOUND) {
2829                 unsigned int flags = KEVENT_FLAG_WORKQ;
2830                 if (was_manager) {
2831                         flags |= KEVENT_FLAG_WORKQ_MANAGER;
2832                 }
2833
2834                 tl->th_flags |= TH_LIST_UNBINDING;
2835                 workqueue_unlock(wq);
2836                 kevent_qos_internal_unbind(p, class_index_get_thread_qos(tl->th_priority), th, flags);
2837                 if (!(tl->th_flags & TH_LIST_UNBINDING)) {
2838                         _setup_wqthread(p, th, wq, tl, WQ_SETUP_CLEAR_VOUCHER);
2839                         pthread_kern->unix_syscall_return(EJUSTRETURN);
2840                         __builtin_unreachable();
2841                 }
2842                 workqueue_lock_spin(wq);
2843                 tl->th_flags &= ~(TH_LIST_KEVENT_BOUND | TH_LIST_UNBINDING);
2844         }
2845
2846         if (!was_manager) {
2847                 /* Fix up counters from the squash operation. */
2848                 uint8_t old_bucket = tl->th_priority;
2849                 uint8_t new_bucket = thread_qos_get_class_index(new_qos);
2850
2851                 if (old_bucket != new_bucket) {
2852                         _wq_thactive_move(wq, old_bucket, new_bucket);
2853                         wq->wq_thscheduled_count[old_bucket]--;
2854                         wq->wq_thscheduled_count[new_bucket]++;
2855
2856                         PTHREAD_TRACE_WQ(TRACE_wq_thread_squash | DBG_FUNC_NONE, wq, tl->th_priority, new_bucket, 0, 0);
2857                         tl->th_priority = new_bucket;
2858                         PTHREAD_TRACE_WQ(TRACE_wq_override_reset | DBG_FUNC_END, tl->th_workq, new_qos, 0, 0, 0);
2859                 }
2860         }
2861
2862         workqueue_run_threadreq_and_unlock(p, wq, tl, NULL, false);
2863         return 0;
2864 }
2865
2866 /**
2867  * Multiplexed call to interact with the workqueue mechanism
2868  */
2869 int
2870 _workq_kernreturn(struct proc *p,
2871                   int options,
2872                   user_addr_t item,
2873                   int arg2,
2874                   int arg3,
2875                   int32_t *retval)
2876 {
2877         struct workqueue *wq;
2878         int error = 0;
2879
2880         if (pthread_kern->proc_get_register(p) == 0) {
2881                 return EINVAL;
2882         }
2883
2884         switch (options) {
2885         case WQOPS_QUEUE_NEWSPISUPP: {
2886                 /*
2887                  * arg2 = offset of serialno into dispatch queue
2888                  * arg3 = kevent support
2889                  */
2890                 int offset = arg2;
2891                 if (arg3 & 0x01){
2892                         // If we get here, then userspace has indicated support for kevent delivery.
2893                 }
2894
2895                 pthread_kern->proc_set_dispatchqueue_serialno_offset(p, (uint64_t)offset);
2896                 break;
2897         }
2898         case WQOPS_QUEUE_REQTHREADS: {
2899                 /*
2900                  * arg2 = number of threads to start
2901                  * arg3 = priority
2902                  */
2903                 error = wqops_queue_reqthreads(p, arg2, arg3);
2904                 break;
2905         }
2906         case WQOPS_SET_EVENT_MANAGER_PRIORITY: {
2907                 /*
2908                  * arg2 = priority for the manager thread
2909                  *
2910                  * if _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG is set, the
2911                  * ~_PTHREAD_PRIORITY_FLAGS_MASK contains a scheduling priority instead
2912                  * of a QOS value
2913                  */
2914                 pthread_priority_t pri = arg2;
2915
2916                 wq = (struct workqueue *)pthread_kern->proc_get_wqptr(p);
2917                 if (wq == NULL) {
2918                         error = EINVAL;
2919                         break;
2920                 }
2921                 workqueue_lock_spin(wq);
2922                 if (pri & _PTHREAD_PRIORITY_SCHED_PRI_FLAG){
2923                         /*
2924                          * If userspace passes a scheduling priority, that takes precidence
2925                          * over any QoS.  (So, userspace should take care not to accidenatally
2926                          * lower the priority this way.)
2927                          */
2928                         uint32_t sched_pri = pri & _PTHREAD_PRIORITY_SCHED_PRI_MASK;
2929                         if (wq->wq_event_manager_priority & _PTHREAD_PRIORITY_SCHED_PRI_FLAG){
2930                                 wq->wq_event_manager_priority = MAX(sched_pri, wq->wq_event_manager_priority & _PTHREAD_PRIORITY_SCHED_PRI_MASK)
2931                                                 | _PTHREAD_PRIORITY_SCHED_PRI_FLAG | _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG;
2932                         } else {
2933                                 wq->wq_event_manager_priority = sched_pri
2934                                                 | _PTHREAD_PRIORITY_SCHED_PRI_FLAG | _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG;
2935                         }
2936                 } else if ((wq->wq_event_manager_priority & _PTHREAD_PRIORITY_SCHED_PRI_FLAG) == 0){
2937                         int cur_qos = pthread_priority_get_thread_qos(wq->wq_event_manager_priority);
2938                         int new_qos = pthread_priority_get_thread_qos(pri);
2939                         wq->wq_event_manager_priority = (uint32_t)thread_qos_get_pthread_priority(MAX(cur_qos, new_qos)) | _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG;
2940                 }
2941                 workqueue_unlock(wq);
2942                 break;
2943         }
2944         case WQOPS_THREAD_KEVENT_RETURN:
2945         case WQOPS_THREAD_WORKLOOP_RETURN:
2946                 wq = (struct workqueue *)pthread_kern->proc_get_wqptr(p);
2947                 PTHREAD_TRACE_WQ(TRACE_wq_runthread | DBG_FUNC_END, wq, options, 0, 0, 0);
2948                 if (item != 0 && arg2 != 0) {
2949                         int32_t kevent_retval;
2950                         int ret;
2951                         if (options == WQOPS_THREAD_KEVENT_RETURN) {
2952                                 ret = kevent_qos_internal(p, -1, item, arg2, item, arg2, NULL, NULL,
2953                                                 KEVENT_FLAG_WORKQ | KEVENT_FLAG_IMMEDIATE | KEVENT_FLAG_ERROR_EVENTS,
2954                                                 &kevent_retval);
2955                         } else /* options == WQOPS_THREAD_WORKLOOP_RETURN */ {
2956                                 kqueue_id_t kevent_id = -1;
2957                                 ret = kevent_id_internal(p, &kevent_id, item, arg2, item, arg2,
2958                                                 NULL, NULL,
2959                                                 KEVENT_FLAG_WORKLOOP | KEVENT_FLAG_IMMEDIATE | KEVENT_FLAG_ERROR_EVENTS,
2960                                                 &kevent_retval);
2961                         }
2962                         /*
2963                          * We shouldn't be getting more errors out than events we put in, so
2964                          * reusing the input buffer should always provide enough space.  But,
2965                          * the assert is commented out since we get errors in edge cases in the
2966                          * process lifecycle.
2967                          */
2968                         //assert(ret == KERN_SUCCESS && kevent_retval >= 0);
2969                         if (ret != KERN_SUCCESS){
2970                                 error = ret;
2971                                 break;
2972                         } else if (kevent_retval > 0){
2973                                 assert(kevent_retval <= arg2);
2974                                 *retval = kevent_retval;
2975                                 error = 0;
2976                                 break;
2977                         }
2978                 }
2979                 goto thread_return;
2980
2981         case WQOPS_THREAD_RETURN:
2982                 wq = (struct workqueue *)pthread_kern->proc_get_wqptr(p);
2983                 PTHREAD_TRACE_WQ(TRACE_wq_runthread | DBG_FUNC_END, wq, options, 0, 0, 0);
2984         thread_return:
2985                 error = wqops_thread_return(p, wq);
2986                 // NOT REACHED except in case of error
2987                 assert(error);
2988                 break;
2989
2990         case WQOPS_SHOULD_NARROW: {
2991                 /*
2992                  * arg2 = priority to test
2993                  * arg3 = unused
2994                  */
2995                 pthread_priority_t priority = arg2;
2996                 thread_t th = current_thread();
2997                 struct threadlist *tl = util_get_thread_threadlist_entry(th);
2998
2999                 if (tl == NULL || (tl->th_flags & TH_LIST_CONSTRAINED) == 0) {
3000                         error = EINVAL;
3001                         break;
3002                 }
3003
3004                 int class = pthread_priority_get_class_index(priority);
3005                 wq = tl->th_workq;
3006                 workqueue_lock_spin(wq);
3007                 bool should_narrow = !may_start_constrained_thread(wq, class, tl, false);
3008                 workqueue_unlock(wq);
3009
3010                 *retval = should_narrow;
3011                 break;
3012         }
3013         default:
3014                 error = EINVAL;
3015                 break;
3016         }
3017
3018         switch (options) {
3019         case WQOPS_THREAD_KEVENT_RETURN:
3020         case WQOPS_THREAD_WORKLOOP_RETURN:
3021         case WQOPS_THREAD_RETURN:
3022                 PTHREAD_TRACE_WQ(TRACE_wq_runthread | DBG_FUNC_START, wq, options, 0, 0, 0);
3023                 break;
3024         }
3025         return (error);
3026 }
3027
3028 /*
3029  * We have no work to do, park ourselves on the idle list.
3030  *
3031  * Consumes the workqueue lock and does not return.
3032  */
3033 static void __dead2
3034 parkit(struct workqueue *wq, struct threadlist *tl, thread_t thread)
3035 {
3036         assert(thread == tl->th_thread);
3037         assert(thread == current_thread());
3038
3039         PTHREAD_TRACE_WQ(TRACE_wq_thread_park | DBG_FUNC_START, wq, 0, 0, 0, 0);
3040
3041         uint32_t us_to_wait = 0;
3042
3043         TAILQ_REMOVE(&wq->wq_thrunlist, tl, th_entry);
3044
3045         tl->th_flags &= ~TH_LIST_RUNNING;
3046         tl->th_flags &= ~TH_LIST_KEVENT;
3047         assert((tl->th_flags & TH_LIST_KEVENT_BOUND) == 0);
3048
3049         if (tl->th_flags & TH_LIST_CONSTRAINED) {
3050                 wq->wq_constrained_threads_scheduled--;
3051                 tl->th_flags &= ~TH_LIST_CONSTRAINED;
3052         }
3053
3054         _wq_thactive_dec(wq, tl->th_priority);
3055         wq->wq_thscheduled_count[tl->th_priority]--;
3056         wq->wq_threads_scheduled--;
3057         uint32_t thidlecount = ++wq->wq_thidlecount;
3058
3059         pthread_kern->thread_sched_call(thread, NULL);
3060
3061         /*
3062          * We'd like to always have one manager thread parked so that we can have
3063          * low latency when we need to bring a manager thread up.  If that idle
3064          * thread list is empty, make this thread a manager thread.
3065          *
3066          * XXX: This doesn't check that there's not a manager thread outstanding,
3067          * so it's based on the assumption that most manager callouts will change
3068          * their QoS before parking.  If that stops being true, this may end up
3069          * costing us more than we gain.
3070          */
3071         if (TAILQ_EMPTY(&wq->wq_thidlemgrlist) &&
3072                         tl->th_priority != WORKQUEUE_EVENT_MANAGER_BUCKET){
3073                 PTHREAD_TRACE_WQ(TRACE_wq_thread_reset_priority | DBG_FUNC_NONE,
3074                                         wq, thread_tid(thread),
3075                                         (tl->th_priority << 16) | WORKQUEUE_EVENT_MANAGER_BUCKET, 2, 0);
3076                 reset_priority(tl, pthread_priority_from_wq_class_index(wq, WORKQUEUE_EVENT_MANAGER_BUCKET));
3077                 tl->th_priority = WORKQUEUE_EVENT_MANAGER_BUCKET;
3078         }
3079
3080         if (tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET){
3081                 TAILQ_INSERT_HEAD(&wq->wq_thidlemgrlist, tl, th_entry);
3082         } else {
3083                 TAILQ_INSERT_HEAD(&wq->wq_thidlelist, tl, th_entry);
3084         }
3085
3086         /*
3087          * When we remove the voucher from the thread, we may lose our importance
3088          * causing us to get preempted, so we do this after putting the thread on
3089          * the idle list.  That when, when we get our importance back we'll be able
3090          * to use this thread from e.g. the kevent call out to deliver a boosting
3091          * message.
3092          */
3093         tl->th_flags |= TH_LIST_REMOVING_VOUCHER;
3094         workqueue_unlock(wq);
3095         if (pthread_kern->thread_will_park_or_terminate) {
3096                 pthread_kern->thread_will_park_or_terminate(tl->th_thread);
3097         }
3098         __assert_only kern_return_t kr;
3099         kr = pthread_kern->thread_set_voucher_name(MACH_PORT_NULL);
3100         assert(kr == KERN_SUCCESS);
3101         workqueue_lock_spin(wq);
3102         tl->th_flags &= ~(TH_LIST_REMOVING_VOUCHER);
3103
3104         if ((tl->th_flags & TH_LIST_RUNNING) == 0) {
3105                 if (thidlecount < 101) {
3106                         us_to_wait = wq_reduce_pool_window_usecs - ((thidlecount-2) * (wq_reduce_pool_window_usecs / 100));
3107                 } else {
3108                         us_to_wait = wq_reduce_pool_window_usecs / 100;
3109                 }
3110
3111                 thread_set_pending_block_hint(thread, kThreadWaitParkedWorkQueue);
3112                 assert_wait_timeout_with_leeway((caddr_t)tl, (THREAD_INTERRUPTIBLE),
3113                                 TIMEOUT_URGENCY_SYS_BACKGROUND|TIMEOUT_URGENCY_LEEWAY, us_to_wait,
3114                                 wq_reduce_pool_window_usecs/10, NSEC_PER_USEC);
3115
3116                 workqueue_unlock(wq);
3117
3118                 thread_block(wq_unpark_continue);
3119                 panic("thread_block(wq_unpark_continue) returned!");
3120         } else {
3121                 workqueue_unlock(wq);
3122
3123                 /*
3124                  * While we'd dropped the lock to unset our voucher, someone came
3125                  * around and made us runnable.  But because we weren't waiting on the
3126                  * event their wakeup() was ineffectual.  To correct for that, we just
3127                  * run the continuation ourselves.
3128                  */
3129                 wq_unpark_continue(NULL, THREAD_AWAKENED);
3130         }
3131 }
3132
3133 static bool
3134 may_start_constrained_thread(struct workqueue *wq, uint32_t at_priclass,
3135                 struct threadlist *tl, bool may_start_timer)
3136 {
3137         uint32_t req_qos = _wq_thactive_best_constrained_req_qos(wq);
3138         wq_thactive_t thactive;
3139
3140         if (may_start_timer && at_priclass < req_qos) {
3141                 /*
3142                  * When called from workqueue_run_threadreq_and_unlock() pre-post newest
3143                  * higher priorities into the thactive state so that
3144                  * workqueue_callback() takes the right decision.
3145                  *
3146                  * If the admission check passes, workqueue_run_threadreq_and_unlock
3147                  * will reset this value before running the request.
3148                  */
3149                 thactive = _wq_thactive_set_best_constrained_req_qos(wq, req_qos,
3150                                 at_priclass);
3151 #ifdef __LP64__
3152                 PTHREAD_TRACE_WQ(TRACE_wq_thactive_update, 1, (uint64_t)thactive,
3153                                 (uint64_t)(thactive >> 64), 0, 0);
3154 #endif
3155         } else {
3156                 thactive = _wq_thactive(wq);
3157         }
3158
3159         uint32_t constrained_threads = wq->wq_constrained_threads_scheduled;
3160         if (tl && (tl->th_flags & TH_LIST_CONSTRAINED)) {
3161                 /*
3162                  * don't count the current thread as scheduled
3163                  */
3164                 constrained_threads--;
3165         }
3166         if (constrained_threads >= wq_max_constrained_threads) {
3167                 PTHREAD_TRACE_WQ(TRACE_wq_constrained_admission | DBG_FUNC_NONE, wq, 1,
3168                                 wq->wq_constrained_threads_scheduled,
3169                                 wq_max_constrained_threads, 0);
3170                 /*
3171                  * we need 1 or more constrained threads to return to the kernel before
3172                  * we can dispatch additional work
3173                  */
3174                 return false;
3175         }
3176
3177         /*
3178          * Compute a metric for many how many threads are active.  We find the
3179          * highest priority request outstanding and then add up the number of
3180          * active threads in that and all higher-priority buckets.  We'll also add
3181          * any "busy" threads which are not active but blocked recently enough that
3182          * we can't be sure they've gone idle yet.  We'll then compare this metric
3183          * to our max concurrency to decide whether to add a new thread.
3184          */
3185
3186         uint32_t busycount, thactive_count;
3187
3188         thactive_count = _wq_thactive_aggregate_downto_qos(wq, thactive,
3189                         at_priclass, &busycount, NULL);
3190
3191         if (tl && tl->th_priority <= at_priclass) {
3192                 /*
3193                  * don't count this thread as currently active
3194                  */
3195                 assert(thactive_count > 0);
3196                 thactive_count--;
3197         }
3198
3199         if (thactive_count + busycount < wq_max_concurrency[at_priclass]) {
3200                 PTHREAD_TRACE_WQ(TRACE_wq_constrained_admission | DBG_FUNC_NONE, wq, 2,
3201                                 thactive_count, busycount, 0);
3202                 return true;
3203         } else {
3204                 PTHREAD_TRACE_WQ(TRACE_wq_constrained_admission | DBG_FUNC_NONE, wq, 3,
3205                                 thactive_count, busycount, 0);
3206         }
3207
3208         if (busycount && may_start_timer) {
3209                 /*
3210                  * If this is called from the add timer, we won't have another timer
3211                  * fire when the thread exits the "busy" state, so rearm the timer.
3212                  */
3213                 if (WQ_TIMER_DELAYED_NEEDED(wq)) {
3214                         workqueue_interval_timer_start(wq);
3215                 }
3216         }
3217
3218         return false;
3219 }
3220
3221 static struct threadlist *
3222 pop_from_thidlelist(struct workqueue *wq, uint32_t priclass)
3223 {
3224         assert(wq->wq_thidlecount);
3225
3226         struct threadlist *tl = NULL;
3227
3228         if (!TAILQ_EMPTY(&wq->wq_thidlemgrlist) &&
3229                         (priclass == WORKQUEUE_EVENT_MANAGER_BUCKET || TAILQ_EMPTY(&wq->wq_thidlelist))){
3230                 tl = TAILQ_FIRST(&wq->wq_thidlemgrlist);
3231                 TAILQ_REMOVE(&wq->wq_thidlemgrlist, tl, th_entry);
3232                 assert(tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET);
3233         } else if (!TAILQ_EMPTY(&wq->wq_thidlelist) &&
3234                         (priclass != WORKQUEUE_EVENT_MANAGER_BUCKET || TAILQ_EMPTY(&wq->wq_thidlemgrlist))){
3235                 tl = TAILQ_FIRST(&wq->wq_thidlelist);
3236                 TAILQ_REMOVE(&wq->wq_thidlelist, tl, th_entry);
3237                 assert(tl->th_priority != WORKQUEUE_EVENT_MANAGER_BUCKET);
3238         } else {
3239                 panic("pop_from_thidlelist called with no threads available");
3240         }
3241         assert((tl->th_flags & TH_LIST_RUNNING) == 0);
3242
3243         assert(wq->wq_thidlecount);
3244         wq->wq_thidlecount--;
3245
3246         TAILQ_INSERT_TAIL(&wq->wq_thrunlist, tl, th_entry);
3247
3248         tl->th_flags |= TH_LIST_RUNNING | TH_LIST_BUSY;
3249
3250         wq->wq_threads_scheduled++;
3251         wq->wq_thscheduled_count[priclass]++;
3252         _wq_thactive_inc(wq, priclass);
3253         return tl;
3254 }
3255
3256 static pthread_priority_t
3257 pthread_priority_from_wq_class_index(struct workqueue *wq, int index)
3258 {
3259         if (index == WORKQUEUE_EVENT_MANAGER_BUCKET){
3260                 return wq->wq_event_manager_priority;
3261         } else {
3262                 return class_index_get_pthread_priority(index);
3263         }
3264 }
3265
3266 static void
3267 reset_priority(struct threadlist *tl, pthread_priority_t pri)
3268 {
3269         kern_return_t ret;
3270         thread_t th = tl->th_thread;
3271
3272         if ((pri & _PTHREAD_PRIORITY_SCHED_PRI_FLAG) == 0){
3273                 ret = pthread_kern->thread_set_workq_qos(th, pthread_priority_get_thread_qos(pri), 0);
3274                 assert(ret == KERN_SUCCESS || ret == KERN_TERMINATED);
3275
3276                 if (tl->th_flags & TH_LIST_EVENT_MGR_SCHED_PRI) {
3277
3278                         /* Reset priority to default (masked by QoS) */
3279
3280                         ret = pthread_kern->thread_set_workq_pri(th, 31, POLICY_TIMESHARE);
3281                         assert(ret == KERN_SUCCESS || ret == KERN_TERMINATED);
3282
3283                         tl->th_flags &= ~TH_LIST_EVENT_MGR_SCHED_PRI;
3284                 }
3285         } else {
3286                 ret = pthread_kern->thread_set_workq_qos(th, THREAD_QOS_UNSPECIFIED, 0);
3287                 assert(ret == KERN_SUCCESS || ret == KERN_TERMINATED);
3288                 ret = pthread_kern->thread_set_workq_pri(th, (pri & (~_PTHREAD_PRIORITY_FLAGS_MASK)), POLICY_TIMESHARE);
3289                 assert(ret == KERN_SUCCESS || ret == KERN_TERMINATED);
3290
3291                 tl->th_flags |= TH_LIST_EVENT_MGR_SCHED_PRI;
3292         }
3293 }
3294
3295 /*
3296  * Picks the best request to run, and returns the best overcommit fallback
3297  * if the best pick is non overcommit and risks failing its admission check.
3298  */
3299 static struct threadreq *
3300 workqueue_best_threadreqs(struct workqueue *wq, struct threadlist *tl,
3301                 struct threadreq **fallback)
3302 {
3303         struct threadreq *req, *best_req = NULL;
3304         int priclass, prilimit;
3305
3306         if ((wq->wq_event_manager_threadreq.tr_state == TR_STATE_WAITING) &&
3307                         ((wq->wq_thscheduled_count[WORKQUEUE_EVENT_MANAGER_BUCKET] == 0) ||
3308                         (tl && tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET))) {
3309                 /*
3310                  * There's an event manager request and either:
3311                  *   - no event manager currently running
3312                  *   - we are re-using the event manager
3313                  */
3314                 req = &wq->wq_event_manager_threadreq;
3315                 PTHREAD_TRACE_WQ_REQ(TRACE_wq_run_threadreq_req_select | DBG_FUNC_NONE, wq, req, 1, 0, 0);
3316                 return req;
3317         }
3318
3319         if (tl) {
3320                 prilimit = WORKQUEUE_EVENT_MANAGER_BUCKET;
3321         } else {
3322                 prilimit = _wq_highest_paced_priority(wq);
3323         }
3324         for (priclass = 0; priclass < prilimit; priclass++) {
3325                 req = TAILQ_FIRST(&wq->wq_overcommit_reqlist[priclass]);
3326                 if (req) {
3327                         PTHREAD_TRACE_WQ_REQ(TRACE_wq_run_threadreq_req_select | DBG_FUNC_NONE, wq, req, 2, 0, 0);
3328                         if (best_req) {
3329                                 *fallback = req;
3330                         } else {
3331                                 best_req = req;
3332                         }
3333                         break;
3334                 }
3335                 if (!best_req) {
3336                         best_req = TAILQ_FIRST(&wq->wq_reqlist[priclass]);
3337                         if (best_req) {
3338                                 PTHREAD_TRACE_WQ_REQ(TRACE_wq_run_threadreq_req_select | DBG_FUNC_NONE, wq, best_req, 3, 0, 0);
3339                         }
3340                 }
3341         }
3342         return best_req;
3343 }
3344
3345 /**
3346  * Runs a thread request on a thread
3347  *
3348  * - if thread is THREAD_NULL, will find a thread and run the request there.
3349  *   Otherwise, the thread must be the current thread.
3350  *
3351  * - if req is NULL, will find the highest priority request and run that.  If
3352  *   it is not NULL, it must be a threadreq object in state NEW.  If it can not
3353  *   be run immediately, it will be enqueued and moved to state WAITING.
3354  *
3355  *   Either way, the thread request object serviced will be moved to state
3356  *   PENDING and attached to the threadlist.
3357  *
3358  *   Should be called with the workqueue lock held.  Will drop it.
3359  *
3360  *   WARNING: _workq_kevent_reqthreads needs to be able to preflight any
3361  *   admission checks in this function.  If you are changing this function,
3362  *   keep that one up-to-date.
3363  *
3364  * - if parking_tl is non NULL, then the current thread is parking. This will
3365  *   try to reuse this thread for a request. If no match is found, it will be
3366  *   parked.
3367  */
3368 static int
3369 workqueue_run_threadreq_and_unlock(proc_t p, struct workqueue *wq,
3370                 struct threadlist *parking_tl, struct threadreq *req,
3371                 bool may_add_new_thread)
3372 {
3373         struct threadreq *incoming_req = req;
3374
3375         struct threadlist *tl = parking_tl;
3376         int rc = WQ_RUN_TR_THROTTLED;
3377
3378         assert(tl == NULL || tl->th_thread == current_thread());
3379         assert(req == NULL || req->tr_state == TR_STATE_NEW);
3380         assert(!may_add_new_thread || !tl);
3381
3382         PTHREAD_TRACE_WQ_REQ(TRACE_wq_run_threadreq | DBG_FUNC_START, wq, req,
3383                         tl ? thread_tid(tl->th_thread) : 0,
3384                         req ? (req->tr_priority << 16 | req->tr_flags) : 0, 0);
3385
3386         /*
3387          * Special cases when provided an event manager request
3388          */
3389         if (req && req->tr_priority == WORKQUEUE_EVENT_MANAGER_BUCKET) {
3390                 // Clients must not rely on identity of event manager requests
3391                 assert(req->tr_flags & TR_FLAG_ONSTACK);
3392                 // You can't be both overcommit and event manager
3393                 assert((req->tr_flags & TR_FLAG_OVERCOMMIT) == 0);
3394
3395                 /*
3396                  * We can only ever have one event manager request, so coalesce them if
3397                  * there's already one outstanding.
3398                  */
3399                 if (wq->wq_event_manager_threadreq.tr_state == TR_STATE_WAITING) {
3400                         PTHREAD_TRACE_WQ_REQ(TRACE_wq_run_threadreq_mgr_merge | DBG_FUNC_NONE, wq, req, 0, 0, 0);
3401
3402                         struct threadreq *existing_req = &wq->wq_event_manager_threadreq;
3403                         if (req->tr_flags & TR_FLAG_KEVENT) {
3404                                 existing_req->tr_flags |= TR_FLAG_KEVENT;
3405                         }
3406
3407                         req = existing_req;
3408                         incoming_req = NULL;
3409                 }
3410
3411                 if (wq->wq_thscheduled_count[WORKQUEUE_EVENT_MANAGER_BUCKET] &&
3412                                 (!tl || tl->th_priority != WORKQUEUE_EVENT_MANAGER_BUCKET)){
3413                         /*
3414                          * There can only be one event manager running at a time.
3415                          */
3416                         PTHREAD_TRACE_WQ(TRACE_wq_run_threadreq | DBG_FUNC_END, wq, 1, 0, 0, 0);
3417                         goto done;
3418                 }
3419         }
3420
3421 again: // Start again after creating a thread
3422
3423         if (_wq_exiting(wq)) {
3424                 rc = WQ_RUN_TR_EXITING;
3425                 goto exiting;
3426         }
3427
3428         /*
3429          * Thread request selection and admission control
3430          */
3431         struct threadreq *fallback = NULL;
3432         if (req) {
3433                 if ((req->tr_flags & TR_FLAG_NO_PACING) == 0 &&
3434                                 _wq_should_pace_priority(wq, req->tr_priority)) {
3435                         /*
3436                          * If a request fails the pacing admission check, then thread
3437                          * requests are redriven when the pacing thread is finally scheduled
3438                          * when it calls _wq_pacing_end() in wq_unpark_continue().
3439                          */
3440                         goto done;
3441                 }
3442         } else if (wq->wq_reqcount == 0) {
3443                 PTHREAD_TRACE_WQ(TRACE_wq_run_threadreq | DBG_FUNC_END, wq, 2, 0, 0, 0);
3444                 goto done;
3445         } else if ((req = workqueue_best_threadreqs(wq, tl, &fallback)) == NULL) {
3446                 PTHREAD_TRACE_WQ(TRACE_wq_run_threadreq | DBG_FUNC_END, wq, 3, 0, 0, 0);
3447                 goto done;
3448         }
3449
3450         if ((req->tr_flags & TR_FLAG_OVERCOMMIT) == 0 &&
3451                         (req->tr_priority < WORKQUEUE_EVENT_MANAGER_BUCKET)) {
3452                 if (!may_start_constrained_thread(wq, req->tr_priority, parking_tl, true)) {
3453                         if (!fallback) {
3454                                 PTHREAD_TRACE_WQ(TRACE_wq_run_threadreq | DBG_FUNC_END, wq, 4, 0, 0, 0);
3455                                 goto done;
3456                         }
3457                         assert(req->tr_state == TR_STATE_WAITING);
3458                         req = fallback;
3459                 }
3460         }
3461
3462         /*
3463          * Thread selection.
3464          */
3465         if (parking_tl) {
3466                 if (tl->th_priority != req->tr_priority) {
3467                         _wq_thactive_move(wq, tl->th_priority, req->tr_priority);
3468                         wq->wq_thscheduled_count[tl->th_priority]--;
3469                         wq->wq_thscheduled_count[req->tr_priority]++;
3470                 }
3471                 PTHREAD_TRACE_WQ(TRACE_wq_run_threadreq_thread_select | DBG_FUNC_NONE,
3472                                 wq, 1, thread_tid(tl->th_thread), 0, 0);
3473         } else if (wq->wq_thidlecount) {
3474                 tl = pop_from_thidlelist(wq, req->tr_priority);
3475                 /*
3476                  * This call will update wq_thscheduled_count and wq_thactive_count for
3477                  * the provided priority.  It will not set the returned thread to that
3478                  * priority.  This matches the behavior of the parking_tl clause above.
3479                  */
3480                 PTHREAD_TRACE_WQ(TRACE_wq_run_threadreq_thread_select | DBG_FUNC_NONE,
3481                                 wq, 2, thread_tid(tl->th_thread), 0, 0);
3482         } else /* no idle threads */ {
3483                 if (!may_add_new_thread || wq->wq_nthreads >= wq_max_threads) {
3484                         PTHREAD_TRACE_WQ(TRACE_wq_run_threadreq | DBG_FUNC_END, wq, 5,
3485                                         may_add_new_thread, wq->wq_nthreads, 0);
3486                         if (wq->wq_nthreads < wq_max_threads) {
3487                                 rc = WQ_RUN_TR_THREAD_NEEDED;
3488                         }
3489                         goto done;
3490                 }
3491
3492                 bool added_thread = workqueue_addnewthread(p, wq);
3493                 /*
3494                  * workqueue_addnewthread will drop and re-take the lock, so we
3495                  * need to ensure we still have a cached request.
3496                  *
3497                  * It also means we have to pick a new request, since our old pick may
3498                  * not be valid anymore.
3499                  */
3500                 req = incoming_req;
3501                 if (req && (req->tr_flags & TR_FLAG_ONSTACK)) {
3502                         _threadreq_copy_prepare(wq);
3503                 }
3504
3505                 if (added_thread) {
3506                         PTHREAD_TRACE_WQ(TRACE_wq_run_threadreq_thread_select | DBG_FUNC_NONE,
3507                                         wq, 3, 0, 0, 0);
3508                         goto again;
3509                 } else if (_wq_exiting(wq)) {
3510                         rc = WQ_RUN_TR_EXITING;
3511                         goto exiting;
3512                 } else {
3513                         PTHREAD_TRACE_WQ(TRACE_wq_run_threadreq | DBG_FUNC_END, wq, 6, 0, 0, 0);
3514                         /*
3515                          * Something caused thread creation to fail.  Kick off the timer in
3516                          * the hope that it'll succeed next time.
3517                          */
3518                         if (WQ_TIMER_DELAYED_NEEDED(wq)) {
3519                                 workqueue_interval_timer_start(wq);
3520                         }
3521                         goto done;
3522                 }
3523         }
3524
3525         /*
3526          * Setup thread, mark request as complete and run with it.
3527          */
3528         if (req->tr_state == TR_STATE_WAITING) {
3529                 _threadreq_dequeue(wq, req);
3530         }
3531         if (tl->th_priority != req->tr_priority) {
3532                 PTHREAD_TRACE_WQ(TRACE_wq_thread_reset_priority | DBG_FUNC_NONE,
3533                                         wq, thread_tid(tl->th_thread),
3534                                         (tl->th_priority << 16) | req->tr_priority, 1, 0);
3535                 reset_priority(tl, pthread_priority_from_wq_class_index(wq, req->tr_priority));
3536                 tl->th_priority = (uint8_t)req->tr_priority;
3537         }
3538         if (req->tr_flags & TR_FLAG_OVERCOMMIT) {
3539                 if ((tl->th_flags & TH_LIST_CONSTRAINED) != 0) {
3540                         tl->th_flags &= ~TH_LIST_CONSTRAINED;
3541                         wq->wq_constrained_threads_scheduled--;
3542                 }
3543         } else {
3544                 if ((tl->th_flags & TH_LIST_CONSTRAINED) == 0) {
3545                         tl->th_flags |= TH_LIST_CONSTRAINED;
3546                         wq->wq_constrained_threads_scheduled++;
3547                 }
3548         }
3549
3550         if (!parking_tl && !(req->tr_flags & TR_FLAG_NO_PACING)) {
3551                 _wq_pacing_start(wq, tl);
3552         }
3553         if ((req->tr_flags & TR_FLAG_OVERCOMMIT) == 0) {
3554                 uint32_t old_qos, new_qos;
3555
3556                 /*
3557                  * If we are scheduling a constrained thread request, we may need to
3558                  * update the best constrained qos in the thactive atomic state.
3559                  */
3560                 for (new_qos = 0; new_qos < WQ_THACTIVE_NO_PENDING_REQUEST; new_qos++) {
3561                         if (TAILQ_FIRST(&wq->wq_reqlist[new_qos]))
3562                                 break;
3563                 }
3564                 old_qos = _wq_thactive_best_constrained_req_qos(wq);
3565                 if (old_qos != new_qos) {
3566                         wq_thactive_t v = _wq_thactive_set_best_constrained_req_qos(wq,
3567                                         old_qos, new_qos);
3568 #ifdef __LP64__
3569                         PTHREAD_TRACE_WQ(TRACE_wq_thactive_update, 2, (uint64_t)v,
3570                                         (uint64_t)(v >> 64), 0, 0);
3571 #else
3572                         PTHREAD_TRACE_WQ(TRACE_wq_thactive_update, 2, v, 0, 0, 0);
3573 #endif
3574                 }
3575         }
3576         {
3577                 uint32_t upcall_flags = WQ_FLAG_THREAD_NEWSPI;
3578                 if (req->tr_flags & TR_FLAG_OVERCOMMIT)
3579                         upcall_flags |= WQ_FLAG_THREAD_OVERCOMMIT;
3580                 if (req->tr_flags & TR_FLAG_KEVENT)
3581                         upcall_flags |= WQ_FLAG_THREAD_KEVENT;
3582                 if (req->tr_flags & TR_FLAG_WORKLOOP)
3583                         upcall_flags |= WQ_FLAG_THREAD_WORKLOOP | WQ_FLAG_THREAD_KEVENT;
3584                 if (tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET)
3585                         upcall_flags |= WQ_FLAG_THREAD_EVENT_MANAGER;
3586                 tl->th_upcall_flags = upcall_flags >> WQ_FLAG_THREAD_PRIOSHIFT;
3587         }
3588         if (req->tr_flags & TR_FLAG_KEVENT) {
3589                 tl->th_flags |= TH_LIST_KEVENT;
3590         } else {
3591                 tl->th_flags &= ~TH_LIST_KEVENT;
3592         }
3593         return _threadreq_complete_and_unlock(p, wq, req, tl);
3594
3595 done:
3596         if (incoming_req) {
3597                 _threadreq_enqueue(wq, incoming_req);
3598         }
3599
3600 exiting:
3601
3602         if (parking_tl && !(parking_tl->th_flags & TH_LIST_UNBINDING)) {
3603                 parkit(wq, parking_tl, parking_tl->th_thread);
3604                 __builtin_unreachable();
3605         }
3606
3607         workqueue_unlock(wq);
3608
3609         return rc;
3610 }
3611
3612 /**
3613  * parked thread wakes up
3614  */
3615 static void __dead2
3616 wq_unpark_continue(void* __unused ptr, wait_result_t wait_result)
3617 {
3618         boolean_t first_use = false;
3619         thread_t th = current_thread();
3620         proc_t p = current_proc();
3621
3622         struct uthread *uth = pthread_kern->get_bsdthread_info(th);
3623         if (uth == NULL) goto done;
3624
3625         struct workqueue *wq = pthread_kern->proc_get_wqptr(p);
3626         if (wq == NULL) goto done;
3627
3628         workqueue_lock_spin(wq);
3629
3630         struct threadlist *tl = pthread_kern->uthread_get_threadlist(uth);
3631         assert(tl != WQ_THREADLIST_EXITING_POISON);
3632         if (tl == NULL) {
3633                 /*
3634                  * We woke up before addnewthread() was finished setting us up.  Go
3635                  * ahead and exit, but before we do poison the threadlist variable so
3636                  * that addnewthread() doesn't think we are valid still.
3637                  */
3638                 pthread_kern->uthread_set_threadlist(uth, WQ_THREADLIST_EXITING_POISON);
3639                 workqueue_unlock(wq);
3640                 goto done;
3641         }
3642
3643         assert(tl->th_flags & TH_LIST_INITED);
3644
3645         if ((tl->th_flags & TH_LIST_NEW)){
3646                 tl->th_flags &= ~(TH_LIST_NEW);
3647                 first_use = true;
3648         }
3649
3650         if ((tl->th_flags & (TH_LIST_RUNNING | TH_LIST_BUSY)) == TH_LIST_RUNNING) {
3651                 /*
3652                  * The normal wakeup path.
3653                  */
3654                 goto return_to_user;
3655         }
3656
3657         if ((tl->th_flags & TH_LIST_RUNNING) == 0 &&
3658                         wait_result == THREAD_TIMED_OUT &&
3659                         tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET &&
3660                         TAILQ_FIRST(&wq->wq_thidlemgrlist) == tl &&
3661                         TAILQ_NEXT(tl, th_entry) == NULL){
3662                 /*
3663                  * If we are the only idle manager and we pop'ed for self-destruction,
3664                  * then don't actually exit.  Instead, free our stack to save some
3665                  * memory and re-park.
3666                  */
3667
3668                 workqueue_unlock(wq);
3669
3670                 vm_map_t vmap = wq->wq_map;
3671
3672                 // Keep this in sync with _setup_wqthread()
3673                 const vm_size_t       guardsize = vm_map_page_size(vmap);
3674                 const user_addr_t     freeaddr = (user_addr_t)tl->th_stackaddr + guardsize;
3675                 const vm_map_offset_t freesize = vm_map_trunc_page_mask((PTH_DEFAULT_STACKSIZE + guardsize + PTHREAD_T_OFFSET) - 1, vm_map_page_mask(vmap)) - guardsize;
3676
3677                 __assert_only int kr = mach_vm_behavior_set(vmap, freeaddr, freesize, VM_BEHAVIOR_REUSABLE);
3678 #if MACH_ASSERT
3679                 if (kr != KERN_SUCCESS && kr != KERN_INVALID_ADDRESS) {
3680                         os_log_error(OS_LOG_DEFAULT, "unable to make thread stack reusable (kr: %d)", kr);
3681                 }
3682 #endif
3683
3684                 workqueue_lock_spin(wq);
3685
3686                 if ( !(tl->th_flags & TH_LIST_RUNNING)) {
3687                         thread_set_pending_block_hint(th, kThreadWaitParkedWorkQueue);
3688                         assert_wait((caddr_t)tl, (THREAD_INTERRUPTIBLE));
3689
3690                         workqueue_unlock(wq);
3691
3692                         thread_block(wq_unpark_continue);
3693                         __builtin_unreachable();
3694                 }
3695         }
3696
3697         if ((tl->th_flags & TH_LIST_RUNNING) == 0) {
3698                 assert((tl->th_flags & TH_LIST_BUSY) == 0);
3699                 if (!first_use) {
3700                         PTHREAD_TRACE_WQ(TRACE_wq_thread_park | DBG_FUNC_END, wq, 0, 0, 0, 0);
3701                 }
3702                 /*
3703                  * We were set running, but not for the purposes of actually running.
3704                  * This could be because the timer elapsed.  Or it could be because the
3705                  * thread aborted.  Either way, we need to return to userspace to exit.
3706                  *
3707                  * The call to workqueue_removethread will consume the lock.
3708                  */
3709
3710                 if (!first_use &&
3711                                 (tl->th_priority < qos_class_get_class_index(WQ_THREAD_CLEANUP_QOS) ||
3712                                 (tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET))) {
3713                         // Reset the QoS to something low for the pthread cleanup
3714                         PTHREAD_TRACE_WQ(TRACE_wq_thread_reset_priority | DBG_FUNC_NONE,
3715                                                 wq, thread_tid(th),
3716                                                 (tl->th_priority << 16) | qos_class_get_class_index(WQ_THREAD_CLEANUP_QOS), 3, 0);
3717                         pthread_priority_t cleanup_pri = _pthread_priority_make_newest(WQ_THREAD_CLEANUP_QOS, 0, 0);
3718                         reset_priority(tl, cleanup_pri);
3719                 }
3720
3721                 workqueue_removethread(tl, 0, first_use);
3722
3723                 if (first_use){
3724                         pthread_kern->thread_bootstrap_return();
3725                 } else {
3726                         pthread_kern->unix_syscall_return(0);
3727                 }
3728                 __builtin_unreachable();
3729         }
3730
3731         /*
3732          * The timer woke us up or the thread was aborted.  However, we have
3733          * already started to make this a runnable thread.  Wait for that to
3734          * finish, then continue to userspace.
3735          */
3736         while ((tl->th_flags & TH_LIST_BUSY)) {
3737                 assert_wait((caddr_t)tl, (THREAD_UNINT));
3738
3739                 workqueue_unlock(wq);
3740
3741                 thread_block(THREAD_CONTINUE_NULL);
3742
3743                 workqueue_lock_spin(wq);
3744         }
3745
3746 return_to_user:
3747         if (!first_use) {
3748                 PTHREAD_TRACE_WQ(TRACE_wq_thread_park | DBG_FUNC_END, wq, 0, 0, 0, 0);
3749         }
3750         if (_wq_pacing_end(wq, tl) && wq->wq_reqcount) {
3751                 workqueue_run_threadreq_and_unlock(p, wq, NULL, NULL, true);
3752         } else {
3753                 workqueue_unlock(wq);
3754         }
3755         _setup_wqthread(p, th, wq, tl, first_use ? WQ_SETUP_FIRST_USE : 0);
3756         pthread_kern->thread_sched_call(th, workqueue_callback);
3757 done:
3758         if (first_use){
3759                 pthread_kern->thread_bootstrap_return();
3760         } else {
3761                 pthread_kern->unix_syscall_return(EJUSTRETURN);
3762         }
3763         panic("Our attempt to return to userspace failed...");
3764 }
3765
3766 /**
3767  * configures initial thread stack/registers to jump into:
3768  * _pthread_wqthread(pthread_t self, mach_port_t kport, void *stackaddr, void *keventlist, int upcall_flags, int nkevents);
3769  * to get there we jump through assembily stubs in pthread_asm.s.  Those
3770  * routines setup a stack frame, using the current stack pointer, and marshall
3771  * arguments from registers to the stack as required by the ABI.
3772  *
3773  * One odd thing we do here is to start the pthread_t 4k below what would be the
3774  * top of the stack otherwise.  This is because usually only the first 4k of the
3775  * pthread_t will be used and so we want to put it on the same 16k page as the
3776  * top of the stack to save memory.
3777  *
3778  * When we are done the stack will look like:
3779  * |-----------| th_stackaddr + th_allocsize
3780  * |pthread_t  | th_stackaddr + DEFAULT_STACKSIZE + guardsize + PTHREAD_STACK_OFFSET
3781  * |kevent list| optionally - at most WQ_KEVENT_LIST_LEN events
3782  * |kevent data| optionally - at most WQ_KEVENT_DATA_SIZE bytes
3783  * |stack gap  | bottom aligned to 16 bytes, and at least as big as stack_gap_min
3784  * |   STACK   |
3785  * |     ⇓     |
3786  * |           |
3787  * |guard page | guardsize
3788  * |-----------| th_stackaddr
3789  */
3790 void
3791 _setup_wqthread(proc_t p, thread_t th, struct workqueue *wq,
3792                 struct threadlist *tl, int setup_flags)
3793 {
3794         int error;
3795         if (setup_flags & WQ_SETUP_CLEAR_VOUCHER) {
3796                 /*
3797                  * For preemption reasons, we want to reset the voucher as late as
3798                  * possible, so we do it in two places:
3799                  *   - Just before parking (i.e. in parkit())
3800                  *   - Prior to doing the setup for the next workitem (i.e. here)
3801                  *
3802                  * Those two places are sufficient to ensure we always reset it before
3803                  * it goes back out to user space, but be careful to not break that
3804                  * guarantee.
3805                  */
3806                 __assert_only kern_return_t kr;
3807                 kr = pthread_kern->thread_set_voucher_name(MACH_PORT_NULL);
3808                 assert(kr == KERN_SUCCESS);
3809         }
3810
3811         uint32_t upcall_flags = tl->th_upcall_flags << WQ_FLAG_THREAD_PRIOSHIFT;
3812         if (!(setup_flags & WQ_SETUP_FIRST_USE)) {
3813                 upcall_flags |= WQ_FLAG_THREAD_REUSE;
3814         }
3815
3816         /*
3817          * Put the QoS class value into the lower bits of the reuse_thread register, this is where
3818          * the thread priority used to be stored anyway.
3819          */
3820         pthread_priority_t priority = pthread_priority_from_wq_class_index(wq, tl->th_priority);
3821         upcall_flags |= (_pthread_priority_get_qos_newest(priority) & WQ_FLAG_THREAD_PRIOMASK);
3822
3823         const vm_size_t guardsize = vm_map_page_size(tl->th_workq->wq_map);
3824         const vm_size_t stack_gap_min = (proc_is64bit(p) == 0) ? C_32_STK_ALIGN : C_64_REDZONE_LEN;
3825         const vm_size_t stack_align_min = (proc_is64bit(p) == 0) ? C_32_STK_ALIGN : C_64_STK_ALIGN;
3826
3827         user_addr_t pthread_self_addr = (user_addr_t)(tl->th_stackaddr + PTH_DEFAULT_STACKSIZE + guardsize + PTHREAD_T_OFFSET);
3828         user_addr_t stack_top_addr = (user_addr_t)((pthread_self_addr - stack_gap_min) & -stack_align_min);
3829         user_addr_t stack_bottom_addr = (user_addr_t)(tl->th_stackaddr + guardsize);
3830
3831         user_addr_t wqstart_fnptr = pthread_kern->proc_get_wqthread(p);
3832         if (!wqstart_fnptr) {
3833                 panic("workqueue thread start function pointer is NULL");
3834         }
3835
3836         if (setup_flags & WQ_SETUP_FIRST_USE) {
3837                 uint32_t tsd_offset = pthread_kern->proc_get_pthread_tsd_offset(p);
3838                 if (tsd_offset) {
3839                         mach_vm_offset_t th_tsd_base = (mach_vm_offset_t)pthread_self_addr + tsd_offset;
3840                         kern_return_t kret = pthread_kern->thread_set_tsd_base(th, th_tsd_base);
3841                         if (kret == KERN_SUCCESS) {
3842                                 upcall_flags |= WQ_FLAG_THREAD_TSD_BASE_SET;
3843                         }
3844                 }
3845
3846                 /*
3847                 * Pre-fault the first page of the new thread's stack and the page that will
3848                 * contain the pthread_t structure.
3849                 */
3850                 vm_map_t vmap = pthread_kern->current_map();
3851                 if (vm_map_trunc_page_mask((vm_map_offset_t)(stack_top_addr - C_64_REDZONE_LEN), vm_map_page_mask(vmap)) !=
3852                                 vm_map_trunc_page_mask((vm_map_offset_t)pthread_self_addr, vm_map_page_mask(vmap))){
3853                         vm_fault( vmap,
3854                                         vm_map_trunc_page_mask((vm_map_offset_t)(stack_top_addr - C_64_REDZONE_LEN), vm_map_page_mask(vmap)),
3855                                         VM_PROT_READ | VM_PROT_WRITE,
3856                                         FALSE,
3857                                         THREAD_UNINT, NULL, 0);
3858                 }
3859                 vm_fault( vmap,
3860                                 vm_map_trunc_page_mask((vm_map_offset_t)pthread_self_addr, vm_map_page_mask(vmap)),
3861                                 VM_PROT_READ | VM_PROT_WRITE,
3862                                 FALSE,
3863                                 THREAD_UNINT, NULL, 0);
3864         }
3865
3866         user_addr_t kevent_list = NULL;
3867         int kevent_count = 0;
3868         if (upcall_flags & WQ_FLAG_THREAD_KEVENT){
3869                 bool workloop = upcall_flags & WQ_FLAG_THREAD_WORKLOOP;
3870
3871                 kevent_list = pthread_self_addr - WQ_KEVENT_LIST_LEN * sizeof(struct kevent_qos_s);
3872                 kevent_count = WQ_KEVENT_LIST_LEN;
3873
3874                 user_addr_t kevent_id_addr = kevent_list;
3875                 if (workloop) {
3876                         /*
3877                          * The kevent ID goes just below the kevent list.  Sufficiently new
3878                          * userspace will know to look there.  Old userspace will just
3879                          * ignore it.
3880                          */
3881                         kevent_id_addr -= sizeof(kqueue_id_t);
3882                 }
3883
3884                 user_addr_t kevent_data_buf = kevent_id_addr - WQ_KEVENT_DATA_SIZE;
3885                 user_size_t kevent_data_available = WQ_KEVENT_DATA_SIZE;
3886
3887                 int32_t events_out = 0;
3888
3889                 assert(tl->th_flags | TH_LIST_KEVENT_BOUND);
3890                 unsigned int flags = KEVENT_FLAG_STACK_DATA | KEVENT_FLAG_IMMEDIATE;
3891                 if (tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET) {
3892                         flags |= KEVENT_FLAG_WORKQ_MANAGER;
3893                 }
3894                 int ret = 0;
3895                 if (workloop) {
3896                         flags |= KEVENT_FLAG_WORKLOOP;
3897                         kqueue_id_t kevent_id = -1;
3898                         ret = kevent_id_internal(p, &kevent_id,
3899                                         NULL, 0, kevent_list, kevent_count,
3900                                         kevent_data_buf, &kevent_data_available,
3901                                         flags, &events_out);
3902                         copyout(&kevent_id, kevent_id_addr, sizeof(kevent_id));
3903                 } else {
3904                         flags |= KEVENT_FLAG_WORKQ;
3905                         ret = kevent_qos_internal(p,
3906                                         class_index_get_thread_qos(tl->th_priority),
3907                                         NULL, 0, kevent_list, kevent_count,
3908                                         kevent_data_buf, &kevent_data_available,
3909                                         flags, &events_out);
3910                 }
3911
3912                 // squash any errors into just empty output
3913                 if (ret != KERN_SUCCESS || events_out == -1){
3914                         events_out = 0;
3915                         kevent_data_available = WQ_KEVENT_DATA_SIZE;
3916                 }
3917
3918                 // We shouldn't get data out if there aren't events available
3919                 assert(events_out != 0 || kevent_data_available == WQ_KEVENT_DATA_SIZE);
3920
3921                 if (events_out > 0){
3922                         if (kevent_data_available == WQ_KEVENT_DATA_SIZE){
3923                                 stack_top_addr = (kevent_id_addr - stack_gap_min) & -stack_align_min;
3924                         } else {
3925                                 stack_top_addr = (kevent_data_buf + kevent_data_available - stack_gap_min) & -stack_align_min;
3926                         }
3927
3928                         kevent_count = events_out;
3929                 } else {
3930                         kevent_list = NULL;
3931                         kevent_count = 0;
3932                 }
3933         }
3934
3935         PTHREAD_TRACE_WQ(TRACE_wq_runthread | DBG_FUNC_START, wq, 0, 0, 0, 0);
3936
3937 #if defined(__i386__) || defined(__x86_64__)
3938         if (proc_is64bit(p) == 0) {
3939                 x86_thread_state32_t state = {
3940                         .eip = (unsigned int)wqstart_fnptr,
3941                         .eax = /* arg0 */ (unsigned int)pthread_self_addr,
3942                         .ebx = /* arg1 */ (unsigned int)tl->th_thport,
3943                         .ecx = /* arg2 */ (unsigned int)stack_bottom_addr,
3944                         .edx = /* arg3 */ (unsigned int)kevent_list,
3945                         .edi = /* arg4 */ (unsigned int)upcall_flags,
3946                         .esi = /* arg5 */ (unsigned int)kevent_count,
3947
3948                         .esp = (int)((vm_offset_t)stack_top_addr),
3949                 };
3950
3951                 error = pthread_kern->thread_set_wq_state32(th, (thread_state_t)&state);
3952                 if (error != KERN_SUCCESS) {
3953                         panic(__func__ ": thread_set_wq_state failed: %d", error);
3954                 }
3955         } else {
3956                 x86_thread_state64_t state64 = {
3957                         // x86-64 already passes all the arguments in registers, so we just put them in their final place here
3958                         .rip = (uint64_t)wqstart_fnptr,
3959                         .rdi = (uint64_t)pthread_self_addr,
3960                         .rsi = (uint64_t)tl->th_thport,
3961                         .rdx = (uint64_t)stack_bottom_addr,
3962                         .rcx = (uint64_t)kevent_list,
3963                         .r8  = (uint64_t)upcall_flags,
3964                         .r9  = (uint64_t)kevent_count,
3965
3966                         .rsp = (uint64_t)(stack_top_addr)
3967                 };
3968
3969                 error = pthread_kern->thread_set_wq_state64(th, (thread_state_t)&state64);
3970                 if (error != KERN_SUCCESS) {
3971                         panic(__func__ ": thread_set_wq_state failed: %d", error);
3972                 }
3973         }
3974 #else
3975 #error setup_wqthread  not defined for this architecture
3976 #endif
3977 }
3978
3979 #if DEBUG
3980 static int wq_kevent_test SYSCTL_HANDLER_ARGS {
3981         //(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req)
3982 #pragma unused(oidp, arg1, arg2)
3983         int error;
3984         struct workq_reqthreads_req_s requests[64] = {};
3985
3986         if (req->newlen > sizeof(requests) || req->newlen < sizeof(struct workq_reqthreads_req_s))
3987                 return EINVAL;
3988
3989         error = copyin(req->newptr, requests, req->newlen);
3990         if (error) return error;
3991
3992         _workq_reqthreads(req->p, (int)(req->newlen / sizeof(struct workq_reqthreads_req_s)), requests);
3993
3994         return 0;
3995 }
3996 #endif // DEBUG
3997
3998 #pragma mark - Misc
3999
4000 int
4001 _fill_procworkqueue(proc_t p, struct proc_workqueueinfo * pwqinfo)
4002 {
4003         struct workqueue * wq;
4004         int error = 0;
4005         int     activecount;
4006
4007         if ((wq = pthread_kern->proc_get_wqptr(p)) == NULL) {
4008                 return EINVAL;
4009         }
4010
4011         /*
4012          * This is sometimes called from interrupt context by the kperf sampler.
4013          * In that case, it's not safe to spin trying to take the lock since we
4014          * might already hold it.  So, we just try-lock it and error out if it's
4015          * already held.  Since this is just a debugging aid, and all our callers
4016          * are able to handle an error, that's fine.
4017          */
4018         bool locked = workqueue_lock_try(wq);
4019         if (!locked) {
4020                 return EBUSY;
4021         }
4022
4023         activecount = _wq_thactive_aggregate_downto_qos(wq, _wq_thactive(wq),
4024                         WORKQUEUE_NUM_BUCKETS - 1, NULL, NULL);
4025         pwqinfo->pwq_nthreads = wq->wq_nthreads;
4026         pwqinfo->pwq_runthreads = activecount;
4027         pwqinfo->pwq_blockedthreads = wq->wq_threads_scheduled - activecount;
4028         pwqinfo->pwq_state = 0;
4029
4030         if (wq->wq_constrained_threads_scheduled >= wq_max_constrained_threads) {
4031                 pwqinfo->pwq_state |= WQ_EXCEEDED_CONSTRAINED_THREAD_LIMIT;
4032         }
4033
4034         if (wq->wq_nthreads >= wq_max_threads) {
4035                 pwqinfo->pwq_state |= WQ_EXCEEDED_TOTAL_THREAD_LIMIT;
4036         }
4037
4038         workqueue_unlock(wq);
4039         return(error);
4040 }
4041
4042 uint32_t
4043 _get_pwq_state_kdp(proc_t p)
4044 {
4045         if (p == NULL) {
4046                 return 0;
4047         }
4048
4049         struct workqueue *wq = pthread_kern->proc_get_wqptr(p);
4050
4051         if (wq == NULL || workqueue_lock_spin_is_acquired_kdp(wq)) {
4052                 return 0;
4053         }
4054
4055         uint32_t pwq_state = WQ_FLAGS_AVAILABLE;
4056
4057         if (wq->wq_constrained_threads_scheduled >= wq_max_constrained_threads) {
4058                 pwq_state |= WQ_EXCEEDED_CONSTRAINED_THREAD_LIMIT;
4059         }
4060
4061         if (wq->wq_nthreads >= wq_max_threads) {
4062                 pwq_state |= WQ_EXCEEDED_TOTAL_THREAD_LIMIT;
4063         }
4064
4065         return pwq_state;
4066 }
4067
4068 int
4069 _thread_selfid(__unused struct proc *p, uint64_t *retval)
4070 {
4071         thread_t thread = current_thread();
4072         *retval = thread_tid(thread);
4073         return KERN_SUCCESS;
4074 }
4075
4076 void
4077 _pthread_init(void)
4078 {
4079         pthread_lck_grp_attr = lck_grp_attr_alloc_init();
4080         pthread_lck_grp = lck_grp_alloc_init("pthread", pthread_lck_grp_attr);
4081
4082         /*
4083          * allocate the lock attribute for pthread synchronizers
4084          */
4085         pthread_lck_attr = lck_attr_alloc_init();
4086
4087         pthread_list_mlock = lck_mtx_alloc_init(pthread_lck_grp, pthread_lck_attr);
4088
4089         pth_global_hashinit();
4090         psynch_thcall = thread_call_allocate(psynch_wq_cleanup, NULL);
4091         psynch_zoneinit();
4092
4093         pthread_zone_workqueue = zinit(sizeof(struct workqueue),
4094                         1024 * sizeof(struct workqueue), 8192, "pthread.workqueue");
4095         pthread_zone_threadlist = zinit(sizeof(struct threadlist),
4096                         1024 * sizeof(struct threadlist), 8192, "pthread.threadlist");
4097         pthread_zone_threadreq = zinit(sizeof(struct threadreq),
4098                         1024 * sizeof(struct threadreq), 8192, "pthread.threadreq");
4099
4100         int policy_bootarg;
4101         if (PE_parse_boot_argn("pthread_mutex_default_policy", &policy_bootarg, sizeof(policy_bootarg))) {
4102                 pthread_mutex_default_policy = policy_bootarg;
4103         }
4104
4105         /*
4106          * register sysctls
4107          */
4108         sysctl_register_oid(&sysctl__kern_wq_stalled_window_usecs);
4109         sysctl_register_oid(&sysctl__kern_wq_reduce_pool_window_usecs);
4110         sysctl_register_oid(&sysctl__kern_wq_max_timer_interval_usecs);
4111         sysctl_register_oid(&sysctl__kern_wq_max_threads);
4112         sysctl_register_oid(&sysctl__kern_wq_max_constrained_threads);
4113         sysctl_register_oid(&sysctl__kern_pthread_debug_tracing);
4114         sysctl_register_oid(&sysctl__kern_pthread_mutex_default_policy);
4115
4116 #if DEBUG
4117         sysctl_register_oid(&sysctl__debug_wq_kevent_test);
4118 #endif
4119
4120         for (int i = 0; i < WORKQUEUE_NUM_BUCKETS; i++) {
4121                 uint32_t thread_qos = _wq_bucket_to_thread_qos(i);
4122                 wq_max_concurrency[i] = pthread_kern->qos_max_parallelism(thread_qos,
4123                                 QOS_PARALLELISM_COUNT_LOGICAL);
4124         }
4125         wq_max_concurrency[WORKQUEUE_EVENT_MANAGER_BUCKET] = 1;
4126 }