kern/kern_support.c

   1 /*
   2  * Copyright (c) 2000-2012 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995-2005 Apple Computer, Inc. All Rights Reserved */
  29 /*
  30  *      pthread_synch.c
  31  */
  32
  33 #pragma mark - Front Matter
  34
  35 #define  _PTHREAD_CONDATTR_T
  36 #define  _PTHREAD_COND_T
  37 #define _PTHREAD_MUTEXATTR_T
  38 #define _PTHREAD_MUTEX_T
  39 #define _PTHREAD_RWLOCKATTR_T
  40 #define _PTHREAD_RWLOCK_T
  41
  42 #undef pthread_mutexattr_t
  43 #undef pthread_mutex_t
  44 #undef pthread_condattr_t
  45 #undef pthread_cond_t
  46 #undef pthread_rwlockattr_t
  47 #undef pthread_rwlock_t
  48
  49 #include <sys/cdefs.h>
  50
  51 // <rdar://problem/26158937> panic() should be marked noreturn
  52 extern void panic(const char *string, ...) __printflike(1,2) __dead2;
  53
  54 #include <sys/param.h>
  55 #include <sys/queue.h>
  56 #include <sys/resourcevar.h>
  57 //#include <sys/proc_internal.h>
  58 #include <sys/kauth.h>
  59 #include <sys/systm.h>
  60 #include <sys/timeb.h>
  61 #include <sys/times.h>
  62 #include <sys/acct.h>
  63 #include <sys/kernel.h>
  64 #include <sys/wait.h>
  65 #include <sys/signalvar.h>
  66 #include <sys/sysctl.h>
  67 #include <sys/syslog.h>
  68 #include <sys/stat.h>
  69 #include <sys/lock.h>
  70 #include <sys/kdebug.h>
  71 //#include <sys/sysproto.h>
  72 #include <sys/vm.h>
  73 #include <sys/user.h>           /* for coredump */
  74 #include <sys/proc_info.h>      /* for fill_procworkqueue */
  75
  76 #include <mach/mach_port.h>
  77 #include <mach/mach_types.h>
  78 #include <mach/semaphore.h>
  79 #include <mach/sync_policy.h>
  80 #include <mach/task.h>
  81 #include <mach/vm_prot.h>
  82 #include <kern/kern_types.h>
  83 #include <kern/task.h>
  84 #include <kern/clock.h>
  85 #include <mach/kern_return.h>
  86 #include <kern/thread.h>
  87 #include <kern/zalloc.h>
  88 #include <kern/sched_prim.h>    /* for thread_exception_return */
  89 #include <kern/processor.h>
  90 #include <kern/assert.h>
  91 #include <mach/mach_vm.h>
  92 #include <mach/mach_param.h>
  93 #include <mach/thread_status.h>
  94 #include <mach/thread_policy.h>
  95 #include <mach/message.h>
  96 #include <mach/port.h>
  97 //#include <vm/vm_protos.h>
  98 #include <vm/vm_fault.h>
  99 #include <vm/vm_map.h>
 100 #include <mach/thread_act.h> /* for thread_resume */
 101 #include <machine/machine_routines.h>
 102 #include <mach/shared_region.h>
 103
 104 #include <libkern/OSAtomic.h>
 105 #include <libkern/libkern.h>
 106
 107 #include <sys/pthread_shims.h>
 108 #include "kern_internal.h"
 109
 110 // XXX: Dirty import for sys/signarvar.h that's wrapped in BSD_KERNEL_PRIVATE
 111 #define sigcantmask (sigmask(SIGKILL) | sigmask(SIGSTOP))
 112
 113 // XXX: Ditto for thread tags from kern/thread.h
 114 #define THREAD_TAG_MAINTHREAD 0x1
 115 #define THREAD_TAG_PTHREAD 0x10
 116 #define THREAD_TAG_WORKQUEUE 0x20
 117
 118 lck_grp_attr_t   *pthread_lck_grp_attr;
 119 lck_grp_t    *pthread_lck_grp;
 120 lck_attr_t   *pthread_lck_attr;
 121
 122 zone_t pthread_zone_workqueue;
 123 zone_t pthread_zone_threadlist;
 124 zone_t pthread_zone_threadreq;
 125
 126 extern void thread_set_cthreadself(thread_t thread, uint64_t pself, int isLP64);
 127 extern void workqueue_thread_yielded(void);
 128
 129 #define WQ_SETUP_FIRST_USE  1
 130 #define WQ_SETUP_CLEAR_VOUCHER  2
 131 static void _setup_wqthread(proc_t p, thread_t th, struct workqueue *wq,
 132                 struct threadlist *tl, int flags);
 133
 134 static void reset_priority(struct threadlist *tl, pthread_priority_t pri);
 135 static pthread_priority_t pthread_priority_from_wq_class_index(struct workqueue *wq, int index);
 136
 137 static void wq_unpark_continue(void* ptr, wait_result_t wait_result) __dead2;
 138
 139 static bool workqueue_addnewthread(proc_t p, struct workqueue *wq);
 140 static void workqueue_removethread(struct threadlist *tl, bool fromexit, bool first_use);
 141 static void workqueue_lock_spin(struct workqueue *);
 142 static void workqueue_unlock(struct workqueue *);
 143
 144 #define WQ_RUN_TR_THROTTLED 0
 145 #define WQ_RUN_TR_THREAD_NEEDED 1
 146 #define WQ_RUN_TR_THREAD_STARTED 2
 147 #define WQ_RUN_TR_EXITING 3
 148 static int workqueue_run_threadreq_and_unlock(proc_t p, struct workqueue *wq,
 149                 struct threadlist *tl, struct threadreq *req, bool may_add_new_thread);
 150
 151 static bool may_start_constrained_thread(struct workqueue *wq,
 152                 uint32_t at_priclass, struct threadlist *tl, bool may_start_timer);
 153
 154 static mach_vm_offset_t stack_addr_hint(proc_t p, vm_map_t vmap);
 155 static boolean_t wq_thread_is_busy(uint64_t cur_ts,
 156                 _Atomic uint64_t *lastblocked_tsp);
 157
 158 int proc_settargetconc(pid_t pid, int queuenum, int32_t targetconc);
 159 int proc_setalltargetconc(pid_t pid, int32_t * targetconcp);
 160
 161 #define WQ_MAXPRI_MIN   0       /* low prio queue num */
 162 #define WQ_MAXPRI_MAX   2       /* max  prio queuenum */
 163 #define WQ_PRI_NUM      3       /* number of prio work queues */
 164
 165 #define C_32_STK_ALIGN          16
 166 #define C_64_STK_ALIGN          16
 167 #define C_64_REDZONE_LEN        128
 168
 169 #define PTHREAD_T_OFFSET 0
 170
 171 /*
 172  * Flags filed passed to bsdthread_create and back in pthread_start
 173 31  <---------------------------------> 0
 174 _________________________________________
 175 | flags(8) | policy(8) | importance(16) |
 176 -----------------------------------------
 177 */
 178
 179 #define PTHREAD_START_CUSTOM            0x01000000
 180 #define PTHREAD_START_SETSCHED          0x02000000
 181 #define PTHREAD_START_DETACHED          0x04000000
 182 #define PTHREAD_START_QOSCLASS          0x08000000
 183 #define PTHREAD_START_TSD_BASE_SET      0x10000000
 184 #define PTHREAD_START_QOSCLASS_MASK     0x00ffffff
 185 #define PTHREAD_START_POLICY_BITSHIFT 16
 186 #define PTHREAD_START_POLICY_MASK 0xff
 187 #define PTHREAD_START_IMPORTANCE_MASK 0xffff
 188
 189 #define SCHED_OTHER      POLICY_TIMESHARE
 190 #define SCHED_FIFO       POLICY_FIFO
 191 #define SCHED_RR         POLICY_RR
 192
 193 #define BASEPRI_DEFAULT 31
 194
 195 #pragma mark sysctls
 196
 197 static uint32_t wq_stalled_window_usecs = WQ_STALLED_WINDOW_USECS;
 198 static uint32_t wq_reduce_pool_window_usecs     = WQ_REDUCE_POOL_WINDOW_USECS;
 199 static uint32_t wq_max_timer_interval_usecs     = WQ_MAX_TIMER_INTERVAL_USECS;
 200 static uint32_t wq_max_threads                  = WORKQUEUE_MAXTHREADS;
 201 static uint32_t wq_max_constrained_threads      = WORKQUEUE_MAXTHREADS / 8;
 202 static uint32_t wq_max_concurrency[WORKQUEUE_NUM_BUCKETS + 1]; // set to ncpus on load
 203
 204 SYSCTL_INT(_kern, OID_AUTO, wq_stalled_window_usecs, CTLFLAG_RW | CTLFLAG_LOCKED,
 205            &wq_stalled_window_usecs, 0, "");
 206
 207 SYSCTL_INT(_kern, OID_AUTO, wq_reduce_pool_window_usecs, CTLFLAG_RW | CTLFLAG_LOCKED,
 208            &wq_reduce_pool_window_usecs, 0, "");
 209
 210 SYSCTL_INT(_kern, OID_AUTO, wq_max_timer_interval_usecs, CTLFLAG_RW | CTLFLAG_LOCKED,
 211            &wq_max_timer_interval_usecs, 0, "");
 212
 213 SYSCTL_INT(_kern, OID_AUTO, wq_max_threads, CTLFLAG_RW | CTLFLAG_LOCKED,
 214            &wq_max_threads, 0, "");
 215
 216 SYSCTL_INT(_kern, OID_AUTO, wq_max_constrained_threads, CTLFLAG_RW | CTLFLAG_LOCKED,
 217            &wq_max_constrained_threads, 0, "");
 218
 219 #ifdef DEBUG
 220 static int wq_kevent_test SYSCTL_HANDLER_ARGS;
 221 SYSCTL_PROC(_debug, OID_AUTO, wq_kevent_test, CTLFLAG_MASKED | CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY | CTLTYPE_OPAQUE, NULL, 0, wq_kevent_test, 0, "-");
 222 #endif
 223
 224 static uint32_t wq_init_constrained_limit = 1;
 225
 226 uint32_t pthread_debug_tracing = 1;
 227
 228 SYSCTL_INT(_kern, OID_AUTO, pthread_debug_tracing, CTLFLAG_RW | CTLFLAG_LOCKED,
 229                    &pthread_debug_tracing, 0, "")
 230
 231 /*
 232  *       +-----+-----+-----+-----+-----+-----+-----+
 233  *       | MT  | BG  | UT  | DE  | IN  | UN  | mgr |
 234  * +-----+-----+-----+-----+-----+-----+-----+-----+
 235  * | pri |  5  |  4  |  3  |  2  |  1  |  0  |  6  |
 236  * | qos |  1  |  2  |  3  |  4  |  5  |  6  |  7  |
 237  * +-----+-----+-----+-----+-----+-----+-----+-----+
 238  */
 239 static inline uint32_t
 240 _wq_bucket_to_thread_qos(int pri)
 241 {
 242         if (pri == WORKQUEUE_EVENT_MANAGER_BUCKET) {
 243                 return WORKQUEUE_EVENT_MANAGER_BUCKET + 1;
 244         }
 245         return WORKQUEUE_EVENT_MANAGER_BUCKET - pri;
 246 }
 247
 248 #pragma mark wq_thactive
 249
 250 #if defined(__LP64__)
 251 // Layout is:
 252 //   7 * 16 bits for each QoS bucket request count (including manager)
 253 //   3 bits of best QoS among all pending constrained requests
 254 //   13 bits of zeroes
 255 #define WQ_THACTIVE_BUCKET_WIDTH 16
 256 #define WQ_THACTIVE_QOS_SHIFT    (7 * WQ_THACTIVE_BUCKET_WIDTH)
 257 #else
 258 // Layout is:
 259 //   6 * 10 bits for each QoS bucket request count (except manager)
 260 //   1 bit for the manager bucket
 261 //   3 bits of best QoS among all pending constrained requests
 262 #define WQ_THACTIVE_BUCKET_WIDTH 10
 263 #define WQ_THACTIVE_QOS_SHIFT    (6 * WQ_THACTIVE_BUCKET_WIDTH + 1)
 264 #endif
 265 #define WQ_THACTIVE_BUCKET_MASK  ((1U << WQ_THACTIVE_BUCKET_WIDTH) - 1)
 266 #define WQ_THACTIVE_BUCKET_HALF  (1U << (WQ_THACTIVE_BUCKET_WIDTH - 1))
 267 #define WQ_THACTIVE_NO_PENDING_REQUEST 6
 268
 269 _Static_assert(sizeof(wq_thactive_t) * CHAR_BIT - WQ_THACTIVE_QOS_SHIFT >= 3,
 270                 "Make sure we have space to encode a QoS");
 271
 272 static inline wq_thactive_t
 273 _wq_thactive_fetch_and_add(struct workqueue *wq, wq_thactive_t offset)
 274 {
 275 #if PTHREAD_INLINE_RMW_ATOMICS || !defined(__LP64__)
 276         return atomic_fetch_add_explicit(&wq->wq_thactive, offset,
 277                         memory_order_relaxed);
 278 #else
 279         return pthread_kern->atomic_fetch_add_128_relaxed(&wq->wq_thactive, offset);
 280 #endif
 281 }
 282
 283 static inline wq_thactive_t
 284 _wq_thactive(struct workqueue *wq)
 285 {
 286 #if PTHREAD_INLINE_RMW_ATOMICS || !defined(__LP64__)
 287         return atomic_load_explicit(&wq->wq_thactive, memory_order_relaxed);
 288 #else
 289         return pthread_kern->atomic_load_128_relaxed(&wq->wq_thactive);
 290 #endif
 291 }
 292
 293 #define WQ_THACTIVE_BEST_CONSTRAINED_REQ_QOS(tha) \
 294                 ((tha) >> WQ_THACTIVE_QOS_SHIFT)
 295
 296 static inline uint32_t
 297 _wq_thactive_best_constrained_req_qos(struct workqueue *wq)
 298 {
 299         // Avoid expensive atomic operations: the three bits we're loading are in
 300         // a single byte, and always updated under the workqueue lock
 301         wq_thactive_t v = *(wq_thactive_t *)&wq->wq_thactive;
 302         return WQ_THACTIVE_BEST_CONSTRAINED_REQ_QOS(v);
 303 }
 304
 305 static inline wq_thactive_t
 306 _wq_thactive_set_best_constrained_req_qos(struct workqueue *wq,
 307                 uint32_t orig_qos, uint32_t new_qos)
 308 {
 309         wq_thactive_t v;
 310         v = (wq_thactive_t)(new_qos - orig_qos) << WQ_THACTIVE_QOS_SHIFT;
 311         /*
 312          * We can do an atomic add relative to the initial load because updates
 313          * to this qos are always serialized under the workqueue lock.
 314          */
 315         return _wq_thactive_fetch_and_add(wq, v) + v;
 316 }
 317
 318 static inline wq_thactive_t
 319 _wq_thactive_offset_for_qos(int qos)
 320 {
 321         return (wq_thactive_t)1 << (qos * WQ_THACTIVE_BUCKET_WIDTH);
 322 }
 323
 324 static inline wq_thactive_t
 325 _wq_thactive_inc(struct workqueue *wq, int qos)
 326 {
 327         return _wq_thactive_fetch_and_add(wq, _wq_thactive_offset_for_qos(qos));
 328 }
 329
 330 static inline wq_thactive_t
 331 _wq_thactive_dec(struct workqueue *wq, int qos)
 332 {
 333         return _wq_thactive_fetch_and_add(wq, -_wq_thactive_offset_for_qos(qos));
 334 }
 335
 336 static inline wq_thactive_t
 337 _wq_thactive_move(struct workqueue *wq, int oldqos, int newqos)
 338 {
 339         return _wq_thactive_fetch_and_add(wq, _wq_thactive_offset_for_qos(newqos) -
 340                         _wq_thactive_offset_for_qos(oldqos));
 341 }
 342
 343 static inline uint32_t
 344 _wq_thactive_aggregate_downto_qos(struct workqueue *wq, wq_thactive_t v,
 345                 int qos, uint32_t *busycount, uint32_t *max_busycount)
 346 {
 347         uint32_t count = 0, active;
 348         uint64_t curtime;
 349
 350 #ifndef __LP64__
 351         /*
 352          * on 32bits the manager bucket is a single bit and the best constrained
 353          * request QoS 3 bits are where the 10 bits of a regular QoS bucket count
 354          * would be. Mask them out.
 355          */
 356         v &= ~(~0ull << WQ_THACTIVE_QOS_SHIFT);
 357 #endif
 358         if (busycount) {
 359                 curtime = mach_absolute_time();
 360                 *busycount = 0;
 361         }
 362         if (max_busycount) {
 363                 *max_busycount = qos + 1;
 364         }
 365         for (int i = 0; i <= qos; i++, v >>= WQ_THACTIVE_BUCKET_WIDTH) {
 366                 active = v & WQ_THACTIVE_BUCKET_MASK;
 367                 count += active;
 368                 if (busycount && wq->wq_thscheduled_count[i] > active) {
 369                         if (wq_thread_is_busy(curtime, &wq->wq_lastblocked_ts[i])) {
 370                                 /*
 371                                  * We only consider the last blocked thread for a given bucket
 372                                  * as busy because we don't want to take the list lock in each
 373                                  * sched callback. However this is an approximation that could
 374                                  * contribute to thread creation storms.
 375                                  */
 376                                 (*busycount)++;
 377                         }
 378                 }
 379         }
 380         return count;
 381 }
 382
 383 #pragma mark - Process/Thread Setup/Teardown syscalls
 384
 385 static mach_vm_offset_t
 386 stack_addr_hint(proc_t p, vm_map_t vmap)
 387 {
 388         mach_vm_offset_t stackaddr;
 389         mach_vm_offset_t aslr_offset;
 390         bool proc64bit = proc_is64bit(p);
 391
 392         // We can't safely take random values % something unless its a power-of-two
 393         _Static_assert(powerof2(PTH_DEFAULT_STACKSIZE), "PTH_DEFAULT_STACKSIZE is a power-of-two");
 394
 395 #if defined(__i386__) || defined(__x86_64__)
 396         if (proc64bit) {
 397                 // Matches vm_map_get_max_aslr_slide_pages's image shift in xnu
 398                 aslr_offset = random() % (1 << 28); // about 512 stacks
 399         } else {
 400                 // Actually bigger than the image shift, we've got ~256MB to work with
 401                 aslr_offset = random() % (16 * PTH_DEFAULT_STACKSIZE);
 402         }
 403         aslr_offset = vm_map_trunc_page_mask(aslr_offset, vm_map_page_mask(vmap));
 404         if (proc64bit) {
 405                 // Above nanomalloc range (see NANOZONE_SIGNATURE)
 406                 stackaddr = 0x700000000000 + aslr_offset;
 407         } else {
 408                 stackaddr = SHARED_REGION_BASE_I386 + SHARED_REGION_SIZE_I386 + aslr_offset;
 409         }
 410 #elif defined(__arm__) || defined(__arm64__)
 411         user_addr_t main_thread_stack_top = 0;
 412         if (pthread_kern->proc_get_user_stack) {
 413                 main_thread_stack_top = pthread_kern->proc_get_user_stack(p);
 414         }
 415         if (proc64bit && main_thread_stack_top) {
 416                 // The main thread stack position is randomly slid by xnu (c.f.
 417                 // load_main() in mach_loader.c), so basing pthread stack allocations
 418                 // where the main thread stack ends is already ASLRd and doing so
 419                 // avoids creating a gap in the process address space that may cause
 420                 // extra PTE memory usage. rdar://problem/33328206
 421                 stackaddr = vm_map_trunc_page_mask((vm_map_offset_t)main_thread_stack_top,
 422                                 vm_map_page_mask(vmap));
 423         } else {
 424                 // vm_map_get_max_aslr_slide_pages ensures 1MB of slide, we do better
 425                 aslr_offset = random() % ((proc64bit ? 4 : 2) * PTH_DEFAULT_STACKSIZE);
 426                 aslr_offset = vm_map_trunc_page_mask((vm_map_offset_t)aslr_offset,
 427                                 vm_map_page_mask(vmap));
 428                 if (proc64bit) {
 429                         // 64 stacks below shared region
 430                         stackaddr = SHARED_REGION_BASE_ARM64 - 64 * PTH_DEFAULT_STACKSIZE - aslr_offset;
 431                 } else {
 432                         // If you try to slide down from this point, you risk ending up in memory consumed by malloc
 433                         stackaddr = SHARED_REGION_BASE_ARM - 32 * PTH_DEFAULT_STACKSIZE + aslr_offset;
 434                 }
 435         }
 436 #else
 437 #error Need to define a stack address hint for this architecture
 438 #endif
 439         return stackaddr;
 440 }
 441
 442 /**
 443  * bsdthread_create system call.  Used by pthread_create.
 444  */
 445 int
 446 _bsdthread_create(struct proc *p, user_addr_t user_func, user_addr_t user_funcarg, user_addr_t user_stack, user_addr_t user_pthread, uint32_t flags, user_addr_t *retval)
 447 {
 448         kern_return_t kret;
 449         void * sright;
 450         int error = 0;
 451         int allocated = 0;
 452         mach_vm_offset_t stackaddr;
 453         mach_vm_size_t th_allocsize = 0;
 454         mach_vm_size_t th_guardsize;
 455         mach_vm_offset_t th_stack;
 456         mach_vm_offset_t th_pthread;
 457         mach_vm_offset_t th_tsd_base;
 458         mach_port_name_t th_thport;
 459         thread_t th;
 460         vm_map_t vmap = pthread_kern->current_map();
 461         task_t ctask = current_task();
 462         unsigned int policy, importance;
 463         uint32_t tsd_offset;
 464
 465         int isLP64 = 0;
 466
 467         if (pthread_kern->proc_get_register(p) == 0) {
 468                 return EINVAL;
 469         }
 470
 471         PTHREAD_TRACE(TRACE_pthread_thread_create | DBG_FUNC_START, flags, 0, 0, 0, 0);
 472
 473         isLP64 = proc_is64bit(p);
 474         th_guardsize = vm_map_page_size(vmap);
 475
 476         stackaddr = pthread_kern->proc_get_stack_addr_hint(p);
 477         kret = pthread_kern->thread_create(ctask, &th);
 478         if (kret != KERN_SUCCESS)
 479                 return(ENOMEM);
 480         thread_reference(th);
 481
 482         pthread_kern->thread_set_tag(th, THREAD_TAG_PTHREAD);
 483
 484         sright = (void *)pthread_kern->convert_thread_to_port(th);
 485         th_thport = pthread_kern->ipc_port_copyout_send(sright, pthread_kern->task_get_ipcspace(ctask));
 486         if (!MACH_PORT_VALID(th_thport)) {
 487                 error = EMFILE; // userland will convert this into a crash
 488                 goto out;
 489         }
 490
 491         if ((flags & PTHREAD_START_CUSTOM) == 0) {
 492                 mach_vm_size_t pthread_size =
 493                         vm_map_round_page_mask(pthread_kern->proc_get_pthsize(p) + PTHREAD_T_OFFSET, vm_map_page_mask(vmap));
 494                 th_allocsize = th_guardsize + user_stack + pthread_size;
 495                 user_stack += PTHREAD_T_OFFSET;
 496
 497                 kret = mach_vm_map(vmap, &stackaddr,
 498                                 th_allocsize,
 499                                 page_size-1,
 500                                 VM_MAKE_TAG(VM_MEMORY_STACK)| VM_FLAGS_ANYWHERE , NULL,
 501                                 0, FALSE, VM_PROT_DEFAULT, VM_PROT_ALL,
 502                                 VM_INHERIT_DEFAULT);
 503                 if (kret != KERN_SUCCESS){
 504                         kret = mach_vm_allocate(vmap,
 505                                         &stackaddr, th_allocsize,
 506                                         VM_MAKE_TAG(VM_MEMORY_STACK)| VM_FLAGS_ANYWHERE);
 507                 }
 508                 if (kret != KERN_SUCCESS) {
 509                         error = ENOMEM;
 510                         goto out;
 511                 }
 512
 513                 PTHREAD_TRACE(TRACE_pthread_thread_create|DBG_FUNC_NONE, th_allocsize, stackaddr, 0, 2, 0);
 514
 515                 allocated = 1;
 516                 /*
 517                  * The guard page is at the lowest address
 518                  * The stack base is the highest address
 519                  */
 520                 kret = mach_vm_protect(vmap,  stackaddr, th_guardsize, FALSE, VM_PROT_NONE);
 521
 522                 if (kret != KERN_SUCCESS) {
 523                         error = ENOMEM;
 524                         goto out1;
 525                 }
 526
 527                 th_pthread = stackaddr + th_guardsize + user_stack;
 528                 th_stack = th_pthread;
 529
 530                 /*
 531                 * Pre-fault the first page of the new thread's stack and the page that will
 532                 * contain the pthread_t structure.
 533                 */
 534                 if (vm_map_trunc_page_mask((vm_map_offset_t)(th_stack - C_64_REDZONE_LEN), vm_map_page_mask(vmap)) !=
 535                                 vm_map_trunc_page_mask((vm_map_offset_t)th_pthread, vm_map_page_mask(vmap))){
 536                         vm_fault( vmap,
 537                                         vm_map_trunc_page_mask((vm_map_offset_t)(th_stack - C_64_REDZONE_LEN), vm_map_page_mask(vmap)),
 538                                         VM_PROT_READ | VM_PROT_WRITE,
 539                                         FALSE,
 540                                         THREAD_UNINT, NULL, 0);
 541                 }
 542
 543                 vm_fault( vmap,
 544                                 vm_map_trunc_page_mask((vm_map_offset_t)th_pthread, vm_map_page_mask(vmap)),
 545                                 VM_PROT_READ | VM_PROT_WRITE,
 546                                 FALSE,
 547                                 THREAD_UNINT, NULL, 0);
 548
 549         } else {
 550                 th_stack = user_stack;
 551                 th_pthread = user_pthread;
 552
 553                 PTHREAD_TRACE(TRACE_pthread_thread_create|DBG_FUNC_NONE, 0, 0, 0, 3, 0);
 554         }
 555
 556         tsd_offset = pthread_kern->proc_get_pthread_tsd_offset(p);
 557         if (tsd_offset) {
 558                 th_tsd_base = th_pthread + tsd_offset;
 559                 kret = pthread_kern->thread_set_tsd_base(th, th_tsd_base);
 560                 if (kret == KERN_SUCCESS) {
 561                         flags |= PTHREAD_START_TSD_BASE_SET;
 562                 }
 563         }
 564
 565 #if defined(__i386__) || defined(__x86_64__)
 566         /*
 567          * Set up i386 registers & function call.
 568          */
 569         if (isLP64 == 0) {
 570                 x86_thread_state32_t state = {
 571                         .eip = (unsigned int)pthread_kern->proc_get_threadstart(p),
 572                         .eax = (unsigned int)th_pthread,
 573                         .ebx = (unsigned int)th_thport,
 574                         .ecx = (unsigned int)user_func,
 575                         .edx = (unsigned int)user_funcarg,
 576                         .edi = (unsigned int)user_stack,
 577                         .esi = (unsigned int)flags,
 578                         /*
 579                          * set stack pointer
 580                          */
 581                         .esp = (int)((vm_offset_t)(th_stack-C_32_STK_ALIGN))
 582                 };
 583
 584                 error = pthread_kern->thread_set_wq_state32(th, (thread_state_t)&state);
 585                 if (error != KERN_SUCCESS) {
 586                         error = EINVAL;
 587                         goto out;
 588                 }
 589         } else {
 590                 x86_thread_state64_t state64 = {
 591                         .rip = (uint64_t)pthread_kern->proc_get_threadstart(p),
 592                         .rdi = (uint64_t)th_pthread,
 593                         .rsi = (uint64_t)(th_thport),
 594                         .rdx = (uint64_t)user_func,
 595                         .rcx = (uint64_t)user_funcarg,
 596                         .r8 = (uint64_t)user_stack,
 597                         .r9 = (uint64_t)flags,
 598                         /*
 599                          * set stack pointer aligned to 16 byte boundary
 600                          */
 601                         .rsp = (uint64_t)(th_stack - C_64_REDZONE_LEN)
 602                 };
 603
 604                 error = pthread_kern->thread_set_wq_state64(th, (thread_state_t)&state64);
 605                 if (error != KERN_SUCCESS) {
 606                         error = EINVAL;
 607                         goto out;
 608                 }
 609
 610         }
 611 #elif defined(__arm__)
 612         arm_thread_state_t state = {
 613                 .pc = (int)pthread_kern->proc_get_threadstart(p),
 614                 .r[0] = (unsigned int)th_pthread,
 615                 .r[1] = (unsigned int)th_thport,
 616                 .r[2] = (unsigned int)user_func,
 617                 .r[3] = (unsigned int)user_funcarg,
 618                 .r[4] = (unsigned int)user_stack,
 619                 .r[5] = (unsigned int)flags,
 620
 621                 /* Set r7 & lr to 0 for better back tracing */
 622                 .r[7] = 0,
 623                 .lr = 0,
 624
 625                 /*
 626                  * set stack pointer
 627                  */
 628                 .sp = (int)((vm_offset_t)(th_stack-C_32_STK_ALIGN))
 629         };
 630
 631         (void) pthread_kern->thread_set_wq_state32(th, (thread_state_t)&state);
 632
 633 #else
 634 #error bsdthread_create  not defined for this architecture
 635 #endif
 636
 637         if ((flags & PTHREAD_START_SETSCHED) != 0) {
 638                 /* Set scheduling parameters if needed */
 639                 thread_extended_policy_data_t    extinfo;
 640                 thread_precedence_policy_data_t   precedinfo;
 641
 642                 importance = (flags & PTHREAD_START_IMPORTANCE_MASK);
 643                 policy = (flags >> PTHREAD_START_POLICY_BITSHIFT) & PTHREAD_START_POLICY_MASK;
 644
 645                 if (policy == SCHED_OTHER) {
 646                         extinfo.timeshare = 1;
 647                 } else {
 648                         extinfo.timeshare = 0;
 649                 }
 650
 651                 thread_policy_set(th, THREAD_EXTENDED_POLICY, (thread_policy_t)&extinfo, THREAD_EXTENDED_POLICY_COUNT);
 652
 653                 precedinfo.importance = (importance - BASEPRI_DEFAULT);
 654                 thread_policy_set(th, THREAD_PRECEDENCE_POLICY, (thread_policy_t)&precedinfo, THREAD_PRECEDENCE_POLICY_COUNT);
 655         } else if ((flags & PTHREAD_START_QOSCLASS) != 0) {
 656                 /* Set thread QoS class if requested. */
 657                 pthread_priority_t priority = (pthread_priority_t)(flags & PTHREAD_START_QOSCLASS_MASK);
 658
 659                 thread_qos_policy_data_t qos;
 660                 qos.qos_tier = pthread_priority_get_thread_qos(priority);
 661                 qos.tier_importance = (qos.qos_tier == QOS_CLASS_UNSPECIFIED) ? 0 :
 662                                 _pthread_priority_get_relpri(priority);
 663
 664                 pthread_kern->thread_policy_set_internal(th, THREAD_QOS_POLICY, (thread_policy_t)&qos, THREAD_QOS_POLICY_COUNT);
 665         }
 666
 667         if (pthread_kern->proc_get_mach_thread_self_tsd_offset) {
 668                 uint64_t mach_thread_self_offset =
 669                                 pthread_kern->proc_get_mach_thread_self_tsd_offset(p);
 670                 if (mach_thread_self_offset && tsd_offset) {
 671                         bool proc64bit = proc_is64bit(p);
 672                         if (proc64bit) {
 673                                 uint64_t th_thport_tsd = (uint64_t)th_thport;
 674                                 error = copyout(&th_thport_tsd, th_pthread + tsd_offset +
 675                                                 mach_thread_self_offset, sizeof(th_thport_tsd));
 676                         } else {
 677                                 uint32_t th_thport_tsd = (uint32_t)th_thport;
 678                                 error = copyout(&th_thport_tsd, th_pthread + tsd_offset +
 679                                                 mach_thread_self_offset, sizeof(th_thport_tsd));
 680                         }
 681                         if (error) {
 682                                 goto out1;
 683                         }
 684                 }
 685         }
 686
 687         kret = pthread_kern->thread_resume(th);
 688         if (kret != KERN_SUCCESS) {
 689                 error = EINVAL;
 690                 goto out1;
 691         }
 692         thread_deallocate(th);  /* drop the creator reference */
 693
 694         PTHREAD_TRACE(TRACE_pthread_thread_create|DBG_FUNC_END, error, th_pthread, 0, 0, 0);
 695
 696         // cast required as mach_vm_offset_t is always 64 bits even on 32-bit platforms
 697         *retval = (user_addr_t)th_pthread;
 698
 699         return(0);
 700
 701 out1:
 702         if (allocated != 0) {
 703                 (void)mach_vm_deallocate(vmap, stackaddr, th_allocsize);
 704         }
 705 out:
 706         (void)pthread_kern->mach_port_deallocate(pthread_kern->task_get_ipcspace(ctask), th_thport);
 707         if (pthread_kern->thread_will_park_or_terminate) {
 708                 pthread_kern->thread_will_park_or_terminate(th);
 709         }
 710         (void)thread_terminate(th);
 711         (void)thread_deallocate(th);
 712         return(error);
 713 }
 714
 715 /**
 716  * bsdthread_terminate system call.  Used by pthread_terminate
 717  */
 718 int
 719 _bsdthread_terminate(__unused struct proc *p,
 720                      user_addr_t stackaddr,
 721                      size_t size,
 722                      uint32_t kthport,
 723                      uint32_t sem,
 724                      __unused int32_t *retval)
 725 {
 726         mach_vm_offset_t freeaddr;
 727         mach_vm_size_t freesize;
 728         kern_return_t kret;
 729         thread_t th = current_thread();
 730
 731         freeaddr = (mach_vm_offset_t)stackaddr;
 732         freesize = size;
 733
 734         PTHREAD_TRACE(TRACE_pthread_thread_terminate|DBG_FUNC_START, freeaddr, freesize, kthport, 0xff, 0);
 735
 736         if ((freesize != (mach_vm_size_t)0) && (freeaddr != (mach_vm_offset_t)0)) {
 737                 if (pthread_kern->thread_get_tag(th) & THREAD_TAG_MAINTHREAD){
 738                         vm_map_t user_map = pthread_kern->current_map();
 739                         freesize = vm_map_trunc_page_mask((vm_map_offset_t)freesize - 1, vm_map_page_mask(user_map));
 740                         kret = mach_vm_behavior_set(user_map, freeaddr, freesize, VM_BEHAVIOR_REUSABLE);
 741                         assert(kret == KERN_SUCCESS || kret == KERN_INVALID_ADDRESS);
 742                         kret = kret ? kret : mach_vm_protect(user_map, freeaddr, freesize, FALSE, VM_PROT_NONE);
 743                         assert(kret == KERN_SUCCESS || kret == KERN_INVALID_ADDRESS);
 744                 } else {
 745                         kret = mach_vm_deallocate(pthread_kern->current_map(), freeaddr, freesize);
 746                         if (kret != KERN_SUCCESS) {
 747                                 PTHREAD_TRACE(TRACE_pthread_thread_terminate|DBG_FUNC_END, kret, 0, 0, 0, 0);
 748                                 return(EINVAL);
 749                         }
 750                 }
 751         }
 752
 753         if (pthread_kern->thread_will_park_or_terminate) {
 754                 pthread_kern->thread_will_park_or_terminate(th);
 755         }
 756         (void)thread_terminate(th);
 757         if (sem != MACH_PORT_NULL) {
 758                  kret = pthread_kern->semaphore_signal_internal_trap(sem);
 759                 if (kret != KERN_SUCCESS) {
 760                         PTHREAD_TRACE(TRACE_pthread_thread_terminate|DBG_FUNC_END, kret, 0, 0, 0, 0);
 761                         return(EINVAL);
 762                 }
 763         }
 764
 765         if (kthport != MACH_PORT_NULL) {
 766                 pthread_kern->mach_port_deallocate(pthread_kern->task_get_ipcspace(current_task()), kthport);
 767         }
 768
 769         PTHREAD_TRACE(TRACE_pthread_thread_terminate|DBG_FUNC_END, 0, 0, 0, 0, 0);
 770
 771         pthread_kern->thread_exception_return();
 772         panic("bsdthread_terminate: still running\n");
 773
 774         PTHREAD_TRACE(TRACE_pthread_thread_terminate|DBG_FUNC_END, 0, 0xff, 0, 0, 0);
 775
 776         return(0);
 777 }
 778
 779 /**
 780  * bsdthread_register system call.  Performs per-process setup.  Responsible for
 781  * returning capabilitiy bits to userspace and receiving userspace function addresses.
 782  */
 783 int
 784 _bsdthread_register(struct proc *p,
 785                     user_addr_t threadstart,
 786                     user_addr_t wqthread,
 787                     int pthsize,
 788                     user_addr_t pthread_init_data,
 789                     user_addr_t pthread_init_data_size,
 790                     uint64_t dispatchqueue_offset,
 791                     int32_t *retval)
 792 {
 793         struct _pthread_registration_data data = {};
 794         uint32_t max_tsd_offset;
 795         kern_return_t kr;
 796         size_t pthread_init_sz = 0;
 797
 798         /* syscall randomizer test can pass bogus values */
 799         if (pthsize < 0 || pthsize > MAX_PTHREAD_SIZE) {
 800                 return(EINVAL);
 801         }
 802         /*
 803          * if we have pthread_init_data, then we use that and target_concptr
 804          * (which is an offset) get data.
 805          */
 806         if (pthread_init_data != 0) {
 807                 if (pthread_init_data_size < sizeof(data.version)) {
 808                         return EINVAL;
 809                 }
 810                 pthread_init_sz = MIN(sizeof(data), (size_t)pthread_init_data_size);
 811                 int ret = copyin(pthread_init_data, &data, pthread_init_sz);
 812                 if (ret) {
 813                         return ret;
 814                 }
 815                 if (data.version != (size_t)pthread_init_data_size) {
 816                         return EINVAL;
 817                 }
 818         } else {
 819                 data.dispatch_queue_offset = dispatchqueue_offset;
 820         }
 821
 822         /* We have to do this before proc_get_register so that it resets after fork */
 823         mach_vm_offset_t stackaddr = stack_addr_hint(p, pthread_kern->current_map());
 824         pthread_kern->proc_set_stack_addr_hint(p, (user_addr_t)stackaddr);
 825
 826         /* prevent multiple registrations */
 827         if (pthread_kern->proc_get_register(p) != 0) {
 828                 return(EINVAL);
 829         }
 830
 831         pthread_kern->proc_set_threadstart(p, threadstart);
 832         pthread_kern->proc_set_wqthread(p, wqthread);
 833         pthread_kern->proc_set_pthsize(p, pthsize);
 834         pthread_kern->proc_set_register(p);
 835
 836         uint32_t tsd_slot_sz = proc_is64bit(p) ? sizeof(uint64_t) : sizeof(uint32_t);
 837         if ((uint32_t)pthsize >= tsd_slot_sz &&
 838                         data.tsd_offset <= (uint32_t)(pthsize - tsd_slot_sz)) {
 839                 max_tsd_offset = ((uint32_t)pthsize - data.tsd_offset - tsd_slot_sz);
 840         } else {
 841                 data.tsd_offset = 0;
 842                 max_tsd_offset = 0;
 843         }
 844         pthread_kern->proc_set_pthread_tsd_offset(p, data.tsd_offset);
 845
 846         if (data.dispatch_queue_offset > max_tsd_offset) {
 847                 data.dispatch_queue_offset = 0;
 848         }
 849         pthread_kern->proc_set_dispatchqueue_offset(p, data.dispatch_queue_offset);
 850
 851         if (pthread_kern->proc_set_return_to_kernel_offset) {
 852                 if (data.return_to_kernel_offset > max_tsd_offset) {
 853                         data.return_to_kernel_offset = 0;
 854                 }
 855                 pthread_kern->proc_set_return_to_kernel_offset(p,
 856                                 data.return_to_kernel_offset);
 857         }
 858
 859         if (pthread_kern->proc_set_mach_thread_self_tsd_offset) {
 860                 if (data.mach_thread_self_offset > max_tsd_offset) {
 861                         data.mach_thread_self_offset = 0;
 862                 }
 863                 pthread_kern->proc_set_mach_thread_self_tsd_offset(p,
 864                                 data.mach_thread_self_offset);
 865         }
 866
 867         if (pthread_init_data != 0) {
 868                 /* Outgoing data that userspace expects as a reply */
 869                 data.version = sizeof(struct _pthread_registration_data);
 870                 if (pthread_kern->qos_main_thread_active()) {
 871                         mach_msg_type_number_t nqos = THREAD_QOS_POLICY_COUNT;
 872                         thread_qos_policy_data_t qos;
 873                         boolean_t gd = FALSE;
 874
 875                         kr = pthread_kern->thread_policy_get(current_thread(), THREAD_QOS_POLICY, (thread_policy_t)&qos, &nqos, &gd);
 876                         if (kr != KERN_SUCCESS || qos.qos_tier == THREAD_QOS_UNSPECIFIED) {
 877                                 /* Unspecified threads means the kernel wants us to impose legacy upon the thread. */
 878                                 qos.qos_tier = THREAD_QOS_LEGACY;
 879                                 qos.tier_importance = 0;
 880
 881                                 kr = pthread_kern->thread_policy_set_internal(current_thread(), THREAD_QOS_POLICY, (thread_policy_t)&qos, THREAD_QOS_POLICY_COUNT);
 882                         }
 883
 884                         if (kr == KERN_SUCCESS) {
 885                                 data.main_qos = thread_qos_get_pthread_priority(qos.qos_tier);
 886                         } else {
 887                                 data.main_qos = _pthread_priority_make_newest(QOS_CLASS_UNSPECIFIED, 0, 0);
 888                         }
 889                 } else {
 890                         data.main_qos = _pthread_priority_make_newest(QOS_CLASS_UNSPECIFIED, 0, 0);
 891                 }
 892
 893                 kr = copyout(&data, pthread_init_data, pthread_init_sz);
 894                 if (kr != KERN_SUCCESS) {
 895                         return EINVAL;
 896                 }
 897         }
 898
 899         /* return the supported feature set as the return value. */
 900         *retval = PTHREAD_FEATURE_SUPPORTED;
 901
 902         return(0);
 903 }
 904
 905 #pragma mark - QoS Manipulation
 906
 907 int
 908 _bsdthread_ctl_set_qos(struct proc *p, user_addr_t __unused cmd, mach_port_name_t kport, user_addr_t tsd_priority_addr, user_addr_t arg3, int *retval)
 909 {
 910         int rv;
 911         thread_t th;
 912
 913         pthread_priority_t priority;
 914
 915         /* Unused parameters must be zero. */
 916         if (arg3 != 0) {
 917                 return EINVAL;
 918         }
 919
 920         /* QoS is stored in a given slot in the pthread TSD. We need to copy that in and set our QoS based on it. */
 921         if (proc_is64bit(p)) {
 922                 uint64_t v;
 923                 rv = copyin(tsd_priority_addr, &v, sizeof(v));
 924                 if (rv) goto out;
 925                 priority = (int)(v & 0xffffffff);
 926         } else {
 927                 uint32_t v;
 928                 rv = copyin(tsd_priority_addr, &v, sizeof(v));
 929                 if (rv) goto out;
 930                 priority = v;
 931         }
 932
 933         if ((th = port_name_to_thread(kport)) == THREAD_NULL) {
 934                 return ESRCH;
 935         }
 936
 937         /* <rdar://problem/16211829> Disable pthread_set_qos_class_np() on threads other than pthread_self */
 938         if (th != current_thread()) {
 939                 thread_deallocate(th);
 940                 return EPERM;
 941         }
 942
 943         rv = _bsdthread_ctl_set_self(p, 0, priority, 0, _PTHREAD_SET_SELF_QOS_FLAG, retval);
 944
 945         /* Static param the thread, we just set QoS on it, so its stuck in QoS land now. */
 946         /* pthread_kern->thread_static_param(th, TRUE); */ // see <rdar://problem/16433744>, for details
 947
 948         thread_deallocate(th);
 949
 950 out:
 951         return rv;
 952 }
 953
 954 static inline struct threadlist *
 955 util_get_thread_threadlist_entry(thread_t th)
 956 {
 957         struct uthread *uth = pthread_kern->get_bsdthread_info(th);
 958         if (uth) {
 959                 struct threadlist *tl = pthread_kern->uthread_get_threadlist(uth);
 960                 return tl;
 961         }
 962         return NULL;
 963 }
 964
 965 boolean_t
 966 _workq_thread_has_been_unbound(thread_t th, int qos_class)
 967 {
 968         struct threadlist *tl = util_get_thread_threadlist_entry(th);
 969         if (!tl) {
 970                 return FALSE;
 971         }
 972
 973         struct workqueue *wq = tl->th_workq;
 974         workqueue_lock_spin(wq);
 975
 976         if (tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET) {
 977                 goto failure;
 978         } else if (qos_class != class_index_get_thread_qos(tl->th_priority)) {
 979                 goto failure;
 980         }
 981
 982         if ((tl->th_flags & TH_LIST_KEVENT_BOUND)){
 983                 goto failure;
 984         }
 985         tl->th_flags &= ~TH_LIST_KEVENT_BOUND;
 986
 987         workqueue_unlock(wq);
 988         return TRUE;
 989
 990 failure:
 991         workqueue_unlock(wq);
 992         return FALSE;
 993 }
 994
 995 int
 996 _bsdthread_ctl_set_self(struct proc *p, user_addr_t __unused cmd, pthread_priority_t priority, mach_port_name_t voucher, _pthread_set_flags_t flags, int __unused *retval)
 997 {
 998         thread_qos_policy_data_t qos;
 999         mach_msg_type_number_t nqos = THREAD_QOS_POLICY_COUNT;
1000         boolean_t gd = FALSE;
1001         thread_t th = current_thread();
1002         struct workqueue *wq = NULL;
1003         struct threadlist *tl = NULL;
1004
1005         kern_return_t kr;
1006         int qos_rv = 0, voucher_rv = 0, fixedpri_rv = 0;
1007
1008         if ((flags & _PTHREAD_SET_SELF_WQ_KEVENT_UNBIND) != 0) {
1009                 tl = util_get_thread_threadlist_entry(th);
1010                 if (tl) {
1011                         wq = tl->th_workq;
1012                 } else {
1013                         goto qos;
1014                 }
1015
1016                 workqueue_lock_spin(wq);
1017                 if (tl->th_flags & TH_LIST_KEVENT_BOUND) {
1018                         tl->th_flags &= ~TH_LIST_KEVENT_BOUND;
1019                         unsigned int kevent_flags = KEVENT_FLAG_WORKQ | KEVENT_FLAG_UNBIND_CHECK_FLAGS;
1020                         if (tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET) {
1021                                 kevent_flags |= KEVENT_FLAG_WORKQ_MANAGER;
1022                         }
1023
1024                         workqueue_unlock(wq);
1025                         __assert_only int ret = kevent_qos_internal_unbind(p, class_index_get_thread_qos(tl->th_priority), th, kevent_flags);
1026                         assert(ret == 0);
1027                 } else {
1028                         workqueue_unlock(wq);
1029                 }
1030         }
1031
1032 qos:
1033         if ((flags & _PTHREAD_SET_SELF_QOS_FLAG) != 0) {
1034                 kr = pthread_kern->thread_policy_get(th, THREAD_QOS_POLICY, (thread_policy_t)&qos, &nqos, &gd);
1035                 if (kr != KERN_SUCCESS) {
1036                         qos_rv = EINVAL;
1037                         goto voucher;
1038                 }
1039
1040                 /*
1041                  * If we have main-thread QoS then we don't allow a thread to come out
1042                  * of QOS_CLASS_UNSPECIFIED.
1043                  */
1044                 if (pthread_kern->qos_main_thread_active() && qos.qos_tier ==
1045                                 THREAD_QOS_UNSPECIFIED) {
1046                         qos_rv = EPERM;
1047                         goto voucher;
1048                 }
1049
1050                 if (!tl) {
1051                         tl = util_get_thread_threadlist_entry(th);
1052                         if (tl) wq = tl->th_workq;
1053                 }
1054
1055                 PTHREAD_TRACE_WQ(TRACE_pthread_set_qos_self | DBG_FUNC_START, wq, qos.qos_tier, qos.tier_importance, 0, 0);
1056
1057                 qos.qos_tier = pthread_priority_get_thread_qos(priority);
1058                 qos.tier_importance = (qos.qos_tier == QOS_CLASS_UNSPECIFIED) ? 0 : _pthread_priority_get_relpri(priority);
1059
1060                 if (qos.qos_tier == QOS_CLASS_UNSPECIFIED ||
1061                                 qos.tier_importance > 0 || qos.tier_importance < THREAD_QOS_MIN_TIER_IMPORTANCE) {
1062                         qos_rv = EINVAL;
1063                         goto voucher;
1064                 }
1065
1066                 /*
1067                  * If we're a workqueue, the threadlist item priority needs adjusting,
1068                  * along with the bucket we were running in.
1069                  */
1070                 if (tl) {
1071                         bool try_run_threadreq = false;
1072
1073                         workqueue_lock_spin(wq);
1074                         kr = pthread_kern->thread_set_workq_qos(th, qos.qos_tier, qos.tier_importance);
1075                         assert(kr == KERN_SUCCESS || kr == KERN_TERMINATED);
1076
1077                         /* Fix up counters. */
1078                         uint8_t old_bucket = tl->th_priority;
1079                         uint8_t new_bucket = pthread_priority_get_class_index(priority);
1080
1081                         if (old_bucket != new_bucket) {
1082                                 _wq_thactive_move(wq, old_bucket, new_bucket);
1083                                 wq->wq_thscheduled_count[old_bucket]--;
1084                                 wq->wq_thscheduled_count[new_bucket]++;
1085                                 if (old_bucket == WORKQUEUE_EVENT_MANAGER_BUCKET ||
1086                                                 old_bucket < new_bucket) {
1087                                         /*
1088                                          * if the QoS of the thread was lowered, then this could
1089                                          * allow for a higher QoS thread request to run, so we need
1090                                          * to reevaluate.
1091                                          */
1092                                         try_run_threadreq = true;
1093                                 }
1094                                 tl->th_priority = new_bucket;
1095                         }
1096
1097                         bool old_overcommit = !(tl->th_flags & TH_LIST_CONSTRAINED);
1098                         bool new_overcommit = priority & _PTHREAD_PRIORITY_OVERCOMMIT_FLAG;
1099                         if (!old_overcommit && new_overcommit) {
1100                                 if (wq->wq_constrained_threads_scheduled-- ==
1101                                                 wq_max_constrained_threads) {
1102                                         try_run_threadreq = true;
1103                                 }
1104                                 tl->th_flags &= ~TH_LIST_CONSTRAINED;
1105                         } else if (old_overcommit && !new_overcommit) {
1106                                 wq->wq_constrained_threads_scheduled++;
1107                                 tl->th_flags |= TH_LIST_CONSTRAINED;
1108                         }
1109
1110                         if (try_run_threadreq) {
1111                                 workqueue_run_threadreq_and_unlock(p, wq, NULL, NULL, true);
1112                         } else {
1113                                 workqueue_unlock(wq);
1114                         }
1115                 } else {
1116                         kr = pthread_kern->thread_policy_set_internal(th, THREAD_QOS_POLICY, (thread_policy_t)&qos, THREAD_QOS_POLICY_COUNT);
1117                         if (kr != KERN_SUCCESS) {
1118                                 qos_rv = EINVAL;
1119                         }
1120                 }
1121
1122                 PTHREAD_TRACE_WQ(TRACE_pthread_set_qos_self | DBG_FUNC_END, wq, qos.qos_tier, qos.tier_importance, 0, 0);
1123         }
1124
1125 voucher:
1126         if ((flags & _PTHREAD_SET_SELF_VOUCHER_FLAG) != 0) {
1127                 kr = pthread_kern->thread_set_voucher_name(voucher);
1128                 if (kr != KERN_SUCCESS) {
1129                         voucher_rv = ENOENT;
1130                         goto fixedpri;
1131                 }
1132         }
1133
1134 fixedpri:
1135         if (qos_rv) goto done;
1136         if ((flags & _PTHREAD_SET_SELF_FIXEDPRIORITY_FLAG) != 0) {
1137                 thread_extended_policy_data_t extpol = {.timeshare = 0};
1138
1139                 if (!tl) tl  = util_get_thread_threadlist_entry(th);
1140                 if (tl) {
1141                         /* Not allowed on workqueue threads */
1142                         fixedpri_rv = ENOTSUP;
1143                         goto done;
1144                 }
1145
1146                 kr = pthread_kern->thread_policy_set_internal(th, THREAD_EXTENDED_POLICY, (thread_policy_t)&extpol, THREAD_EXTENDED_POLICY_COUNT);
1147                 if (kr != KERN_SUCCESS) {
1148                         fixedpri_rv = EINVAL;
1149                         goto done;
1150                 }
1151         } else if ((flags & _PTHREAD_SET_SELF_TIMESHARE_FLAG) != 0) {
1152                 thread_extended_policy_data_t extpol = {.timeshare = 1};
1153
1154                 if (!tl) tl = util_get_thread_threadlist_entry(th);
1155                 if (tl) {
1156                         /* Not allowed on workqueue threads */
1157                         fixedpri_rv = ENOTSUP;
1158                         goto done;
1159                 }
1160
1161                 kr = pthread_kern->thread_policy_set_internal(th, THREAD_EXTENDED_POLICY, (thread_policy_t)&extpol, THREAD_EXTENDED_POLICY_COUNT);
1162                 if (kr != KERN_SUCCESS) {
1163                         fixedpri_rv = EINVAL;
1164                         goto done;
1165                 }
1166         }
1167
1168 done:
1169         if (qos_rv && voucher_rv) {
1170                 /* Both failed, give that a unique error. */
1171                 return EBADMSG;
1172         }
1173
1174         if (qos_rv) {
1175                 return qos_rv;
1176         }
1177
1178         if (voucher_rv) {
1179                 return voucher_rv;
1180         }
1181
1182         if (fixedpri_rv) {
1183                 return fixedpri_rv;
1184         }
1185
1186         return 0;
1187 }
1188
1189 int
1190 _bsdthread_ctl_qos_override_start(struct proc __unused *p, user_addr_t __unused cmd, mach_port_name_t kport, pthread_priority_t priority, user_addr_t resource, int __unused *retval)
1191 {
1192         thread_t th;
1193         int rv = 0;
1194
1195         if ((th = port_name_to_thread(kport)) == THREAD_NULL) {
1196                 return ESRCH;
1197         }
1198
1199         int override_qos = pthread_priority_get_thread_qos(priority);
1200
1201         struct threadlist *tl = util_get_thread_threadlist_entry(th);
1202         if (tl) {
1203                 PTHREAD_TRACE_WQ(TRACE_wq_override_start | DBG_FUNC_NONE, tl->th_workq, thread_tid(th), 1, priority, 0);
1204         }
1205
1206         /* The only failure case here is if we pass a tid and have it lookup the thread, we pass the uthread, so this all always succeeds. */
1207         pthread_kern->proc_usynch_thread_qos_add_override_for_resource_check_owner(th, override_qos, TRUE,
1208                         resource, THREAD_QOS_OVERRIDE_TYPE_PTHREAD_EXPLICIT_OVERRIDE, USER_ADDR_NULL, MACH_PORT_NULL);
1209         thread_deallocate(th);
1210         return rv;
1211 }
1212
1213 int
1214 _bsdthread_ctl_qos_override_end(struct proc __unused *p, user_addr_t __unused cmd, mach_port_name_t kport, user_addr_t resource, user_addr_t arg3, int __unused *retval)
1215 {
1216         thread_t th;
1217         int rv = 0;
1218
1219         if (arg3 != 0) {
1220                 return EINVAL;
1221         }
1222
1223         if ((th = port_name_to_thread(kport)) == THREAD_NULL) {
1224                 return ESRCH;
1225         }
1226
1227         struct uthread *uth = pthread_kern->get_bsdthread_info(th);
1228
1229         struct threadlist *tl = util_get_thread_threadlist_entry(th);
1230         if (tl) {
1231                 PTHREAD_TRACE_WQ(TRACE_wq_override_end | DBG_FUNC_NONE, tl->th_workq, thread_tid(th), 0, 0, 0);
1232         }
1233
1234         pthread_kern->proc_usynch_thread_qos_remove_override_for_resource(current_task(), uth, 0, resource, THREAD_QOS_OVERRIDE_TYPE_PTHREAD_EXPLICIT_OVERRIDE);
1235
1236         thread_deallocate(th);
1237         return rv;
1238 }
1239
1240 static int
1241 _bsdthread_ctl_qos_dispatch_asynchronous_override_add_internal(mach_port_name_t kport, pthread_priority_t priority, user_addr_t resource, user_addr_t ulock_addr)
1242 {
1243         thread_t th;
1244         int rv = 0;
1245
1246         if ((th = port_name_to_thread(kport)) == THREAD_NULL) {
1247                 return ESRCH;
1248         }
1249
1250         int override_qos = pthread_priority_get_thread_qos(priority);
1251
1252         struct threadlist *tl = util_get_thread_threadlist_entry(th);
1253         if (!tl) {
1254                 thread_deallocate(th);
1255                 return EPERM;
1256         }
1257
1258         PTHREAD_TRACE_WQ(TRACE_wq_override_dispatch | DBG_FUNC_NONE, tl->th_workq, thread_tid(th), 1, priority, 0);
1259
1260         rv = pthread_kern->proc_usynch_thread_qos_add_override_for_resource_check_owner(th, override_qos, TRUE,
1261                         resource, THREAD_QOS_OVERRIDE_TYPE_DISPATCH_ASYNCHRONOUS_OVERRIDE, ulock_addr, kport);
1262
1263         thread_deallocate(th);
1264         return rv;
1265 }
1266
1267 int _bsdthread_ctl_qos_dispatch_asynchronous_override_add(struct proc __unused *p, user_addr_t __unused cmd,
1268                 mach_port_name_t kport, pthread_priority_t priority, user_addr_t resource, int __unused *retval)
1269 {
1270         return _bsdthread_ctl_qos_dispatch_asynchronous_override_add_internal(kport, priority, resource, USER_ADDR_NULL);
1271 }
1272
1273 int
1274 _bsdthread_ctl_qos_override_dispatch(struct proc *p __unused, user_addr_t cmd __unused, mach_port_name_t kport, pthread_priority_t priority, user_addr_t ulock_addr, int __unused *retval)
1275 {
1276         return _bsdthread_ctl_qos_dispatch_asynchronous_override_add_internal(kport, priority, USER_ADDR_NULL, ulock_addr);
1277 }
1278
1279 int
1280 _bsdthread_ctl_qos_override_reset(struct proc *p, user_addr_t cmd, user_addr_t arg1, user_addr_t arg2, user_addr_t arg3, int *retval)
1281 {
1282         if (arg1 != 0 || arg2 != 0 || arg3 != 0) {
1283                 return EINVAL;
1284         }
1285
1286         return _bsdthread_ctl_qos_dispatch_asynchronous_override_reset(p, cmd, 1 /* reset_all */, 0, 0, retval);
1287 }
1288
1289 int
1290 _bsdthread_ctl_qos_dispatch_asynchronous_override_reset(struct proc __unused *p, user_addr_t __unused cmd, int reset_all, user_addr_t resource, user_addr_t arg3, int __unused *retval)
1291 {
1292         if ((reset_all && (resource != 0)) || arg3 != 0) {
1293                 return EINVAL;
1294         }
1295
1296         thread_t th = current_thread();
1297         struct uthread *uth = pthread_kern->get_bsdthread_info(th);
1298         struct threadlist *tl = pthread_kern->uthread_get_threadlist(uth);
1299
1300         if (!tl) {
1301                 return EPERM;
1302         }
1303
1304         PTHREAD_TRACE_WQ(TRACE_wq_override_reset | DBG_FUNC_NONE, tl->th_workq, 0, 0, 0, 0);
1305
1306         resource = reset_all ? THREAD_QOS_OVERRIDE_RESOURCE_WILDCARD : resource;
1307         pthread_kern->proc_usynch_thread_qos_reset_override_for_resource(current_task(), uth, 0, resource, THREAD_QOS_OVERRIDE_TYPE_DISPATCH_ASYNCHRONOUS_OVERRIDE);
1308
1309         return 0;
1310 }
1311
1312 static int
1313 _bsdthread_ctl_max_parallelism(struct proc __unused *p, user_addr_t __unused cmd,
1314                 int qos, unsigned long flags, int *retval)
1315 {
1316         _Static_assert(QOS_PARALLELISM_COUNT_LOGICAL ==
1317                         _PTHREAD_QOS_PARALLELISM_COUNT_LOGICAL, "logical");
1318         _Static_assert(QOS_PARALLELISM_REALTIME ==
1319                         _PTHREAD_QOS_PARALLELISM_REALTIME, "realtime");
1320
1321         if (flags & ~(QOS_PARALLELISM_REALTIME | QOS_PARALLELISM_COUNT_LOGICAL)) {
1322                 return EINVAL;
1323         }
1324
1325         if (flags & QOS_PARALLELISM_REALTIME) {
1326                 if (qos) {
1327                         return EINVAL;
1328                 }
1329         } else if (qos == THREAD_QOS_UNSPECIFIED || qos >= THREAD_QOS_LAST) {
1330                 return EINVAL;
1331         }
1332
1333         *retval = pthread_kern->qos_max_parallelism(qos, flags);
1334         return 0;
1335 }
1336
1337 int
1338 _bsdthread_ctl(struct proc *p, user_addr_t cmd, user_addr_t arg1, user_addr_t arg2, user_addr_t arg3, int *retval)
1339 {
1340         switch (cmd) {
1341         case BSDTHREAD_CTL_SET_QOS:
1342                 return _bsdthread_ctl_set_qos(p, cmd, (mach_port_name_t)arg1, arg2, arg3, retval);
1343         case BSDTHREAD_CTL_QOS_OVERRIDE_START:
1344                 return _bsdthread_ctl_qos_override_start(p, cmd, (mach_port_name_t)arg1, (pthread_priority_t)arg2, arg3, retval);
1345         case BSDTHREAD_CTL_QOS_OVERRIDE_END:
1346                 return _bsdthread_ctl_qos_override_end(p, cmd, (mach_port_name_t)arg1, arg2, arg3, retval);
1347         case BSDTHREAD_CTL_QOS_OVERRIDE_RESET:
1348                 return _bsdthread_ctl_qos_override_reset(p, cmd, arg1, arg2, arg3, retval);
1349         case BSDTHREAD_CTL_QOS_OVERRIDE_DISPATCH:
1350                 return _bsdthread_ctl_qos_override_dispatch(p, cmd, (mach_port_name_t)arg1, (pthread_priority_t)arg2, arg3, retval);
1351         case BSDTHREAD_CTL_QOS_DISPATCH_ASYNCHRONOUS_OVERRIDE_ADD:
1352                 return _bsdthread_ctl_qos_dispatch_asynchronous_override_add(p, cmd, (mach_port_name_t)arg1, (pthread_priority_t)arg2, arg3, retval);
1353         case BSDTHREAD_CTL_QOS_DISPATCH_ASYNCHRONOUS_OVERRIDE_RESET:
1354                 return _bsdthread_ctl_qos_dispatch_asynchronous_override_reset(p, cmd, (int)arg1, arg2, arg3, retval);
1355         case BSDTHREAD_CTL_SET_SELF:
1356                 return _bsdthread_ctl_set_self(p, cmd, (pthread_priority_t)arg1, (mach_port_name_t)arg2, (_pthread_set_flags_t)arg3, retval);
1357         case BSDTHREAD_CTL_QOS_MAX_PARALLELISM:
1358                 return _bsdthread_ctl_max_parallelism(p, cmd, (int)arg1, (unsigned long)arg2, retval);
1359         default:
1360                 return EINVAL;
1361         }
1362 }
1363
1364 #pragma mark - Workqueue Implementation
1365
1366 #pragma mark wq_flags
1367
1368 static inline uint32_t
1369 _wq_flags(struct workqueue *wq)
1370 {
1371         return atomic_load_explicit(&wq->wq_flags, memory_order_relaxed);
1372 }
1373
1374 static inline bool
1375 _wq_exiting(struct workqueue *wq)
1376 {
1377         return _wq_flags(wq) & WQ_EXITING;
1378 }
1379
1380 static inline uint32_t
1381 _wq_flags_or_orig(struct workqueue *wq, uint32_t v)
1382 {
1383 #if PTHREAD_INLINE_RMW_ATOMICS
1384         uint32_t state;
1385         do {
1386                 state = _wq_flags(wq);
1387         } while (!OSCompareAndSwap(state, state | v, &wq->wq_flags));
1388         return state;
1389 #else
1390         return atomic_fetch_or_explicit(&wq->wq_flags, v, memory_order_relaxed);
1391 #endif
1392 }
1393
1394 static inline uint32_t
1395 _wq_flags_and_orig(struct workqueue *wq, uint32_t v)
1396 {
1397 #if PTHREAD_INLINE_RMW_ATOMICS
1398         uint32_t state;
1399         do {
1400                 state = _wq_flags(wq);
1401         } while (!OSCompareAndSwap(state, state & v, &wq->wq_flags));
1402         return state;
1403 #else
1404         return atomic_fetch_and_explicit(&wq->wq_flags, v, memory_order_relaxed);
1405 #endif
1406 }
1407
1408 static inline bool
1409 WQ_TIMER_DELAYED_NEEDED(struct workqueue *wq)
1410 {
1411         uint32_t oldflags, newflags;
1412         do {
1413                 oldflags = _wq_flags(wq);
1414                 if (oldflags & (WQ_EXITING | WQ_ATIMER_DELAYED_RUNNING)) {
1415                         return false;
1416                 }
1417                 newflags = oldflags | WQ_ATIMER_DELAYED_RUNNING;
1418         } while (!OSCompareAndSwap(oldflags, newflags, &wq->wq_flags));
1419         return true;
1420 }
1421
1422 static inline bool
1423 WQ_TIMER_IMMEDIATE_NEEDED(struct workqueue *wq)
1424 {
1425         uint32_t oldflags, newflags;
1426         do {
1427                 oldflags = _wq_flags(wq);
1428                 if (oldflags & (WQ_EXITING | WQ_ATIMER_IMMEDIATE_RUNNING)) {
1429                         return false;
1430                 }
1431                 newflags = oldflags | WQ_ATIMER_IMMEDIATE_RUNNING;
1432         } while (!OSCompareAndSwap(oldflags, newflags, &wq->wq_flags));
1433         return true;
1434 }
1435
1436 #pragma mark thread requests pacing
1437
1438 static inline uint32_t
1439 _wq_pacing_shift_for_pri(int pri)
1440 {
1441         return _wq_bucket_to_thread_qos(pri) - 1;
1442 }
1443
1444 static inline int
1445 _wq_highest_paced_priority(struct workqueue *wq)
1446 {
1447         uint8_t paced = wq->wq_paced;
1448         int msb = paced ? 32 - __builtin_clz(paced) : 0; // fls(paced) == bit + 1
1449         return WORKQUEUE_EVENT_MANAGER_BUCKET - msb;
1450 }
1451
1452 static inline uint8_t
1453 _wq_pacing_bit_for_pri(int pri)
1454 {
1455         return 1u << _wq_pacing_shift_for_pri(pri);
1456 }
1457
1458 static inline bool
1459 _wq_should_pace_priority(struct workqueue *wq, int pri)
1460 {
1461         return wq->wq_paced >= _wq_pacing_bit_for_pri(pri);
1462 }
1463
1464 static inline void
1465 _wq_pacing_start(struct workqueue *wq, struct threadlist *tl)
1466 {
1467         uint8_t bit = _wq_pacing_bit_for_pri(tl->th_priority);
1468         assert((tl->th_flags & TH_LIST_PACING) == 0);
1469         assert((wq->wq_paced & bit) == 0);
1470         wq->wq_paced |= bit;
1471         tl->th_flags |= TH_LIST_PACING;
1472 }
1473
1474 static inline bool
1475 _wq_pacing_end(struct workqueue *wq, struct threadlist *tl)
1476 {
1477         if (tl->th_flags & TH_LIST_PACING) {
1478                 uint8_t bit = _wq_pacing_bit_for_pri(tl->th_priority);
1479                 assert((wq->wq_paced & bit) != 0);
1480                 wq->wq_paced ^= bit;
1481                 tl->th_flags &= ~TH_LIST_PACING;
1482                 return wq->wq_paced < bit; // !_wq_should_pace_priority
1483         }
1484         return false;
1485 }
1486
1487 #pragma mark thread requests
1488
1489 static void
1490 _threadreq_init_alloced(struct threadreq *req, int priority, int flags)
1491 {
1492         assert((flags & TR_FLAG_ONSTACK) == 0);
1493         req->tr_state = TR_STATE_NEW;
1494         req->tr_priority = priority;
1495         req->tr_flags = flags;
1496 }
1497
1498 static void
1499 _threadreq_init_stack(struct threadreq *req, int priority, int flags)
1500 {
1501         req->tr_state = TR_STATE_NEW;
1502         req->tr_priority = priority;
1503         req->tr_flags = flags | TR_FLAG_ONSTACK;
1504 }
1505
1506 static void
1507 _threadreq_copy_prepare(struct workqueue *wq)
1508 {
1509 again:
1510         if (wq->wq_cached_threadreq) {
1511                 return;
1512         }
1513
1514         workqueue_unlock(wq);
1515         struct threadreq *req = zalloc(pthread_zone_threadreq);
1516         workqueue_lock_spin(wq);
1517
1518         if (wq->wq_cached_threadreq) {
1519                 /*
1520                  * We lost the race and someone left behind an extra threadreq for us
1521                  * to use.  Throw away our request and retry.
1522                  */
1523                 workqueue_unlock(wq);
1524                 zfree(pthread_zone_threadreq, req);
1525                 workqueue_lock_spin(wq);
1526                 goto again;
1527         } else {
1528                 wq->wq_cached_threadreq = req;
1529         }
1530
1531         assert(wq->wq_cached_threadreq);
1532 }
1533
1534 static bool
1535 _threadreq_copy_prepare_noblock(struct workqueue *wq)
1536 {
1537         if (wq->wq_cached_threadreq) {
1538                 return true;
1539         }
1540
1541         wq->wq_cached_threadreq = zalloc_noblock(pthread_zone_threadreq);
1542
1543         return wq->wq_cached_threadreq != NULL;
1544 }
1545
1546 static inline struct threadreq_head *
1547 _threadreq_list_for_req(struct workqueue *wq, const struct threadreq *req)
1548 {
1549         if (req->tr_flags & TR_FLAG_OVERCOMMIT) {
1550                 return &wq->wq_overcommit_reqlist[req->tr_priority];
1551         } else {
1552                 return &wq->wq_reqlist[req->tr_priority];
1553         }
1554 }
1555
1556 static void
1557 _threadreq_enqueue(struct workqueue *wq, struct threadreq *req)
1558 {
1559         assert(req && req->tr_state == TR_STATE_NEW);
1560         if (req->tr_priority == WORKQUEUE_EVENT_MANAGER_BUCKET) {
1561                 assert(wq->wq_event_manager_threadreq.tr_state != TR_STATE_WAITING);
1562                 memcpy(&wq->wq_event_manager_threadreq, req, sizeof(struct threadreq));
1563                 req = &wq->wq_event_manager_threadreq;
1564                 req->tr_flags &= ~(TR_FLAG_ONSTACK | TR_FLAG_NO_PACING);
1565         } else {
1566                 if (req->tr_flags & TR_FLAG_ONSTACK) {
1567                         assert(wq->wq_cached_threadreq);
1568                         struct threadreq *newreq = wq->wq_cached_threadreq;
1569                         wq->wq_cached_threadreq = NULL;
1570
1571                         memcpy(newreq, req, sizeof(struct threadreq));
1572                         newreq->tr_flags &= ~(TR_FLAG_ONSTACK | TR_FLAG_NO_PACING);
1573                         req->tr_state = TR_STATE_DEAD;
1574                         req = newreq;
1575                 }
1576                 TAILQ_INSERT_TAIL(_threadreq_list_for_req(wq, req), req, tr_entry);
1577         }
1578         req->tr_state = TR_STATE_WAITING;
1579         wq->wq_reqcount++;
1580 }
1581
1582 static void
1583 _threadreq_dequeue(struct workqueue *wq, struct threadreq *req)
1584 {
1585         if (req->tr_priority != WORKQUEUE_EVENT_MANAGER_BUCKET) {
1586                 struct threadreq_head *req_list = _threadreq_list_for_req(wq, req);
1587 #if DEBUG
1588                 struct threadreq *cursor = NULL;
1589                 TAILQ_FOREACH(cursor, req_list, tr_entry) {
1590                         if (cursor == req) break;
1591                 }
1592                 assert(cursor == req);
1593 #endif
1594                 TAILQ_REMOVE(req_list, req, tr_entry);
1595         }
1596         wq->wq_reqcount--;
1597 }
1598
1599 /*
1600  * Mark a thread request as complete.  At this point, it is treated as owned by
1601  * the submitting subsystem and you should assume it could be freed.
1602  *
1603  * Called with the workqueue lock held.
1604  */
1605 static int
1606 _threadreq_complete_and_unlock(proc_t p, struct workqueue *wq,
1607                 struct threadreq *req, struct threadlist *tl)
1608 {
1609         struct threadreq *req_tofree = NULL;
1610         bool sync = (req->tr_state == TR_STATE_NEW);
1611         bool workloop = req->tr_flags & TR_FLAG_WORKLOOP;
1612         bool onstack = req->tr_flags & TR_FLAG_ONSTACK;
1613         bool kevent = req->tr_flags & TR_FLAG_KEVENT;
1614         bool unbinding = tl->th_flags & TH_LIST_UNBINDING;
1615         bool locked = true;
1616         bool waking_parked_thread = (tl->th_flags & TH_LIST_BUSY);
1617         int ret;
1618
1619         req->tr_state = TR_STATE_COMPLETE;
1620
1621         if (!workloop && !onstack && req != &wq->wq_event_manager_threadreq) {
1622                 if (wq->wq_cached_threadreq) {
1623                         req_tofree = req;
1624                 } else {
1625                         wq->wq_cached_threadreq = req;
1626                 }
1627         }
1628
1629         if (tl->th_flags & TH_LIST_UNBINDING) {
1630                 tl->th_flags &= ~TH_LIST_UNBINDING;
1631                 assert((tl->th_flags & TH_LIST_KEVENT_BOUND));
1632         } else if (workloop || kevent) {
1633                 assert((tl->th_flags & TH_LIST_KEVENT_BOUND) == 0);
1634                 tl->th_flags |= TH_LIST_KEVENT_BOUND;
1635         }
1636
1637         if (workloop) {
1638                 workqueue_unlock(wq);
1639                 ret = pthread_kern->workloop_fulfill_threadreq(wq->wq_proc, (void*)req,
1640                                 tl->th_thread, sync ? WORKLOOP_FULFILL_THREADREQ_SYNC : 0);
1641                 assert(ret == 0);
1642                 locked = false;
1643         } else if (kevent) {
1644                 unsigned int kevent_flags = KEVENT_FLAG_WORKQ;
1645                 if (sync) {
1646                         kevent_flags |= KEVENT_FLAG_SYNCHRONOUS_BIND;
1647                 }
1648                 if (tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET) {
1649                         kevent_flags |= KEVENT_FLAG_WORKQ_MANAGER;
1650                 }
1651                 workqueue_unlock(wq);
1652                 ret = kevent_qos_internal_bind(wq->wq_proc,
1653                                 class_index_get_thread_qos(tl->th_priority), tl->th_thread,
1654                                 kevent_flags);
1655                 if (ret != 0) {
1656                         workqueue_lock_spin(wq);
1657                         tl->th_flags &= ~TH_LIST_KEVENT_BOUND;
1658                         locked = true;
1659                 } else {
1660                         locked = false;
1661                 }
1662         }
1663
1664         /*
1665          * Run Thread, Run!
1666          */
1667         PTHREAD_TRACE_WQ(TRACE_wq_run_threadreq | DBG_FUNC_END, wq, 0, 0, 0, 0);
1668         PTHREAD_TRACE_WQ_REQ(TRACE_wq_runitem | DBG_FUNC_START, wq, req, tl->th_priority,
1669                         thread_tid(current_thread()), thread_tid(tl->th_thread));
1670
1671         if (waking_parked_thread) {
1672                 if (!locked) {
1673                         workqueue_lock_spin(wq);
1674                 }
1675                 tl->th_flags &= ~(TH_LIST_BUSY);
1676                 if ((tl->th_flags & TH_LIST_REMOVING_VOUCHER) == 0) {
1677                         /*
1678                          * If the thread is in the process of removing its voucher, then it
1679                          * isn't actually in the wait event yet and we don't need to wake
1680                          * it up.  Save the trouble (and potential lock-ordering issues
1681                          * (see 30617015)).
1682                          */
1683                         thread_wakeup_thread(tl, tl->th_thread);
1684                 }
1685                 workqueue_unlock(wq);
1686
1687                 if (req_tofree) zfree(pthread_zone_threadreq, req_tofree);
1688                 return WQ_RUN_TR_THREAD_STARTED;
1689         }
1690
1691         assert ((tl->th_flags & TH_LIST_PACING) == 0);
1692         if (locked) {
1693                 workqueue_unlock(wq);
1694         }
1695         if (req_tofree) zfree(pthread_zone_threadreq, req_tofree);
1696         if (unbinding) {
1697                 return WQ_RUN_TR_THREAD_STARTED;
1698         }
1699         _setup_wqthread(p, tl->th_thread, wq, tl, WQ_SETUP_CLEAR_VOUCHER);
1700         pthread_kern->unix_syscall_return(EJUSTRETURN);
1701         __builtin_unreachable();
1702 }
1703
1704 /*
1705  * Mark a thread request as cancelled.  Has similar ownership semantics to the
1706  * complete call above.
1707  */
1708 static void
1709 _threadreq_cancel(struct workqueue *wq, struct threadreq *req)
1710 {
1711         assert(req->tr_state == TR_STATE_WAITING);
1712         req->tr_state = TR_STATE_DEAD;
1713
1714         assert((req->tr_flags & TR_FLAG_ONSTACK) == 0);
1715         if (req->tr_flags & TR_FLAG_WORKLOOP) {
1716                 __assert_only int ret;
1717                 ret = pthread_kern->workloop_fulfill_threadreq(wq->wq_proc, (void*)req,
1718                                 THREAD_NULL, WORKLOOP_FULFILL_THREADREQ_CANCEL);
1719                 assert(ret == 0 || ret == ECANCELED);
1720         } else if (req != &wq->wq_event_manager_threadreq) {
1721                 zfree(pthread_zone_threadreq, req);
1722         }
1723 }
1724
1725 #pragma mark workqueue lock
1726
1727 static boolean_t workqueue_lock_spin_is_acquired_kdp(struct workqueue *wq) {
1728   return kdp_lck_spin_is_acquired(&wq->wq_lock);
1729 }
1730
1731 static void
1732 workqueue_lock_spin(struct workqueue *wq)
1733 {
1734         assert(ml_get_interrupts_enabled() == TRUE);
1735         lck_spin_lock(&wq->wq_lock);
1736 }
1737
1738 static bool
1739 workqueue_lock_try(struct workqueue *wq)
1740 {
1741         return lck_spin_try_lock(&wq->wq_lock);
1742 }
1743
1744 static void
1745 workqueue_unlock(struct workqueue *wq)
1746 {
1747         lck_spin_unlock(&wq->wq_lock);
1748 }
1749
1750 #pragma mark workqueue add timer
1751
1752 /**
1753  * Sets up the timer which will call out to workqueue_add_timer
1754  */
1755 static void
1756 workqueue_interval_timer_start(struct workqueue *wq)
1757 {
1758         uint64_t deadline;
1759
1760         /* n.b. wq_timer_interval is reset to 0 in workqueue_add_timer if the
1761          ATIMER_RUNNING flag is not present.  The net effect here is that if a
1762          sequence of threads is required, we'll double the time before we give out
1763          the next one. */
1764         if (wq->wq_timer_interval == 0) {
1765                 wq->wq_timer_interval = wq_stalled_window_usecs;
1766
1767         } else {
1768                 wq->wq_timer_interval = wq->wq_timer_interval * 2;
1769
1770                 if (wq->wq_timer_interval > wq_max_timer_interval_usecs) {
1771                         wq->wq_timer_interval = wq_max_timer_interval_usecs;
1772                 }
1773         }
1774         clock_interval_to_deadline(wq->wq_timer_interval, 1000, &deadline);
1775
1776         PTHREAD_TRACE_WQ(TRACE_wq_start_add_timer, wq, wq->wq_reqcount,
1777                         _wq_flags(wq), wq->wq_timer_interval, 0);
1778
1779         thread_call_t call = wq->wq_atimer_delayed_call;
1780         if (thread_call_enter1_delayed(call, call, deadline)) {
1781                 panic("delayed_call was already enqueued");
1782         }
1783 }
1784
1785 /**
1786  * Immediately trigger the workqueue_add_timer
1787  */
1788 static void
1789 workqueue_interval_timer_trigger(struct workqueue *wq)
1790 {
1791         PTHREAD_TRACE_WQ(TRACE_wq_start_add_timer, wq, wq->wq_reqcount,
1792                         _wq_flags(wq), 0, 0);
1793
1794         thread_call_t call = wq->wq_atimer_immediate_call;
1795         if (thread_call_enter1(call, call)) {
1796                 panic("immediate_call was already enqueued");
1797         }
1798 }
1799
1800 /**
1801  * returns whether lastblocked_tsp is within wq_stalled_window_usecs of cur_ts
1802  */
1803 static boolean_t
1804 wq_thread_is_busy(uint64_t cur_ts, _Atomic uint64_t *lastblocked_tsp)
1805 {
1806         clock_sec_t     secs;
1807         clock_usec_t    usecs;
1808         uint64_t lastblocked_ts;
1809         uint64_t elapsed;
1810
1811         lastblocked_ts = atomic_load_explicit(lastblocked_tsp, memory_order_relaxed);
1812         if (lastblocked_ts >= cur_ts) {
1813                 /*
1814                  * because the update of the timestamp when a thread blocks isn't
1815                  * serialized against us looking at it (i.e. we don't hold the workq lock)
1816                  * it's possible to have a timestamp that matches the current time or
1817                  * that even looks to be in the future relative to when we grabbed the current
1818                  * time... just treat this as a busy thread since it must have just blocked.
1819                  */
1820                 return (TRUE);
1821         }
1822         elapsed = cur_ts - lastblocked_ts;
1823
1824         pthread_kern->absolutetime_to_microtime(elapsed, &secs, &usecs);
1825
1826         return (secs == 0 && usecs < wq_stalled_window_usecs);
1827 }
1828
1829 /**
1830  * handler function for the timer
1831  */
1832 static void
1833 workqueue_add_timer(struct workqueue *wq, thread_call_t thread_call_self)
1834 {
1835         proc_t p = wq->wq_proc;
1836
1837         workqueue_lock_spin(wq);
1838
1839         PTHREAD_TRACE_WQ(TRACE_wq_add_timer | DBG_FUNC_START, wq,
1840                         _wq_flags(wq), wq->wq_nthreads, wq->wq_thidlecount, 0);
1841
1842         /*
1843          * There's two tricky issues here.
1844          *
1845          * First issue: we start the thread_call's that invoke this routine without
1846          * the workqueue lock held.  The scheduler callback needs to trigger
1847          * reevaluation of the number of running threads but shouldn't take that
1848          * lock, so we can't use it to synchronize state around the thread_call.
1849          * As a result, it might re-enter the thread_call while this routine is
1850          * already running.  This could cause it to fire a second time and we'll
1851          * have two add_timers running at once.  Obviously, we don't want that to
1852          * keep stacking, so we need to keep it at two timers.
1853          *
1854          * Solution: use wq_flags (accessed via atomic CAS) to synchronize the
1855          * enqueue of the thread_call itself.  When a thread needs to trigger the
1856          * add_timer, it checks for ATIMER_DELAYED_RUNNING and, when not set, sets
1857          * the flag then does a thread_call_enter.  We'll then remove that flag
1858          * only once we've got the lock and it's safe for the thread_call to be
1859          * entered again.
1860          *
1861          * Second issue: we need to make sure that the two timers don't execute this
1862          * routine concurrently.  We can't use the workqueue lock for this because
1863          * we'll need to drop it during our execution.
1864          *
1865          * Solution: use WQL_ATIMER_BUSY as a condition variable to indicate that
1866          * we are currently executing the routine and the next thread should wait.
1867          *
1868          * After all that, we arrive at the following four possible states:
1869          * !WQ_ATIMER_DELAYED_RUNNING && !WQL_ATIMER_BUSY       no pending timer, no active timer
1870          * !WQ_ATIMER_DELAYED_RUNNING &&  WQL_ATIMER_BUSY       no pending timer,  1 active timer
1871          *  WQ_ATIMER_DELAYED_RUNNING && !WQL_ATIMER_BUSY        1 pending timer, no active timer
1872          *  WQ_ATIMER_DELAYED_RUNNING &&  WQL_ATIMER_BUSY        1 pending timer,  1 active timer
1873          *
1874          * Further complication sometimes we need to trigger this function to run
1875          * without delay.  Because we aren't under a lock between setting
1876          * WQ_ATIMER_DELAYED_RUNNING and calling thread_call_enter, we can't simply
1877          * re-enter the thread call: if thread_call_enter() returned false, we
1878          * wouldn't be able to distinguish the case where the thread_call had
1879          * already fired from the case where it hadn't been entered yet from the
1880          * other thread.  So, we use a separate thread_call for immediate
1881          * invocations, and a separate RUNNING flag, WQ_ATIMER_IMMEDIATE_RUNNING.
1882          */
1883
1884         while (wq->wq_lflags & WQL_ATIMER_BUSY) {
1885                 wq->wq_lflags |= WQL_ATIMER_WAITING;
1886
1887                 assert_wait((caddr_t)wq, (THREAD_UNINT));
1888                 workqueue_unlock(wq);
1889
1890                 thread_block(THREAD_CONTINUE_NULL);
1891
1892                 workqueue_lock_spin(wq);
1893         }
1894         /*
1895          * Prevent _workqueue_mark_exiting() from going away
1896          */
1897         wq->wq_lflags |= WQL_ATIMER_BUSY;
1898
1899         /*
1900          * Decide which timer we are and remove the RUNNING flag.
1901          */
1902         if (thread_call_self == wq->wq_atimer_delayed_call) {
1903                 uint64_t wq_flags = _wq_flags_and_orig(wq, ~WQ_ATIMER_DELAYED_RUNNING);
1904                 if ((wq_flags & WQ_ATIMER_DELAYED_RUNNING) == 0) {
1905                         panic("workqueue_add_timer(delayed) w/o WQ_ATIMER_DELAYED_RUNNING");
1906                 }
1907         } else if (thread_call_self == wq->wq_atimer_immediate_call) {
1908                 uint64_t wq_flags = _wq_flags_and_orig(wq, ~WQ_ATIMER_IMMEDIATE_RUNNING);
1909                 if ((wq_flags & WQ_ATIMER_IMMEDIATE_RUNNING) == 0) {
1910                         panic("workqueue_add_timer(immediate) w/o WQ_ATIMER_IMMEDIATE_RUNNING");
1911                 }
1912         } else {
1913                 panic("workqueue_add_timer can't figure out which timer it is");
1914         }
1915
1916         int ret = WQ_RUN_TR_THREAD_STARTED;
1917         while (ret == WQ_RUN_TR_THREAD_STARTED && wq->wq_reqcount) {
1918                 ret = workqueue_run_threadreq_and_unlock(p, wq, NULL, NULL, true);
1919
1920                 workqueue_lock_spin(wq);
1921         }
1922         _threadreq_copy_prepare(wq);
1923
1924         /*
1925          * If we called WQ_TIMER_NEEDED above, then this flag will be set if that
1926          * call marked the timer running.  If so, we let the timer interval grow.
1927          * Otherwise, we reset it back to 0.
1928          */
1929         uint32_t wq_flags = _wq_flags(wq);
1930         if (!(wq_flags & WQ_ATIMER_DELAYED_RUNNING)) {
1931                 wq->wq_timer_interval = 0;
1932         }
1933
1934         wq->wq_lflags &= ~WQL_ATIMER_BUSY;
1935
1936         if ((wq_flags & WQ_EXITING) || (wq->wq_lflags & WQL_ATIMER_WAITING)) {
1937                 /*
1938                  * wakeup the thread hung up in _workqueue_mark_exiting or
1939                  * workqueue_add_timer waiting for this timer to finish getting out of
1940                  * the way
1941                  */
1942                 wq->wq_lflags &= ~WQL_ATIMER_WAITING;
1943                 wakeup(wq);
1944         }
1945
1946         PTHREAD_TRACE_WQ(TRACE_wq_add_timer | DBG_FUNC_END, wq, 0, wq->wq_nthreads, wq->wq_thidlecount, 0);
1947
1948         workqueue_unlock(wq);
1949 }
1950
1951 #pragma mark thread state tracking
1952
1953 // called by spinlock code when trying to yield to lock owner
1954 void
1955 _workqueue_thread_yielded(void)
1956 {
1957 }
1958
1959 static void
1960 workqueue_callback(int type, thread_t thread)
1961 {
1962         struct uthread *uth = pthread_kern->get_bsdthread_info(thread);
1963         struct threadlist *tl = pthread_kern->uthread_get_threadlist(uth);
1964         struct workqueue *wq = tl->th_workq;
1965         uint32_t old_count, req_qos, qos = tl->th_priority;
1966         wq_thactive_t old_thactive;
1967
1968         switch (type) {
1969         case SCHED_CALL_BLOCK: {
1970                 bool start_timer = false;
1971
1972                 old_thactive = _wq_thactive_dec(wq, tl->th_priority);
1973                 req_qos = WQ_THACTIVE_BEST_CONSTRAINED_REQ_QOS(old_thactive);
1974                 old_count = _wq_thactive_aggregate_downto_qos(wq, old_thactive,
1975                                 qos, NULL, NULL);
1976
1977                 if (old_count == wq_max_concurrency[tl->th_priority]) {
1978                         /*
1979                          * The number of active threads at this priority has fallen below
1980                          * the maximum number of concurrent threads that are allowed to run
1981                          *
1982                          * if we collide with another thread trying to update the
1983                          * last_blocked (really unlikely since another thread would have to
1984                          * get scheduled and then block after we start down this path), it's
1985                          * not a problem.  Either timestamp is adequate, so no need to retry
1986                          */
1987                         atomic_store_explicit(&wq->wq_lastblocked_ts[qos],
1988                                         mach_absolute_time(), memory_order_relaxed);
1989                 }
1990
1991                 if (req_qos == WORKQUEUE_EVENT_MANAGER_BUCKET || qos > req_qos) {
1992                         /*
1993                          * The blocking thread is at a lower QoS than the highest currently
1994                          * pending constrained request, nothing has to be redriven
1995                          */
1996                 } else {
1997                         uint32_t max_busycount, old_req_count;
1998                         old_req_count = _wq_thactive_aggregate_downto_qos(wq, old_thactive,
1999                                         req_qos, NULL, &max_busycount);
2000                         /*
2001                          * If it is possible that may_start_constrained_thread had refused
2002                          * admission due to being over the max concurrency, we may need to
2003                          * spin up a new thread.
2004                          *
2005                          * We take into account the maximum number of busy threads
2006                          * that can affect may_start_constrained_thread as looking at the
2007                          * actual number may_start_constrained_thread will see is racy.
2008                          *
2009                          * IOW at NCPU = 4, for IN (req_qos = 1), if the old req count is
2010                          * between NCPU (4) and NCPU - 2 (2) we need to redrive.
2011                          */
2012                         if (wq_max_concurrency[req_qos] <= old_req_count + max_busycount &&
2013                                         old_req_count <= wq_max_concurrency[req_qos]) {
2014                                 if (WQ_TIMER_DELAYED_NEEDED(wq)) {
2015                                         start_timer = true;
2016                                         workqueue_interval_timer_start(wq);
2017                                 }
2018                         }
2019                 }
2020
2021                 PTHREAD_TRACE_WQ(TRACE_wq_thread_block | DBG_FUNC_START, wq,
2022                                 old_count - 1, qos | (req_qos << 8),
2023                                 wq->wq_reqcount << 1 | start_timer, 0);
2024                 break;
2025         }
2026         case SCHED_CALL_UNBLOCK: {
2027                 /*
2028                  * we cannot take the workqueue_lock here...
2029                  * an UNBLOCK can occur from a timer event which
2030                  * is run from an interrupt context... if the workqueue_lock
2031                  * is already held by this processor, we'll deadlock...
2032                  * the thread lock for the thread being UNBLOCKED
2033                  * is also held
2034                  */
2035                 old_thactive = _wq_thactive_inc(wq, qos);
2036                 if (pthread_debug_tracing) {
2037                         req_qos = WQ_THACTIVE_BEST_CONSTRAINED_REQ_QOS(old_thactive);
2038                         old_count = _wq_thactive_aggregate_downto_qos(wq, old_thactive,
2039                                         qos, NULL, NULL);
2040                         PTHREAD_TRACE_WQ(TRACE_wq_thread_block | DBG_FUNC_END, wq,
2041                                         old_count + 1, qos | (req_qos << 8),
2042                                         wq->wq_threads_scheduled, 0);
2043                 }
2044                 break;
2045         }
2046         }
2047 }
2048
2049 sched_call_t
2050 _workqueue_get_sched_callback(void)
2051 {
2052         return workqueue_callback;
2053 }
2054
2055 #pragma mark thread addition/removal
2056
2057 static mach_vm_size_t
2058 _workqueue_allocsize(struct workqueue *wq)
2059 {
2060         proc_t p = wq->wq_proc;
2061         mach_vm_size_t guardsize = vm_map_page_size(wq->wq_map);
2062         mach_vm_size_t pthread_size =
2063                 vm_map_round_page_mask(pthread_kern->proc_get_pthsize(p) + PTHREAD_T_OFFSET, vm_map_page_mask(wq->wq_map));
2064         return guardsize + PTH_DEFAULT_STACKSIZE + pthread_size;
2065 }
2066
2067 /**
2068  * pop goes the thread
2069  *
2070  * If fromexit is set, the call is from workqueue_exit(,
2071  * so some cleanups are to be avoided.
2072  */
2073 static void
2074 workqueue_removethread(struct threadlist *tl, bool fromexit, bool first_use)
2075 {
2076         struct uthread * uth;
2077         struct workqueue * wq = tl->th_workq;
2078
2079         if (tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET){
2080                 TAILQ_REMOVE(&wq->wq_thidlemgrlist, tl, th_entry);
2081         } else {
2082                 TAILQ_REMOVE(&wq->wq_thidlelist, tl, th_entry);
2083         }
2084
2085         if (fromexit == 0) {
2086                 assert(wq->wq_nthreads && wq->wq_thidlecount);
2087                 wq->wq_nthreads--;
2088                 wq->wq_thidlecount--;
2089         }
2090
2091         /*
2092          * Clear the threadlist pointer in uthread so
2093          * blocked thread on wakeup for termination will
2094          * not access the thread list as it is going to be
2095          * freed.
2096          */
2097         pthread_kern->thread_sched_call(tl->th_thread, NULL);
2098
2099         uth = pthread_kern->get_bsdthread_info(tl->th_thread);
2100         if (uth != (struct uthread *)0) {
2101                 pthread_kern->uthread_set_threadlist(uth, NULL);
2102         }
2103         if (fromexit == 0) {
2104                 /* during exit the lock is not held */
2105                 workqueue_unlock(wq);
2106         }
2107
2108         if ( (tl->th_flags & TH_LIST_NEW) || first_use ) {
2109                 /*
2110                  * thread was created, but never used...
2111                  * need to clean up the stack and port ourselves
2112                  * since we're not going to spin up through the
2113                  * normal exit path triggered from Libc
2114                  */
2115                 if (fromexit == 0) {
2116                         /* vm map is already deallocated when this is called from exit */
2117                         (void)mach_vm_deallocate(wq->wq_map, tl->th_stackaddr, _workqueue_allocsize(wq));
2118                 }
2119                 (void)pthread_kern->mach_port_deallocate(pthread_kern->task_get_ipcspace(wq->wq_task), tl->th_thport);
2120         }
2121         /*
2122          * drop our ref on the thread
2123          */
2124         thread_deallocate(tl->th_thread);
2125
2126         zfree(pthread_zone_threadlist, tl);
2127 }
2128
2129
2130 /**
2131  * Try to add a new workqueue thread.
2132  *
2133  * - called with workq lock held
2134  * - dropped and retaken around thread creation
2135  * - return with workq lock held
2136  */
2137 static bool
2138 workqueue_addnewthread(proc_t p, struct workqueue *wq)
2139 {
2140         kern_return_t kret;
2141
2142         wq->wq_nthreads++;
2143
2144         workqueue_unlock(wq);
2145
2146         struct threadlist *tl = zalloc(pthread_zone_threadlist);
2147         bzero(tl, sizeof(struct threadlist));
2148
2149         thread_t th;
2150         kret = pthread_kern->thread_create_workq_waiting(wq->wq_task, wq_unpark_continue, tl, &th);
2151         if (kret != KERN_SUCCESS) {
2152                 PTHREAD_TRACE_WQ(TRACE_wq_thread_create_failed | DBG_FUNC_NONE, wq, kret, 0, 0, 0);
2153                 goto fail_free;
2154         }
2155
2156         mach_vm_offset_t stackaddr = pthread_kern->proc_get_stack_addr_hint(p);
2157
2158         mach_vm_size_t guardsize = vm_map_page_size(wq->wq_map);
2159         mach_vm_size_t pthread_size =
2160                 vm_map_round_page_mask(pthread_kern->proc_get_pthsize(p) + PTHREAD_T_OFFSET, vm_map_page_mask(wq->wq_map));
2161         mach_vm_size_t th_allocsize = guardsize + PTH_DEFAULT_STACKSIZE + pthread_size;
2162
2163         kret = mach_vm_map(wq->wq_map, &stackaddr,
2164                         th_allocsize, page_size-1,
2165                         VM_MAKE_TAG(VM_MEMORY_STACK)| VM_FLAGS_ANYWHERE, NULL,
2166                         0, FALSE, VM_PROT_DEFAULT, VM_PROT_ALL,
2167                         VM_INHERIT_DEFAULT);
2168
2169         if (kret != KERN_SUCCESS) {
2170                 kret = mach_vm_allocate(wq->wq_map,
2171                                 &stackaddr, th_allocsize,
2172                                 VM_MAKE_TAG(VM_MEMORY_STACK) | VM_FLAGS_ANYWHERE);
2173         }
2174
2175         if (kret != KERN_SUCCESS) {
2176                 PTHREAD_TRACE_WQ(TRACE_wq_thread_create_failed | DBG_FUNC_NONE, wq, kret, 1, 0, 0);
2177                 goto fail_terminate;
2178         }
2179
2180         /*
2181          * The guard page is at the lowest address
2182          * The stack base is the highest address
2183          */
2184         kret = mach_vm_protect(wq->wq_map, stackaddr, guardsize, FALSE, VM_PROT_NONE);
2185         if (kret != KERN_SUCCESS) {
2186                 PTHREAD_TRACE_WQ(TRACE_wq_thread_create_failed | DBG_FUNC_NONE, wq, kret, 2, 0, 0);
2187                 goto fail_vm_deallocate;
2188         }
2189
2190
2191         pthread_kern->thread_set_tag(th, THREAD_TAG_PTHREAD | THREAD_TAG_WORKQUEUE);
2192         pthread_kern->thread_static_param(th, TRUE);
2193
2194         /*
2195          * convert_thread_to_port() consumes a reference
2196          */
2197         thread_reference(th);
2198         void *sright = (void *)pthread_kern->convert_thread_to_port(th);
2199         tl->th_thport = pthread_kern->ipc_port_copyout_send(sright,
2200                         pthread_kern->task_get_ipcspace(wq->wq_task));
2201
2202         tl->th_flags = TH_LIST_INITED | TH_LIST_NEW;
2203         tl->th_thread = th;
2204         tl->th_workq = wq;
2205         tl->th_stackaddr = stackaddr;
2206         tl->th_priority = WORKQUEUE_NUM_BUCKETS;
2207
2208         struct uthread *uth;
2209         uth = pthread_kern->get_bsdthread_info(tl->th_thread);
2210
2211         workqueue_lock_spin(wq);
2212
2213         void *current_tl = pthread_kern->uthread_get_threadlist(uth);
2214         if (current_tl == NULL) {
2215                 pthread_kern->uthread_set_threadlist(uth, tl);
2216                 TAILQ_INSERT_TAIL(&wq->wq_thidlelist, tl, th_entry);
2217                 wq->wq_thidlecount++;
2218         } else if (current_tl == WQ_THREADLIST_EXITING_POISON) {
2219                 /*
2220                  * Failed thread creation race: The thread already woke up and has exited.
2221                  */
2222                 PTHREAD_TRACE_WQ(TRACE_wq_thread_create_failed | DBG_FUNC_NONE, wq, kret, 3, 0, 0);
2223                 goto fail_unlock;
2224         } else {
2225                 panic("Unexpected initial threadlist value");
2226         }
2227
2228         PTHREAD_TRACE_WQ(TRACE_wq_thread_create | DBG_FUNC_NONE, wq, 0, 0, 0, 0);
2229
2230         return (TRUE);
2231
2232 fail_unlock:
2233         workqueue_unlock(wq);
2234         (void)pthread_kern->mach_port_deallocate(pthread_kern->task_get_ipcspace(wq->wq_task),
2235                         tl->th_thport);
2236
2237 fail_vm_deallocate:
2238         (void) mach_vm_deallocate(wq->wq_map, stackaddr, th_allocsize);
2239
2240 fail_terminate:
2241         if (pthread_kern->thread_will_park_or_terminate) {
2242                 pthread_kern->thread_will_park_or_terminate(th);
2243         }
2244         (void)thread_terminate(th);
2245         thread_deallocate(th);
2246
2247 fail_free:
2248         zfree(pthread_zone_threadlist, tl);
2249
2250         workqueue_lock_spin(wq);
2251         wq->wq_nthreads--;
2252
2253         return (FALSE);
2254 }
2255
2256 /**
2257  * Setup per-process state for the workqueue.
2258  */
2259 int
2260 _workq_open(struct proc *p, __unused int32_t *retval)
2261 {
2262         struct workqueue * wq;
2263         char * ptr;
2264         uint32_t num_cpus;
2265         int error = 0;
2266
2267         if (pthread_kern->proc_get_register(p) == 0) {
2268                 return EINVAL;
2269         }
2270
2271         num_cpus = pthread_kern->ml_get_max_cpus();
2272
2273         if (wq_init_constrained_limit) {
2274                 uint32_t limit;
2275                 /*
2276                  * set up the limit for the constrained pool
2277                  * this is a virtual pool in that we don't
2278                  * maintain it on a separate idle and run list
2279                  */
2280                 limit = num_cpus * WORKQUEUE_CONSTRAINED_FACTOR;
2281
2282                 if (limit > wq_max_constrained_threads)
2283                         wq_max_constrained_threads = limit;
2284
2285                 wq_init_constrained_limit = 0;
2286
2287                 if (wq_max_threads > WQ_THACTIVE_BUCKET_HALF) {
2288                         wq_max_threads = WQ_THACTIVE_BUCKET_HALF;
2289                 }
2290                 if (wq_max_threads > pthread_kern->config_thread_max - 20) {
2291                         wq_max_threads = pthread_kern->config_thread_max - 20;
2292                 }
2293         }
2294
2295         if (pthread_kern->proc_get_wqptr(p) == NULL) {
2296                 if (pthread_kern->proc_init_wqptr_or_wait(p) == FALSE) {
2297                         assert(pthread_kern->proc_get_wqptr(p) != NULL);
2298                         goto out;
2299                 }
2300
2301                 ptr = (char *)zalloc(pthread_zone_workqueue);
2302                 bzero(ptr, sizeof(struct workqueue));
2303
2304                 wq = (struct workqueue *)ptr;
2305                 wq->wq_proc = p;
2306                 wq->wq_task = current_task();
2307                 wq->wq_map  = pthread_kern->current_map();
2308
2309                 // Start the event manager at the priority hinted at by the policy engine
2310                 int mgr_priority_hint = pthread_kern->task_get_default_manager_qos(current_task());
2311                 wq->wq_event_manager_priority = (uint32_t)thread_qos_get_pthread_priority(mgr_priority_hint) | _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG;
2312
2313                 TAILQ_INIT(&wq->wq_thrunlist);
2314                 TAILQ_INIT(&wq->wq_thidlelist);
2315                 for (int i = 0; i < WORKQUEUE_EVENT_MANAGER_BUCKET; i++) {
2316                         TAILQ_INIT(&wq->wq_overcommit_reqlist[i]);
2317                         TAILQ_INIT(&wq->wq_reqlist[i]);
2318                 }
2319
2320                 wq->wq_atimer_delayed_call =
2321                                 thread_call_allocate_with_priority((thread_call_func_t)workqueue_add_timer,
2322                                                 (thread_call_param_t)wq, THREAD_CALL_PRIORITY_KERNEL);
2323                 wq->wq_atimer_immediate_call =
2324                                 thread_call_allocate_with_priority((thread_call_func_t)workqueue_add_timer,
2325                                                 (thread_call_param_t)wq, THREAD_CALL_PRIORITY_KERNEL);
2326
2327                 lck_spin_init(&wq->wq_lock, pthread_lck_grp, pthread_lck_attr);
2328
2329                 wq->wq_cached_threadreq = zalloc(pthread_zone_threadreq);
2330                 *(wq_thactive_t *)&wq->wq_thactive =
2331                                 (wq_thactive_t)WQ_THACTIVE_NO_PENDING_REQUEST <<
2332                                 WQ_THACTIVE_QOS_SHIFT;
2333
2334                 pthread_kern->proc_set_wqptr(p, wq);
2335
2336         }
2337 out:
2338
2339         return(error);
2340 }
2341
2342 /*
2343  * Routine:     workqueue_mark_exiting
2344  *
2345  * Function:    Mark the work queue such that new threads will not be added to the
2346  *              work queue after we return.
2347  *
2348  * Conditions:  Called against the current process.
2349  */
2350 void
2351 _workqueue_mark_exiting(struct proc *p)
2352 {
2353         struct workqueue *wq = pthread_kern->proc_get_wqptr(p);
2354         if (!wq) return;
2355
2356         PTHREAD_TRACE_WQ(TRACE_wq_pthread_exit|DBG_FUNC_START, wq, 0, 0, 0, 0);
2357
2358         workqueue_lock_spin(wq);
2359
2360         /*
2361          * We arm the add timer without holding the workqueue lock so we need
2362          * to synchronize with any running or soon to be running timers.
2363          *
2364          * Threads that intend to arm the timer atomically OR
2365          * WQ_ATIMER_{DELAYED,IMMEDIATE}_RUNNING into the wq_flags, only if
2366          * WQ_EXITING is not present.  So, once we have set WQ_EXITING, we can
2367          * be sure that no new RUNNING flags will be set, but still need to
2368          * wait for the already running timers to complete.
2369          *
2370          * We always hold the workq lock when dropping WQ_ATIMER_RUNNING, so
2371          * the check for and sleep until clear is protected.
2372          */
2373         uint64_t wq_flags = _wq_flags_or_orig(wq, WQ_EXITING);
2374
2375         if (wq_flags & WQ_ATIMER_DELAYED_RUNNING) {
2376                 if (thread_call_cancel(wq->wq_atimer_delayed_call) == TRUE) {
2377                         wq_flags = _wq_flags_and_orig(wq, ~WQ_ATIMER_DELAYED_RUNNING);
2378                 }
2379         }
2380         if (wq_flags & WQ_ATIMER_IMMEDIATE_RUNNING) {
2381                 if (thread_call_cancel(wq->wq_atimer_immediate_call) == TRUE) {
2382                         wq_flags = _wq_flags_and_orig(wq, ~WQ_ATIMER_IMMEDIATE_RUNNING);
2383                 }
2384         }
2385         while ((_wq_flags(wq) & (WQ_ATIMER_DELAYED_RUNNING | WQ_ATIMER_IMMEDIATE_RUNNING)) ||
2386                         (wq->wq_lflags & WQL_ATIMER_BUSY)) {
2387                 assert_wait((caddr_t)wq, (THREAD_UNINT));
2388                 workqueue_unlock(wq);
2389
2390                 thread_block(THREAD_CONTINUE_NULL);
2391
2392                 workqueue_lock_spin(wq);
2393         }
2394
2395         /*
2396          * Save off pending requests, will complete/free them below after unlocking
2397          */
2398         TAILQ_HEAD(, threadreq) local_list = TAILQ_HEAD_INITIALIZER(local_list);
2399
2400         for (int i = 0; i < WORKQUEUE_EVENT_MANAGER_BUCKET; i++) {
2401                 TAILQ_CONCAT(&local_list, &wq->wq_overcommit_reqlist[i], tr_entry);
2402                 TAILQ_CONCAT(&local_list, &wq->wq_reqlist[i], tr_entry);
2403         }
2404
2405         /*
2406          * XXX: Can't deferred cancel the event manager request, so just smash it.
2407          */
2408         assert((wq->wq_event_manager_threadreq.tr_flags & TR_FLAG_WORKLOOP) == 0);
2409         wq->wq_event_manager_threadreq.tr_state = TR_STATE_DEAD;
2410
2411         workqueue_unlock(wq);
2412
2413         struct threadreq *tr, *tr_temp;
2414         TAILQ_FOREACH_SAFE(tr, &local_list, tr_entry, tr_temp) {
2415                 _threadreq_cancel(wq, tr);
2416         }
2417         PTHREAD_TRACE(TRACE_wq_pthread_exit|DBG_FUNC_END, 0, 0, 0, 0, 0);
2418 }
2419
2420 /*
2421  * Routine:     workqueue_exit
2422  *
2423  * Function:    clean up the work queue structure(s) now that there are no threads
2424  *              left running inside the work queue (except possibly current_thread).
2425  *
2426  * Conditions:  Called by the last thread in the process.
2427  *              Called against current process.
2428  */
2429 void
2430 _workqueue_exit(struct proc *p)
2431 {
2432         struct workqueue  * wq;
2433         struct threadlist  * tl, *tlist;
2434         struct uthread  *uth;
2435
2436         wq = pthread_kern->proc_get_wqptr(p);
2437         if (wq != NULL) {
2438
2439                 PTHREAD_TRACE_WQ(TRACE_wq_workqueue_exit|DBG_FUNC_START, wq, 0, 0, 0, 0);
2440
2441                 pthread_kern->proc_set_wqptr(p, NULL);
2442
2443                 /*
2444                  * Clean up workqueue data structures for threads that exited and
2445                  * didn't get a chance to clean up after themselves.
2446                  */
2447                 TAILQ_FOREACH_SAFE(tl, &wq->wq_thrunlist, th_entry, tlist) {
2448                         assert((tl->th_flags & TH_LIST_RUNNING) != 0);
2449
2450                         pthread_kern->thread_sched_call(tl->th_thread, NULL);
2451
2452                         uth = pthread_kern->get_bsdthread_info(tl->th_thread);
2453                         if (uth != (struct uthread *)0) {
2454                                 pthread_kern->uthread_set_threadlist(uth, NULL);
2455                         }
2456                         TAILQ_REMOVE(&wq->wq_thrunlist, tl, th_entry);
2457
2458                         /*
2459                          * drop our last ref on the thread
2460                          */
2461                         thread_deallocate(tl->th_thread);
2462
2463                         zfree(pthread_zone_threadlist, tl);
2464                 }
2465                 TAILQ_FOREACH_SAFE(tl, &wq->wq_thidlelist, th_entry, tlist) {
2466                         assert((tl->th_flags & TH_LIST_RUNNING) == 0);
2467                         assert(tl->th_priority != WORKQUEUE_EVENT_MANAGER_BUCKET);
2468                         workqueue_removethread(tl, true, false);
2469                 }
2470                 TAILQ_FOREACH_SAFE(tl, &wq->wq_thidlemgrlist, th_entry, tlist) {
2471                         assert((tl->th_flags & TH_LIST_RUNNING) == 0);
2472                         assert(tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET);
2473                         workqueue_removethread(tl, true, false);
2474                 }
2475                 if (wq->wq_cached_threadreq) {
2476                         zfree(pthread_zone_threadreq, wq->wq_cached_threadreq);
2477                 }
2478                 thread_call_free(wq->wq_atimer_delayed_call);
2479                 thread_call_free(wq->wq_atimer_immediate_call);
2480                 lck_spin_destroy(&wq->wq_lock, pthread_lck_grp);
2481
2482                 for (int i = 0; i < WORKQUEUE_EVENT_MANAGER_BUCKET; i++) {
2483                         assert(TAILQ_EMPTY(&wq->wq_overcommit_reqlist[i]));
2484                         assert(TAILQ_EMPTY(&wq->wq_reqlist[i]));
2485                 }
2486
2487                 zfree(pthread_zone_workqueue, wq);
2488
2489                 PTHREAD_TRACE(TRACE_wq_workqueue_exit|DBG_FUNC_END, 0, 0, 0, 0, 0);
2490         }
2491 }
2492
2493
2494 #pragma mark workqueue thread manipulation
2495
2496
2497 /**
2498  * Entry point for libdispatch to ask for threads
2499  */
2500 static int
2501 wqops_queue_reqthreads(struct proc *p, int reqcount,
2502                 pthread_priority_t priority)
2503 {
2504         bool overcommit = _pthread_priority_get_flags(priority) & _PTHREAD_PRIORITY_OVERCOMMIT_FLAG;
2505         bool event_manager = _pthread_priority_get_flags(priority) & _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG;
2506         int class = event_manager ? WORKQUEUE_EVENT_MANAGER_BUCKET :
2507                         pthread_priority_get_class_index(priority);
2508
2509         if ((reqcount <= 0) || (class < 0) || (class >= WORKQUEUE_NUM_BUCKETS) ||
2510                         (overcommit && event_manager)) {
2511                 return EINVAL;
2512         }
2513
2514         struct workqueue *wq;
2515         if ((wq = (struct workqueue *)pthread_kern->proc_get_wqptr(p)) == NULL) {
2516                 return EINVAL;
2517         }
2518
2519         workqueue_lock_spin(wq);
2520         _threadreq_copy_prepare(wq);
2521
2522         PTHREAD_TRACE_WQ(TRACE_wq_wqops_reqthreads | DBG_FUNC_NONE, wq, reqcount, priority, 0, 0);
2523
2524         int tr_flags = 0;
2525         if (overcommit) tr_flags |= TR_FLAG_OVERCOMMIT;
2526         if (reqcount > 1) {
2527                 /*
2528                  * when libdispatch asks for more than one thread, it wants to achieve
2529                  * parallelism. Pacing would be detrimental to this ask, so treat
2530                  * these specially to not do the pacing admission check
2531                  */
2532                 tr_flags |= TR_FLAG_NO_PACING;
2533         }
2534
2535         while (reqcount-- && !_wq_exiting(wq)) {
2536                 struct threadreq req;
2537                 _threadreq_init_stack(&req, class, tr_flags);
2538
2539                 workqueue_run_threadreq_and_unlock(p, wq, NULL, &req, true);
2540
2541                 workqueue_lock_spin(wq); /* reacquire */
2542                 _threadreq_copy_prepare(wq);
2543         }
2544
2545         workqueue_unlock(wq);
2546
2547         return 0;
2548 }
2549
2550 /*
2551  * Used by the kevent system to request threads.
2552  *
2553  * Currently count is ignored and we always return one thread per invocation.
2554  */
2555 static thread_t
2556 _workq_kevent_reqthreads(struct proc *p, pthread_priority_t priority,
2557                 bool no_emergency)
2558 {
2559         int wq_run_tr = WQ_RUN_TR_THROTTLED;
2560         bool emergency_thread = false;
2561         struct threadreq req;
2562
2563
2564         struct workqueue *wq;
2565         if ((wq = (struct workqueue *)pthread_kern->proc_get_wqptr(p)) == NULL) {
2566                 return THREAD_NULL;
2567         }
2568
2569         int class = pthread_priority_get_class_index(priority);
2570
2571         workqueue_lock_spin(wq);
2572         bool has_threadreq = _threadreq_copy_prepare_noblock(wq);
2573
2574         PTHREAD_TRACE_WQ_REQ(TRACE_wq_kevent_reqthreads | DBG_FUNC_NONE, wq, NULL, priority, 0, 0);
2575
2576         /*
2577          * Skip straight to event manager if that's what was requested
2578          */
2579         if ((_pthread_priority_get_qos_newest(priority) == QOS_CLASS_UNSPECIFIED) ||
2580                         (_pthread_priority_get_flags(priority) & _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG)){
2581                 goto event_manager;
2582         }
2583
2584         bool will_pace = _wq_should_pace_priority(wq, class);
2585         if ((wq->wq_thidlecount == 0 || will_pace) && has_threadreq == false) {
2586                 /*
2587                  * We'll need to persist the request and can't, so return the emergency
2588                  * thread instead, which has a persistent request object.
2589                  */
2590                 emergency_thread = true;
2591                 goto event_manager;
2592         }
2593
2594         /*
2595          * Handle overcommit requests
2596          */
2597         if ((_pthread_priority_get_flags(priority) & _PTHREAD_PRIORITY_OVERCOMMIT_FLAG) != 0){
2598                 _threadreq_init_stack(&req, class, TR_FLAG_KEVENT | TR_FLAG_OVERCOMMIT);
2599                 wq_run_tr = workqueue_run_threadreq_and_unlock(p, wq, NULL, &req, false);
2600                 goto done;
2601         }
2602
2603         /*
2604          * Handle constrained requests
2605          */
2606         boolean_t may_start = may_start_constrained_thread(wq, class, NULL, false);
2607         if (may_start || no_emergency) {
2608                 _threadreq_init_stack(&req, class, TR_FLAG_KEVENT);
2609                 wq_run_tr = workqueue_run_threadreq_and_unlock(p, wq, NULL, &req, false);
2610                 goto done;
2611         } else {
2612                 emergency_thread = true;
2613         }
2614
2615
2616 event_manager:
2617         _threadreq_init_stack(&req, WORKQUEUE_EVENT_MANAGER_BUCKET, TR_FLAG_KEVENT);
2618         wq_run_tr = workqueue_run_threadreq_and_unlock(p, wq, NULL, &req, false);
2619
2620 done:
2621         if (wq_run_tr == WQ_RUN_TR_THREAD_NEEDED && WQ_TIMER_IMMEDIATE_NEEDED(wq)) {
2622                 workqueue_interval_timer_trigger(wq);
2623         }
2624         return emergency_thread ? (void*)-1 : 0;
2625 }
2626
2627 thread_t
2628 _workq_reqthreads(struct proc *p, __assert_only int requests_count,
2629                 workq_reqthreads_req_t request)
2630 {
2631         assert(requests_count == 1);
2632
2633         pthread_priority_t priority = request->priority;
2634         bool no_emergency = request->count & WORKQ_REQTHREADS_NOEMERGENCY;
2635
2636         return _workq_kevent_reqthreads(p, priority, no_emergency);
2637 }
2638
2639
2640 int
2641 workq_kern_threadreq(struct proc *p, workq_threadreq_t _req,
2642                 enum workq_threadreq_type type, unsigned long priority, int flags)
2643 {
2644         struct workqueue *wq;
2645         int ret;
2646
2647         if ((wq = (struct workqueue *)pthread_kern->proc_get_wqptr(p)) == NULL) {
2648                 return EINVAL;
2649         }
2650
2651         switch (type) {
2652         case WORKQ_THREADREQ_KEVENT: {
2653                 bool no_emergency = flags & WORKQ_THREADREQ_FLAG_NOEMERGENCY;
2654                 (void)_workq_kevent_reqthreads(p, priority, no_emergency);
2655                 return 0;
2656         }
2657         case WORKQ_THREADREQ_WORKLOOP:
2658         case WORKQ_THREADREQ_WORKLOOP_NO_THREAD_CALL: {
2659                 struct threadreq *req = (struct threadreq *)_req;
2660                 int req_class = pthread_priority_get_class_index(priority);
2661                 int req_flags = TR_FLAG_WORKLOOP;
2662                 if ((_pthread_priority_get_flags(priority) &
2663                                 _PTHREAD_PRIORITY_OVERCOMMIT_FLAG) != 0){
2664                         req_flags |= TR_FLAG_OVERCOMMIT;
2665                 }
2666
2667                 thread_t thread = current_thread();
2668                 struct threadlist *tl = util_get_thread_threadlist_entry(thread);
2669
2670                 if (tl && tl != WQ_THREADLIST_EXITING_POISON &&
2671                                 (tl->th_flags & TH_LIST_UNBINDING)) {
2672                         /*
2673                          * we're called back synchronously from the context of
2674                          * kevent_qos_internal_unbind from within wqops_thread_return()
2675                          * we can try to match up this thread with this request !
2676                          */
2677                 } else {
2678                         tl = NULL;
2679                 }
2680
2681                 _threadreq_init_alloced(req, req_class, req_flags);
2682                 workqueue_lock_spin(wq);
2683                 PTHREAD_TRACE_WQ_REQ(TRACE_wq_kevent_reqthreads | DBG_FUNC_NONE, wq, req, priority, 1, 0);
2684                 ret = workqueue_run_threadreq_and_unlock(p, wq, tl, req, false);
2685                 if (ret == WQ_RUN_TR_EXITING) {
2686                         return ECANCELED;
2687                 }
2688                 if (ret == WQ_RUN_TR_THREAD_NEEDED) {
2689                         if (type == WORKQ_THREADREQ_WORKLOOP_NO_THREAD_CALL) {
2690                                 return EAGAIN;
2691                         }
2692                         if (WQ_TIMER_IMMEDIATE_NEEDED(wq)) {
2693                                 workqueue_interval_timer_trigger(wq);
2694                         }
2695                 }
2696                 return 0;
2697         }
2698         case WORKQ_THREADREQ_REDRIVE:
2699                 PTHREAD_TRACE_WQ_REQ(TRACE_wq_kevent_reqthreads | DBG_FUNC_NONE, wq, 0, 0, 4, 0);
2700                 workqueue_lock_spin(wq);
2701                 ret = workqueue_run_threadreq_and_unlock(p, wq, NULL, NULL, true);
2702                 if (ret == WQ_RUN_TR_EXITING) {
2703                         return ECANCELED;
2704                 }
2705                 return 0;
2706         default:
2707                 return ENOTSUP;
2708         }
2709 }
2710
2711 int
2712 workq_kern_threadreq_modify(struct proc *p, workq_threadreq_t _req,
2713                 enum workq_threadreq_op operation, unsigned long arg1,
2714                 unsigned long __unused arg2)
2715 {
2716         struct threadreq *req = (struct threadreq *)_req;
2717         struct workqueue *wq;
2718         int priclass, ret = 0, wq_tr_rc = WQ_RUN_TR_THROTTLED;
2719
2720         if (req == NULL || (wq = pthread_kern->proc_get_wqptr(p)) == NULL) {
2721                 return EINVAL;
2722         }
2723
2724         workqueue_lock_spin(wq);
2725
2726         if (_wq_exiting(wq)) {
2727                 ret = ECANCELED;
2728                 goto out_unlock;
2729         }
2730
2731         /*
2732          * Find/validate the referenced request structure
2733          */
2734         if (req->tr_state != TR_STATE_WAITING) {
2735                 ret = EINVAL;
2736                 goto out_unlock;
2737         }
2738         assert(req->tr_priority < WORKQUEUE_EVENT_MANAGER_BUCKET);
2739         assert(req->tr_flags & TR_FLAG_WORKLOOP);
2740
2741         switch (operation) {
2742         case WORKQ_THREADREQ_CHANGE_PRI:
2743         case WORKQ_THREADREQ_CHANGE_PRI_NO_THREAD_CALL:
2744                 priclass = pthread_priority_get_class_index(arg1);
2745                 PTHREAD_TRACE_WQ_REQ(TRACE_wq_kevent_reqthreads | DBG_FUNC_NONE, wq, req, arg1, 2, 0);
2746                 if (req->tr_priority == priclass) {
2747                         goto out_unlock;
2748                 }
2749                 _threadreq_dequeue(wq, req);
2750                 req->tr_priority = priclass;
2751                 req->tr_state = TR_STATE_NEW; // what was old is new again
2752                 wq_tr_rc = workqueue_run_threadreq_and_unlock(p, wq, NULL, req, false);
2753                 goto out;
2754
2755         case WORKQ_THREADREQ_CANCEL:
2756                 PTHREAD_TRACE_WQ_REQ(TRACE_wq_kevent_reqthreads | DBG_FUNC_NONE, wq, req, 0, 3, 0);
2757                 _threadreq_dequeue(wq, req);
2758                 req->tr_state = TR_STATE_DEAD;
2759                 break;
2760
2761         default:
2762                 ret = ENOTSUP;
2763                 break;
2764         }
2765
2766 out_unlock:
2767         workqueue_unlock(wq);
2768 out:
2769         if (wq_tr_rc == WQ_RUN_TR_THREAD_NEEDED) {
2770                 if (operation == WORKQ_THREADREQ_CHANGE_PRI_NO_THREAD_CALL) {
2771                         ret = EAGAIN;
2772                 } else if (WQ_TIMER_IMMEDIATE_NEEDED(wq)) {
2773                         workqueue_interval_timer_trigger(wq);
2774                 }
2775         }
2776         return ret;
2777 }
2778
2779
2780 static int
2781 wqops_thread_return(struct proc *p, struct workqueue *wq)
2782 {
2783         thread_t th = current_thread();
2784         struct uthread *uth = pthread_kern->get_bsdthread_info(th);
2785         struct threadlist *tl = pthread_kern->uthread_get_threadlist(uth);
2786
2787         /* reset signal mask on the workqueue thread to default state */
2788         if (pthread_kern->uthread_get_sigmask(uth) != (sigset_t)(~workq_threadmask)) {
2789                 pthread_kern->proc_lock(p);
2790                 pthread_kern->uthread_set_sigmask(uth, ~workq_threadmask);
2791                 pthread_kern->proc_unlock(p);
2792         }
2793
2794         if (wq == NULL || !tl) {
2795                 return EINVAL;
2796         }
2797
2798         PTHREAD_TRACE_WQ(TRACE_wq_override_reset | DBG_FUNC_START, tl->th_workq, 0, 0, 0, 0);
2799
2800         /*
2801          * This squash call has neat semantics: it removes the specified overrides,
2802          * replacing the current requested QoS with the previous effective QoS from
2803          * those overrides.  This means we won't be preempted due to having our QoS
2804          * lowered.  Of course, now our understanding of the thread's QoS is wrong,
2805          * so we'll adjust below.
2806          */
2807         bool was_manager = (tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET);
2808         int new_qos;
2809
2810         if (!was_manager) {
2811                 new_qos = pthread_kern->proc_usynch_thread_qos_squash_override_for_resource(th,
2812                                 THREAD_QOS_OVERRIDE_RESOURCE_WILDCARD,
2813                                 THREAD_QOS_OVERRIDE_TYPE_DISPATCH_ASYNCHRONOUS_OVERRIDE);
2814         }
2815
2816         PTHREAD_TRACE_WQ(TRACE_wq_runitem | DBG_FUNC_END, wq, tl->th_priority, 0, 0, 0);
2817
2818         workqueue_lock_spin(wq);
2819
2820         if (tl->th_flags & TH_LIST_KEVENT_BOUND) {
2821                 unsigned int flags = KEVENT_FLAG_WORKQ;
2822                 if (was_manager) {
2823                         flags |= KEVENT_FLAG_WORKQ_MANAGER;
2824                 }
2825
2826                 tl->th_flags |= TH_LIST_UNBINDING;
2827                 workqueue_unlock(wq);
2828                 kevent_qos_internal_unbind(p, class_index_get_thread_qos(tl->th_priority), th, flags);
2829                 if (!(tl->th_flags & TH_LIST_UNBINDING)) {
2830                         _setup_wqthread(p, th, wq, tl, WQ_SETUP_CLEAR_VOUCHER);
2831                         pthread_kern->unix_syscall_return(EJUSTRETURN);
2832                         __builtin_unreachable();
2833                 }
2834                 workqueue_lock_spin(wq);
2835                 tl->th_flags &= ~(TH_LIST_KEVENT_BOUND | TH_LIST_UNBINDING);
2836         }
2837
2838         if (!was_manager) {
2839                 /* Fix up counters from the squash operation. */
2840                 uint8_t old_bucket = tl->th_priority;
2841                 uint8_t new_bucket = thread_qos_get_class_index(new_qos);
2842
2843                 if (old_bucket != new_bucket) {
2844                         _wq_thactive_move(wq, old_bucket, new_bucket);
2845                         wq->wq_thscheduled_count[old_bucket]--;
2846                         wq->wq_thscheduled_count[new_bucket]++;
2847
2848                         PTHREAD_TRACE_WQ(TRACE_wq_thread_squash | DBG_FUNC_NONE, wq, tl->th_priority, new_bucket, 0, 0);
2849                         tl->th_priority = new_bucket;
2850                         PTHREAD_TRACE_WQ(TRACE_wq_override_reset | DBG_FUNC_END, tl->th_workq, new_qos, 0, 0, 0);
2851                 }
2852         }
2853
2854         workqueue_run_threadreq_and_unlock(p, wq, tl, NULL, false);
2855         return 0;
2856 }
2857
2858 /**
2859  * Multiplexed call to interact with the workqueue mechanism
2860  */
2861 int
2862 _workq_kernreturn(struct proc *p,
2863                   int options,
2864                   user_addr_t item,
2865                   int arg2,
2866                   int arg3,
2867                   int32_t *retval)
2868 {
2869         struct workqueue *wq;
2870         int error = 0;
2871
2872         if (pthread_kern->proc_get_register(p) == 0) {
2873                 return EINVAL;
2874         }
2875
2876         switch (options) {
2877         case WQOPS_QUEUE_NEWSPISUPP: {
2878                 /*
2879                  * arg2 = offset of serialno into dispatch queue
2880                  * arg3 = kevent support
2881                  */
2882                 int offset = arg2;
2883                 if (arg3 & 0x01){
2884                         // If we get here, then userspace has indicated support for kevent delivery.
2885                 }
2886
2887                 pthread_kern->proc_set_dispatchqueue_serialno_offset(p, (uint64_t)offset);
2888                 break;
2889         }
2890         case WQOPS_QUEUE_REQTHREADS: {
2891                 /*
2892                  * arg2 = number of threads to start
2893                  * arg3 = priority
2894                  */
2895                 error = wqops_queue_reqthreads(p, arg2, arg3);
2896                 break;
2897         }
2898         case WQOPS_SET_EVENT_MANAGER_PRIORITY: {
2899                 /*
2900                  * arg2 = priority for the manager thread
2901                  *
2902                  * if _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG is set, the
2903                  * ~_PTHREAD_PRIORITY_FLAGS_MASK contains a scheduling priority instead
2904                  * of a QOS value
2905                  */
2906                 pthread_priority_t pri = arg2;
2907
2908                 wq = (struct workqueue *)pthread_kern->proc_get_wqptr(p);
2909                 if (wq == NULL) {
2910                         error = EINVAL;
2911                         break;
2912                 }
2913                 workqueue_lock_spin(wq);
2914                 if (pri & _PTHREAD_PRIORITY_SCHED_PRI_FLAG){
2915                         /*
2916                          * If userspace passes a scheduling priority, that takes precidence
2917                          * over any QoS.  (So, userspace should take care not to accidenatally
2918                          * lower the priority this way.)
2919                          */
2920                         uint32_t sched_pri = pri & _PTHREAD_PRIORITY_SCHED_PRI_MASK;
2921                         if (wq->wq_event_manager_priority & _PTHREAD_PRIORITY_SCHED_PRI_FLAG){
2922                                 wq->wq_event_manager_priority = MAX(sched_pri, wq->wq_event_manager_priority & _PTHREAD_PRIORITY_SCHED_PRI_MASK)
2923                                                 | _PTHREAD_PRIORITY_SCHED_PRI_FLAG | _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG;
2924                         } else {
2925                                 wq->wq_event_manager_priority = sched_pri
2926                                                 | _PTHREAD_PRIORITY_SCHED_PRI_FLAG | _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG;
2927                         }
2928                 } else if ((wq->wq_event_manager_priority & _PTHREAD_PRIORITY_SCHED_PRI_FLAG) == 0){
2929                         int cur_qos = pthread_priority_get_thread_qos(wq->wq_event_manager_priority);
2930                         int new_qos = pthread_priority_get_thread_qos(pri);
2931                         wq->wq_event_manager_priority = (uint32_t)thread_qos_get_pthread_priority(MAX(cur_qos, new_qos)) | _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG;
2932                 }
2933                 workqueue_unlock(wq);
2934                 break;
2935         }
2936         case WQOPS_THREAD_KEVENT_RETURN:
2937         case WQOPS_THREAD_WORKLOOP_RETURN:
2938                 wq = (struct workqueue *)pthread_kern->proc_get_wqptr(p);
2939                 PTHREAD_TRACE_WQ(TRACE_wq_runthread | DBG_FUNC_END, wq, options, 0, 0, 0);
2940                 if (item != 0 && arg2 != 0) {
2941                         int32_t kevent_retval;
2942                         int ret;
2943                         if (options == WQOPS_THREAD_KEVENT_RETURN) {
2944                                 ret = kevent_qos_internal(p, -1, item, arg2, item, arg2, NULL, NULL,
2945                                                 KEVENT_FLAG_WORKQ | KEVENT_FLAG_IMMEDIATE | KEVENT_FLAG_ERROR_EVENTS,
2946                                                 &kevent_retval);
2947                         } else /* options == WQOPS_THREAD_WORKLOOP_RETURN */ {
2948                                 kqueue_id_t kevent_id = -1;
2949                                 ret = kevent_id_internal(p, &kevent_id, item, arg2, item, arg2,
2950                                                 NULL, NULL,
2951                                                 KEVENT_FLAG_WORKLOOP | KEVENT_FLAG_IMMEDIATE | KEVENT_FLAG_ERROR_EVENTS,
2952                                                 &kevent_retval);
2953                         }
2954                         /*
2955                          * We shouldn't be getting more errors out than events we put in, so
2956                          * reusing the input buffer should always provide enough space.  But,
2957                          * the assert is commented out since we get errors in edge cases in the
2958                          * process lifecycle.
2959                          */
2960                         //assert(ret == KERN_SUCCESS && kevent_retval >= 0);
2961                         if (ret != KERN_SUCCESS){
2962                                 error = ret;
2963                                 break;
2964                         } else if (kevent_retval > 0){
2965                                 assert(kevent_retval <= arg2);
2966                                 *retval = kevent_retval;
2967                                 error = 0;
2968                                 break;
2969                         }
2970                 }
2971                 goto thread_return;
2972
2973         case WQOPS_THREAD_RETURN:
2974                 wq = (struct workqueue *)pthread_kern->proc_get_wqptr(p);
2975                 PTHREAD_TRACE_WQ(TRACE_wq_runthread | DBG_FUNC_END, wq, options, 0, 0, 0);
2976         thread_return:
2977                 error = wqops_thread_return(p, wq);
2978                 // NOT REACHED except in case of error
2979                 assert(error);
2980                 break;
2981
2982         case WQOPS_SHOULD_NARROW: {
2983                 /*
2984                  * arg2 = priority to test
2985                  * arg3 = unused
2986                  */
2987                 pthread_priority_t priority = arg2;
2988                 thread_t th = current_thread();
2989                 struct threadlist *tl = util_get_thread_threadlist_entry(th);
2990
2991                 if (tl == NULL || (tl->th_flags & TH_LIST_CONSTRAINED) == 0) {
2992                         error = EINVAL;
2993                         break;
2994                 }
2995
2996                 int class = pthread_priority_get_class_index(priority);
2997                 wq = tl->th_workq;
2998                 workqueue_lock_spin(wq);
2999                 bool should_narrow = !may_start_constrained_thread(wq, class, tl, false);
3000                 workqueue_unlock(wq);
3001
3002                 *retval = should_narrow;
3003                 break;
3004         }
3005         default:
3006                 error = EINVAL;
3007                 break;
3008         }
3009
3010         switch (options) {
3011         case WQOPS_THREAD_KEVENT_RETURN:
3012         case WQOPS_THREAD_WORKLOOP_RETURN:
3013         case WQOPS_THREAD_RETURN:
3014                 PTHREAD_TRACE_WQ(TRACE_wq_runthread | DBG_FUNC_START, wq, options, 0, 0, 0);
3015                 break;
3016         }
3017         return (error);
3018 }
3019
3020 /*
3021  * We have no work to do, park ourselves on the idle list.
3022  *
3023  * Consumes the workqueue lock and does not return.
3024  */
3025 static void __dead2
3026 parkit(struct workqueue *wq, struct threadlist *tl, thread_t thread)
3027 {
3028         assert(thread == tl->th_thread);
3029         assert(thread == current_thread());
3030
3031         PTHREAD_TRACE_WQ(TRACE_wq_thread_park | DBG_FUNC_START, wq, 0, 0, 0, 0);
3032
3033         uint32_t us_to_wait = 0;
3034
3035         TAILQ_REMOVE(&wq->wq_thrunlist, tl, th_entry);
3036
3037         tl->th_flags &= ~TH_LIST_RUNNING;
3038         tl->th_flags &= ~TH_LIST_KEVENT;
3039         assert((tl->th_flags & TH_LIST_KEVENT_BOUND) == 0);
3040
3041         if (tl->th_flags & TH_LIST_CONSTRAINED) {
3042                 wq->wq_constrained_threads_scheduled--;
3043                 tl->th_flags &= ~TH_LIST_CONSTRAINED;
3044         }
3045
3046         _wq_thactive_dec(wq, tl->th_priority);
3047         wq->wq_thscheduled_count[tl->th_priority]--;
3048         wq->wq_threads_scheduled--;
3049         uint32_t thidlecount = ++wq->wq_thidlecount;
3050
3051         pthread_kern->thread_sched_call(thread, NULL);
3052
3053         /*
3054          * We'd like to always have one manager thread parked so that we can have
3055          * low latency when we need to bring a manager thread up.  If that idle
3056          * thread list is empty, make this thread a manager thread.
3057          *
3058          * XXX: This doesn't check that there's not a manager thread outstanding,
3059          * so it's based on the assumption that most manager callouts will change
3060          * their QoS before parking.  If that stops being true, this may end up
3061          * costing us more than we gain.
3062          */
3063         if (TAILQ_EMPTY(&wq->wq_thidlemgrlist) &&
3064                         tl->th_priority != WORKQUEUE_EVENT_MANAGER_BUCKET){
3065                 PTHREAD_TRACE_WQ(TRACE_wq_thread_reset_priority | DBG_FUNC_NONE,
3066                                         wq, thread_tid(thread),
3067                                         (tl->th_priority << 16) | WORKQUEUE_EVENT_MANAGER_BUCKET, 2, 0);
3068                 reset_priority(tl, pthread_priority_from_wq_class_index(wq, WORKQUEUE_EVENT_MANAGER_BUCKET));
3069                 tl->th_priority = WORKQUEUE_EVENT_MANAGER_BUCKET;
3070         }
3071
3072         if (tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET){
3073                 TAILQ_INSERT_HEAD(&wq->wq_thidlemgrlist, tl, th_entry);
3074         } else {
3075                 TAILQ_INSERT_HEAD(&wq->wq_thidlelist, tl, th_entry);
3076         }
3077
3078         /*
3079          * When we remove the voucher from the thread, we may lose our importance
3080          * causing us to get preempted, so we do this after putting the thread on
3081          * the idle list.  That when, when we get our importance back we'll be able
3082          * to use this thread from e.g. the kevent call out to deliver a boosting
3083          * message.
3084          */
3085         tl->th_flags |= TH_LIST_REMOVING_VOUCHER;
3086         workqueue_unlock(wq);
3087         if (pthread_kern->thread_will_park_or_terminate) {
3088                 pthread_kern->thread_will_park_or_terminate(tl->th_thread);
3089         }
3090         __assert_only kern_return_t kr;
3091         kr = pthread_kern->thread_set_voucher_name(MACH_PORT_NULL);
3092         assert(kr == KERN_SUCCESS);
3093         workqueue_lock_spin(wq);
3094         tl->th_flags &= ~(TH_LIST_REMOVING_VOUCHER);
3095
3096         if ((tl->th_flags & TH_LIST_RUNNING) == 0) {
3097                 if (thidlecount < 101) {
3098                         us_to_wait = wq_reduce_pool_window_usecs - ((thidlecount-2) * (wq_reduce_pool_window_usecs / 100));
3099                 } else {
3100                         us_to_wait = wq_reduce_pool_window_usecs / 100;
3101                 }
3102
3103                 thread_set_pending_block_hint(thread, kThreadWaitParkedWorkQueue);
3104                 assert_wait_timeout_with_leeway((caddr_t)tl, (THREAD_INTERRUPTIBLE),
3105                                 TIMEOUT_URGENCY_SYS_BACKGROUND|TIMEOUT_URGENCY_LEEWAY, us_to_wait,
3106                                 wq_reduce_pool_window_usecs/10, NSEC_PER_USEC);
3107
3108                 workqueue_unlock(wq);
3109
3110                 thread_block(wq_unpark_continue);
3111                 panic("thread_block(wq_unpark_continue) returned!");
3112         } else {
3113                 workqueue_unlock(wq);
3114
3115                 /*
3116                  * While we'd dropped the lock to unset our voucher, someone came
3117                  * around and made us runnable.  But because we weren't waiting on the
3118                  * event their wakeup() was ineffectual.  To correct for that, we just
3119                  * run the continuation ourselves.
3120                  */
3121                 wq_unpark_continue(NULL, THREAD_AWAKENED);
3122         }
3123 }
3124
3125 static bool
3126 may_start_constrained_thread(struct workqueue *wq, uint32_t at_priclass,
3127                 struct threadlist *tl, bool may_start_timer)
3128 {
3129         uint32_t req_qos = _wq_thactive_best_constrained_req_qos(wq);
3130         wq_thactive_t thactive;
3131
3132         if (may_start_timer && at_priclass < req_qos) {
3133                 /*
3134                  * When called from workqueue_run_threadreq_and_unlock() pre-post newest
3135                  * higher priorities into the thactive state so that
3136                  * workqueue_callback() takes the right decision.
3137                  *
3138                  * If the admission check passes, workqueue_run_threadreq_and_unlock
3139                  * will reset this value before running the request.
3140                  */
3141                 thactive = _wq_thactive_set_best_constrained_req_qos(wq, req_qos,
3142                                 at_priclass);
3143 #ifdef __LP64__
3144                 PTHREAD_TRACE_WQ(TRACE_wq_thactive_update, 1, (uint64_t)thactive,
3145                                 (uint64_t)(thactive >> 64), 0, 0);
3146 #endif
3147         } else {
3148                 thactive = _wq_thactive(wq);
3149         }
3150
3151         uint32_t constrained_threads = wq->wq_constrained_threads_scheduled;
3152         if (tl && (tl->th_flags & TH_LIST_CONSTRAINED)) {
3153                 /*
3154                  * don't count the current thread as scheduled
3155                  */
3156                 constrained_threads--;
3157         }
3158         if (constrained_threads >= wq_max_constrained_threads) {
3159                 PTHREAD_TRACE_WQ(TRACE_wq_constrained_admission | DBG_FUNC_NONE, wq, 1,
3160                                 wq->wq_constrained_threads_scheduled,
3161                                 wq_max_constrained_threads, 0);
3162                 /*
3163                  * we need 1 or more constrained threads to return to the kernel before
3164                  * we can dispatch additional work
3165                  */
3166                 return false;
3167         }
3168
3169         /*
3170          * Compute a metric for many how many threads are active.  We find the
3171          * highest priority request outstanding and then add up the number of
3172          * active threads in that and all higher-priority buckets.  We'll also add
3173          * any "busy" threads which are not active but blocked recently enough that
3174          * we can't be sure they've gone idle yet.  We'll then compare this metric
3175          * to our max concurrency to decide whether to add a new thread.
3176          */
3177
3178         uint32_t busycount, thactive_count;
3179
3180         thactive_count = _wq_thactive_aggregate_downto_qos(wq, thactive,
3181                         at_priclass, &busycount, NULL);
3182
3183         if (tl && tl->th_priority <= at_priclass) {
3184                 /*
3185                  * don't count this thread as currently active
3186                  */
3187                 assert(thactive_count > 0);
3188                 thactive_count--;
3189         }
3190
3191         if (thactive_count + busycount < wq_max_concurrency[at_priclass]) {
3192                 PTHREAD_TRACE_WQ(TRACE_wq_constrained_admission | DBG_FUNC_NONE, wq, 2,
3193                                 thactive_count, busycount, 0);
3194                 return true;
3195         } else {
3196                 PTHREAD_TRACE_WQ(TRACE_wq_constrained_admission | DBG_FUNC_NONE, wq, 3,
3197                                 thactive_count, busycount, 0);
3198         }
3199
3200         if (busycount && may_start_timer) {
3201                 /*
3202                  * If this is called from the add timer, we won't have another timer
3203                  * fire when the thread exits the "busy" state, so rearm the timer.
3204                  */
3205                 if (WQ_TIMER_DELAYED_NEEDED(wq)) {
3206                         workqueue_interval_timer_start(wq);
3207                 }
3208         }
3209
3210         return false;
3211 }
3212
3213 static struct threadlist *
3214 pop_from_thidlelist(struct workqueue *wq, uint32_t priclass)
3215 {
3216         assert(wq->wq_thidlecount);
3217
3218         struct threadlist *tl = NULL;
3219
3220         if (!TAILQ_EMPTY(&wq->wq_thidlemgrlist) &&
3221                         (priclass == WORKQUEUE_EVENT_MANAGER_BUCKET || TAILQ_EMPTY(&wq->wq_thidlelist))){
3222                 tl = TAILQ_FIRST(&wq->wq_thidlemgrlist);
3223                 TAILQ_REMOVE(&wq->wq_thidlemgrlist, tl, th_entry);
3224                 assert(tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET);
3225         } else if (!TAILQ_EMPTY(&wq->wq_thidlelist) &&
3226                         (priclass != WORKQUEUE_EVENT_MANAGER_BUCKET || TAILQ_EMPTY(&wq->wq_thidlemgrlist))){
3227                 tl = TAILQ_FIRST(&wq->wq_thidlelist);
3228                 TAILQ_REMOVE(&wq->wq_thidlelist, tl, th_entry);
3229                 assert(tl->th_priority != WORKQUEUE_EVENT_MANAGER_BUCKET);
3230         } else {
3231                 panic("pop_from_thidlelist called with no threads available");
3232         }
3233         assert((tl->th_flags & TH_LIST_RUNNING) == 0);
3234
3235         assert(wq->wq_thidlecount);
3236         wq->wq_thidlecount--;
3237
3238         TAILQ_INSERT_TAIL(&wq->wq_thrunlist, tl, th_entry);
3239
3240         tl->th_flags |= TH_LIST_RUNNING | TH_LIST_BUSY;
3241
3242         wq->wq_threads_scheduled++;
3243         wq->wq_thscheduled_count[priclass]++;
3244         _wq_thactive_inc(wq, priclass);
3245         return tl;
3246 }
3247
3248 static pthread_priority_t
3249 pthread_priority_from_wq_class_index(struct workqueue *wq, int index)
3250 {
3251         if (index == WORKQUEUE_EVENT_MANAGER_BUCKET){
3252                 return wq->wq_event_manager_priority;
3253         } else {
3254                 return class_index_get_pthread_priority(index);
3255         }
3256 }
3257
3258 static void
3259 reset_priority(struct threadlist *tl, pthread_priority_t pri)
3260 {
3261         kern_return_t ret;
3262         thread_t th = tl->th_thread;
3263
3264         if ((pri & _PTHREAD_PRIORITY_SCHED_PRI_FLAG) == 0){
3265                 ret = pthread_kern->thread_set_workq_qos(th, pthread_priority_get_thread_qos(pri), 0);
3266                 assert(ret == KERN_SUCCESS || ret == KERN_TERMINATED);
3267
3268                 if (tl->th_flags & TH_LIST_EVENT_MGR_SCHED_PRI) {
3269
3270                         /* Reset priority to default (masked by QoS) */
3271
3272                         ret = pthread_kern->thread_set_workq_pri(th, 31, POLICY_TIMESHARE);
3273                         assert(ret == KERN_SUCCESS || ret == KERN_TERMINATED);
3274
3275                         tl->th_flags &= ~TH_LIST_EVENT_MGR_SCHED_PRI;
3276                 }
3277         } else {
3278                 ret = pthread_kern->thread_set_workq_qos(th, THREAD_QOS_UNSPECIFIED, 0);
3279                 assert(ret == KERN_SUCCESS || ret == KERN_TERMINATED);
3280                 ret = pthread_kern->thread_set_workq_pri(th, (pri & (~_PTHREAD_PRIORITY_FLAGS_MASK)), POLICY_TIMESHARE);
3281                 assert(ret == KERN_SUCCESS || ret == KERN_TERMINATED);
3282
3283                 tl->th_flags |= TH_LIST_EVENT_MGR_SCHED_PRI;
3284         }
3285 }
3286
3287 /*
3288  * Picks the best request to run, and returns the best overcommit fallback
3289  * if the best pick is non overcommit and risks failing its admission check.
3290  */
3291 static struct threadreq *
3292 workqueue_best_threadreqs(struct workqueue *wq, struct threadlist *tl,
3293                 struct threadreq **fallback)
3294 {
3295         struct threadreq *req, *best_req = NULL;
3296         int priclass, prilimit;
3297
3298         if ((wq->wq_event_manager_threadreq.tr_state == TR_STATE_WAITING) &&
3299                         ((wq->wq_thscheduled_count[WORKQUEUE_EVENT_MANAGER_BUCKET] == 0) ||
3300                         (tl && tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET))) {
3301                 /*
3302                  * There's an event manager request and either:
3303                  *   - no event manager currently running
3304                  *   - we are re-using the event manager
3305                  */
3306                 req = &wq->wq_event_manager_threadreq;
3307                 PTHREAD_TRACE_WQ_REQ(TRACE_wq_run_threadreq_req_select | DBG_FUNC_NONE, wq, req, 1, 0, 0);
3308                 return req;
3309         }
3310
3311         if (tl) {
3312                 prilimit = WORKQUEUE_EVENT_MANAGER_BUCKET;
3313         } else {
3314                 prilimit = _wq_highest_paced_priority(wq);
3315         }
3316         for (priclass = 0; priclass < prilimit; priclass++) {
3317                 req = TAILQ_FIRST(&wq->wq_overcommit_reqlist[priclass]);
3318                 if (req) {
3319                         PTHREAD_TRACE_WQ_REQ(TRACE_wq_run_threadreq_req_select | DBG_FUNC_NONE, wq, req, 2, 0, 0);
3320                         if (best_req) {
3321                                 *fallback = req;
3322                         } else {
3323                                 best_req = req;
3324                         }
3325                         break;
3326                 }
3327                 if (!best_req) {
3328                         best_req = TAILQ_FIRST(&wq->wq_reqlist[priclass]);
3329                         if (best_req) {
3330                                 PTHREAD_TRACE_WQ_REQ(TRACE_wq_run_threadreq_req_select | DBG_FUNC_NONE, wq, best_req, 3, 0, 0);
3331                         }
3332                 }
3333         }
3334         return best_req;
3335 }
3336
3337 /**
3338  * Runs a thread request on a thread
3339  *
3340  * - if thread is THREAD_NULL, will find a thread and run the request there.
3341  *   Otherwise, the thread must be the current thread.
3342  *
3343  * - if req is NULL, will find the highest priority request and run that.  If
3344  *   it is not NULL, it must be a threadreq object in state NEW.  If it can not
3345  *   be run immediately, it will be enqueued and moved to state WAITING.
3346  *
3347  *   Either way, the thread request object serviced will be moved to state
3348  *   PENDING and attached to the threadlist.
3349  *
3350  *   Should be called with the workqueue lock held.  Will drop it.
3351  *
3352  *   WARNING: _workq_kevent_reqthreads needs to be able to preflight any
3353  *   admission checks in this function.  If you are changing this function,
3354  *   keep that one up-to-date.
3355  *
3356  * - if parking_tl is non NULL, then the current thread is parking. This will
3357  *   try to reuse this thread for a request. If no match is found, it will be
3358  *   parked.
3359  */
3360 static int
3361 workqueue_run_threadreq_and_unlock(proc_t p, struct workqueue *wq,
3362                 struct threadlist *parking_tl, struct threadreq *req,
3363                 bool may_add_new_thread)
3364 {
3365         struct threadreq *incoming_req = req;
3366
3367         struct threadlist *tl = parking_tl;
3368         int rc = WQ_RUN_TR_THROTTLED;
3369
3370         assert(tl == NULL || tl->th_thread == current_thread());
3371         assert(req == NULL || req->tr_state == TR_STATE_NEW);
3372         assert(!may_add_new_thread || !tl);
3373
3374         PTHREAD_TRACE_WQ_REQ(TRACE_wq_run_threadreq | DBG_FUNC_START, wq, req,
3375                         tl ? thread_tid(tl->th_thread) : 0,
3376                         req ? (req->tr_priority << 16 | req->tr_flags) : 0, 0);
3377
3378         /*
3379          * Special cases when provided an event manager request
3380          */
3381         if (req && req->tr_priority == WORKQUEUE_EVENT_MANAGER_BUCKET) {
3382                 // Clients must not rely on identity of event manager requests
3383                 assert(req->tr_flags & TR_FLAG_ONSTACK);
3384                 // You can't be both overcommit and event manager
3385                 assert((req->tr_flags & TR_FLAG_OVERCOMMIT) == 0);
3386
3387                 /*
3388                  * We can only ever have one event manager request, so coalesce them if
3389                  * there's already one outstanding.
3390                  */
3391                 if (wq->wq_event_manager_threadreq.tr_state == TR_STATE_WAITING) {
3392                         PTHREAD_TRACE_WQ_REQ(TRACE_wq_run_threadreq_mgr_merge | DBG_FUNC_NONE, wq, req, 0, 0, 0);
3393
3394                         struct threadreq *existing_req = &wq->wq_event_manager_threadreq;
3395                         if (req->tr_flags & TR_FLAG_KEVENT) {
3396                                 existing_req->tr_flags |= TR_FLAG_KEVENT;
3397                         }
3398
3399                         req = existing_req;
3400                         incoming_req = NULL;
3401                 }
3402
3403                 if (wq->wq_thscheduled_count[WORKQUEUE_EVENT_MANAGER_BUCKET] &&
3404                                 (!tl || tl->th_priority != WORKQUEUE_EVENT_MANAGER_BUCKET)){
3405                         /*
3406                          * There can only be one event manager running at a time.
3407                          */
3408                         PTHREAD_TRACE_WQ(TRACE_wq_run_threadreq | DBG_FUNC_END, wq, 1, 0, 0, 0);
3409                         goto done;
3410                 }
3411         }
3412
3413 again: // Start again after creating a thread
3414
3415         if (_wq_exiting(wq)) {
3416                 rc = WQ_RUN_TR_EXITING;
3417                 goto exiting;
3418         }
3419
3420         /*
3421          * Thread request selection and admission control
3422          */
3423         struct threadreq *fallback = NULL;
3424         if (req) {
3425                 if ((req->tr_flags & TR_FLAG_NO_PACING) == 0 &&
3426                                 _wq_should_pace_priority(wq, req->tr_priority)) {
3427                         /*
3428                          * If a request fails the pacing admission check, then thread
3429                          * requests are redriven when the pacing thread is finally scheduled
3430                          * when it calls _wq_pacing_end() in wq_unpark_continue().
3431                          */
3432                         goto done;
3433                 }
3434         } else if (wq->wq_reqcount == 0) {
3435                 PTHREAD_TRACE_WQ(TRACE_wq_run_threadreq | DBG_FUNC_END, wq, 2, 0, 0, 0);
3436                 goto done;
3437         } else if ((req = workqueue_best_threadreqs(wq, tl, &fallback)) == NULL) {
3438                 PTHREAD_TRACE_WQ(TRACE_wq_run_threadreq | DBG_FUNC_END, wq, 3, 0, 0, 0);
3439                 goto done;
3440         }
3441
3442         if ((req->tr_flags & TR_FLAG_OVERCOMMIT) == 0 &&
3443                         (req->tr_priority < WORKQUEUE_EVENT_MANAGER_BUCKET)) {
3444                 if (!may_start_constrained_thread(wq, req->tr_priority, parking_tl, true)) {
3445                         if (!fallback) {
3446                                 PTHREAD_TRACE_WQ(TRACE_wq_run_threadreq | DBG_FUNC_END, wq, 4, 0, 0, 0);
3447                                 goto done;
3448                         }
3449                         assert(req->tr_state == TR_STATE_WAITING);
3450                         req = fallback;
3451                 }
3452         }
3453
3454         /*
3455          * Thread selection.
3456          */
3457         if (parking_tl) {
3458                 if (tl->th_priority != req->tr_priority) {
3459                         _wq_thactive_move(wq, tl->th_priority, req->tr_priority);
3460                         wq->wq_thscheduled_count[tl->th_priority]--;
3461                         wq->wq_thscheduled_count[req->tr_priority]++;
3462                 }
3463                 PTHREAD_TRACE_WQ(TRACE_wq_run_threadreq_thread_select | DBG_FUNC_NONE,
3464                                 wq, 1, thread_tid(tl->th_thread), 0, 0);
3465         } else if (wq->wq_thidlecount) {
3466                 tl = pop_from_thidlelist(wq, req->tr_priority);
3467                 /*
3468                  * This call will update wq_thscheduled_count and wq_thactive_count for
3469                  * the provided priority.  It will not set the returned thread to that
3470                  * priority.  This matches the behavior of the parking_tl clause above.
3471                  */
3472                 PTHREAD_TRACE_WQ(TRACE_wq_run_threadreq_thread_select | DBG_FUNC_NONE,
3473                                 wq, 2, thread_tid(tl->th_thread), 0, 0);
3474         } else /* no idle threads */ {
3475                 if (!may_add_new_thread || wq->wq_nthreads >= wq_max_threads) {
3476                         PTHREAD_TRACE_WQ(TRACE_wq_run_threadreq | DBG_FUNC_END, wq, 5,
3477                                         may_add_new_thread, wq->wq_nthreads, 0);
3478                         if (wq->wq_nthreads < wq_max_threads) {
3479                                 rc = WQ_RUN_TR_THREAD_NEEDED;
3480                         }
3481                         goto done;
3482                 }
3483
3484                 bool added_thread = workqueue_addnewthread(p, wq);
3485                 /*
3486                  * workqueue_addnewthread will drop and re-take the lock, so we
3487                  * need to ensure we still have a cached request.
3488                  *
3489                  * It also means we have to pick a new request, since our old pick may
3490                  * not be valid anymore.
3491                  */
3492                 req = incoming_req;
3493                 if (req && (req->tr_flags & TR_FLAG_ONSTACK)) {
3494                         _threadreq_copy_prepare(wq);
3495                 }
3496
3497                 if (added_thread) {
3498                         PTHREAD_TRACE_WQ(TRACE_wq_run_threadreq_thread_select | DBG_FUNC_NONE,
3499                                         wq, 3, 0, 0, 0);
3500                         goto again;
3501                 } else if (_wq_exiting(wq)) {
3502                         rc = WQ_RUN_TR_EXITING;
3503                         goto exiting;
3504                 } else {
3505                         PTHREAD_TRACE_WQ(TRACE_wq_run_threadreq | DBG_FUNC_END, wq, 6, 0, 0, 0);
3506                         /*
3507                          * Something caused thread creation to fail.  Kick off the timer in
3508                          * the hope that it'll succeed next time.
3509                          */
3510                         if (WQ_TIMER_DELAYED_NEEDED(wq)) {
3511                                 workqueue_interval_timer_start(wq);
3512                         }
3513                         goto done;
3514                 }
3515         }
3516
3517         /*
3518          * Setup thread, mark request as complete and run with it.
3519          */
3520         if (req->tr_state == TR_STATE_WAITING) {
3521                 _threadreq_dequeue(wq, req);
3522         }
3523         if (tl->th_priority != req->tr_priority) {
3524                 PTHREAD_TRACE_WQ(TRACE_wq_thread_reset_priority | DBG_FUNC_NONE,
3525                                         wq, thread_tid(tl->th_thread),
3526                                         (tl->th_priority << 16) | req->tr_priority, 1, 0);
3527                 reset_priority(tl, pthread_priority_from_wq_class_index(wq, req->tr_priority));
3528                 tl->th_priority = (uint8_t)req->tr_priority;
3529         }
3530         if (req->tr_flags & TR_FLAG_OVERCOMMIT) {
3531                 if ((tl->th_flags & TH_LIST_CONSTRAINED) != 0) {
3532                         tl->th_flags &= ~TH_LIST_CONSTRAINED;
3533                         wq->wq_constrained_threads_scheduled--;
3534                 }
3535         } else {
3536                 if ((tl->th_flags & TH_LIST_CONSTRAINED) == 0) {
3537                         tl->th_flags |= TH_LIST_CONSTRAINED;
3538                         wq->wq_constrained_threads_scheduled++;
3539                 }
3540         }
3541
3542         if (!parking_tl && !(req->tr_flags & TR_FLAG_NO_PACING)) {
3543                 _wq_pacing_start(wq, tl);
3544         }
3545         if ((req->tr_flags & TR_FLAG_OVERCOMMIT) == 0) {
3546                 uint32_t old_qos, new_qos;
3547
3548                 /*
3549                  * If we are scheduling a constrained thread request, we may need to
3550                  * update the best constrained qos in the thactive atomic state.
3551                  */
3552                 for (new_qos = 0; new_qos < WQ_THACTIVE_NO_PENDING_REQUEST; new_qos++) {
3553                         if (TAILQ_FIRST(&wq->wq_reqlist[new_qos]))
3554                                 break;
3555                 }
3556                 old_qos = _wq_thactive_best_constrained_req_qos(wq);
3557                 if (old_qos != new_qos) {
3558                         wq_thactive_t v = _wq_thactive_set_best_constrained_req_qos(wq,
3559                                         old_qos, new_qos);
3560 #ifdef __LP64__
3561                         PTHREAD_TRACE_WQ(TRACE_wq_thactive_update, 2, (uint64_t)v,
3562                                         (uint64_t)(v >> 64), 0, 0);
3563 #else
3564                         PTHREAD_TRACE_WQ(TRACE_wq_thactive_update, 2, v, 0, 0, 0);
3565 #endif
3566                 }
3567         }
3568         {
3569                 uint32_t upcall_flags = WQ_FLAG_THREAD_NEWSPI;
3570                 if (req->tr_flags & TR_FLAG_OVERCOMMIT)
3571                         upcall_flags |= WQ_FLAG_THREAD_OVERCOMMIT;
3572                 if (req->tr_flags & TR_FLAG_KEVENT)
3573                         upcall_flags |= WQ_FLAG_THREAD_KEVENT;
3574                 if (req->tr_flags & TR_FLAG_WORKLOOP)
3575                         upcall_flags |= WQ_FLAG_THREAD_WORKLOOP | WQ_FLAG_THREAD_KEVENT;
3576                 if (tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET)
3577                         upcall_flags |= WQ_FLAG_THREAD_EVENT_MANAGER;
3578                 tl->th_upcall_flags = upcall_flags >> WQ_FLAG_THREAD_PRIOSHIFT;
3579         }
3580         if (req->tr_flags & TR_FLAG_KEVENT) {
3581                 tl->th_flags |= TH_LIST_KEVENT;
3582         } else {
3583                 tl->th_flags &= ~TH_LIST_KEVENT;
3584         }
3585         return _threadreq_complete_and_unlock(p, wq, req, tl);
3586
3587 done:
3588         if (incoming_req) {
3589                 _threadreq_enqueue(wq, incoming_req);
3590         }
3591
3592 exiting:
3593
3594         if (parking_tl && !(parking_tl->th_flags & TH_LIST_UNBINDING)) {
3595                 parkit(wq, parking_tl, parking_tl->th_thread);
3596                 __builtin_unreachable();
3597         }
3598
3599         workqueue_unlock(wq);
3600
3601         return rc;
3602 }
3603
3604 /**
3605  * parked thread wakes up
3606  */
3607 static void __dead2
3608 wq_unpark_continue(void* __unused ptr, wait_result_t wait_result)
3609 {
3610         boolean_t first_use = false;
3611         thread_t th = current_thread();
3612         proc_t p = current_proc();
3613
3614         struct uthread *uth = pthread_kern->get_bsdthread_info(th);
3615         if (uth == NULL) goto done;
3616
3617         struct workqueue *wq = pthread_kern->proc_get_wqptr(p);
3618         if (wq == NULL) goto done;
3619
3620         workqueue_lock_spin(wq);
3621
3622         struct threadlist *tl = pthread_kern->uthread_get_threadlist(uth);
3623         assert(tl != WQ_THREADLIST_EXITING_POISON);
3624         if (tl == NULL) {
3625                 /*
3626                  * We woke up before addnewthread() was finished setting us up.  Go
3627                  * ahead and exit, but before we do poison the threadlist variable so
3628                  * that addnewthread() doesn't think we are valid still.
3629                  */
3630                 pthread_kern->uthread_set_threadlist(uth, WQ_THREADLIST_EXITING_POISON);
3631                 workqueue_unlock(wq);
3632                 goto done;
3633         }
3634
3635         assert(tl->th_flags & TH_LIST_INITED);
3636
3637         if ((tl->th_flags & TH_LIST_NEW)){
3638                 tl->th_flags &= ~(TH_LIST_NEW);
3639                 first_use = true;
3640         }
3641
3642         if ((tl->th_flags & (TH_LIST_RUNNING | TH_LIST_BUSY)) == TH_LIST_RUNNING) {
3643                 /*
3644                  * The normal wakeup path.
3645                  */
3646                 goto return_to_user;
3647         }
3648
3649         if ((tl->th_flags & TH_LIST_RUNNING) == 0 &&
3650                         wait_result == THREAD_TIMED_OUT &&
3651                         tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET &&
3652                         TAILQ_FIRST(&wq->wq_thidlemgrlist) == tl &&
3653                         TAILQ_NEXT(tl, th_entry) == NULL){
3654                 /*
3655                  * If we are the only idle manager and we pop'ed for self-destruction,
3656                  * then don't actually exit.  Instead, free our stack to save some
3657                  * memory and re-park.
3658                  */
3659
3660                 workqueue_unlock(wq);
3661
3662                 vm_map_t vmap = wq->wq_map;
3663
3664                 // Keep this in sync with _setup_wqthread()
3665                 const vm_size_t       guardsize = vm_map_page_size(vmap);
3666                 const user_addr_t     freeaddr = (user_addr_t)tl->th_stackaddr + guardsize;
3667                 const vm_map_offset_t freesize = vm_map_trunc_page_mask((PTH_DEFAULT_STACKSIZE + guardsize + PTHREAD_T_OFFSET) - 1, vm_map_page_mask(vmap)) - guardsize;
3668
3669                 int kr;
3670                 kr = mach_vm_behavior_set(vmap, freeaddr, freesize, VM_BEHAVIOR_REUSABLE);
3671                 assert(kr == KERN_SUCCESS || kr == KERN_INVALID_ADDRESS);
3672
3673                 workqueue_lock_spin(wq);
3674
3675                 if ( !(tl->th_flags & TH_LIST_RUNNING)) {
3676                         thread_set_pending_block_hint(th, kThreadWaitParkedWorkQueue);
3677                         assert_wait((caddr_t)tl, (THREAD_INTERRUPTIBLE));
3678
3679                         workqueue_unlock(wq);
3680
3681                         thread_block(wq_unpark_continue);
3682                         __builtin_unreachable();
3683                 }
3684         }
3685
3686         if ((tl->th_flags & TH_LIST_RUNNING) == 0) {
3687                 assert((tl->th_flags & TH_LIST_BUSY) == 0);
3688                 if (!first_use) {
3689                         PTHREAD_TRACE_WQ(TRACE_wq_thread_park | DBG_FUNC_END, wq, 0, 0, 0, 0);
3690                 }
3691                 /*
3692                  * We were set running, but not for the purposes of actually running.
3693                  * This could be because the timer elapsed.  Or it could be because the
3694                  * thread aborted.  Either way, we need to return to userspace to exit.
3695                  *
3696                  * The call to workqueue_removethread will consume the lock.
3697                  */
3698
3699                 if (!first_use &&
3700                                 (tl->th_priority < qos_class_get_class_index(WQ_THREAD_CLEANUP_QOS) ||
3701                                 (tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET))) {
3702                         // Reset the QoS to something low for the pthread cleanup
3703                         PTHREAD_TRACE_WQ(TRACE_wq_thread_reset_priority | DBG_FUNC_NONE,
3704                                                 wq, thread_tid(th),
3705                                                 (tl->th_priority << 16) | qos_class_get_class_index(WQ_THREAD_CLEANUP_QOS), 3, 0);
3706                         pthread_priority_t cleanup_pri = _pthread_priority_make_newest(WQ_THREAD_CLEANUP_QOS, 0, 0);
3707                         reset_priority(tl, cleanup_pri);
3708                 }
3709
3710                 workqueue_removethread(tl, 0, first_use);
3711
3712                 if (first_use){
3713                         pthread_kern->thread_bootstrap_return();
3714                 } else {
3715                         pthread_kern->unix_syscall_return(0);
3716                 }
3717                 __builtin_unreachable();
3718         }
3719
3720         /*
3721          * The timer woke us up or the thread was aborted.  However, we have
3722          * already started to make this a runnable thread.  Wait for that to
3723          * finish, then continue to userspace.
3724          */
3725         while ((tl->th_flags & TH_LIST_BUSY)) {
3726                 assert_wait((caddr_t)tl, (THREAD_UNINT));
3727
3728                 workqueue_unlock(wq);
3729
3730                 thread_block(THREAD_CONTINUE_NULL);
3731
3732                 workqueue_lock_spin(wq);
3733         }
3734
3735 return_to_user:
3736         if (!first_use) {
3737                 PTHREAD_TRACE_WQ(TRACE_wq_thread_park | DBG_FUNC_END, wq, 0, 0, 0, 0);
3738         }
3739         if (_wq_pacing_end(wq, tl) && wq->wq_reqcount) {
3740                 workqueue_run_threadreq_and_unlock(p, wq, NULL, NULL, true);
3741         } else {
3742                 workqueue_unlock(wq);
3743         }
3744         _setup_wqthread(p, th, wq, tl, first_use ? WQ_SETUP_FIRST_USE : 0);
3745         pthread_kern->thread_sched_call(th, workqueue_callback);
3746 done:
3747         if (first_use){
3748                 pthread_kern->thread_bootstrap_return();
3749         } else {
3750                 pthread_kern->unix_syscall_return(EJUSTRETURN);
3751         }
3752         panic("Our attempt to return to userspace failed...");
3753 }
3754
3755 /**
3756  * configures initial thread stack/registers to jump into:
3757  * _pthread_wqthread(pthread_t self, mach_port_t kport, void *stackaddr, void *keventlist, int upcall_flags, int nkevents);
3758  * to get there we jump through assembily stubs in pthread_asm.s.  Those
3759  * routines setup a stack frame, using the current stack pointer, and marshall
3760  * arguments from registers to the stack as required by the ABI.
3761  *
3762  * One odd thing we do here is to start the pthread_t 4k below what would be the
3763  * top of the stack otherwise.  This is because usually only the first 4k of the
3764  * pthread_t will be used and so we want to put it on the same 16k page as the
3765  * top of the stack to save memory.
3766  *
3767  * When we are done the stack will look like:
3768  * |-----------| th_stackaddr + th_allocsize
3769  * |pthread_t  | th_stackaddr + DEFAULT_STACKSIZE + guardsize + PTHREAD_STACK_OFFSET
3770  * |kevent list| optionally - at most WQ_KEVENT_LIST_LEN events
3771  * |kevent data| optionally - at most WQ_KEVENT_DATA_SIZE bytes
3772  * |stack gap  | bottom aligned to 16 bytes, and at least as big as stack_gap_min
3773  * |   STACK   |
3774  * |     ⇓     |
3775  * |           |
3776  * |guard page | guardsize
3777  * |-----------| th_stackaddr
3778  */
3779 void
3780 _setup_wqthread(proc_t p, thread_t th, struct workqueue *wq,
3781                 struct threadlist *tl, int setup_flags)
3782 {
3783         int error;
3784         if (setup_flags & WQ_SETUP_CLEAR_VOUCHER) {
3785                 /*
3786                  * For preemption reasons, we want to reset the voucher as late as
3787                  * possible, so we do it in two places:
3788                  *   - Just before parking (i.e. in parkit())
3789                  *   - Prior to doing the setup for the next workitem (i.e. here)
3790                  *
3791                  * Those two places are sufficient to ensure we always reset it before
3792                  * it goes back out to user space, but be careful to not break that
3793                  * guarantee.
3794                  */
3795                 __assert_only kern_return_t kr;
3796                 kr = pthread_kern->thread_set_voucher_name(MACH_PORT_NULL);
3797                 assert(kr == KERN_SUCCESS);
3798         }
3799
3800         uint32_t upcall_flags = tl->th_upcall_flags << WQ_FLAG_THREAD_PRIOSHIFT;
3801         if (!(setup_flags & WQ_SETUP_FIRST_USE)) {
3802                 upcall_flags |= WQ_FLAG_THREAD_REUSE;
3803         }
3804
3805         /*
3806          * Put the QoS class value into the lower bits of the reuse_thread register, this is where
3807          * the thread priority used to be stored anyway.
3808          */
3809         pthread_priority_t priority = pthread_priority_from_wq_class_index(wq, tl->th_priority);
3810         upcall_flags |= (_pthread_priority_get_qos_newest(priority) & WQ_FLAG_THREAD_PRIOMASK);
3811
3812         const vm_size_t guardsize = vm_map_page_size(tl->th_workq->wq_map);
3813         const vm_size_t stack_gap_min = (proc_is64bit(p) == 0) ? C_32_STK_ALIGN : C_64_REDZONE_LEN;
3814         const vm_size_t stack_align_min = (proc_is64bit(p) == 0) ? C_32_STK_ALIGN : C_64_STK_ALIGN;
3815
3816         user_addr_t pthread_self_addr = (user_addr_t)(tl->th_stackaddr + PTH_DEFAULT_STACKSIZE + guardsize + PTHREAD_T_OFFSET);
3817         user_addr_t stack_top_addr = (user_addr_t)((pthread_self_addr - stack_gap_min) & -stack_align_min);
3818         user_addr_t stack_bottom_addr = (user_addr_t)(tl->th_stackaddr + guardsize);
3819
3820         user_addr_t wqstart_fnptr = pthread_kern->proc_get_wqthread(p);
3821         if (!wqstart_fnptr) {
3822                 panic("workqueue thread start function pointer is NULL");
3823         }
3824
3825         if (setup_flags & WQ_SETUP_FIRST_USE) {
3826                 uint32_t tsd_offset = pthread_kern->proc_get_pthread_tsd_offset(p);
3827                 if (tsd_offset) {
3828                         mach_vm_offset_t th_tsd_base = (mach_vm_offset_t)pthread_self_addr + tsd_offset;
3829                         kern_return_t kret = pthread_kern->thread_set_tsd_base(th, th_tsd_base);
3830                         if (kret == KERN_SUCCESS) {
3831                                 upcall_flags |= WQ_FLAG_THREAD_TSD_BASE_SET;
3832                         }
3833                 }
3834
3835                 /*
3836                 * Pre-fault the first page of the new thread's stack and the page that will
3837                 * contain the pthread_t structure.
3838                 */
3839                 vm_map_t vmap = pthread_kern->current_map();
3840                 if (vm_map_trunc_page_mask((vm_map_offset_t)(stack_top_addr - C_64_REDZONE_LEN), vm_map_page_mask(vmap)) !=
3841                                 vm_map_trunc_page_mask((vm_map_offset_t)pthread_self_addr, vm_map_page_mask(vmap))){
3842                         vm_fault( vmap,
3843                                         vm_map_trunc_page_mask((vm_map_offset_t)(stack_top_addr - C_64_REDZONE_LEN), vm_map_page_mask(vmap)),
3844                                         VM_PROT_READ | VM_PROT_WRITE,
3845                                         FALSE,
3846                                         THREAD_UNINT, NULL, 0);
3847                 }
3848                 vm_fault( vmap,
3849                                 vm_map_trunc_page_mask((vm_map_offset_t)pthread_self_addr, vm_map_page_mask(vmap)),
3850                                 VM_PROT_READ | VM_PROT_WRITE,
3851                                 FALSE,
3852                                 THREAD_UNINT, NULL, 0);
3853         }
3854
3855         user_addr_t kevent_list = NULL;
3856         int kevent_count = 0;
3857         if (upcall_flags & WQ_FLAG_THREAD_KEVENT){
3858                 bool workloop = upcall_flags & WQ_FLAG_THREAD_WORKLOOP;
3859
3860                 kevent_list = pthread_self_addr - WQ_KEVENT_LIST_LEN * sizeof(struct kevent_qos_s);
3861                 kevent_count = WQ_KEVENT_LIST_LEN;
3862
3863                 user_addr_t kevent_id_addr = kevent_list;
3864                 if (workloop) {
3865                         /*
3866                          * The kevent ID goes just below the kevent list.  Sufficiently new
3867                          * userspace will know to look there.  Old userspace will just
3868                          * ignore it.
3869                          */
3870                         kevent_id_addr -= sizeof(kqueue_id_t);
3871                 }
3872
3873                 user_addr_t kevent_data_buf = kevent_id_addr - WQ_KEVENT_DATA_SIZE;
3874                 user_size_t kevent_data_available = WQ_KEVENT_DATA_SIZE;
3875
3876                 int32_t events_out = 0;
3877
3878                 assert(tl->th_flags | TH_LIST_KEVENT_BOUND);
3879                 unsigned int flags = KEVENT_FLAG_STACK_DATA | KEVENT_FLAG_IMMEDIATE;
3880                 if (tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET) {
3881                         flags |= KEVENT_FLAG_WORKQ_MANAGER;
3882                 }
3883                 int ret = 0;
3884                 if (workloop) {
3885                         flags |= KEVENT_FLAG_WORKLOOP;
3886                         kqueue_id_t kevent_id = -1;
3887                         ret = kevent_id_internal(p, &kevent_id,
3888                                         NULL, 0, kevent_list, kevent_count,
3889                                         kevent_data_buf, &kevent_data_available,
3890                                         flags, &events_out);
3891                         copyout(&kevent_id, kevent_id_addr, sizeof(kevent_id));
3892                 } else {
3893                         flags |= KEVENT_FLAG_WORKQ;
3894                         ret = kevent_qos_internal(p,
3895                                         class_index_get_thread_qos(tl->th_priority),
3896                                         NULL, 0, kevent_list, kevent_count,
3897                                         kevent_data_buf, &kevent_data_available,
3898                                         flags, &events_out);
3899                 }
3900
3901                 // squash any errors into just empty output
3902                 if (ret != KERN_SUCCESS || events_out == -1){
3903                         events_out = 0;
3904                         kevent_data_available = WQ_KEVENT_DATA_SIZE;
3905                 }
3906
3907                 // We shouldn't get data out if there aren't events available
3908                 assert(events_out != 0 || kevent_data_available == WQ_KEVENT_DATA_SIZE);
3909
3910                 if (events_out > 0){
3911                         if (kevent_data_available == WQ_KEVENT_DATA_SIZE){
3912                                 stack_top_addr = (kevent_id_addr - stack_gap_min) & -stack_align_min;
3913                         } else {
3914                                 stack_top_addr = (kevent_data_buf + kevent_data_available - stack_gap_min) & -stack_align_min;
3915                         }
3916
3917                         kevent_count = events_out;
3918                 } else {
3919                         kevent_list = NULL;
3920                         kevent_count = 0;
3921                 }
3922         }
3923
3924         PTHREAD_TRACE_WQ(TRACE_wq_runthread | DBG_FUNC_START, wq, 0, 0, 0, 0);
3925
3926 #if defined(__i386__) || defined(__x86_64__)
3927         if (proc_is64bit(p) == 0) {
3928                 x86_thread_state32_t state = {
3929                         .eip = (unsigned int)wqstart_fnptr,
3930                         .eax = /* arg0 */ (unsigned int)pthread_self_addr,
3931                         .ebx = /* arg1 */ (unsigned int)tl->th_thport,
3932                         .ecx = /* arg2 */ (unsigned int)stack_bottom_addr,
3933                         .edx = /* arg3 */ (unsigned int)kevent_list,
3934                         .edi = /* arg4 */ (unsigned int)upcall_flags,
3935                         .esi = /* arg5 */ (unsigned int)kevent_count,
3936
3937                         .esp = (int)((vm_offset_t)stack_top_addr),
3938                 };
3939
3940                 error = pthread_kern->thread_set_wq_state32(th, (thread_state_t)&state);
3941                 if (error != KERN_SUCCESS) {
3942                         panic(__func__ ": thread_set_wq_state failed: %d", error);
3943                 }
3944         } else {
3945                 x86_thread_state64_t state64 = {
3946                         // x86-64 already passes all the arguments in registers, so we just put them in their final place here
3947                         .rip = (uint64_t)wqstart_fnptr,
3948                         .rdi = (uint64_t)pthread_self_addr,
3949                         .rsi = (uint64_t)tl->th_thport,
3950                         .rdx = (uint64_t)stack_bottom_addr,
3951                         .rcx = (uint64_t)kevent_list,
3952                         .r8  = (uint64_t)upcall_flags,
3953                         .r9  = (uint64_t)kevent_count,
3954
3955                         .rsp = (uint64_t)(stack_top_addr)
3956                 };
3957
3958                 error = pthread_kern->thread_set_wq_state64(th, (thread_state_t)&state64);
3959                 if (error != KERN_SUCCESS) {
3960                         panic(__func__ ": thread_set_wq_state failed: %d", error);
3961                 }
3962         }
3963 #else
3964 #error setup_wqthread  not defined for this architecture
3965 #endif
3966 }
3967
3968 #if DEBUG
3969 static int wq_kevent_test SYSCTL_HANDLER_ARGS {
3970         //(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req)
3971 #pragma unused(oidp, arg1, arg2)
3972         int error;
3973         struct workq_reqthreads_req_s requests[64] = {};
3974
3975         if (req->newlen > sizeof(requests) || req->newlen < sizeof(struct workq_reqthreads_req_s))
3976                 return EINVAL;
3977
3978         error = copyin(req->newptr, requests, req->newlen);
3979         if (error) return error;
3980
3981         _workq_reqthreads(req->p, (int)(req->newlen / sizeof(struct workq_reqthreads_req_s)), requests);
3982
3983         return 0;
3984 }
3985 #endif // DEBUG
3986
3987 #pragma mark - Misc
3988
3989 int
3990 _fill_procworkqueue(proc_t p, struct proc_workqueueinfo * pwqinfo)
3991 {
3992         struct workqueue * wq;
3993         int error = 0;
3994         int     activecount;
3995
3996         if ((wq = pthread_kern->proc_get_wqptr(p)) == NULL) {
3997                 return EINVAL;
3998         }
3999
4000         /*
4001          * This is sometimes called from interrupt context by the kperf sampler.
4002          * In that case, it's not safe to spin trying to take the lock since we
4003          * might already hold it.  So, we just try-lock it and error out if it's
4004          * already held.  Since this is just a debugging aid, and all our callers
4005          * are able to handle an error, that's fine.
4006          */
4007         bool locked = workqueue_lock_try(wq);
4008         if (!locked) {
4009                 return EBUSY;
4010         }
4011
4012         activecount = _wq_thactive_aggregate_downto_qos(wq, _wq_thactive(wq),
4013                         WORKQUEUE_NUM_BUCKETS - 1, NULL, NULL);
4014         pwqinfo->pwq_nthreads = wq->wq_nthreads;
4015         pwqinfo->pwq_runthreads = activecount;
4016         pwqinfo->pwq_blockedthreads = wq->wq_threads_scheduled - activecount;
4017         pwqinfo->pwq_state = 0;
4018
4019         if (wq->wq_constrained_threads_scheduled >= wq_max_constrained_threads) {
4020                 pwqinfo->pwq_state |= WQ_EXCEEDED_CONSTRAINED_THREAD_LIMIT;
4021         }
4022
4023         if (wq->wq_nthreads >= wq_max_threads) {
4024                 pwqinfo->pwq_state |= WQ_EXCEEDED_TOTAL_THREAD_LIMIT;
4025         }
4026
4027         workqueue_unlock(wq);
4028         return(error);
4029 }
4030
4031 uint32_t
4032 _get_pwq_state_kdp(proc_t p)
4033 {
4034         if (p == NULL) {
4035                 return 0;
4036         }
4037
4038         struct workqueue *wq = pthread_kern->proc_get_wqptr(p);
4039
4040         if (wq == NULL || workqueue_lock_spin_is_acquired_kdp(wq)) {
4041                 return 0;
4042         }
4043
4044         uint32_t pwq_state = WQ_FLAGS_AVAILABLE;
4045
4046         if (wq->wq_constrained_threads_scheduled >= wq_max_constrained_threads) {
4047                 pwq_state |= WQ_EXCEEDED_CONSTRAINED_THREAD_LIMIT;
4048         }
4049
4050         if (wq->wq_nthreads >= wq_max_threads) {
4051                 pwq_state |= WQ_EXCEEDED_TOTAL_THREAD_LIMIT;
4052         }
4053
4054         return pwq_state;
4055 }
4056
4057 int
4058 _thread_selfid(__unused struct proc *p, uint64_t *retval)
4059 {
4060         thread_t thread = current_thread();
4061         *retval = thread_tid(thread);
4062         return KERN_SUCCESS;
4063 }
4064
4065 void
4066 _pthread_init(void)
4067 {
4068         pthread_lck_grp_attr = lck_grp_attr_alloc_init();
4069         pthread_lck_grp = lck_grp_alloc_init("pthread", pthread_lck_grp_attr);
4070
4071         /*
4072          * allocate the lock attribute for pthread synchronizers
4073          */
4074         pthread_lck_attr = lck_attr_alloc_init();
4075
4076         pthread_list_mlock = lck_mtx_alloc_init(pthread_lck_grp, pthread_lck_attr);
4077
4078         pth_global_hashinit();
4079         psynch_thcall = thread_call_allocate(psynch_wq_cleanup, NULL);
4080         psynch_zoneinit();
4081
4082         pthread_zone_workqueue = zinit(sizeof(struct workqueue),
4083                         1024 * sizeof(struct workqueue), 8192, "pthread.workqueue");
4084         pthread_zone_threadlist = zinit(sizeof(struct threadlist),
4085                         1024 * sizeof(struct threadlist), 8192, "pthread.threadlist");
4086         pthread_zone_threadreq = zinit(sizeof(struct threadreq),
4087                         1024 * sizeof(struct threadreq), 8192, "pthread.threadreq");
4088
4089         /*
4090          * register sysctls
4091          */
4092         sysctl_register_oid(&sysctl__kern_wq_stalled_window_usecs);
4093         sysctl_register_oid(&sysctl__kern_wq_reduce_pool_window_usecs);
4094         sysctl_register_oid(&sysctl__kern_wq_max_timer_interval_usecs);
4095         sysctl_register_oid(&sysctl__kern_wq_max_threads);
4096         sysctl_register_oid(&sysctl__kern_wq_max_constrained_threads);
4097         sysctl_register_oid(&sysctl__kern_pthread_debug_tracing);
4098
4099 #if DEBUG
4100         sysctl_register_oid(&sysctl__debug_wq_kevent_test);
4101 #endif
4102
4103         for (int i = 0; i < WORKQUEUE_NUM_BUCKETS; i++) {
4104                 uint32_t thread_qos = _wq_bucket_to_thread_qos(i);
4105                 wq_max_concurrency[i] = pthread_kern->qos_max_parallelism(thread_qos,
4106                                 QOS_PARALLELISM_COUNT_LOGICAL);
4107         }
4108         wq_max_concurrency[WORKQUEUE_EVENT_MANAGER_BUCKET] = 1;
4109 }