kern/kern_support.c

   1 /*
   2  * Copyright (c) 2000-2012 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995-2005 Apple Computer, Inc. All Rights Reserved */
  29 /*
  30  *      pthread_synch.c
  31  */
  32
  33 #pragma mark - Front Matter
  34
  35 #define  _PTHREAD_CONDATTR_T
  36 #define  _PTHREAD_COND_T
  37 #define _PTHREAD_MUTEXATTR_T
  38 #define _PTHREAD_MUTEX_T
  39 #define _PTHREAD_RWLOCKATTR_T
  40 #define _PTHREAD_RWLOCK_T
  41
  42 #undef pthread_mutexattr_t
  43 #undef pthread_mutex_t
  44 #undef pthread_condattr_t
  45 #undef pthread_cond_t
  46 #undef pthread_rwlockattr_t
  47 #undef pthread_rwlock_t
  48
  49 #include <sys/cdefs.h>
  50
  51 // <rdar://problem/26158937> panic() should be marked noreturn
  52 extern void panic(const char *string, ...) __printflike(1,2) __dead2;
  53
  54 #include <sys/param.h>
  55 #include <sys/queue.h>
  56 #include <sys/resourcevar.h>
  57 //#include <sys/proc_internal.h>
  58 #include <sys/kauth.h>
  59 #include <sys/systm.h>
  60 #include <sys/timeb.h>
  61 #include <sys/times.h>
  62 #include <sys/acct.h>
  63 #include <sys/kernel.h>
  64 #include <sys/wait.h>
  65 #include <sys/signalvar.h>
  66 #include <sys/sysctl.h>
  67 #include <sys/syslog.h>
  68 #include <sys/stat.h>
  69 #include <sys/lock.h>
  70 #include <sys/kdebug.h>
  71 //#include <sys/sysproto.h>
  72 #include <sys/vm.h>
  73 #include <sys/user.h>           /* for coredump */
  74 #include <sys/proc_info.h>      /* for fill_procworkqueue */
  75
  76 #include <mach/mach_port.h>
  77 #include <mach/mach_types.h>
  78 #include <mach/semaphore.h>
  79 #include <mach/sync_policy.h>
  80 #include <mach/task.h>
  81 #include <mach/vm_prot.h>
  82 #include <kern/kern_types.h>
  83 #include <kern/task.h>
  84 #include <kern/clock.h>
  85 #include <mach/kern_return.h>
  86 #include <kern/thread.h>
  87 #include <kern/sched_prim.h>
  88 #include <kern/kalloc.h>
  89 #include <kern/sched_prim.h>    /* for thread_exception_return */
  90 #include <kern/processor.h>
  91 #include <kern/assert.h>
  92 #include <mach/mach_vm.h>
  93 #include <mach/mach_param.h>
  94 #include <mach/thread_status.h>
  95 #include <mach/thread_policy.h>
  96 #include <mach/message.h>
  97 #include <mach/port.h>
  98 //#include <vm/vm_protos.h>
  99 #include <vm/vm_fault.h>
 100 #include <vm/vm_map.h>
 101 #include <mach/thread_act.h> /* for thread_resume */
 102 #include <machine/machine_routines.h>
 103 #include <mach/shared_region.h>
 104
 105 #include <libkern/OSAtomic.h>
 106 #include <libkern/libkern.h>
 107
 108 #include <sys/pthread_shims.h>
 109 #include "kern_internal.h"
 110
 111 // XXX: Dirty import for sys/signarvar.h that's wrapped in BSD_KERNEL_PRIVATE
 112 #define sigcantmask (sigmask(SIGKILL) | sigmask(SIGSTOP))
 113
 114 // XXX: Ditto for thread tags from kern/thread.h
 115 #define THREAD_TAG_MAINTHREAD 0x1
 116 #define THREAD_TAG_PTHREAD 0x10
 117 #define THREAD_TAG_WORKQUEUE 0x20
 118
 119 lck_grp_attr_t   *pthread_lck_grp_attr;
 120 lck_grp_t    *pthread_lck_grp;
 121 lck_attr_t   *pthread_lck_attr;
 122
 123 extern void thread_set_cthreadself(thread_t thread, uint64_t pself, int isLP64);
 124 extern void workqueue_thread_yielded(void);
 125
 126 enum run_nextreq_mode {
 127         RUN_NEXTREQ_DEFAULT,
 128         RUN_NEXTREQ_DEFAULT_KEVENT,
 129         RUN_NEXTREQ_OVERCOMMIT,
 130         RUN_NEXTREQ_OVERCOMMIT_KEVENT,
 131         RUN_NEXTREQ_DEFERRED_OVERCOMMIT,
 132         RUN_NEXTREQ_UNCONSTRAINED,
 133         RUN_NEXTREQ_EVENT_MANAGER,
 134         RUN_NEXTREQ_ADD_TIMER
 135 };
 136 static thread_t workqueue_run_nextreq(proc_t p, struct workqueue *wq, thread_t th,
 137                 enum run_nextreq_mode mode, pthread_priority_t prio,
 138                 bool kevent_bind_via_return);
 139
 140 static boolean_t workqueue_run_one(proc_t p, struct workqueue *wq, boolean_t overcommit, pthread_priority_t priority);
 141
 142 static void wq_runreq(proc_t p, thread_t th, struct workqueue *wq,
 143                 struct threadlist *tl, boolean_t return_directly, boolean_t deferred_kevent);
 144
 145 static void _setup_wqthread(proc_t p, thread_t th, struct workqueue *wq, struct threadlist *tl, bool first_use);
 146
 147 static void reset_priority(struct threadlist *tl, pthread_priority_t pri);
 148 static pthread_priority_t pthread_priority_from_wq_class_index(struct workqueue *wq, int index);
 149
 150 static void wq_unpark_continue(void* ptr, wait_result_t wait_result) __dead2;
 151
 152 static boolean_t workqueue_addnewthread(struct workqueue *wq, boolean_t ignore_constrained_thread_limit);
 153
 154 static void workqueue_removethread(struct threadlist *tl, bool fromexit, bool first_use);
 155 static void workqueue_lock_spin(struct workqueue *);
 156 static void workqueue_unlock(struct workqueue *);
 157
 158 static boolean_t may_start_constrained_thread(struct workqueue *wq, uint32_t at_priclass, uint32_t my_priclass, boolean_t *start_timer);
 159
 160 static mach_vm_offset_t stack_addr_hint(proc_t p, vm_map_t vmap);
 161
 162 int proc_settargetconc(pid_t pid, int queuenum, int32_t targetconc);
 163 int proc_setalltargetconc(pid_t pid, int32_t * targetconcp);
 164
 165 #define WQ_MAXPRI_MIN   0       /* low prio queue num */
 166 #define WQ_MAXPRI_MAX   2       /* max  prio queuenum */
 167 #define WQ_PRI_NUM      3       /* number of prio work queues */
 168
 169 #define C_32_STK_ALIGN          16
 170 #define C_64_STK_ALIGN          16
 171 #define C_64_REDZONE_LEN        128
 172
 173 #define PTHREAD_T_OFFSET 0
 174
 175 /*
 176  * Flags filed passed to bsdthread_create and back in pthread_start
 177 31  <---------------------------------> 0
 178 _________________________________________
 179 | flags(8) | policy(8) | importance(16) |
 180 -----------------------------------------
 181 */
 182
 183 #define PTHREAD_START_CUSTOM            0x01000000
 184 #define PTHREAD_START_SETSCHED          0x02000000
 185 #define PTHREAD_START_DETACHED          0x04000000
 186 #define PTHREAD_START_QOSCLASS          0x08000000
 187 #define PTHREAD_START_TSD_BASE_SET      0x10000000
 188 #define PTHREAD_START_QOSCLASS_MASK     0x00ffffff
 189 #define PTHREAD_START_POLICY_BITSHIFT 16
 190 #define PTHREAD_START_POLICY_MASK 0xff
 191 #define PTHREAD_START_IMPORTANCE_MASK 0xffff
 192
 193 #define SCHED_OTHER      POLICY_TIMESHARE
 194 #define SCHED_FIFO       POLICY_FIFO
 195 #define SCHED_RR         POLICY_RR
 196
 197 #define BASEPRI_DEFAULT 31
 198
 199 #pragma mark sysctls
 200
 201 uint32_t wq_yielded_threshold           = WQ_YIELDED_THRESHOLD;
 202 uint32_t wq_yielded_window_usecs        = WQ_YIELDED_WINDOW_USECS;
 203 uint32_t wq_stalled_window_usecs        = WQ_STALLED_WINDOW_USECS;
 204 uint32_t wq_reduce_pool_window_usecs    = WQ_REDUCE_POOL_WINDOW_USECS;
 205 uint32_t wq_max_timer_interval_usecs    = WQ_MAX_TIMER_INTERVAL_USECS;
 206 uint32_t wq_max_threads                 = WORKQUEUE_MAXTHREADS;
 207 uint32_t wq_max_constrained_threads     = WORKQUEUE_MAXTHREADS / 8;
 208 uint32_t wq_max_concurrency = 1; // set to ncpus on load
 209
 210 SYSCTL_INT(_kern, OID_AUTO, wq_yielded_threshold, CTLFLAG_RW | CTLFLAG_LOCKED,
 211            &wq_yielded_threshold, 0, "");
 212
 213 SYSCTL_INT(_kern, OID_AUTO, wq_yielded_window_usecs, CTLFLAG_RW | CTLFLAG_LOCKED,
 214            &wq_yielded_window_usecs, 0, "");
 215
 216 SYSCTL_INT(_kern, OID_AUTO, wq_stalled_window_usecs, CTLFLAG_RW | CTLFLAG_LOCKED,
 217            &wq_stalled_window_usecs, 0, "");
 218
 219 SYSCTL_INT(_kern, OID_AUTO, wq_reduce_pool_window_usecs, CTLFLAG_RW | CTLFLAG_LOCKED,
 220            &wq_reduce_pool_window_usecs, 0, "");
 221
 222 SYSCTL_INT(_kern, OID_AUTO, wq_max_timer_interval_usecs, CTLFLAG_RW | CTLFLAG_LOCKED,
 223            &wq_max_timer_interval_usecs, 0, "");
 224
 225 SYSCTL_INT(_kern, OID_AUTO, wq_max_threads, CTLFLAG_RW | CTLFLAG_LOCKED,
 226            &wq_max_threads, 0, "");
 227
 228 SYSCTL_INT(_kern, OID_AUTO, wq_max_constrained_threads, CTLFLAG_RW | CTLFLAG_LOCKED,
 229            &wq_max_constrained_threads, 0, "");
 230
 231 #ifdef DEBUG
 232 SYSCTL_INT(_kern, OID_AUTO, wq_max_concurrency, CTLFLAG_RW | CTLFLAG_LOCKED,
 233                    &wq_max_concurrency, 0, "");
 234
 235 static int wq_kevent_test SYSCTL_HANDLER_ARGS;
 236 SYSCTL_PROC(_debug, OID_AUTO, wq_kevent_test, CTLFLAG_MASKED | CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY | CTLTYPE_OPAQUE, NULL, 0, wq_kevent_test, 0, "-");
 237 #endif
 238
 239 static uint32_t wq_init_constrained_limit = 1;
 240
 241 uint32_t pthread_debug_tracing = 1;
 242
 243 SYSCTL_INT(_kern, OID_AUTO, pthread_debug_tracing, CTLFLAG_RW | CTLFLAG_LOCKED,
 244                    &pthread_debug_tracing, 0, "")
 245
 246
 247 #pragma mark - Process/Thread Setup/Teardown syscalls
 248
 249 static mach_vm_offset_t
 250 stack_addr_hint(proc_t p, vm_map_t vmap)
 251 {
 252         mach_vm_offset_t stackaddr;
 253         mach_vm_offset_t aslr_offset;
 254         bool proc64bit = proc_is64bit(p);
 255
 256         // We can't safely take random values % something unless its a power-of-two
 257         _Static_assert(powerof2(PTH_DEFAULT_STACKSIZE), "PTH_DEFAULT_STACKSIZE is a power-of-two");
 258
 259 #if defined(__i386__) || defined(__x86_64__)
 260         if (proc64bit) {
 261                 // Matches vm_map_get_max_aslr_slide_pages's image shift in xnu
 262                 aslr_offset = random() % (1 << 28); // about 512 stacks
 263         } else {
 264                 // Actually bigger than the image shift, we've got ~256MB to work with
 265                 aslr_offset = random() % (16 * PTH_DEFAULT_STACKSIZE);
 266         }
 267         aslr_offset = vm_map_trunc_page_mask(aslr_offset, vm_map_page_mask(vmap));
 268         if (proc64bit) {
 269                 // Above nanomalloc range (see NANOZONE_SIGNATURE)
 270                 stackaddr = 0x700000000000 + aslr_offset;
 271         } else {
 272                 stackaddr = SHARED_REGION_BASE_I386 + SHARED_REGION_SIZE_I386 + aslr_offset;
 273         }
 274 #elif defined(__arm__) || defined(__arm64__)
 275         // vm_map_get_max_aslr_slide_pages ensures 1MB of slide, we do better
 276         aslr_offset = random() % ((proc64bit ? 4 : 2) * PTH_DEFAULT_STACKSIZE);
 277         aslr_offset = vm_map_trunc_page_mask((vm_map_offset_t)aslr_offset, vm_map_page_mask(vmap));
 278         if (proc64bit) {
 279                 // 64 stacks below nanomalloc (see NANOZONE_SIGNATURE)
 280                 stackaddr = 0x170000000 - 64 * PTH_DEFAULT_STACKSIZE - aslr_offset;
 281         } else {
 282                 // If you try to slide down from this point, you risk ending up in memory consumed by malloc
 283                 stackaddr = SHARED_REGION_BASE_ARM - 32 * PTH_DEFAULT_STACKSIZE + aslr_offset;
 284         }
 285 #else
 286 #error Need to define a stack address hint for this architecture
 287 #endif
 288         return stackaddr;
 289 }
 290
 291 /**
 292  * bsdthread_create system call.  Used by pthread_create.
 293  */
 294 int
 295 _bsdthread_create(struct proc *p, user_addr_t user_func, user_addr_t user_funcarg, user_addr_t user_stack, user_addr_t user_pthread, uint32_t flags, user_addr_t *retval)
 296 {
 297         kern_return_t kret;
 298         void * sright;
 299         int error = 0;
 300         int allocated = 0;
 301         mach_vm_offset_t stackaddr;
 302         mach_vm_size_t th_allocsize = 0;
 303         mach_vm_size_t th_guardsize;
 304         mach_vm_offset_t th_stack;
 305         mach_vm_offset_t th_pthread;
 306         mach_vm_offset_t th_tsd_base;
 307         mach_port_name_t th_thport;
 308         thread_t th;
 309         vm_map_t vmap = pthread_kern->current_map();
 310         task_t ctask = current_task();
 311         unsigned int policy, importance;
 312         uint32_t tsd_offset;
 313
 314         int isLP64 = 0;
 315
 316         if (pthread_kern->proc_get_register(p) == 0) {
 317                 return EINVAL;
 318         }
 319
 320         PTHREAD_TRACE(TRACE_pthread_thread_create | DBG_FUNC_START, flags, 0, 0, 0, 0);
 321
 322         isLP64 = proc_is64bit(p);
 323         th_guardsize = vm_map_page_size(vmap);
 324
 325         stackaddr = pthread_kern->proc_get_stack_addr_hint(p);
 326         kret = pthread_kern->thread_create(ctask, &th);
 327         if (kret != KERN_SUCCESS)
 328                 return(ENOMEM);
 329         thread_reference(th);
 330
 331         pthread_kern->thread_set_tag(th, THREAD_TAG_PTHREAD);
 332
 333         sright = (void *)pthread_kern->convert_thread_to_port(th);
 334         th_thport = pthread_kern->ipc_port_copyout_send(sright, pthread_kern->task_get_ipcspace(ctask));
 335
 336         if ((flags & PTHREAD_START_CUSTOM) == 0) {
 337                 mach_vm_size_t pthread_size =
 338                         vm_map_round_page_mask(pthread_kern->proc_get_pthsize(p) + PTHREAD_T_OFFSET, vm_map_page_mask(vmap));
 339                 th_allocsize = th_guardsize + user_stack + pthread_size;
 340                 user_stack += PTHREAD_T_OFFSET;
 341
 342                 kret = mach_vm_map(vmap, &stackaddr,
 343                                 th_allocsize,
 344                                 page_size-1,
 345                                 VM_MAKE_TAG(VM_MEMORY_STACK)| VM_FLAGS_ANYWHERE , NULL,
 346                                 0, FALSE, VM_PROT_DEFAULT, VM_PROT_ALL,
 347                                 VM_INHERIT_DEFAULT);
 348                 if (kret != KERN_SUCCESS){
 349                         kret = mach_vm_allocate(vmap,
 350                                         &stackaddr, th_allocsize,
 351                                         VM_MAKE_TAG(VM_MEMORY_STACK)| VM_FLAGS_ANYWHERE);
 352                 }
 353                 if (kret != KERN_SUCCESS) {
 354                         error = ENOMEM;
 355                         goto out;
 356                 }
 357
 358                 PTHREAD_TRACE(TRACE_pthread_thread_create|DBG_FUNC_NONE, th_allocsize, stackaddr, 0, 2, 0);
 359
 360                 allocated = 1;
 361                 /*
 362                  * The guard page is at the lowest address
 363                  * The stack base is the highest address
 364                  */
 365                 kret = mach_vm_protect(vmap,  stackaddr, th_guardsize, FALSE, VM_PROT_NONE);
 366
 367                 if (kret != KERN_SUCCESS) {
 368                         error = ENOMEM;
 369                         goto out1;
 370                 }
 371
 372                 th_pthread = stackaddr + th_guardsize + user_stack;
 373                 th_stack = th_pthread;
 374
 375                 /*
 376                 * Pre-fault the first page of the new thread's stack and the page that will
 377                 * contain the pthread_t structure.
 378                 */
 379                 if (vm_map_trunc_page_mask((vm_map_offset_t)(th_stack - C_64_REDZONE_LEN), vm_map_page_mask(vmap)) !=
 380                                 vm_map_trunc_page_mask((vm_map_offset_t)th_pthread, vm_map_page_mask(vmap))){
 381                         vm_fault( vmap,
 382                                         vm_map_trunc_page_mask((vm_map_offset_t)(th_stack - C_64_REDZONE_LEN), vm_map_page_mask(vmap)),
 383                                         VM_PROT_READ | VM_PROT_WRITE,
 384                                         FALSE,
 385                                         THREAD_UNINT, NULL, 0);
 386                 }
 387
 388                 vm_fault( vmap,
 389                                 vm_map_trunc_page_mask((vm_map_offset_t)th_pthread, vm_map_page_mask(vmap)),
 390                                 VM_PROT_READ | VM_PROT_WRITE,
 391                                 FALSE,
 392                                 THREAD_UNINT, NULL, 0);
 393
 394         } else {
 395                 th_stack = user_stack;
 396                 th_pthread = user_pthread;
 397
 398                 PTHREAD_TRACE(TRACE_pthread_thread_create|DBG_FUNC_NONE, 0, 0, 0, 3, 0);
 399         }
 400
 401         tsd_offset = pthread_kern->proc_get_pthread_tsd_offset(p);
 402         if (tsd_offset) {
 403                 th_tsd_base = th_pthread + tsd_offset;
 404                 kret = pthread_kern->thread_set_tsd_base(th, th_tsd_base);
 405                 if (kret == KERN_SUCCESS) {
 406                         flags |= PTHREAD_START_TSD_BASE_SET;
 407                 }
 408         }
 409
 410 #if defined(__i386__) || defined(__x86_64__)
 411         /*
 412          * Set up i386 registers & function call.
 413          */
 414         if (isLP64 == 0) {
 415                 x86_thread_state32_t state = {
 416                         .eip = (unsigned int)pthread_kern->proc_get_threadstart(p),
 417                         .eax = (unsigned int)th_pthread,
 418                         .ebx = (unsigned int)th_thport,
 419                         .ecx = (unsigned int)user_func,
 420                         .edx = (unsigned int)user_funcarg,
 421                         .edi = (unsigned int)user_stack,
 422                         .esi = (unsigned int)flags,
 423                         /*
 424                          * set stack pointer
 425                          */
 426                         .esp = (int)((vm_offset_t)(th_stack-C_32_STK_ALIGN))
 427                 };
 428
 429                 error = pthread_kern->thread_set_wq_state32(th, (thread_state_t)&state);
 430                 if (error != KERN_SUCCESS) {
 431                         error = EINVAL;
 432                         goto out;
 433                 }
 434         } else {
 435                 x86_thread_state64_t state64 = {
 436                         .rip = (uint64_t)pthread_kern->proc_get_threadstart(p),
 437                         .rdi = (uint64_t)th_pthread,
 438                         .rsi = (uint64_t)(th_thport),
 439                         .rdx = (uint64_t)user_func,
 440                         .rcx = (uint64_t)user_funcarg,
 441                         .r8 = (uint64_t)user_stack,
 442                         .r9 = (uint64_t)flags,
 443                         /*
 444                          * set stack pointer aligned to 16 byte boundary
 445                          */
 446                         .rsp = (uint64_t)(th_stack - C_64_REDZONE_LEN)
 447                 };
 448
 449                 error = pthread_kern->thread_set_wq_state64(th, (thread_state_t)&state64);
 450                 if (error != KERN_SUCCESS) {
 451                         error = EINVAL;
 452                         goto out;
 453                 }
 454
 455         }
 456 #elif defined(__arm__)
 457         arm_thread_state_t state = {
 458                 .pc = (int)pthread_kern->proc_get_threadstart(p),
 459                 .r[0] = (unsigned int)th_pthread,
 460                 .r[1] = (unsigned int)th_thport,
 461                 .r[2] = (unsigned int)user_func,
 462                 .r[3] = (unsigned int)user_funcarg,
 463                 .r[4] = (unsigned int)user_stack,
 464                 .r[5] = (unsigned int)flags,
 465
 466                 /* Set r7 & lr to 0 for better back tracing */
 467                 .r[7] = 0,
 468                 .lr = 0,
 469
 470                 /*
 471                  * set stack pointer
 472                  */
 473                 .sp = (int)((vm_offset_t)(th_stack-C_32_STK_ALIGN))
 474         };
 475
 476         (void) pthread_kern->thread_set_wq_state32(th, (thread_state_t)&state);
 477
 478 #else
 479 #error bsdthread_create  not defined for this architecture
 480 #endif
 481
 482         if ((flags & PTHREAD_START_SETSCHED) != 0) {
 483                 /* Set scheduling parameters if needed */
 484                 thread_extended_policy_data_t    extinfo;
 485                 thread_precedence_policy_data_t   precedinfo;
 486
 487                 importance = (flags & PTHREAD_START_IMPORTANCE_MASK);
 488                 policy = (flags >> PTHREAD_START_POLICY_BITSHIFT) & PTHREAD_START_POLICY_MASK;
 489
 490                 if (policy == SCHED_OTHER) {
 491                         extinfo.timeshare = 1;
 492                 } else {
 493                         extinfo.timeshare = 0;
 494                 }
 495
 496                 thread_policy_set(th, THREAD_EXTENDED_POLICY, (thread_policy_t)&extinfo, THREAD_EXTENDED_POLICY_COUNT);
 497
 498                 precedinfo.importance = (importance - BASEPRI_DEFAULT);
 499                 thread_policy_set(th, THREAD_PRECEDENCE_POLICY, (thread_policy_t)&precedinfo, THREAD_PRECEDENCE_POLICY_COUNT);
 500         } else if ((flags & PTHREAD_START_QOSCLASS) != 0) {
 501                 /* Set thread QoS class if requested. */
 502                 pthread_priority_t priority = (pthread_priority_t)(flags & PTHREAD_START_QOSCLASS_MASK);
 503
 504                 thread_qos_policy_data_t qos;
 505                 qos.qos_tier = pthread_priority_get_thread_qos(priority);
 506                 qos.tier_importance = (qos.qos_tier == QOS_CLASS_UNSPECIFIED) ? 0 :
 507                                 _pthread_priority_get_relpri(priority);
 508
 509                 pthread_kern->thread_policy_set_internal(th, THREAD_QOS_POLICY, (thread_policy_t)&qos, THREAD_QOS_POLICY_COUNT);
 510         }
 511
 512         kret = pthread_kern->thread_resume(th);
 513         if (kret != KERN_SUCCESS) {
 514                 error = EINVAL;
 515                 goto out1;
 516         }
 517         thread_deallocate(th);  /* drop the creator reference */
 518
 519         PTHREAD_TRACE(TRACE_pthread_thread_create|DBG_FUNC_END, error, th_pthread, 0, 0, 0);
 520
 521         // cast required as mach_vm_offset_t is always 64 bits even on 32-bit platforms
 522         *retval = (user_addr_t)th_pthread;
 523
 524         return(0);
 525
 526 out1:
 527         if (allocated != 0) {
 528                 (void)mach_vm_deallocate(vmap, stackaddr, th_allocsize);
 529         }
 530 out:
 531         (void)pthread_kern->mach_port_deallocate(pthread_kern->task_get_ipcspace(ctask), th_thport);
 532         (void)thread_terminate(th);
 533         (void)thread_deallocate(th);
 534         return(error);
 535 }
 536
 537 /**
 538  * bsdthread_terminate system call.  Used by pthread_terminate
 539  */
 540 int
 541 _bsdthread_terminate(__unused struct proc *p,
 542                      user_addr_t stackaddr,
 543                      size_t size,
 544                      uint32_t kthport,
 545                      uint32_t sem,
 546                      __unused int32_t *retval)
 547 {
 548         mach_vm_offset_t freeaddr;
 549         mach_vm_size_t freesize;
 550         kern_return_t kret;
 551         thread_t th = current_thread();
 552
 553         freeaddr = (mach_vm_offset_t)stackaddr;
 554         freesize = size;
 555
 556         PTHREAD_TRACE(TRACE_pthread_thread_terminate|DBG_FUNC_START, freeaddr, freesize, kthport, 0xff, 0);
 557
 558         if ((freesize != (mach_vm_size_t)0) && (freeaddr != (mach_vm_offset_t)0)) {
 559                 if (pthread_kern->thread_get_tag(th) & THREAD_TAG_MAINTHREAD){
 560                         vm_map_t user_map = pthread_kern->current_map();
 561                         freesize = vm_map_trunc_page_mask((vm_map_offset_t)freesize - 1, vm_map_page_mask(user_map));
 562                         kret = mach_vm_behavior_set(user_map, freeaddr, freesize, VM_BEHAVIOR_REUSABLE);
 563                         assert(kret == KERN_SUCCESS || kret == KERN_INVALID_ADDRESS);
 564                         kret = kret ? kret : mach_vm_protect(user_map, freeaddr, freesize, FALSE, VM_PROT_NONE);
 565                         assert(kret == KERN_SUCCESS || kret == KERN_INVALID_ADDRESS);
 566                 } else {
 567                         kret = mach_vm_deallocate(pthread_kern->current_map(), freeaddr, freesize);
 568                         if (kret != KERN_SUCCESS) {
 569                                 PTHREAD_TRACE(TRACE_pthread_thread_terminate|DBG_FUNC_END, kret, 0, 0, 0, 0);
 570                                 return(EINVAL);
 571                         }
 572                 }
 573         }
 574
 575         (void) thread_terminate(th);
 576         if (sem != MACH_PORT_NULL) {
 577                  kret = pthread_kern->semaphore_signal_internal_trap(sem);
 578                 if (kret != KERN_SUCCESS) {
 579                         PTHREAD_TRACE(TRACE_pthread_thread_terminate|DBG_FUNC_END, kret, 0, 0, 0, 0);
 580                         return(EINVAL);
 581                 }
 582         }
 583
 584         if (kthport != MACH_PORT_NULL) {
 585                 pthread_kern->mach_port_deallocate(pthread_kern->task_get_ipcspace(current_task()), kthport);
 586         }
 587
 588         PTHREAD_TRACE(TRACE_pthread_thread_terminate|DBG_FUNC_END, 0, 0, 0, 0, 0);
 589
 590         pthread_kern->thread_exception_return();
 591         panic("bsdthread_terminate: still running\n");
 592
 593         PTHREAD_TRACE(TRACE_pthread_thread_terminate|DBG_FUNC_END, 0, 0xff, 0, 0, 0);
 594
 595         return(0);
 596 }
 597
 598 /**
 599  * bsdthread_register system call.  Performs per-process setup.  Responsible for
 600  * returning capabilitiy bits to userspace and receiving userspace function addresses.
 601  */
 602 int
 603 _bsdthread_register(struct proc *p,
 604                     user_addr_t threadstart,
 605                     user_addr_t wqthread,
 606                     int pthsize,
 607                     user_addr_t pthread_init_data,
 608                     user_addr_t pthread_init_data_size,
 609                     uint64_t dispatchqueue_offset,
 610                     int32_t *retval)
 611 {
 612         /* We have to do this first so that it resets after fork */
 613         pthread_kern->proc_set_stack_addr_hint(p, (user_addr_t)stack_addr_hint(p, pthread_kern->current_map()));
 614
 615         /* prevent multiple registrations */
 616         if (pthread_kern->proc_get_register(p) != 0) {
 617                 return(EINVAL);
 618         }
 619         /* syscall randomizer test can pass bogus values */
 620         if (pthsize < 0 || pthsize > MAX_PTHREAD_SIZE) {
 621                 return(EINVAL);
 622         }
 623         pthread_kern->proc_set_threadstart(p, threadstart);
 624         pthread_kern->proc_set_wqthread(p, wqthread);
 625         pthread_kern->proc_set_pthsize(p, pthsize);
 626         pthread_kern->proc_set_register(p);
 627
 628         /* if we have pthread_init_data, then we use that and target_concptr (which is an offset) get data. */
 629         if (pthread_init_data != 0) {
 630                 thread_qos_policy_data_t qos;
 631
 632                 struct _pthread_registration_data data = {};
 633                 size_t pthread_init_sz = MIN(sizeof(struct _pthread_registration_data), (size_t)pthread_init_data_size);
 634
 635                 kern_return_t kr = copyin(pthread_init_data, &data, pthread_init_sz);
 636                 if (kr != KERN_SUCCESS) {
 637                         return EINVAL;
 638                 }
 639
 640                 /* Incoming data from the data structure */
 641                 pthread_kern->proc_set_dispatchqueue_offset(p, data.dispatch_queue_offset);
 642                 if (data.version > offsetof(struct _pthread_registration_data, tsd_offset)
 643                         && data.tsd_offset < (uint32_t)pthsize) {
 644                         pthread_kern->proc_set_pthread_tsd_offset(p, data.tsd_offset);
 645                 }
 646
 647                 /* Outgoing data that userspace expects as a reply */
 648                 data.version = sizeof(struct _pthread_registration_data);
 649                 if (pthread_kern->qos_main_thread_active()) {
 650                         mach_msg_type_number_t nqos = THREAD_QOS_POLICY_COUNT;
 651                         boolean_t gd = FALSE;
 652
 653                         kr = pthread_kern->thread_policy_get(current_thread(), THREAD_QOS_POLICY, (thread_policy_t)&qos, &nqos, &gd);
 654                         if (kr != KERN_SUCCESS || qos.qos_tier == THREAD_QOS_UNSPECIFIED) {
 655                                 /* Unspecified threads means the kernel wants us to impose legacy upon the thread. */
 656                                 qos.qos_tier = THREAD_QOS_LEGACY;
 657                                 qos.tier_importance = 0;
 658
 659                                 kr = pthread_kern->thread_policy_set_internal(current_thread(), THREAD_QOS_POLICY, (thread_policy_t)&qos, THREAD_QOS_POLICY_COUNT);
 660                         }
 661
 662                         if (kr == KERN_SUCCESS) {
 663                                 data.main_qos = thread_qos_get_pthread_priority(qos.qos_tier);
 664                         } else {
 665                                 data.main_qos = _pthread_priority_make_newest(QOS_CLASS_UNSPECIFIED, 0, 0);
 666                         }
 667                 } else {
 668                         data.main_qos = _pthread_priority_make_newest(QOS_CLASS_UNSPECIFIED, 0, 0);
 669                 }
 670
 671                 kr = copyout(&data, pthread_init_data, pthread_init_sz);
 672                 if (kr != KERN_SUCCESS) {
 673                         return EINVAL;
 674                 }
 675         } else {
 676                 pthread_kern->proc_set_dispatchqueue_offset(p, dispatchqueue_offset);
 677         }
 678
 679         /* return the supported feature set as the return value. */
 680         *retval = PTHREAD_FEATURE_SUPPORTED;
 681
 682         return(0);
 683 }
 684
 685 #pragma mark - QoS Manipulation
 686
 687 int
 688 _bsdthread_ctl_set_qos(struct proc *p, user_addr_t __unused cmd, mach_port_name_t kport, user_addr_t tsd_priority_addr, user_addr_t arg3, int *retval)
 689 {
 690         kern_return_t kr;
 691         thread_t th;
 692
 693         pthread_priority_t priority;
 694
 695         /* Unused parameters must be zero. */
 696         if (arg3 != 0) {
 697                 return EINVAL;
 698         }
 699
 700         /* QoS is stored in a given slot in the pthread TSD. We need to copy that in and set our QoS based on it. */
 701         if (proc_is64bit(p)) {
 702                 uint64_t v;
 703                 kr = copyin(tsd_priority_addr, &v, sizeof(v));
 704                 if (kr != KERN_SUCCESS) {
 705                         return kr;
 706                 }
 707                 priority = (int)(v & 0xffffffff);
 708         } else {
 709                 uint32_t v;
 710                 kr = copyin(tsd_priority_addr, &v, sizeof(v));
 711                 if (kr != KERN_SUCCESS) {
 712                         return kr;
 713                 }
 714                 priority = v;
 715         }
 716
 717         if ((th = port_name_to_thread(kport)) == THREAD_NULL) {
 718                 return ESRCH;
 719         }
 720
 721         /* <rdar://problem/16211829> Disable pthread_set_qos_class_np() on threads other than pthread_self */
 722         if (th != current_thread()) {
 723                 thread_deallocate(th);
 724                 return EPERM;
 725         }
 726
 727         int rv = _bsdthread_ctl_set_self(p, 0, priority, 0, _PTHREAD_SET_SELF_QOS_FLAG, retval);
 728
 729         /* Static param the thread, we just set QoS on it, so its stuck in QoS land now. */
 730         /* pthread_kern->thread_static_param(th, TRUE); */ // see <rdar://problem/16433744>, for details
 731
 732         thread_deallocate(th);
 733
 734         return rv;
 735 }
 736
 737 static inline struct threadlist *
 738 util_get_thread_threadlist_entry(thread_t th)
 739 {
 740         struct uthread *uth = pthread_kern->get_bsdthread_info(th);
 741         if (uth) {
 742                 struct threadlist *tl = pthread_kern->uthread_get_threadlist(uth);
 743                 return tl;
 744         }
 745         return NULL;
 746 }
 747
 748 int
 749 _bsdthread_ctl_set_self(struct proc *p, user_addr_t __unused cmd, pthread_priority_t priority, mach_port_name_t voucher, _pthread_set_flags_t flags, int __unused *retval)
 750 {
 751         thread_qos_policy_data_t qos;
 752         mach_msg_type_number_t nqos = THREAD_QOS_POLICY_COUNT;
 753         boolean_t gd = FALSE;
 754         bool was_manager_thread = false;
 755         thread_t th = current_thread();
 756         struct workqueue *wq = NULL;
 757         struct threadlist *tl = NULL;
 758
 759         kern_return_t kr;
 760         int qos_rv = 0, voucher_rv = 0, fixedpri_rv = 0;
 761
 762         if ((flags & _PTHREAD_SET_SELF_WQ_KEVENT_UNBIND) != 0) {
 763                 tl = util_get_thread_threadlist_entry(th);
 764                 if (tl) {
 765                         wq = tl->th_workq;
 766                 } else {
 767                         goto qos;
 768                 }
 769
 770                 workqueue_lock_spin(wq);
 771                 if (tl->th_flags & TH_LIST_KEVENT_BOUND) {
 772                         tl->th_flags &= ~TH_LIST_KEVENT_BOUND;
 773                         unsigned int kevent_flags = KEVENT_FLAG_WORKQ;
 774                         if (tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET) {
 775                                 kevent_flags |= KEVENT_FLAG_WORKQ_MANAGER;
 776                         }
 777
 778                         workqueue_unlock(wq);
 779                         kevent_qos_internal_unbind(p, class_index_get_thread_qos(tl->th_priority), th, kevent_flags);
 780                 } else {
 781                         workqueue_unlock(wq);
 782                 }
 783         }
 784
 785 qos:
 786         if ((flags & _PTHREAD_SET_SELF_QOS_FLAG) != 0) {
 787                 kr = pthread_kern->thread_policy_get(th, THREAD_QOS_POLICY, (thread_policy_t)&qos, &nqos, &gd);
 788                 if (kr != KERN_SUCCESS) {
 789                         qos_rv = EINVAL;
 790                         goto voucher;
 791                 }
 792
 793                 /* If we have main-thread QoS then we don't allow a thread to come out of QOS_CLASS_UNSPECIFIED. */
 794                 if (pthread_kern->qos_main_thread_active() && qos.qos_tier == THREAD_QOS_UNSPECIFIED) {
 795                         qos_rv = EPERM;
 796                         goto voucher;
 797                 }
 798
 799                 /* Get the work queue for tracing, also the threadlist for bucket manipluation. */
 800                 if (!tl) {
 801                         tl = util_get_thread_threadlist_entry(th);
 802                         if (tl) wq = tl->th_workq;
 803                 }
 804
 805                 PTHREAD_TRACE_WQ(TRACE_pthread_set_qos_self | DBG_FUNC_START, wq, qos.qos_tier, qos.tier_importance, 0, 0);
 806
 807                 qos.qos_tier = pthread_priority_get_thread_qos(priority);
 808                 qos.tier_importance = (qos.qos_tier == QOS_CLASS_UNSPECIFIED) ? 0 : _pthread_priority_get_relpri(priority);
 809
 810                 if (qos.qos_tier == QOS_CLASS_UNSPECIFIED) {
 811                         qos_rv = EINVAL;
 812                         goto voucher;
 813                 }
 814
 815                 /* If we're a workqueue, the threadlist item priority needs adjusting, along with the bucket we were running in. */
 816                 if (tl) {
 817                         workqueue_lock_spin(wq);
 818                         bool now_under_constrained_limit = false;
 819
 820                         assert(!(tl->th_flags & TH_LIST_KEVENT_BOUND));
 821
 822                         kr = pthread_kern->thread_set_workq_qos(th, qos.qos_tier, qos.tier_importance);
 823                         assert(kr == KERN_SUCCESS || kr == KERN_TERMINATED);
 824
 825                         /* Fix up counters. */
 826                         uint8_t old_bucket = tl->th_priority;
 827                         uint8_t new_bucket = pthread_priority_get_class_index(priority);
 828                         if (old_bucket == WORKQUEUE_EVENT_MANAGER_BUCKET) {
 829                                 was_manager_thread = true;
 830                         }
 831
 832                         uint32_t old_active = OSAddAtomic(-1, &wq->wq_thactive_count[old_bucket]);
 833                         OSAddAtomic(1, &wq->wq_thactive_count[new_bucket]);
 834
 835                         wq->wq_thscheduled_count[old_bucket]--;
 836                         wq->wq_thscheduled_count[new_bucket]++;
 837
 838                         bool old_overcommit = !(tl->th_flags & TH_LIST_CONSTRAINED);
 839                         bool new_overcommit = priority & _PTHREAD_PRIORITY_OVERCOMMIT_FLAG;
 840                         if (!old_overcommit && new_overcommit) {
 841                                 wq->wq_constrained_threads_scheduled--;
 842                                 tl->th_flags &= ~TH_LIST_CONSTRAINED;
 843                                 if (wq->wq_constrained_threads_scheduled == wq_max_constrained_threads - 1) {
 844                                         now_under_constrained_limit = true;
 845                                 }
 846                         } else if (old_overcommit && !new_overcommit) {
 847                                 wq->wq_constrained_threads_scheduled++;
 848                                 tl->th_flags |= TH_LIST_CONSTRAINED;
 849                         }
 850
 851                         tl->th_priority = new_bucket;
 852
 853                         /* If we were at the ceiling of threads for a given bucket, we have
 854                          * to reevaluate whether we should start more work.
 855                          */
 856                         if (old_active == wq->wq_reqconc[old_bucket] || now_under_constrained_limit) {
 857                                 /* workqueue_run_nextreq will drop the workqueue lock in all exit paths. */
 858                                 (void)workqueue_run_nextreq(p, wq, THREAD_NULL, RUN_NEXTREQ_DEFAULT, 0, false);
 859                         } else {
 860                                 workqueue_unlock(wq);
 861                         }
 862                 } else {
 863                         kr = pthread_kern->thread_policy_set_internal(th, THREAD_QOS_POLICY, (thread_policy_t)&qos, THREAD_QOS_POLICY_COUNT);
 864                         if (kr != KERN_SUCCESS) {
 865                                 qos_rv = EINVAL;
 866                         }
 867                 }
 868
 869                 PTHREAD_TRACE_WQ(TRACE_pthread_set_qos_self | DBG_FUNC_END, wq, qos.qos_tier, qos.tier_importance, 0, 0);
 870         }
 871
 872 voucher:
 873         if ((flags & _PTHREAD_SET_SELF_VOUCHER_FLAG) != 0) {
 874                 kr = pthread_kern->thread_set_voucher_name(voucher);
 875                 if (kr != KERN_SUCCESS) {
 876                         voucher_rv = ENOENT;
 877                         goto fixedpri;
 878                 }
 879         }
 880
 881 fixedpri:
 882         if (qos_rv) goto done;
 883         if ((flags & _PTHREAD_SET_SELF_FIXEDPRIORITY_FLAG) != 0) {
 884                 thread_extended_policy_data_t extpol = {.timeshare = 0};
 885
 886                 if (!tl) tl  = util_get_thread_threadlist_entry(th);
 887                 if (tl) {
 888                         /* Not allowed on workqueue threads */
 889                         fixedpri_rv = ENOTSUP;
 890                         goto done;
 891                 }
 892
 893                 kr = pthread_kern->thread_policy_set_internal(th, THREAD_EXTENDED_POLICY, (thread_policy_t)&extpol, THREAD_EXTENDED_POLICY_COUNT);
 894                 if (kr != KERN_SUCCESS) {
 895                         fixedpri_rv = EINVAL;
 896                         goto done;
 897                 }
 898         } else if ((flags & _PTHREAD_SET_SELF_TIMESHARE_FLAG) != 0) {
 899                 thread_extended_policy_data_t extpol = {.timeshare = 1};
 900
 901                 if (!tl) tl = util_get_thread_threadlist_entry(th);
 902                 if (tl) {
 903                         /* Not allowed on workqueue threads */
 904                         fixedpri_rv = ENOTSUP;
 905                         goto done;
 906                 }
 907
 908                 kr = pthread_kern->thread_policy_set_internal(th, THREAD_EXTENDED_POLICY, (thread_policy_t)&extpol, THREAD_EXTENDED_POLICY_COUNT);
 909                 if (kr != KERN_SUCCESS) {
 910                         fixedpri_rv = EINVAL;
 911                         goto done;
 912                 }
 913         }
 914
 915 done:
 916         if (qos_rv && voucher_rv) {
 917                 /* Both failed, give that a unique error. */
 918                 return EBADMSG;
 919         }
 920
 921         if (qos_rv) {
 922                 return qos_rv;
 923         }
 924
 925         if (voucher_rv) {
 926                 return voucher_rv;
 927         }
 928
 929         if (fixedpri_rv) {
 930                 return fixedpri_rv;
 931         }
 932
 933         return 0;
 934 }
 935
 936 int
 937 _bsdthread_ctl_qos_override_start(struct proc __unused *p, user_addr_t __unused cmd, mach_port_name_t kport, pthread_priority_t priority, user_addr_t resource, int __unused *retval)
 938 {
 939         thread_t th;
 940         int rv = 0;
 941
 942         if ((th = port_name_to_thread(kport)) == THREAD_NULL) {
 943                 return ESRCH;
 944         }
 945
 946         int override_qos = pthread_priority_get_thread_qos(priority);
 947
 948         struct threadlist *tl = util_get_thread_threadlist_entry(th);
 949         if (tl) {
 950                 PTHREAD_TRACE_WQ(TRACE_wq_override_start | DBG_FUNC_NONE, tl->th_workq, thread_tid(th), 1, priority, 0);
 951         }
 952
 953         /* The only failure case here is if we pass a tid and have it lookup the thread, we pass the uthread, so this all always succeeds. */
 954         pthread_kern->proc_usynch_thread_qos_add_override_for_resource_check_owner(th, override_qos, TRUE,
 955                         resource, THREAD_QOS_OVERRIDE_TYPE_PTHREAD_EXPLICIT_OVERRIDE, USER_ADDR_NULL, MACH_PORT_NULL);
 956         thread_deallocate(th);
 957         return rv;
 958 }
 959
 960 int
 961 _bsdthread_ctl_qos_override_end(struct proc __unused *p, user_addr_t __unused cmd, mach_port_name_t kport, user_addr_t resource, user_addr_t arg3, int __unused *retval)
 962 {
 963         thread_t th;
 964         int rv = 0;
 965
 966         if (arg3 != 0) {
 967                 return EINVAL;
 968         }
 969
 970         if ((th = port_name_to_thread(kport)) == THREAD_NULL) {
 971                 return ESRCH;
 972         }
 973
 974         struct uthread *uth = pthread_kern->get_bsdthread_info(th);
 975
 976         struct threadlist *tl = util_get_thread_threadlist_entry(th);
 977         if (tl) {
 978                 PTHREAD_TRACE_WQ(TRACE_wq_override_end | DBG_FUNC_NONE, tl->th_workq, thread_tid(th), 0, 0, 0);
 979         }
 980
 981         pthread_kern->proc_usynch_thread_qos_remove_override_for_resource(current_task(), uth, 0, resource, THREAD_QOS_OVERRIDE_TYPE_PTHREAD_EXPLICIT_OVERRIDE);
 982
 983         thread_deallocate(th);
 984         return rv;
 985 }
 986
 987 static int
 988 _bsdthread_ctl_qos_dispatch_asynchronous_override_add_internal(mach_port_name_t kport, pthread_priority_t priority, user_addr_t resource, user_addr_t ulock_addr)
 989 {
 990         thread_t th;
 991         int rv = 0;
 992
 993         if ((th = port_name_to_thread(kport)) == THREAD_NULL) {
 994                 return ESRCH;
 995         }
 996
 997         int override_qos = pthread_priority_get_thread_qos(priority);
 998
 999         struct threadlist *tl = util_get_thread_threadlist_entry(th);
1000         if (!tl) {
1001                 thread_deallocate(th);
1002                 return EPERM;
1003         }
1004
1005         PTHREAD_TRACE_WQ(TRACE_wq_override_dispatch | DBG_FUNC_NONE, tl->th_workq, thread_tid(th), 1, priority, 0);
1006
1007         rv = pthread_kern->proc_usynch_thread_qos_add_override_for_resource_check_owner(th, override_qos, TRUE,
1008                         resource, THREAD_QOS_OVERRIDE_TYPE_DISPATCH_ASYNCHRONOUS_OVERRIDE, ulock_addr, kport);
1009
1010         thread_deallocate(th);
1011         return rv;
1012 }
1013
1014 int _bsdthread_ctl_qos_dispatch_asynchronous_override_add(struct proc __unused *p, user_addr_t __unused cmd,
1015                 mach_port_name_t kport, pthread_priority_t priority, user_addr_t resource, int __unused *retval)
1016 {
1017         return _bsdthread_ctl_qos_dispatch_asynchronous_override_add_internal(kport, priority, resource, USER_ADDR_NULL);
1018 }
1019
1020 int
1021 _bsdthread_ctl_qos_override_dispatch(struct proc *p __unused, user_addr_t cmd __unused, mach_port_name_t kport, pthread_priority_t priority, user_addr_t ulock_addr, int __unused *retval)
1022 {
1023         return _bsdthread_ctl_qos_dispatch_asynchronous_override_add_internal(kport, priority, USER_ADDR_NULL, ulock_addr);
1024 }
1025
1026 int
1027 _bsdthread_ctl_qos_override_reset(struct proc *p, user_addr_t cmd, user_addr_t arg1, user_addr_t arg2, user_addr_t arg3, int *retval)
1028 {
1029         if (arg1 != 0 || arg2 != 0 || arg3 != 0) {
1030                 return EINVAL;
1031         }
1032
1033         return _bsdthread_ctl_qos_dispatch_asynchronous_override_reset(p, cmd, 1 /* reset_all */, 0, 0, retval);
1034 }
1035
1036 int
1037 _bsdthread_ctl_qos_dispatch_asynchronous_override_reset(struct proc __unused *p, user_addr_t __unused cmd, int reset_all, user_addr_t resource, user_addr_t arg3, int __unused *retval)
1038 {
1039         if ((reset_all && (resource != 0)) || arg3 != 0) {
1040                 return EINVAL;
1041         }
1042
1043         thread_t th = current_thread();
1044         struct uthread *uth = pthread_kern->get_bsdthread_info(th);
1045         struct threadlist *tl = pthread_kern->uthread_get_threadlist(uth);
1046
1047         if (!tl) {
1048                 return EPERM;
1049         }
1050
1051         PTHREAD_TRACE_WQ(TRACE_wq_override_reset | DBG_FUNC_NONE, tl->th_workq, 0, 0, 0, 0);
1052
1053         resource = reset_all ? THREAD_QOS_OVERRIDE_RESOURCE_WILDCARD : resource;
1054         pthread_kern->proc_usynch_thread_qos_reset_override_for_resource(current_task(), uth, 0, resource, THREAD_QOS_OVERRIDE_TYPE_DISPATCH_ASYNCHRONOUS_OVERRIDE);
1055
1056         return 0;
1057 }
1058
1059 int
1060 _bsdthread_ctl(struct proc *p, user_addr_t cmd, user_addr_t arg1, user_addr_t arg2, user_addr_t arg3, int *retval)
1061 {
1062         switch (cmd) {
1063         case BSDTHREAD_CTL_SET_QOS:
1064                 return _bsdthread_ctl_set_qos(p, cmd, (mach_port_name_t)arg1, arg2, arg3, retval);
1065         case BSDTHREAD_CTL_QOS_OVERRIDE_START:
1066                 return _bsdthread_ctl_qos_override_start(p, cmd, (mach_port_name_t)arg1, (pthread_priority_t)arg2, arg3, retval);
1067         case BSDTHREAD_CTL_QOS_OVERRIDE_END:
1068                 return _bsdthread_ctl_qos_override_end(p, cmd, (mach_port_name_t)arg1, arg2, arg3, retval);
1069         case BSDTHREAD_CTL_QOS_OVERRIDE_RESET:
1070                 return _bsdthread_ctl_qos_override_reset(p, cmd, arg1, arg2, arg3, retval);
1071         case BSDTHREAD_CTL_QOS_OVERRIDE_DISPATCH:
1072                 return _bsdthread_ctl_qos_override_dispatch(p, cmd, (mach_port_name_t)arg1, (pthread_priority_t)arg2, arg3, retval);
1073         case BSDTHREAD_CTL_QOS_DISPATCH_ASYNCHRONOUS_OVERRIDE_ADD:
1074                 return _bsdthread_ctl_qos_dispatch_asynchronous_override_add(p, cmd, (mach_port_name_t)arg1, (pthread_priority_t)arg2, arg3, retval);
1075         case BSDTHREAD_CTL_QOS_DISPATCH_ASYNCHRONOUS_OVERRIDE_RESET:
1076                 return _bsdthread_ctl_qos_dispatch_asynchronous_override_reset(p, cmd, (int)arg1, arg2, arg3, retval);
1077         case BSDTHREAD_CTL_SET_SELF:
1078                 return _bsdthread_ctl_set_self(p, cmd, (pthread_priority_t)arg1, (mach_port_name_t)arg2, (_pthread_set_flags_t)arg3, retval);
1079         default:
1080                 return EINVAL;
1081         }
1082 }
1083
1084 #pragma mark - Workqueue Implementation
1085 #pragma mark workqueue lock
1086
1087 static boolean_t workqueue_lock_spin_is_acquired_kdp(struct workqueue *wq) {
1088   return kdp_lck_spin_is_acquired(&wq->wq_lock);
1089 }
1090
1091 static void
1092 workqueue_lock_spin(struct workqueue *wq)
1093 {
1094         boolean_t interrupt_state = ml_set_interrupts_enabled(FALSE);
1095         lck_spin_lock(&wq->wq_lock);
1096         wq->wq_interrupt_state = interrupt_state;
1097 }
1098
1099 static void
1100 workqueue_unlock(struct workqueue *wq)
1101 {
1102         boolean_t interrupt_state = wq->wq_interrupt_state;
1103         lck_spin_unlock(&wq->wq_lock);
1104         ml_set_interrupts_enabled(interrupt_state);
1105 }
1106
1107 #pragma mark workqueue add timer
1108
1109 /**
1110  * Sets up the timer which will call out to workqueue_add_timer
1111  */
1112 static void
1113 workqueue_interval_timer_start(struct workqueue *wq)
1114 {
1115         uint64_t deadline;
1116
1117         /* n.b. wq_timer_interval is reset to 0 in workqueue_add_timer if the
1118          ATIMER_RUNNING flag is not present.  The net effect here is that if a
1119          sequence of threads is required, we'll double the time before we give out
1120          the next one. */
1121         if (wq->wq_timer_interval == 0) {
1122                 wq->wq_timer_interval = wq_stalled_window_usecs;
1123
1124         } else {
1125                 wq->wq_timer_interval = wq->wq_timer_interval * 2;
1126
1127                 if (wq->wq_timer_interval > wq_max_timer_interval_usecs) {
1128                         wq->wq_timer_interval = wq_max_timer_interval_usecs;
1129                 }
1130         }
1131         clock_interval_to_deadline(wq->wq_timer_interval, 1000, &deadline);
1132
1133         PTHREAD_TRACE_WQ(TRACE_wq_start_add_timer, wq, wq->wq_reqcount, wq->wq_flags, wq->wq_timer_interval, 0);
1134
1135         boolean_t ret = thread_call_enter1_delayed(wq->wq_atimer_delayed_call, wq->wq_atimer_delayed_call, deadline);
1136         if (ret) {
1137                 panic("delayed_call was already enqueued");
1138         }
1139 }
1140
1141 /**
1142  * Immediately trigger the workqueue_add_timer
1143  */
1144 static void
1145 workqueue_interval_timer_trigger(struct workqueue *wq)
1146 {
1147         PTHREAD_TRACE_WQ(TRACE_wq_start_add_timer, wq, wq->wq_reqcount, wq->wq_flags, 0, 0);
1148
1149         boolean_t ret = thread_call_enter1(wq->wq_atimer_immediate_call, wq->wq_atimer_immediate_call);
1150         if (ret) {
1151                 panic("immediate_call was already enqueued");
1152         }
1153 }
1154
1155 /**
1156  * returns whether lastblocked_tsp is within wq_stalled_window_usecs of cur_ts
1157  */
1158 static boolean_t
1159 wq_thread_is_busy(uint64_t cur_ts, uint64_t *lastblocked_tsp)
1160 {
1161         clock_sec_t     secs;
1162         clock_usec_t    usecs;
1163         uint64_t lastblocked_ts;
1164         uint64_t elapsed;
1165
1166         /*
1167          * the timestamp is updated atomically w/o holding the workqueue lock
1168          * so we need to do an atomic read of the 64 bits so that we don't see
1169          * a mismatched pair of 32 bit reads... we accomplish this in an architecturally
1170          * independent fashion by using OSCompareAndSwap64 to write back the
1171          * value we grabbed... if it succeeds, then we have a good timestamp to
1172          * evaluate... if it fails, we straddled grabbing the timestamp while it
1173          * was being updated... treat a failed update as a busy thread since
1174          * it implies we are about to see a really fresh timestamp anyway
1175          */
1176         lastblocked_ts = *lastblocked_tsp;
1177
1178         if ( !OSCompareAndSwap64((UInt64)lastblocked_ts, (UInt64)lastblocked_ts, lastblocked_tsp))
1179                 return (TRUE);
1180
1181         if (lastblocked_ts >= cur_ts) {
1182                 /*
1183                  * because the update of the timestamp when a thread blocks isn't
1184                  * serialized against us looking at it (i.e. we don't hold the workq lock)
1185                  * it's possible to have a timestamp that matches the current time or
1186                  * that even looks to be in the future relative to when we grabbed the current
1187                  * time... just treat this as a busy thread since it must have just blocked.
1188                  */
1189                 return (TRUE);
1190         }
1191         elapsed = cur_ts - lastblocked_ts;
1192
1193         pthread_kern->absolutetime_to_microtime(elapsed, &secs, &usecs);
1194
1195         if (secs == 0 && usecs < wq_stalled_window_usecs)
1196                 return (TRUE);
1197         return (FALSE);
1198 }
1199
1200 static inline bool
1201 WQ_TIMER_DELAYED_NEEDED(struct workqueue *wq)
1202 {
1203         int oldflags;
1204 retry:
1205         oldflags = wq->wq_flags;
1206         if ( !(oldflags & (WQ_EXITING | WQ_ATIMER_DELAYED_RUNNING))) {
1207                 if (OSCompareAndSwap(oldflags, oldflags | WQ_ATIMER_DELAYED_RUNNING, (UInt32 *)&wq->wq_flags)) {
1208                         return true;
1209                 } else {
1210                         goto retry;
1211                 }
1212         }
1213         return false;
1214 }
1215
1216 static inline bool
1217 WQ_TIMER_IMMEDIATE_NEEDED(struct workqueue *wq)
1218 {
1219         int oldflags;
1220 retry:
1221         oldflags = wq->wq_flags;
1222         if ( !(oldflags & (WQ_EXITING | WQ_ATIMER_IMMEDIATE_RUNNING))) {
1223                 if (OSCompareAndSwap(oldflags, oldflags | WQ_ATIMER_IMMEDIATE_RUNNING, (UInt32 *)&wq->wq_flags)) {
1224                         return true;
1225                 } else {
1226                         goto retry;
1227                 }
1228         }
1229         return false;
1230 }
1231
1232 /**
1233  * handler function for the timer
1234  */
1235 static void
1236 workqueue_add_timer(struct workqueue *wq, thread_call_t thread_call_self)
1237 {
1238         proc_t          p;
1239         boolean_t       start_timer = FALSE;
1240         boolean_t       retval;
1241
1242         PTHREAD_TRACE_WQ(TRACE_wq_add_timer | DBG_FUNC_START, wq, wq->wq_flags, wq->wq_nthreads, wq->wq_thidlecount, 0);
1243
1244         p = wq->wq_proc;
1245
1246         workqueue_lock_spin(wq);
1247
1248         /*
1249          * There's two tricky issues here.
1250          *
1251          * First issue: we start the thread_call's that invoke this routine without
1252          * the workqueue lock held.  The scheduler callback needs to trigger
1253          * reevaluation of the number of running threads but shouldn't take that
1254          * lock, so we can't use it to synchronize state around the thread_call.
1255          * As a result, it might re-enter the thread_call while this routine is
1256          * already running.  This could cause it to fire a second time and we'll
1257          * have two add_timers running at once.  Obviously, we don't want that to
1258          * keep stacking, so we need to keep it at two timers.
1259          *
1260          * Solution: use wq_flags (accessed via atomic CAS) to synchronize the
1261          * enqueue of the thread_call itself.  When a thread needs to trigger the
1262          * add_timer, it checks for ATIMER_DELAYED_RUNNING and, when not set, sets
1263          * the flag then does a thread_call_enter.  We'll then remove that flag
1264          * only once we've got the lock and it's safe for the thread_call to be
1265          * entered again.
1266          *
1267          * Second issue: we need to make sure that the two timers don't execute this
1268          * routine concurrently.  We can't use the workqueue lock for this because
1269          * we'll need to drop it during our execution.
1270          *
1271          * Solution: use WQL_ATIMER_BUSY as a condition variable to indicate that
1272          * we are currently executing the routine and the next thread should wait.
1273          *
1274          * After all that, we arrive at the following four possible states:
1275          * !WQ_ATIMER_DELAYED_RUNNING && !WQL_ATIMER_BUSY       no pending timer, no active timer
1276          * !WQ_ATIMER_DELAYED_RUNNING &&  WQL_ATIMER_BUSY       no pending timer,  1 active timer
1277          *  WQ_ATIMER_DELAYED_RUNNING && !WQL_ATIMER_BUSY        1 pending timer, no active timer
1278          *  WQ_ATIMER_DELAYED_RUNNING &&  WQL_ATIMER_BUSY        1 pending timer,  1 active timer
1279          *
1280          * Further complication sometimes we need to trigger this function to run
1281          * without delay.  Because we aren't under a lock between setting
1282          * WQ_ATIMER_DELAYED_RUNNING and calling thread_call_enter, we can't simply
1283          * re-enter the thread call: if thread_call_enter() returned false, we
1284          * wouldn't be able to distinguish the case where the thread_call had
1285          * already fired from the case where it hadn't been entered yet from the
1286          * other thread.  So, we use a separate thread_call for immediate
1287          * invocations, and a separate RUNNING flag, WQ_ATIMER_IMMEDIATE_RUNNING.
1288          */
1289
1290         while (wq->wq_lflags & WQL_ATIMER_BUSY) {
1291                 wq->wq_lflags |= WQL_ATIMER_WAITING;
1292
1293                 assert_wait((caddr_t)wq, (THREAD_UNINT));
1294                 workqueue_unlock(wq);
1295
1296                 thread_block(THREAD_CONTINUE_NULL);
1297
1298                 workqueue_lock_spin(wq);
1299         }
1300         wq->wq_lflags |= WQL_ATIMER_BUSY;
1301
1302         /*
1303          * Decide which timer we are and remove the RUNNING flag.
1304          */
1305         if (thread_call_self == wq->wq_atimer_delayed_call) {
1306                 if ((wq->wq_flags & WQ_ATIMER_DELAYED_RUNNING) == 0) {
1307                         panic("workqueue_add_timer is the delayed timer but the delayed running flag isn't set");
1308                 }
1309                 WQ_UNSETFLAG(wq, WQ_ATIMER_DELAYED_RUNNING);
1310         } else if (thread_call_self == wq->wq_atimer_immediate_call) {
1311                 if ((wq->wq_flags & WQ_ATIMER_IMMEDIATE_RUNNING) == 0) {
1312                         panic("workqueue_add_timer is the immediate timer but the immediate running flag isn't set");
1313                 }
1314                 WQ_UNSETFLAG(wq, WQ_ATIMER_IMMEDIATE_RUNNING);
1315         } else {
1316                 panic("workqueue_add_timer can't figure out which timer it is");
1317         }
1318
1319 again:
1320         retval = TRUE;
1321         if ( !(wq->wq_flags & WQ_EXITING)) {
1322                 boolean_t add_thread = FALSE;
1323                 /*
1324                  * check to see if the stall frequency was beyond our tolerance
1325                  * or we have work on the queue, but haven't scheduled any
1326                  * new work within our acceptable time interval because
1327                  * there were no idle threads left to schedule
1328                  */
1329                 if (wq->wq_reqcount) {
1330                         uint32_t        priclass = 0;
1331                         uint32_t        thactive_count = 0;
1332                         uint64_t        curtime = mach_absolute_time();
1333                         uint64_t        busycount = 0;
1334
1335                         if (wq->wq_requests[WORKQUEUE_EVENT_MANAGER_BUCKET] &&
1336                                 wq->wq_thscheduled_count[WORKQUEUE_EVENT_MANAGER_BUCKET] == 0){
1337                                 priclass = WORKQUEUE_EVENT_MANAGER_BUCKET;
1338                         } else {
1339                                 for (priclass = 0; priclass < WORKQUEUE_NUM_BUCKETS; priclass++) {
1340                                         if (wq->wq_requests[priclass])
1341                                                 break;
1342                                 }
1343                         }
1344
1345                         if (priclass < WORKQUEUE_EVENT_MANAGER_BUCKET){
1346                                 /*
1347                                  * Compute a metric for many how many threads are active.  We
1348                                  * find the highest priority request outstanding and then add up
1349                                  * the number of active threads in that and all higher-priority
1350                                  * buckets.  We'll also add any "busy" threads which are not
1351                                  * active but blocked recently enough that we can't be sure
1352                                  * they've gone idle yet.  We'll then compare this metric to our
1353                                  * max concurrency to decide whether to add a new thread.
1354                                  */
1355                                 for (uint32_t i = 0; i <= priclass; i++) {
1356                                         thactive_count += wq->wq_thactive_count[i];
1357
1358                                         if (wq->wq_thscheduled_count[i] < wq->wq_thactive_count[i]) {
1359                                                 if (wq_thread_is_busy(curtime, &wq->wq_lastblocked_ts[i]))
1360                                                         busycount++;
1361                                         }
1362                                 }
1363                         }
1364
1365                         if (thactive_count + busycount < wq->wq_max_concurrency ||
1366                                 priclass == WORKQUEUE_EVENT_MANAGER_BUCKET) {
1367
1368                                 if (wq->wq_thidlecount == 0) {
1369                                         /*
1370                                          * if we have no idle threads, try to add one
1371                                          */
1372                                         retval = workqueue_addnewthread(wq, priclass == WORKQUEUE_EVENT_MANAGER_BUCKET);
1373                                 }
1374                                 add_thread = TRUE;
1375                         }
1376
1377                         if (wq->wq_reqcount) {
1378                                 /*
1379                                  * as long as we have threads to schedule, and we successfully
1380                                  * scheduled new work, keep trying
1381                                  */
1382                                 while (wq->wq_thidlecount && !(wq->wq_flags & WQ_EXITING)) {
1383                                         /*
1384                                          * workqueue_run_nextreq is responsible for
1385                                          * dropping the workqueue lock in all cases
1386                                          */
1387                                         retval = (workqueue_run_nextreq(p, wq, THREAD_NULL, RUN_NEXTREQ_ADD_TIMER, 0, false) != THREAD_NULL);
1388                                         workqueue_lock_spin(wq);
1389
1390                                         if (retval == FALSE)
1391                                                 break;
1392                                 }
1393                                 if ( !(wq->wq_flags & WQ_EXITING) && wq->wq_reqcount) {
1394
1395                                         if (wq->wq_thidlecount == 0 && retval == TRUE && add_thread == TRUE)
1396                                                 goto again;
1397
1398                                         if (wq->wq_thidlecount == 0 || busycount) {
1399                                                 start_timer = WQ_TIMER_DELAYED_NEEDED(wq);
1400                                         }
1401
1402                                         PTHREAD_TRACE_WQ(TRACE_wq_add_timer | DBG_FUNC_NONE, wq, wq->wq_reqcount, wq->wq_thidlecount, busycount, 0);
1403                                 }
1404                         }
1405                 }
1406         }
1407
1408         /*
1409          * If we called WQ_TIMER_NEEDED above, then this flag will be set if that
1410          * call marked the timer running.  If so, we let the timer interval grow.
1411          * Otherwise, we reset it back to 0.
1412          */
1413         if (!(wq->wq_flags & WQ_ATIMER_DELAYED_RUNNING)) {
1414                 wq->wq_timer_interval = 0;
1415         }
1416
1417         wq->wq_lflags &= ~WQL_ATIMER_BUSY;
1418
1419         if ((wq->wq_flags & WQ_EXITING) || (wq->wq_lflags & WQL_ATIMER_WAITING)) {
1420                 /*
1421                  * wakeup the thread hung up in _workqueue_mark_exiting or workqueue_add_timer waiting for this timer
1422                  * to finish getting out of the way
1423                  */
1424                 wq->wq_lflags &= ~WQL_ATIMER_WAITING;
1425                 wakeup(wq);
1426         }
1427
1428         PTHREAD_TRACE_WQ(TRACE_wq_add_timer | DBG_FUNC_END, wq, start_timer, wq->wq_nthreads, wq->wq_thidlecount, 0);
1429
1430         workqueue_unlock(wq);
1431
1432         if (start_timer == TRUE)
1433                 workqueue_interval_timer_start(wq);
1434 }
1435
1436 #pragma mark thread state tracking
1437
1438 // called by spinlock code when trying to yield to lock owner
1439 void
1440 _workqueue_thread_yielded(void)
1441 {
1442         struct workqueue *wq;
1443         proc_t p;
1444
1445         p = current_proc();
1446
1447         if ((wq = pthread_kern->proc_get_wqptr(p)) == NULL || wq->wq_reqcount == 0)
1448                 return;
1449
1450         workqueue_lock_spin(wq);
1451
1452         if (wq->wq_reqcount) {
1453                 uint64_t        curtime;
1454                 uint64_t        elapsed;
1455                 clock_sec_t     secs;
1456                 clock_usec_t    usecs;
1457
1458                 if (wq->wq_thread_yielded_count++ == 0)
1459                         wq->wq_thread_yielded_timestamp = mach_absolute_time();
1460
1461                 if (wq->wq_thread_yielded_count < wq_yielded_threshold) {
1462                         workqueue_unlock(wq);
1463                         return;
1464                 }
1465
1466                 PTHREAD_TRACE_WQ(TRACE_wq_thread_yielded | DBG_FUNC_START, wq, wq->wq_thread_yielded_count, wq->wq_reqcount, 0, 0);
1467
1468                 wq->wq_thread_yielded_count = 0;
1469
1470                 curtime = mach_absolute_time();
1471                 elapsed = curtime - wq->wq_thread_yielded_timestamp;
1472                 pthread_kern->absolutetime_to_microtime(elapsed, &secs, &usecs);
1473
1474                 if (secs == 0 && usecs < wq_yielded_window_usecs) {
1475
1476                         if (wq->wq_thidlecount == 0) {
1477                                 workqueue_addnewthread(wq, TRUE);
1478                                 /*
1479                                  * 'workqueue_addnewthread' drops the workqueue lock
1480                                  * when creating the new thread and then retakes it before
1481                                  * returning... this window allows other threads to process
1482                                  * requests, so we need to recheck for available work
1483                                  * if none found, we just return...  the newly created thread
1484                                  * will eventually get used (if it hasn't already)...
1485                                  */
1486                                 if (wq->wq_reqcount == 0) {
1487                                         workqueue_unlock(wq);
1488                                         return;
1489                                 }
1490                         }
1491                         if (wq->wq_thidlecount) {
1492                                 (void)workqueue_run_nextreq(p, wq, THREAD_NULL, RUN_NEXTREQ_UNCONSTRAINED, 0, false);
1493                                 /*
1494                                  * workqueue_run_nextreq is responsible for
1495                                  * dropping the workqueue lock in all cases
1496                                  */
1497                                 PTHREAD_TRACE_WQ(TRACE_wq_thread_yielded | DBG_FUNC_END, wq, wq->wq_thread_yielded_count, wq->wq_reqcount, 1, 0);
1498
1499                                 return;
1500                         }
1501                 }
1502                 PTHREAD_TRACE_WQ(TRACE_wq_thread_yielded | DBG_FUNC_END, wq, wq->wq_thread_yielded_count, wq->wq_reqcount, 2, 0);
1503         }
1504         workqueue_unlock(wq);
1505 }
1506
1507 static void
1508 workqueue_callback(int type, thread_t thread)
1509 {
1510         struct uthread    *uth;
1511         struct threadlist *tl;
1512         struct workqueue  *wq;
1513
1514         uth = pthread_kern->get_bsdthread_info(thread);
1515         tl = pthread_kern->uthread_get_threadlist(uth);
1516         wq = tl->th_workq;
1517
1518         switch (type) {
1519         case SCHED_CALL_BLOCK: {
1520                 uint32_t        old_activecount;
1521                 boolean_t       start_timer = FALSE;
1522
1523                 old_activecount = OSAddAtomic(-1, &wq->wq_thactive_count[tl->th_priority]);
1524
1525                 /*
1526                  * If we blocked and were at the requested concurrency previously, we may
1527                  * need to spin up a new thread.  Of course, if it's the event manager
1528                  * then that's moot, so ignore that case.
1529                  */
1530                 if (old_activecount == wq->wq_reqconc[tl->th_priority] &&
1531                         tl->th_priority != WORKQUEUE_EVENT_MANAGER_BUCKET) {
1532                         uint64_t        curtime;
1533                         UInt64          *lastblocked_ptr;
1534
1535                         /*
1536                          * the number of active threads at this priority
1537                          * has fallen below the maximum number of concurrent
1538                          * threads that we're allowed to run
1539                          */
1540                         lastblocked_ptr = (UInt64 *)&wq->wq_lastblocked_ts[tl->th_priority];
1541                         curtime = mach_absolute_time();
1542
1543                         /*
1544                          * if we collide with another thread trying to update the last_blocked (really unlikely
1545                          * since another thread would have to get scheduled and then block after we start down
1546                          * this path), it's not a problem.  Either timestamp is adequate, so no need to retry
1547                          */
1548
1549                         OSCompareAndSwap64(*lastblocked_ptr, (UInt64)curtime, lastblocked_ptr);
1550
1551                         if (wq->wq_reqcount) {
1552                                 /*
1553                                  * We have work to do so start up the timer if it's not
1554                                  * running; it'll sort out whether we need to start another
1555                                  * thread
1556                                  */
1557                                 start_timer = WQ_TIMER_DELAYED_NEEDED(wq);
1558                         }
1559
1560                         if (start_timer == TRUE) {
1561                                 workqueue_interval_timer_start(wq);
1562                         }
1563                 }
1564                 PTHREAD_TRACE1_WQ(TRACE_wq_thread_block | DBG_FUNC_START, wq, old_activecount, tl->th_priority, start_timer, thread_tid(thread));
1565                 break;
1566         }
1567         case SCHED_CALL_UNBLOCK:
1568                 /*
1569                  * we cannot take the workqueue_lock here...
1570                  * an UNBLOCK can occur from a timer event which
1571                  * is run from an interrupt context... if the workqueue_lock
1572                  * is already held by this processor, we'll deadlock...
1573                  * the thread lock for the thread being UNBLOCKED
1574                  * is also held
1575                  */
1576                 OSAddAtomic(1, &wq->wq_thactive_count[tl->th_priority]);
1577
1578                 PTHREAD_TRACE1_WQ(TRACE_wq_thread_block | DBG_FUNC_END, wq, wq->wq_threads_scheduled, tl->th_priority, 0, thread_tid(thread));
1579
1580                 break;
1581         }
1582 }
1583
1584 sched_call_t
1585 _workqueue_get_sched_callback(void)
1586 {
1587         return workqueue_callback;
1588 }
1589
1590 #pragma mark thread addition/removal
1591
1592 static mach_vm_size_t
1593 _workqueue_allocsize(struct workqueue *wq)
1594 {
1595         proc_t p = wq->wq_proc;
1596         mach_vm_size_t guardsize = vm_map_page_size(wq->wq_map);
1597         mach_vm_size_t pthread_size =
1598                 vm_map_round_page_mask(pthread_kern->proc_get_pthsize(p) + PTHREAD_T_OFFSET, vm_map_page_mask(wq->wq_map));
1599         return guardsize + PTH_DEFAULT_STACKSIZE + pthread_size;
1600 }
1601
1602 /**
1603  * pop goes the thread
1604  *
1605  * If fromexit is set, the call is from workqueue_exit(,
1606  * so some cleanups are to be avoided.
1607  */
1608 static void
1609 workqueue_removethread(struct threadlist *tl, bool fromexit, bool first_use)
1610 {
1611         struct uthread * uth;
1612         struct workqueue * wq = tl->th_workq;
1613
1614         if (tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET){
1615                 TAILQ_REMOVE(&wq->wq_thidlemgrlist, tl, th_entry);
1616         } else {
1617                 TAILQ_REMOVE(&wq->wq_thidlelist, tl, th_entry);
1618         }
1619
1620         if (fromexit == 0) {
1621                 assert(wq->wq_nthreads && wq->wq_thidlecount);
1622                 wq->wq_nthreads--;
1623                 wq->wq_thidlecount--;
1624         }
1625
1626         /*
1627          * Clear the threadlist pointer in uthread so
1628          * blocked thread on wakeup for termination will
1629          * not access the thread list as it is going to be
1630          * freed.
1631          */
1632         pthread_kern->thread_sched_call(tl->th_thread, NULL);
1633
1634         uth = pthread_kern->get_bsdthread_info(tl->th_thread);
1635         if (uth != (struct uthread *)0) {
1636                 pthread_kern->uthread_set_threadlist(uth, NULL);
1637         }
1638         if (fromexit == 0) {
1639                 /* during exit the lock is not held */
1640                 workqueue_unlock(wq);
1641         }
1642
1643         if ( (tl->th_flags & TH_LIST_NEW) || first_use ) {
1644                 /*
1645                  * thread was created, but never used...
1646                  * need to clean up the stack and port ourselves
1647                  * since we're not going to spin up through the
1648                  * normal exit path triggered from Libc
1649                  */
1650                 if (fromexit == 0) {
1651                         /* vm map is already deallocated when this is called from exit */
1652                         (void)mach_vm_deallocate(wq->wq_map, tl->th_stackaddr, _workqueue_allocsize(wq));
1653                 }
1654                 (void)pthread_kern->mach_port_deallocate(pthread_kern->task_get_ipcspace(wq->wq_task), tl->th_thport);
1655
1656         } else {
1657
1658                 PTHREAD_TRACE1_WQ(TRACE_wq_thread_park | DBG_FUNC_END, wq, (uintptr_t)thread_tid(current_thread()), wq->wq_nthreads, 0xdead, thread_tid(tl->th_thread));
1659         }
1660         /*
1661          * drop our ref on the thread
1662          */
1663         thread_deallocate(tl->th_thread);
1664
1665         kfree(tl, sizeof(struct threadlist));
1666 }
1667
1668
1669 /**
1670  * Try to add a new workqueue thread.
1671  *
1672  * - called with workq lock held
1673  * - dropped and retaken around thread creation
1674  * - return with workq lock held
1675  */
1676 static boolean_t
1677 workqueue_addnewthread(struct workqueue *wq, boolean_t ignore_constrained_thread_limit)
1678 {
1679         struct threadlist *tl;
1680         struct uthread  *uth;
1681         kern_return_t   kret;
1682         thread_t        th;
1683         proc_t          p;
1684         void            *sright;
1685         mach_vm_offset_t stackaddr;
1686
1687         if ((wq->wq_flags & WQ_EXITING) == WQ_EXITING) {
1688                 PTHREAD_TRACE_WQ(TRACE_wq_thread_add_during_exit | DBG_FUNC_NONE, wq, 0, 0, 0, 0);
1689                 return (FALSE);
1690         }
1691
1692         if (wq->wq_nthreads >= wq_max_threads) {
1693                 PTHREAD_TRACE_WQ(TRACE_wq_thread_limit_exceeded | DBG_FUNC_NONE, wq, wq->wq_nthreads, wq_max_threads, 0, 0);
1694                 return (FALSE);
1695         }
1696
1697         if (ignore_constrained_thread_limit == FALSE &&
1698                 wq->wq_constrained_threads_scheduled >= wq_max_constrained_threads) {
1699                 /*
1700                  * If we're not creating this thread to service an overcommit or
1701                  * event manager request, then we check to see if we are over our
1702                  * constrained thread limit, in which case we error out.
1703                  */
1704                 PTHREAD_TRACE_WQ(TRACE_wq_thread_constrained_maxed | DBG_FUNC_NONE, wq, wq->wq_constrained_threads_scheduled,
1705                                 wq_max_constrained_threads, 0, 0);
1706                 return (FALSE);
1707         }
1708
1709         wq->wq_nthreads++;
1710
1711         p = wq->wq_proc;
1712         workqueue_unlock(wq);
1713
1714         tl = kalloc(sizeof(struct threadlist));
1715         bzero(tl, sizeof(struct threadlist));
1716
1717         kret = pthread_kern->thread_create_workq_waiting(wq->wq_task, wq_unpark_continue, tl, &th);
1718         if (kret != KERN_SUCCESS) {
1719                 PTHREAD_TRACE_WQ(TRACE_wq_thread_create_failed | DBG_FUNC_NONE, wq, kret, 0, 0, 0);
1720                 kfree(tl, sizeof(struct threadlist));
1721                 goto failed;
1722         }
1723
1724         stackaddr = pthread_kern->proc_get_stack_addr_hint(p);
1725
1726         mach_vm_size_t guardsize = vm_map_page_size(wq->wq_map);
1727         mach_vm_size_t pthread_size =
1728                 vm_map_round_page_mask(pthread_kern->proc_get_pthsize(p) + PTHREAD_T_OFFSET, vm_map_page_mask(wq->wq_map));
1729         mach_vm_size_t th_allocsize = guardsize + PTH_DEFAULT_STACKSIZE + pthread_size;
1730
1731         kret = mach_vm_map(wq->wq_map, &stackaddr,
1732                         th_allocsize, page_size-1,
1733                         VM_MAKE_TAG(VM_MEMORY_STACK)| VM_FLAGS_ANYWHERE, NULL,
1734                         0, FALSE, VM_PROT_DEFAULT, VM_PROT_ALL,
1735                         VM_INHERIT_DEFAULT);
1736
1737         if (kret != KERN_SUCCESS) {
1738                 PTHREAD_TRACE_WQ(TRACE_wq_thread_create_failed | DBG_FUNC_NONE, wq, kret, 1, 0, 0);
1739
1740                 kret = mach_vm_allocate(wq->wq_map,
1741                                 &stackaddr, th_allocsize,
1742                                 VM_MAKE_TAG(VM_MEMORY_STACK) | VM_FLAGS_ANYWHERE);
1743         }
1744         if (kret == KERN_SUCCESS) {
1745                 /*
1746                  * The guard page is at the lowest address
1747                  * The stack base is the highest address
1748                  */
1749                 kret = mach_vm_protect(wq->wq_map, stackaddr, guardsize, FALSE, VM_PROT_NONE);
1750
1751                 if (kret != KERN_SUCCESS) {
1752                         (void) mach_vm_deallocate(wq->wq_map, stackaddr, th_allocsize);
1753                         PTHREAD_TRACE_WQ(TRACE_wq_thread_create_failed | DBG_FUNC_NONE, wq, kret, 2, 0, 0);
1754                 }
1755         }
1756         if (kret != KERN_SUCCESS) {
1757                 (void) thread_terminate(th);
1758                 thread_deallocate(th);
1759
1760                 kfree(tl, sizeof(struct threadlist));
1761                 goto failed;
1762         }
1763         thread_reference(th);
1764
1765         pthread_kern->thread_set_tag(th, THREAD_TAG_PTHREAD | THREAD_TAG_WORKQUEUE);
1766
1767         sright = (void *)pthread_kern->convert_thread_to_port(th);
1768         tl->th_thport = pthread_kern->ipc_port_copyout_send(sright, pthread_kern->task_get_ipcspace(wq->wq_task));
1769
1770         pthread_kern->thread_static_param(th, TRUE);
1771
1772         tl->th_flags = TH_LIST_INITED | TH_LIST_NEW;
1773
1774         tl->th_thread = th;
1775         tl->th_workq = wq;
1776         tl->th_stackaddr = stackaddr;
1777         tl->th_priority = WORKQUEUE_NUM_BUCKETS;
1778
1779         uth = pthread_kern->get_bsdthread_info(tl->th_thread);
1780
1781         workqueue_lock_spin(wq);
1782
1783         pthread_kern->uthread_set_threadlist(uth, tl);
1784         TAILQ_INSERT_TAIL(&wq->wq_thidlelist, tl, th_entry);
1785
1786         wq->wq_thidlecount++;
1787
1788         PTHREAD_TRACE_WQ(TRACE_wq_thread_create | DBG_FUNC_NONE, wq, 0, 0, 0, 0);
1789
1790         return (TRUE);
1791
1792 failed:
1793         workqueue_lock_spin(wq);
1794         wq->wq_nthreads--;
1795
1796         return (FALSE);
1797 }
1798
1799 /**
1800  * Setup per-process state for the workqueue.
1801  */
1802 int
1803 _workq_open(struct proc *p, __unused int32_t *retval)
1804 {
1805         struct workqueue * wq;
1806         int wq_size;
1807         char * ptr;
1808         uint32_t i;
1809         uint32_t num_cpus;
1810         int error = 0;
1811
1812         if (pthread_kern->proc_get_register(p) == 0) {
1813                 return EINVAL;
1814         }
1815
1816         num_cpus = pthread_kern->ml_get_max_cpus();
1817
1818         if (wq_init_constrained_limit) {
1819                 uint32_t limit;
1820                 /*
1821                  * set up the limit for the constrained pool
1822                  * this is a virtual pool in that we don't
1823                  * maintain it on a separate idle and run list
1824                  */
1825                 limit = num_cpus * WORKQUEUE_CONSTRAINED_FACTOR;
1826
1827                 if (limit > wq_max_constrained_threads)
1828                         wq_max_constrained_threads = limit;
1829
1830                 wq_init_constrained_limit = 0;
1831
1832                 if (wq_max_threads > pthread_kern->config_thread_max - 20) {
1833                         wq_max_threads = pthread_kern->config_thread_max - 20;
1834                 }
1835         }
1836
1837         if (pthread_kern->proc_get_wqptr(p) == NULL) {
1838                 if (pthread_kern->proc_init_wqptr_or_wait(p) == FALSE) {
1839                         assert(pthread_kern->proc_get_wqptr(p) != NULL);
1840                         goto out;
1841                 }
1842
1843                 wq_size = sizeof(struct workqueue);
1844
1845                 ptr = (char *)kalloc(wq_size);
1846                 bzero(ptr, wq_size);
1847
1848                 wq = (struct workqueue *)ptr;
1849                 wq->wq_flags = WQ_LIST_INITED;
1850                 wq->wq_proc = p;
1851                 wq->wq_max_concurrency = wq_max_concurrency;
1852                 wq->wq_task = current_task();
1853                 wq->wq_map  = pthread_kern->current_map();
1854
1855                 for (i = 0; i < WORKQUEUE_NUM_BUCKETS; i++)
1856                         wq->wq_reqconc[i] = (uint16_t)wq->wq_max_concurrency;
1857
1858                 // The event manager bucket is special, so its gets a concurrency of 1
1859                 // though we shouldn't ever read this value for that bucket
1860                 wq->wq_reqconc[WORKQUEUE_EVENT_MANAGER_BUCKET] = 1;
1861
1862                 // Start the event manager at the priority hinted at by the policy engine
1863                 int mgr_priority_hint = pthread_kern->task_get_default_manager_qos(current_task());
1864                 wq->wq_event_manager_priority = (uint32_t)thread_qos_get_pthread_priority(mgr_priority_hint) | _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG;
1865
1866                 TAILQ_INIT(&wq->wq_thrunlist);
1867                 TAILQ_INIT(&wq->wq_thidlelist);
1868
1869                 wq->wq_atimer_delayed_call = thread_call_allocate((thread_call_func_t)workqueue_add_timer, (thread_call_param_t)wq);
1870                 wq->wq_atimer_immediate_call = thread_call_allocate((thread_call_func_t)workqueue_add_timer, (thread_call_param_t)wq);
1871
1872                 lck_spin_init(&wq->wq_lock, pthread_lck_grp, pthread_lck_attr);
1873
1874                 pthread_kern->proc_set_wqptr(p, wq);
1875
1876         }
1877 out:
1878
1879         return(error);
1880 }
1881
1882 /*
1883  * Routine:     workqueue_mark_exiting
1884  *
1885  * Function:    Mark the work queue such that new threads will not be added to the
1886  *              work queue after we return.
1887  *
1888  * Conditions:  Called against the current process.
1889  */
1890 void
1891 _workqueue_mark_exiting(struct proc *p)
1892 {
1893         struct workqueue *wq = pthread_kern->proc_get_wqptr(p);
1894
1895         if (wq != NULL) {
1896
1897                 PTHREAD_TRACE_WQ(TRACE_wq_pthread_exit|DBG_FUNC_START, wq, 0, 0, 0, 0);
1898
1899                 workqueue_lock_spin(wq);
1900
1901                 /*
1902                  * We arm the add timer without holding the workqueue lock so we need
1903                  * to synchronize with any running or soon to be running timers.
1904                  *
1905                  * Threads that intend to arm the timer atomically OR
1906                  * WQ_ATIMER_{DELAYED,IMMEDIATE}_RUNNING into the wq_flags, only if
1907                  * WQ_EXITING is not present.  So, once we have set WQ_EXITING, we can
1908                  * be sure that no new RUNNING flags will be set, but still need to
1909                  * wait for the already running timers to complete.
1910                  *
1911                  * We always hold the workq lock when dropping WQ_ATIMER_RUNNING, so
1912                  * the check for and sleep until clear is protected.
1913                  */
1914                 WQ_SETFLAG(wq, WQ_EXITING);
1915
1916                 if (wq->wq_flags & WQ_ATIMER_DELAYED_RUNNING) {
1917                         if (thread_call_cancel(wq->wq_atimer_delayed_call) == TRUE) {
1918                                 WQ_UNSETFLAG(wq, WQ_ATIMER_DELAYED_RUNNING);
1919                         }
1920                 }
1921                 if (wq->wq_flags & WQ_ATIMER_IMMEDIATE_RUNNING) {
1922                         if (thread_call_cancel(wq->wq_atimer_immediate_call) == TRUE) {
1923                                 WQ_UNSETFLAG(wq, WQ_ATIMER_IMMEDIATE_RUNNING);
1924                         }
1925                 }
1926                 while (wq->wq_flags & (WQ_ATIMER_DELAYED_RUNNING | WQ_ATIMER_IMMEDIATE_RUNNING) ||
1927                                 (wq->wq_lflags & WQL_ATIMER_BUSY)) {
1928                         assert_wait((caddr_t)wq, (THREAD_UNINT));
1929                         workqueue_unlock(wq);
1930
1931                         thread_block(THREAD_CONTINUE_NULL);
1932
1933                         workqueue_lock_spin(wq);
1934                 }
1935                 workqueue_unlock(wq);
1936
1937                 PTHREAD_TRACE(TRACE_wq_pthread_exit|DBG_FUNC_END, 0, 0, 0, 0, 0);
1938         }
1939 }
1940
1941 /*
1942  * Routine:     workqueue_exit
1943  *
1944  * Function:    clean up the work queue structure(s) now that there are no threads
1945  *              left running inside the work queue (except possibly current_thread).
1946  *
1947  * Conditions:  Called by the last thread in the process.
1948  *              Called against current process.
1949  */
1950 void
1951 _workqueue_exit(struct proc *p)
1952 {
1953         struct workqueue  * wq;
1954         struct threadlist  * tl, *tlist;
1955         struct uthread  *uth;
1956         size_t wq_size = sizeof(struct workqueue);
1957
1958         wq = pthread_kern->proc_get_wqptr(p);
1959         if (wq != NULL) {
1960
1961                 PTHREAD_TRACE_WQ(TRACE_wq_workqueue_exit|DBG_FUNC_START, wq, 0, 0, 0, 0);
1962
1963                 pthread_kern->proc_set_wqptr(p, NULL);
1964
1965                 /*
1966                  * Clean up workqueue data structures for threads that exited and
1967                  * didn't get a chance to clean up after themselves.
1968                  */
1969                 TAILQ_FOREACH_SAFE(tl, &wq->wq_thrunlist, th_entry, tlist) {
1970                         assert((tl->th_flags & TH_LIST_RUNNING) != 0);
1971
1972                         pthread_kern->thread_sched_call(tl->th_thread, NULL);
1973
1974                         uth = pthread_kern->get_bsdthread_info(tl->th_thread);
1975                         if (uth != (struct uthread *)0) {
1976                                 pthread_kern->uthread_set_threadlist(uth, NULL);
1977                         }
1978                         TAILQ_REMOVE(&wq->wq_thrunlist, tl, th_entry);
1979
1980                         /*
1981                          * drop our last ref on the thread
1982                          */
1983                         thread_deallocate(tl->th_thread);
1984
1985                         kfree(tl, sizeof(struct threadlist));
1986                 }
1987                 TAILQ_FOREACH_SAFE(tl, &wq->wq_thidlelist, th_entry, tlist) {
1988                         assert((tl->th_flags & TH_LIST_RUNNING) == 0);
1989                         assert(tl->th_priority != WORKQUEUE_EVENT_MANAGER_BUCKET);
1990                         workqueue_removethread(tl, true, false);
1991                 }
1992                 TAILQ_FOREACH_SAFE(tl, &wq->wq_thidlemgrlist, th_entry, tlist) {
1993                         assert((tl->th_flags & TH_LIST_RUNNING) == 0);
1994                         assert(tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET);
1995                         workqueue_removethread(tl, true, false);
1996                 }
1997                 thread_call_free(wq->wq_atimer_delayed_call);
1998                 thread_call_free(wq->wq_atimer_immediate_call);
1999                 lck_spin_destroy(&wq->wq_lock, pthread_lck_grp);
2000
2001                 kfree(wq, wq_size);
2002
2003                 PTHREAD_TRACE(TRACE_wq_workqueue_exit|DBG_FUNC_END, 0, 0, 0, 0, 0);
2004         }
2005 }
2006
2007
2008 #pragma mark workqueue thread manipulation
2009
2010 /**
2011  * Entry point for libdispatch to ask for threads
2012  */
2013 static int wqops_queue_reqthreads(struct proc *p, int reqcount, pthread_priority_t priority){
2014         struct workqueue *wq;
2015         boolean_t start_timer = FALSE;
2016
2017         boolean_t overcommit = (_pthread_priority_get_flags(priority) & _PTHREAD_PRIORITY_OVERCOMMIT_FLAG) != 0;
2018         int class = pthread_priority_get_class_index(priority);
2019
2020         boolean_t event_manager = (_pthread_priority_get_flags(priority) & _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG) != 0;
2021         if (event_manager){
2022                 class = WORKQUEUE_EVENT_MANAGER_BUCKET;
2023         }
2024
2025         if ((reqcount <= 0) || (class < 0) || (class >= WORKQUEUE_NUM_BUCKETS) || (overcommit && event_manager)) {
2026                 return EINVAL;
2027         }
2028
2029
2030         if ((wq = (struct workqueue *)pthread_kern->proc_get_wqptr(p)) == NULL) {
2031                 return EINVAL;
2032         }
2033
2034         workqueue_lock_spin(wq);
2035
2036         if (overcommit == 0 && event_manager == 0) {
2037                 wq->wq_reqcount += reqcount;
2038                 wq->wq_requests[class] += reqcount;
2039
2040                 PTHREAD_TRACE_WQ(TRACE_wq_req_threads | DBG_FUNC_NONE, wq, priority, wq->wq_requests[class], reqcount, 0);
2041
2042                 while (wq->wq_reqcount) {
2043                         if (!workqueue_run_one(p, wq, overcommit, 0))
2044                                 break;
2045                 }
2046         } else if (overcommit) {
2047                 PTHREAD_TRACE_WQ(TRACE_wq_req_octhreads | DBG_FUNC_NONE, wq, priority, wq->wq_ocrequests[class], reqcount, 0);
2048
2049                 while (reqcount) {
2050                         if (!workqueue_run_one(p, wq, overcommit, priority))
2051                                 break;
2052                         reqcount--;
2053                 }
2054                 if (reqcount) {
2055                         /*
2056                          * We need to delay starting some of the overcommit requests.
2057                          * We'll record the request here and as existing threads return to
2058                          * the kernel, we'll notice the ocrequests and spin them back to
2059                          * user space as the overcommit variety.
2060                          */
2061                         wq->wq_reqcount += reqcount;
2062                         wq->wq_requests[class] += reqcount;
2063                         wq->wq_ocrequests[class] += reqcount;
2064
2065                         PTHREAD_TRACE_WQ(TRACE_wq_delay_octhreads | DBG_FUNC_NONE, wq, priority, wq->wq_ocrequests[class], reqcount, 0);
2066
2067                         /*
2068                          * If we delayed this thread coming up but we're not constrained
2069                          * or at max threads then we need to start the timer so we don't
2070                          * risk dropping this request on the floor.
2071                          */
2072                         if ((wq->wq_constrained_threads_scheduled < wq_max_constrained_threads) &&
2073                                         (wq->wq_nthreads < wq_max_threads)){
2074                                 start_timer = WQ_TIMER_DELAYED_NEEDED(wq);
2075                         }
2076                 }
2077         } else if (event_manager) {
2078                 PTHREAD_TRACE_WQ(TRACE_wq_req_event_manager | DBG_FUNC_NONE, wq, wq->wq_event_manager_priority, wq->wq_requests[WORKQUEUE_EVENT_MANAGER_BUCKET], wq->wq_thscheduled_count[WORKQUEUE_EVENT_MANAGER_BUCKET], 0);
2079
2080                 if (wq->wq_requests[WORKQUEUE_EVENT_MANAGER_BUCKET] == 0){
2081                         wq->wq_reqcount += 1;
2082                         wq->wq_requests[WORKQUEUE_EVENT_MANAGER_BUCKET] = 1;
2083                 }
2084
2085                 // We've recorded the request for an event manager thread above.  We'll
2086                 // let the timer pick it up as we would for a kernel callout.  We can
2087                 // do a direct add/wakeup when that support is added for the kevent path.
2088                 if (wq->wq_thscheduled_count[WORKQUEUE_EVENT_MANAGER_BUCKET] == 0){
2089                         start_timer = WQ_TIMER_DELAYED_NEEDED(wq);
2090                 }
2091         }
2092
2093         if (start_timer) {
2094                 workqueue_interval_timer_start(wq);
2095         }
2096
2097         workqueue_unlock(wq);
2098
2099         return 0;
2100 }
2101
2102 /*
2103  * Used by the kevent system to request threads.
2104  *
2105  * Currently count is ignored and we always return one thread per invocation.
2106  */
2107 thread_t _workq_reqthreads(struct proc *p, int requests_count, workq_reqthreads_req_t requests){
2108         thread_t th = THREAD_NULL;
2109         boolean_t do_thread_call = FALSE;
2110         boolean_t emergency_thread = FALSE;
2111         assert(requests_count > 0);
2112
2113 #if DEBUG
2114         // Make sure that the requests array is sorted, highest priority first
2115         if (requests_count > 1){
2116                 __assert_only qos_class_t priority = _pthread_priority_get_qos_newest(requests[0].priority);
2117                 __assert_only unsigned long flags = ((_pthread_priority_get_flags(requests[0].priority) & (_PTHREAD_PRIORITY_OVERCOMMIT_FLAG|_PTHREAD_PRIORITY_EVENT_MANAGER_FLAG)) != 0);
2118                 for (int i = 1; i < requests_count; i++){
2119                         if (requests[i].count == 0) continue;
2120                         __assert_only qos_class_t next_priority = _pthread_priority_get_qos_newest(requests[i].priority);
2121                         __assert_only unsigned long next_flags = ((_pthread_priority_get_flags(requests[i].priority) & (_PTHREAD_PRIORITY_OVERCOMMIT_FLAG|_PTHREAD_PRIORITY_EVENT_MANAGER_FLAG)) != 0);
2122                         if (next_flags != flags){
2123                                 flags = next_flags;
2124                                 priority = next_priority;
2125                         } else {
2126                                 assert(next_priority <= priority);
2127                         }
2128                 }
2129         }
2130 #endif // DEBUG
2131
2132         struct workqueue *wq;
2133         if ((wq = (struct workqueue *)pthread_kern->proc_get_wqptr(p)) == NULL) {
2134                 return THREAD_NULL;
2135         }
2136
2137         workqueue_lock_spin(wq);
2138
2139         PTHREAD_TRACE_WQ(TRACE_wq_kevent_req_threads | DBG_FUNC_START, wq, requests_count, 0, 0, 0);
2140
2141         // Look for overcommit or event-manager-only requests.
2142         boolean_t have_overcommit = FALSE;
2143         pthread_priority_t priority = 0;
2144         for (int i = 0; i < requests_count; i++){
2145                 if (requests[i].count == 0)
2146                         continue;
2147                 priority = requests[i].priority;
2148                 if (_pthread_priority_get_qos_newest(priority) == QOS_CLASS_UNSPECIFIED){
2149                         priority |= _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG;
2150                 }
2151                 if ((_pthread_priority_get_flags(priority) & _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG) != 0){
2152                         goto event_manager;
2153                 }
2154                 if ((_pthread_priority_get_flags(priority) & _PTHREAD_PRIORITY_OVERCOMMIT_FLAG) != 0){
2155                         have_overcommit = TRUE;
2156                         break;
2157                 }
2158         }
2159
2160         if (have_overcommit){
2161                 if (wq->wq_thidlecount){
2162                         th = workqueue_run_nextreq(p, wq, THREAD_NULL, RUN_NEXTREQ_OVERCOMMIT_KEVENT, priority, true);
2163                         if (th != THREAD_NULL){
2164                                 goto out;
2165                         } else {
2166                                 workqueue_lock_spin(wq); // reacquire lock
2167                         }
2168                 }
2169
2170                 int class = pthread_priority_get_class_index(priority);
2171                 wq->wq_reqcount += 1;
2172                 wq->wq_requests[class] += 1;
2173                 wq->wq_kevent_ocrequests[class] += 1;
2174
2175                 do_thread_call = WQ_TIMER_IMMEDIATE_NEEDED(wq);
2176                 goto deferred;
2177         }
2178
2179         // Having no overcommit requests, try to find any request that can start
2180         // There's no TOCTTOU since we hold the workqueue lock
2181         for (int i = 0; i < requests_count; i++){
2182                 workq_reqthreads_req_t req = requests + i;
2183                 priority = req->priority;
2184                 int class = pthread_priority_get_class_index(priority);
2185
2186                 if (req->count == 0)
2187                         continue;
2188
2189                 if (!may_start_constrained_thread(wq, class, WORKQUEUE_NUM_BUCKETS, NULL))
2190                         continue;
2191
2192                 wq->wq_reqcount += 1;
2193                 wq->wq_requests[class] += 1;
2194                 wq->wq_kevent_requests[class] += 1;
2195
2196                 PTHREAD_TRACE_WQ(TRACE_wq_req_kevent_threads | DBG_FUNC_NONE, wq, priority, wq->wq_kevent_requests[class], 1, 0);
2197
2198                 if (wq->wq_thidlecount){
2199                         th = workqueue_run_nextreq(p, wq, THREAD_NULL, RUN_NEXTREQ_DEFAULT_KEVENT, priority, true);
2200                         goto out;
2201                 } else {
2202                         do_thread_call = WQ_TIMER_IMMEDIATE_NEEDED(wq);
2203                         goto deferred;
2204                 }
2205         }
2206
2207         // Okay, here's the fun case: we can't spin up any of the non-overcommit threads
2208         // that we've seen a request for, so we kick this over to the event manager thread
2209         emergency_thread = TRUE;
2210
2211 event_manager:
2212         if (wq->wq_requests[WORKQUEUE_EVENT_MANAGER_BUCKET] == 0){
2213                 wq->wq_reqcount += 1;
2214                 wq->wq_requests[WORKQUEUE_EVENT_MANAGER_BUCKET] = 1;
2215                 PTHREAD_TRACE_WQ(TRACE_wq_req_event_manager | DBG_FUNC_NONE, wq, 0, wq->wq_kevent_requests[WORKQUEUE_EVENT_MANAGER_BUCKET], 1, 0);
2216         } else {
2217                 PTHREAD_TRACE_WQ(TRACE_wq_req_event_manager | DBG_FUNC_NONE, wq, 0, wq->wq_kevent_requests[WORKQUEUE_EVENT_MANAGER_BUCKET], 0, 0);
2218         }
2219         wq->wq_kevent_requests[WORKQUEUE_EVENT_MANAGER_BUCKET] = 1;
2220
2221         if (wq->wq_thidlecount && wq->wq_thscheduled_count[WORKQUEUE_EVENT_MANAGER_BUCKET] == 0){
2222                 th = workqueue_run_nextreq(p, wq, THREAD_NULL, RUN_NEXTREQ_EVENT_MANAGER, 0, true);
2223                 assert(th != THREAD_NULL);
2224                 goto out;
2225         }
2226         do_thread_call = WQ_TIMER_IMMEDIATE_NEEDED(wq);
2227
2228 deferred:
2229         workqueue_unlock(wq);
2230
2231         if (do_thread_call == TRUE){
2232                 workqueue_interval_timer_trigger(wq);
2233         }
2234
2235 out:
2236         PTHREAD_TRACE_WQ(TRACE_wq_kevent_req_threads | DBG_FUNC_END, wq, do_thread_call, 0, 0, 0);
2237
2238         return emergency_thread ? (void*)-1 : th;
2239 }
2240
2241
2242 static int wqops_thread_return(struct proc *p){
2243         thread_t th = current_thread();
2244         struct uthread *uth = pthread_kern->get_bsdthread_info(th);
2245         struct threadlist *tl = pthread_kern->uthread_get_threadlist(uth);
2246
2247         /* reset signal mask on the workqueue thread to default state */
2248         if (pthread_kern->uthread_get_sigmask(uth) != (sigset_t)(~workq_threadmask)) {
2249                 pthread_kern->proc_lock(p);
2250                 pthread_kern->uthread_set_sigmask(uth, ~workq_threadmask);
2251                 pthread_kern->proc_unlock(p);
2252         }
2253
2254         struct workqueue *wq = (struct workqueue *)pthread_kern->proc_get_wqptr(p);
2255         if (wq == NULL || !tl) {
2256                 return EINVAL;
2257         }
2258
2259         PTHREAD_TRACE_WQ(TRACE_wq_override_reset | DBG_FUNC_START, tl->th_workq, 0, 0, 0, 0);
2260
2261         /*
2262          * This squash call has neat semantics: it removes the specified overrides,
2263          * replacing the current requested QoS with the previous effective QoS from
2264          * those overrides.  This means we won't be preempted due to having our QoS
2265          * lowered.  Of course, now our understanding of the thread's QoS is wrong,
2266          * so we'll adjust below.
2267          */
2268         int new_qos =
2269         pthread_kern->proc_usynch_thread_qos_squash_override_for_resource(th,
2270                         THREAD_QOS_OVERRIDE_RESOURCE_WILDCARD,
2271                         THREAD_QOS_OVERRIDE_TYPE_DISPATCH_ASYNCHRONOUS_OVERRIDE);
2272
2273         workqueue_lock_spin(wq);
2274
2275         if (tl->th_flags & TH_LIST_KEVENT_BOUND) {
2276                 unsigned int flags = KEVENT_FLAG_WORKQ;
2277                 if (tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET) {
2278                         flags |= KEVENT_FLAG_WORKQ_MANAGER;
2279                 }
2280
2281                 workqueue_unlock(wq);
2282                 kevent_qos_internal_unbind(p, class_index_get_thread_qos(tl->th_priority), th, flags);
2283                 workqueue_lock_spin(wq);
2284
2285                 tl->th_flags &= ~TH_LIST_KEVENT_BOUND;
2286         }
2287
2288         /* Fix up counters from the squash operation. */
2289         uint8_t old_bucket = tl->th_priority;
2290         uint8_t new_bucket = thread_qos_get_class_index(new_qos);
2291
2292         if (old_bucket != new_bucket) {
2293                 OSAddAtomic(-1, &wq->wq_thactive_count[old_bucket]);
2294                 OSAddAtomic(1, &wq->wq_thactive_count[new_bucket]);
2295
2296                 wq->wq_thscheduled_count[old_bucket]--;
2297                 wq->wq_thscheduled_count[new_bucket]++;
2298
2299                 tl->th_priority = new_bucket;
2300         }
2301
2302         PTHREAD_TRACE_WQ(TRACE_wq_override_reset | DBG_FUNC_END, tl->th_workq, new_qos, 0, 0, 0);
2303
2304         PTHREAD_TRACE_WQ(TRACE_wq_runitem | DBG_FUNC_END, wq, 0, 0, 0, 0);
2305
2306         (void)workqueue_run_nextreq(p, wq, th, RUN_NEXTREQ_DEFAULT, 0, false);
2307         /*
2308          * workqueue_run_nextreq is responsible for
2309          * dropping the workqueue lock in all cases
2310          */
2311         return 0;
2312 }
2313
2314 /**
2315  * Multiplexed call to interact with the workqueue mechanism
2316  */
2317 int
2318 _workq_kernreturn(struct proc *p,
2319                   int options,
2320                   user_addr_t item,
2321                   int arg2,
2322                   int arg3,
2323                   int32_t *retval)
2324 {
2325         int error = 0;
2326
2327         if (pthread_kern->proc_get_register(p) == 0) {
2328                 return EINVAL;
2329         }
2330
2331         switch (options) {
2332         case WQOPS_QUEUE_NEWSPISUPP: {
2333                 /*
2334                  * arg2 = offset of serialno into dispatch queue
2335                  * arg3 = kevent support
2336                  */
2337                 int offset = arg2;
2338                 if (arg3 & 0x01){
2339                         // If we get here, then userspace has indicated support for kevent delivery.
2340                 }
2341
2342                 pthread_kern->proc_set_dispatchqueue_serialno_offset(p, (uint64_t)offset);
2343                 break;
2344         }
2345         case WQOPS_QUEUE_REQTHREADS: {
2346                 /*
2347                  * arg2 = number of threads to start
2348                  * arg3 = priority
2349                  */
2350                 error = wqops_queue_reqthreads(p, arg2, arg3);
2351                 break;
2352         }
2353         case WQOPS_SET_EVENT_MANAGER_PRIORITY: {
2354                 /*
2355                  * arg2 = priority for the manager thread
2356                  *
2357                  * if _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG is set, the
2358                  * ~_PTHREAD_PRIORITY_FLAGS_MASK contains a scheduling priority instead
2359                  * of a QOS value
2360                  */
2361                 pthread_priority_t pri = arg2;
2362
2363                 struct workqueue *wq = (struct workqueue *)pthread_kern->proc_get_wqptr(p);
2364                 if (wq == NULL) {
2365                         error = EINVAL;
2366                         break;
2367                 }
2368                 workqueue_lock_spin(wq);
2369                 if (pri & _PTHREAD_PRIORITY_SCHED_PRI_FLAG){
2370                         // If userspace passes a scheduling priority, that takes precidence
2371                         // over any QoS.  (So, userspace should take care not to accidenatally
2372                         // lower the priority this way.)
2373                         uint32_t sched_pri = pri & (~_PTHREAD_PRIORITY_FLAGS_MASK);
2374                         if (wq->wq_event_manager_priority & _PTHREAD_PRIORITY_SCHED_PRI_FLAG){
2375                                 wq->wq_event_manager_priority = MAX(sched_pri, wq->wq_event_manager_priority & (~_PTHREAD_PRIORITY_FLAGS_MASK))
2376                                                 | _PTHREAD_PRIORITY_SCHED_PRI_FLAG | _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG;
2377                         } else {
2378                                 wq->wq_event_manager_priority = sched_pri
2379                                                 | _PTHREAD_PRIORITY_SCHED_PRI_FLAG | _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG;
2380                         }
2381                 } else if ((wq->wq_event_manager_priority & _PTHREAD_PRIORITY_SCHED_PRI_FLAG) == 0){
2382                         int cur_qos = pthread_priority_get_thread_qos(wq->wq_event_manager_priority);
2383                         int new_qos = pthread_priority_get_thread_qos(pri);
2384                         wq->wq_event_manager_priority = (uint32_t)thread_qos_get_pthread_priority(MAX(cur_qos, new_qos)) | _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG;
2385                 }
2386                 workqueue_unlock(wq);
2387                 break;
2388         }
2389         case WQOPS_THREAD_KEVENT_RETURN:
2390                 if (item != 0) {
2391                         int32_t kevent_retval;
2392                         int ret = kevent_qos_internal(p, -1, item, arg2, item, arg2, NULL, NULL, KEVENT_FLAG_WORKQ | KEVENT_FLAG_IMMEDIATE | KEVENT_FLAG_ERROR_EVENTS, &kevent_retval);
2393                         // We shouldn't be getting more errors out than events we put in, so
2394                         // reusing the input buffer should always provide enough space.  But,
2395                         // the assert is commented out since we get errors in edge cases in the
2396                         // process lifecycle.
2397                         //assert(ret == KERN_SUCCESS && kevent_retval >= 0);
2398                         if (ret != KERN_SUCCESS){
2399                                 error = ret;
2400                                 break;
2401                         } else if (kevent_retval > 0){
2402                                 assert(kevent_retval <= arg2);
2403                                 *retval = kevent_retval;
2404                                 error = 0;
2405                                 break;
2406                         }
2407                 }
2408                 // FALLTHRU
2409         case WQOPS_THREAD_RETURN:
2410                 error = wqops_thread_return(p);
2411                 // NOT REACHED except in case of error
2412                 assert(error);
2413                 break;
2414         default:
2415                 error = EINVAL;
2416                 break;
2417         }
2418         return (error);
2419 }
2420
2421
2422 static boolean_t
2423 workqueue_run_one(proc_t p, struct workqueue *wq, boolean_t overcommit, pthread_priority_t priority)
2424 {
2425         boolean_t       ran_one;
2426
2427         if (wq->wq_thidlecount == 0) {
2428                 if (overcommit == FALSE) {
2429                         if (wq->wq_constrained_threads_scheduled < wq->wq_max_concurrency)
2430                                 workqueue_addnewthread(wq, overcommit);
2431                 } else {
2432                         workqueue_addnewthread(wq, overcommit);
2433
2434                         if (wq->wq_thidlecount == 0)
2435                                 return (FALSE);
2436                 }
2437         }
2438         ran_one = (workqueue_run_nextreq(p, wq, THREAD_NULL, overcommit ? RUN_NEXTREQ_OVERCOMMIT : RUN_NEXTREQ_DEFAULT, priority, false) != THREAD_NULL);
2439         /*
2440          * workqueue_run_nextreq is responsible for
2441          * dropping the workqueue lock in all cases
2442          */
2443         workqueue_lock_spin(wq);
2444
2445         return (ran_one);
2446 }
2447
2448 /*
2449  * We have no work to do, park ourselves on the idle list.
2450  *
2451  * Consumes the workqueue lock and does not return.
2452  */
2453 static void __dead2
2454 parkit(struct workqueue *wq, struct threadlist *tl, thread_t thread)
2455 {
2456         assert(thread == tl->th_thread);
2457         assert(thread == current_thread());
2458
2459         uint32_t us_to_wait = 0;
2460
2461         TAILQ_REMOVE(&wq->wq_thrunlist, tl, th_entry);
2462
2463         tl->th_flags &= ~TH_LIST_RUNNING;
2464         tl->th_flags &= ~TH_LIST_KEVENT;
2465         assert((tl->th_flags & TH_LIST_KEVENT_BOUND) == 0);
2466
2467         if (tl->th_flags & TH_LIST_CONSTRAINED) {
2468                 wq->wq_constrained_threads_scheduled--;
2469                 tl->th_flags &= ~TH_LIST_CONSTRAINED;
2470         }
2471
2472         OSAddAtomic(-1, &wq->wq_thactive_count[tl->th_priority]);
2473         wq->wq_thscheduled_count[tl->th_priority]--;
2474         wq->wq_threads_scheduled--;
2475         uint32_t thidlecount = ++wq->wq_thidlecount;
2476
2477         pthread_kern->thread_sched_call(thread, NULL);
2478
2479         /*
2480          * We'd like to always have one manager thread parked so that we can have
2481          * low latency when we need to bring a manager thread up.  If that idle
2482          * thread list is empty, make this thread a manager thread.
2483          *
2484          * XXX: This doesn't check that there's not a manager thread outstanding,
2485          * so it's based on the assumption that most manager callouts will change
2486          * their QoS before parking.  If that stops being true, this may end up
2487          * costing us more than we gain.
2488          */
2489         if (TAILQ_EMPTY(&wq->wq_thidlemgrlist) &&
2490                         tl->th_priority != WORKQUEUE_EVENT_MANAGER_BUCKET){
2491                 reset_priority(tl, pthread_priority_from_wq_class_index(wq, WORKQUEUE_EVENT_MANAGER_BUCKET));
2492                 tl->th_priority = WORKQUEUE_EVENT_MANAGER_BUCKET;
2493         }
2494
2495         if (tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET){
2496                 TAILQ_INSERT_HEAD(&wq->wq_thidlemgrlist, tl, th_entry);
2497         } else {
2498                 TAILQ_INSERT_HEAD(&wq->wq_thidlelist, tl, th_entry);
2499         }
2500
2501         PTHREAD_TRACE_WQ(TRACE_wq_thread_park | DBG_FUNC_START, wq,
2502                         wq->wq_threads_scheduled, wq->wq_thidlecount, us_to_wait, 0);
2503
2504         /*
2505          * When we remove the voucher from the thread, we may lose our importance
2506          * causing us to get preempted, so we do this after putting the thread on
2507          * the idle list.  That when, when we get our importance back we'll be able
2508          * to use this thread from e.g. the kevent call out to deliver a boosting
2509          * message.
2510          */
2511         workqueue_unlock(wq);
2512         kern_return_t kr = pthread_kern->thread_set_voucher_name(MACH_PORT_NULL);
2513         assert(kr == KERN_SUCCESS);
2514         workqueue_lock_spin(wq);
2515
2516         if ((tl->th_flags & TH_LIST_RUNNING) == 0) {
2517                 if (thidlecount < 101) {
2518                         us_to_wait = wq_reduce_pool_window_usecs - ((thidlecount-2) * (wq_reduce_pool_window_usecs / 100));
2519                 } else {
2520                         us_to_wait = wq_reduce_pool_window_usecs / 100;
2521                 }
2522
2523                 assert_wait_timeout_with_leeway((caddr_t)tl, (THREAD_INTERRUPTIBLE),
2524                                 TIMEOUT_URGENCY_SYS_BACKGROUND|TIMEOUT_URGENCY_LEEWAY, us_to_wait,
2525                                 wq_reduce_pool_window_usecs/10, NSEC_PER_USEC);
2526
2527                 workqueue_unlock(wq);
2528
2529                 thread_block(wq_unpark_continue);
2530                 panic("thread_block(wq_unpark_continue) returned!");
2531         } else {
2532                 workqueue_unlock(wq);
2533
2534                 /*
2535                  * While we'd dropped the lock to unset our voucher, someone came
2536                  * around and made us runnable.  But because we weren't waiting on the
2537                  * event their wakeup() was ineffectual.  To correct for that, we just
2538                  * run the continuation ourselves.
2539                  */
2540                 wq_unpark_continue(NULL, THREAD_AWAKENED);
2541         }
2542 }
2543
2544 static boolean_t may_start_constrained_thread(struct workqueue *wq, uint32_t at_priclass, uint32_t my_priclass, boolean_t *start_timer){
2545         if (wq->wq_constrained_threads_scheduled >= wq_max_constrained_threads) {
2546                 /*
2547                  * we need 1 or more constrained threads to return to the kernel before
2548                  * we can dispatch additional work
2549                  */
2550                 return FALSE;
2551         }
2552
2553         uint32_t busycount = 0;
2554         uint32_t thactive_count = wq->wq_thactive_count[at_priclass];
2555
2556         // Has our most recently blocked thread blocked recently enough that we
2557         // should still consider it busy?
2558         if (wq->wq_thscheduled_count[at_priclass] > wq->wq_thactive_count[at_priclass]) {
2559                 if (wq_thread_is_busy(mach_absolute_time(), &wq->wq_lastblocked_ts[at_priclass])) {
2560                         busycount++;
2561                 }
2562         }
2563
2564         if (my_priclass < WORKQUEUE_NUM_BUCKETS && my_priclass == at_priclass){
2565                 /*
2566                  * don't count this thread as currently active
2567                  */
2568                 thactive_count--;
2569         }
2570
2571         if (thactive_count + busycount >= wq->wq_max_concurrency) {
2572                 if (busycount && start_timer) {
2573                                 /*
2574                                  * we found at least 1 thread in the
2575                                  * 'busy' state... make sure we start
2576                                  * the timer because if they are the only
2577                                  * threads keeping us from scheduling
2578                                  * this work request, we won't get a callback
2579                                  * to kick off the timer... we need to
2580                                  * start it now...
2581                                  */
2582                                 *start_timer = WQ_TIMER_DELAYED_NEEDED(wq);
2583                 }
2584
2585                 PTHREAD_TRACE_WQ(TRACE_wq_overcommitted|DBG_FUNC_NONE, wq, ((start_timer && *start_timer) ? 1 << _PTHREAD_PRIORITY_FLAGS_SHIFT : 0) | class_index_get_pthread_priority(at_priclass), thactive_count, busycount, 0);
2586
2587                 return FALSE;
2588         }
2589         return TRUE;
2590 }
2591
2592 static struct threadlist *
2593 pop_from_thidlelist(struct workqueue *wq, uint32_t priclass)
2594 {
2595         assert(wq->wq_thidlecount);
2596
2597         struct threadlist *tl = NULL;
2598
2599         if (!TAILQ_EMPTY(&wq->wq_thidlemgrlist) &&
2600                         (priclass == WORKQUEUE_EVENT_MANAGER_BUCKET || TAILQ_EMPTY(&wq->wq_thidlelist))){
2601                 tl = TAILQ_FIRST(&wq->wq_thidlemgrlist);
2602                 TAILQ_REMOVE(&wq->wq_thidlemgrlist, tl, th_entry);
2603                 assert(tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET);
2604         } else if (!TAILQ_EMPTY(&wq->wq_thidlelist) &&
2605                         (priclass != WORKQUEUE_EVENT_MANAGER_BUCKET || TAILQ_EMPTY(&wq->wq_thidlemgrlist))){
2606                 tl = TAILQ_FIRST(&wq->wq_thidlelist);
2607                 TAILQ_REMOVE(&wq->wq_thidlelist, tl, th_entry);
2608                 assert(tl->th_priority != WORKQUEUE_EVENT_MANAGER_BUCKET);
2609         } else {
2610                 panic("pop_from_thidlelist called with no threads available");
2611         }
2612         assert((tl->th_flags & TH_LIST_RUNNING) == 0);
2613
2614         assert(wq->wq_thidlecount);
2615         wq->wq_thidlecount--;
2616
2617         TAILQ_INSERT_TAIL(&wq->wq_thrunlist, tl, th_entry);
2618
2619         tl->th_flags |= TH_LIST_RUNNING | TH_LIST_BUSY;
2620
2621         wq->wq_threads_scheduled++;
2622         wq->wq_thscheduled_count[priclass]++;
2623         OSAddAtomic(1, &wq->wq_thactive_count[priclass]);
2624
2625         return tl;
2626 }
2627
2628 static pthread_priority_t
2629 pthread_priority_from_wq_class_index(struct workqueue *wq, int index){
2630         if (index == WORKQUEUE_EVENT_MANAGER_BUCKET){
2631                 return wq->wq_event_manager_priority;
2632         } else {
2633                 return class_index_get_pthread_priority(index);
2634         }
2635 }
2636
2637 static void
2638 reset_priority(struct threadlist *tl, pthread_priority_t pri){
2639         kern_return_t ret;
2640         thread_t th = tl->th_thread;
2641
2642         if ((pri & _PTHREAD_PRIORITY_SCHED_PRI_FLAG) == 0){
2643                 ret = pthread_kern->thread_set_workq_qos(th, pthread_priority_get_thread_qos(pri), 0);
2644                 assert(ret == KERN_SUCCESS || ret == KERN_TERMINATED);
2645
2646                 if (tl->th_flags & TH_LIST_EVENT_MGR_SCHED_PRI) {
2647
2648                         /* Reset priority to default (masked by QoS) */
2649
2650                         ret = pthread_kern->thread_set_workq_pri(th, 31, POLICY_TIMESHARE);
2651                         assert(ret == KERN_SUCCESS || ret == KERN_TERMINATED);
2652
2653                         tl->th_flags &= ~TH_LIST_EVENT_MGR_SCHED_PRI;
2654                 }
2655         } else {
2656                 ret = pthread_kern->thread_set_workq_qos(th, THREAD_QOS_UNSPECIFIED, 0);
2657                 assert(ret == KERN_SUCCESS || ret == KERN_TERMINATED);
2658                 ret = pthread_kern->thread_set_workq_pri(th, (pri & (~_PTHREAD_PRIORITY_FLAGS_MASK)), POLICY_TIMESHARE);
2659                 assert(ret == KERN_SUCCESS || ret == KERN_TERMINATED);
2660
2661                 tl->th_flags |= TH_LIST_EVENT_MGR_SCHED_PRI;
2662         }
2663 }
2664
2665 /**
2666  * grabs a thread for a request
2667  *
2668  *  - called with the workqueue lock held...
2669  *  - responsible for dropping it in all cases
2670  *  - if provided mode is for overcommit, doesn't consume a reqcount
2671  *
2672  */
2673 static thread_t
2674 workqueue_run_nextreq(proc_t p, struct workqueue *wq, thread_t thread,
2675                 enum run_nextreq_mode mode, pthread_priority_t prio,
2676                 bool kevent_bind_via_return)
2677 {
2678         thread_t th_to_run = THREAD_NULL;
2679         uint32_t upcall_flags = 0;
2680         uint32_t priclass;
2681         struct threadlist *tl = NULL;
2682         struct uthread *uth = NULL;
2683         boolean_t start_timer = FALSE;
2684
2685         if (mode == RUN_NEXTREQ_ADD_TIMER) {
2686                 mode = RUN_NEXTREQ_DEFAULT;
2687         }
2688
2689         // valid modes to call this function with
2690         assert(mode == RUN_NEXTREQ_DEFAULT || mode == RUN_NEXTREQ_DEFAULT_KEVENT ||
2691                         mode == RUN_NEXTREQ_OVERCOMMIT || mode == RUN_NEXTREQ_UNCONSTRAINED ||
2692                         mode == RUN_NEXTREQ_EVENT_MANAGER || mode == RUN_NEXTREQ_OVERCOMMIT_KEVENT);
2693         // may only have a priority if in OVERCOMMIT or DEFAULT_KEVENT mode
2694         assert(mode == RUN_NEXTREQ_OVERCOMMIT || mode == RUN_NEXTREQ_OVERCOMMIT_KEVENT ||
2695                         mode == RUN_NEXTREQ_DEFAULT_KEVENT || prio == 0);
2696         // thread == thread_null means "please spin up a new workqueue thread, we can't reuse this"
2697         // thread != thread_null is thread reuse, and must be the current thread
2698         assert(thread == THREAD_NULL || thread == current_thread());
2699
2700         PTHREAD_TRACE_WQ(TRACE_wq_run_nextitem|DBG_FUNC_START, wq, thread_tid(thread), wq->wq_thidlecount, wq->wq_reqcount, 0);
2701
2702         if (thread != THREAD_NULL) {
2703                 uth = pthread_kern->get_bsdthread_info(thread);
2704
2705                 if ((tl = pthread_kern->uthread_get_threadlist(uth)) == NULL) {
2706                         panic("wq thread with no threadlist");
2707                 }
2708         }
2709
2710         /*
2711          * from here until we drop the workq lock we can't be pre-empted since we
2712          * hold the lock in spin mode... this is important since we have to
2713          * independently update the priority that the thread is associated with and
2714          * the priorty based counters that "workqueue_callback" also changes and
2715          * bases decisions on.
2716          */
2717
2718         /*
2719          * This giant monstrosity does three things:
2720          *
2721          *   - adjusts the mode, if required
2722          *   - selects the priclass that we'll be servicing
2723          *   - sets any mode-specific upcall flags
2724          *
2725          * When possible special-cases should be handled here and converted into
2726          * non-special cases.
2727          */
2728         if (mode == RUN_NEXTREQ_OVERCOMMIT) {
2729                 priclass = pthread_priority_get_class_index(prio);
2730                 upcall_flags |= WQ_FLAG_THREAD_OVERCOMMIT;
2731         } else if (mode == RUN_NEXTREQ_OVERCOMMIT_KEVENT){
2732                 priclass = pthread_priority_get_class_index(prio);
2733                 upcall_flags |= WQ_FLAG_THREAD_KEVENT;
2734         } else if (mode == RUN_NEXTREQ_EVENT_MANAGER){
2735                 assert(wq->wq_thscheduled_count[WORKQUEUE_EVENT_MANAGER_BUCKET] == 0);
2736                 priclass = WORKQUEUE_EVENT_MANAGER_BUCKET;
2737                 upcall_flags |= WQ_FLAG_THREAD_EVENT_MANAGER;
2738                 if (wq->wq_kevent_requests[WORKQUEUE_EVENT_MANAGER_BUCKET]){
2739                         upcall_flags |= WQ_FLAG_THREAD_KEVENT;
2740                 }
2741         } else if (wq->wq_reqcount == 0){
2742                 // no work to do.  we'll check again when new work arrives.
2743                 goto done;
2744         } else if (mode == RUN_NEXTREQ_DEFAULT_KEVENT) {
2745                 assert(kevent_bind_via_return);
2746
2747                 priclass = pthread_priority_get_class_index(prio);
2748                 assert(priclass < WORKQUEUE_EVENT_MANAGER_BUCKET);
2749                 assert(wq->wq_kevent_requests[priclass] > 0);
2750
2751                 upcall_flags |= WQ_FLAG_THREAD_KEVENT;
2752                 mode = RUN_NEXTREQ_DEFAULT;
2753         } else if (wq->wq_requests[WORKQUEUE_EVENT_MANAGER_BUCKET] &&
2754                            ((wq->wq_thscheduled_count[WORKQUEUE_EVENT_MANAGER_BUCKET] == 0) ||
2755                                 (thread != THREAD_NULL && tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET))){
2756                 // There's an event manager request and either:
2757                 //   - no event manager currently running
2758                 //   - we are re-using the event manager
2759                 mode = RUN_NEXTREQ_EVENT_MANAGER;
2760                 priclass = WORKQUEUE_EVENT_MANAGER_BUCKET;
2761                 upcall_flags |= WQ_FLAG_THREAD_EVENT_MANAGER;
2762                 if (wq->wq_kevent_requests[WORKQUEUE_EVENT_MANAGER_BUCKET]){
2763                         upcall_flags |= WQ_FLAG_THREAD_KEVENT;
2764                 }
2765         } else {
2766                 // Find highest priority and check for special request types
2767                 for (priclass = 0; priclass < WORKQUEUE_EVENT_MANAGER_BUCKET; priclass++) {
2768                         if (wq->wq_requests[priclass])
2769                                 break;
2770                 }
2771                 if (priclass == WORKQUEUE_EVENT_MANAGER_BUCKET){
2772                         // only request should have been event manager since it's not in a bucket,
2773                         // but we weren't able to handle it since there's already an event manager running,
2774                         // so we fell into this case
2775                         assert(wq->wq_requests[WORKQUEUE_EVENT_MANAGER_BUCKET] == 1 &&
2776                                    wq->wq_thscheduled_count[WORKQUEUE_EVENT_MANAGER_BUCKET] == 1 &&
2777                                    wq->wq_reqcount == 1);
2778                         goto done;
2779                 }
2780
2781                 if (wq->wq_kevent_ocrequests[priclass]){
2782                         mode = RUN_NEXTREQ_DEFERRED_OVERCOMMIT;
2783                         upcall_flags |= WQ_FLAG_THREAD_KEVENT;
2784                         upcall_flags |= WQ_FLAG_THREAD_OVERCOMMIT;
2785                 } else if (wq->wq_ocrequests[priclass]){
2786                         mode = RUN_NEXTREQ_DEFERRED_OVERCOMMIT;
2787                         upcall_flags |= WQ_FLAG_THREAD_OVERCOMMIT;
2788                 } else if (wq->wq_kevent_requests[priclass]){
2789                         upcall_flags |= WQ_FLAG_THREAD_KEVENT;
2790                 }
2791         }
2792
2793         assert(mode != RUN_NEXTREQ_EVENT_MANAGER || priclass == WORKQUEUE_EVENT_MANAGER_BUCKET);
2794         assert(mode == RUN_NEXTREQ_EVENT_MANAGER || priclass != WORKQUEUE_EVENT_MANAGER_BUCKET);
2795
2796         if (mode == RUN_NEXTREQ_DEFAULT /* non-overcommit */){
2797                 uint32_t my_priclass = (thread != THREAD_NULL) ? tl->th_priority : WORKQUEUE_NUM_BUCKETS;
2798                 if (may_start_constrained_thread(wq, priclass, my_priclass, &start_timer) == FALSE){
2799                         // per policy, we won't start another constrained thread
2800                         goto done;
2801                 }
2802         }
2803
2804         if (thread != THREAD_NULL) {
2805                 /*
2806                  * thread is non-NULL here when we return from userspace
2807                  * in workq_kernreturn, rather than trying to find a thread
2808                  * we pick up new work for this specific thread.
2809                  */
2810                 th_to_run = thread;
2811                 upcall_flags |= WQ_FLAG_THREAD_REUSE;
2812         } else if (wq->wq_thidlecount == 0) {
2813                 /*
2814                  * we have no additional threads waiting to pick up
2815                  * work, however, there is additional work to do.
2816                  */
2817                 start_timer = WQ_TIMER_DELAYED_NEEDED(wq);
2818
2819                 PTHREAD_TRACE_WQ(TRACE_wq_stalled, wq, wq->wq_nthreads, start_timer, 0, 0);
2820
2821                 goto done;
2822         } else {
2823                 // there is both work available and an idle thread, so activate a thread
2824                 tl = pop_from_thidlelist(wq, priclass);
2825                 th_to_run = tl->th_thread;
2826         }
2827
2828         // Adjust counters and thread flags AKA consume the request
2829         // TODO: It would be lovely if OVERCOMMIT consumed reqcount
2830         switch (mode) {
2831                 case RUN_NEXTREQ_DEFAULT:
2832                 case RUN_NEXTREQ_DEFAULT_KEVENT: /* actually mapped to DEFAULT above */
2833                 case RUN_NEXTREQ_ADD_TIMER: /* actually mapped to DEFAULT above */
2834                 case RUN_NEXTREQ_UNCONSTRAINED:
2835                         wq->wq_reqcount--;
2836                         wq->wq_requests[priclass]--;
2837
2838                         if (mode == RUN_NEXTREQ_DEFAULT){
2839                                 if (!(tl->th_flags & TH_LIST_CONSTRAINED)) {
2840                                         wq->wq_constrained_threads_scheduled++;
2841                                         tl->th_flags |= TH_LIST_CONSTRAINED;
2842                                 }
2843                         } else if (mode == RUN_NEXTREQ_UNCONSTRAINED){
2844                                 if (tl->th_flags & TH_LIST_CONSTRAINED) {
2845                                         wq->wq_constrained_threads_scheduled--;
2846                                         tl->th_flags &= ~TH_LIST_CONSTRAINED;
2847                                 }
2848                         }
2849                         if (upcall_flags & WQ_FLAG_THREAD_KEVENT){
2850                                 wq->wq_kevent_requests[priclass]--;
2851                         }
2852                         break;
2853
2854                 case RUN_NEXTREQ_EVENT_MANAGER:
2855                         wq->wq_reqcount--;
2856                         wq->wq_requests[priclass]--;
2857
2858                         if (tl->th_flags & TH_LIST_CONSTRAINED) {
2859                                 wq->wq_constrained_threads_scheduled--;
2860                                 tl->th_flags &= ~TH_LIST_CONSTRAINED;
2861                         }
2862                         if (upcall_flags & WQ_FLAG_THREAD_KEVENT){
2863                                 wq->wq_kevent_requests[priclass]--;
2864                         }
2865                         break;
2866
2867                 case RUN_NEXTREQ_DEFERRED_OVERCOMMIT:
2868                         wq->wq_reqcount--;
2869                         wq->wq_requests[priclass]--;
2870                         if (upcall_flags & WQ_FLAG_THREAD_KEVENT){
2871                                 wq->wq_kevent_ocrequests[priclass]--;
2872                         } else {
2873                         wq->wq_ocrequests[priclass]--;
2874                         }
2875                         /* FALLTHROUGH */
2876                 case RUN_NEXTREQ_OVERCOMMIT:
2877                 case RUN_NEXTREQ_OVERCOMMIT_KEVENT:
2878                         if (tl->th_flags & TH_LIST_CONSTRAINED) {
2879                                 wq->wq_constrained_threads_scheduled--;
2880                                 tl->th_flags &= ~TH_LIST_CONSTRAINED;
2881                         }
2882                         break;
2883         }
2884
2885         // Confirm we've maintained our counter invariants
2886         assert(wq->wq_requests[priclass] < UINT16_MAX);
2887         assert(wq->wq_ocrequests[priclass] < UINT16_MAX);
2888         assert(wq->wq_kevent_requests[priclass] < UINT16_MAX);
2889         assert(wq->wq_kevent_ocrequests[priclass] < UINT16_MAX);
2890         assert(wq->wq_ocrequests[priclass] + wq->wq_kevent_requests[priclass] +
2891                         wq->wq_kevent_ocrequests[priclass] <=
2892                         wq->wq_requests[priclass]);
2893
2894         assert((tl->th_flags & TH_LIST_KEVENT_BOUND) == 0);
2895         if (upcall_flags & WQ_FLAG_THREAD_KEVENT) {
2896                 tl->th_flags |= TH_LIST_KEVENT;
2897         } else {
2898                 tl->th_flags &= ~TH_LIST_KEVENT;
2899         }
2900
2901         uint32_t orig_class = tl->th_priority;
2902         tl->th_priority = (uint8_t)priclass;
2903
2904         if ((thread != THREAD_NULL) && (orig_class != priclass)) {
2905                 /*
2906                  * we need to adjust these counters based on this
2907                  * thread's new disposition w/r to priority
2908                  */
2909                 OSAddAtomic(-1, &wq->wq_thactive_count[orig_class]);
2910                 OSAddAtomic(1, &wq->wq_thactive_count[priclass]);
2911
2912                 wq->wq_thscheduled_count[orig_class]--;
2913                 wq->wq_thscheduled_count[priclass]++;
2914         }
2915         wq->wq_thread_yielded_count = 0;
2916
2917         pthread_priority_t outgoing_priority = pthread_priority_from_wq_class_index(wq, tl->th_priority);
2918         PTHREAD_TRACE_WQ(TRACE_wq_reset_priority | DBG_FUNC_START, wq, thread_tid(tl->th_thread), outgoing_priority, 0, 0);
2919         reset_priority(tl, outgoing_priority);
2920         PTHREAD_TRACE_WQ(TRACE_wq_reset_priority | DBG_FUNC_END, wq, thread_tid(tl->th_thread), outgoing_priority, 0, 0);
2921
2922         /*
2923          * persist upcall_flags so that in can be retrieved in setup_wqthread
2924          */
2925         tl->th_upcall_flags = upcall_flags >> WQ_FLAG_THREAD_PRIOSHIFT;
2926
2927         /*
2928          * if current thread is reused for work request, does not return via unix_syscall
2929          */
2930         wq_runreq(p, th_to_run, wq, tl, (thread == th_to_run),
2931                         (upcall_flags & WQ_FLAG_THREAD_KEVENT) && !kevent_bind_via_return);
2932
2933         PTHREAD_TRACE_WQ(TRACE_wq_run_nextitem|DBG_FUNC_END, wq, thread_tid(th_to_run), mode == RUN_NEXTREQ_OVERCOMMIT, 1, 0);
2934
2935         assert(!kevent_bind_via_return || (upcall_flags & WQ_FLAG_THREAD_KEVENT));
2936         if (kevent_bind_via_return && (upcall_flags & WQ_FLAG_THREAD_KEVENT)) {
2937                 tl->th_flags |= TH_LIST_KEVENT_BOUND;
2938         }
2939
2940         workqueue_unlock(wq);
2941
2942         return th_to_run;
2943
2944 done:
2945         if (start_timer)
2946                 workqueue_interval_timer_start(wq);
2947
2948         PTHREAD_TRACE_WQ(TRACE_wq_run_nextitem | DBG_FUNC_END, wq, thread_tid(thread), start_timer, 3, 0);
2949
2950         if (thread != THREAD_NULL){
2951                 parkit(wq, tl, thread);
2952                 /* NOT REACHED */
2953         }
2954
2955         workqueue_unlock(wq);
2956
2957         return THREAD_NULL;
2958 }
2959
2960 /**
2961  * parked thread wakes up
2962  */
2963 static void __dead2
2964 wq_unpark_continue(void* __unused ptr, wait_result_t wait_result)
2965 {
2966         boolean_t first_use = false;
2967         thread_t th = current_thread();
2968         proc_t p = current_proc();
2969
2970         struct uthread *uth = pthread_kern->get_bsdthread_info(th);
2971         if (uth == NULL) goto done;
2972
2973         struct threadlist *tl = pthread_kern->uthread_get_threadlist(uth);
2974         if (tl == NULL) goto done;
2975
2976         struct workqueue *wq = tl->th_workq;
2977
2978         workqueue_lock_spin(wq);
2979
2980         assert(tl->th_flags & TH_LIST_INITED);
2981
2982         if ((tl->th_flags & TH_LIST_NEW)){
2983                 tl->th_flags &= ~(TH_LIST_NEW);
2984                 first_use = true;
2985         }
2986
2987         if ((tl->th_flags & (TH_LIST_RUNNING | TH_LIST_BUSY)) == TH_LIST_RUNNING) {
2988                 /*
2989                  * The normal wakeup path.
2990                  */
2991                 goto return_to_user;
2992         }
2993
2994         if ((tl->th_flags & TH_LIST_RUNNING) == 0 &&
2995                         wait_result == THREAD_TIMED_OUT &&
2996                         tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET &&
2997                         TAILQ_FIRST(&wq->wq_thidlemgrlist) == tl &&
2998                         TAILQ_NEXT(tl, th_entry) == NULL){
2999                 /*
3000                  * If we are the only idle manager and we pop'ed for self-destruction,
3001                  * then don't actually exit.  Instead, free our stack to save some
3002                  * memory and re-park.
3003                  */
3004
3005                 workqueue_unlock(wq);
3006
3007                 vm_map_t vmap = wq->wq_map;
3008
3009                 // Keep this in sync with _setup_wqthread()
3010                 const vm_size_t       guardsize = vm_map_page_size(vmap);
3011                 const user_addr_t     freeaddr = (user_addr_t)tl->th_stackaddr + guardsize;
3012                 const vm_map_offset_t freesize = vm_map_trunc_page_mask((PTH_DEFAULT_STACKSIZE + guardsize + PTHREAD_T_OFFSET) - 1, vm_map_page_mask(vmap)) - guardsize;
3013
3014                 int kr;
3015                 kr = mach_vm_behavior_set(vmap, freeaddr, freesize, VM_BEHAVIOR_REUSABLE);
3016                 assert(kr == KERN_SUCCESS || kr == KERN_INVALID_ADDRESS);
3017
3018                 workqueue_lock_spin(wq);
3019
3020                 if ( !(tl->th_flags & TH_LIST_RUNNING)) {
3021                         assert_wait((caddr_t)tl, (THREAD_INTERRUPTIBLE));
3022
3023                         workqueue_unlock(wq);
3024
3025                         thread_block(wq_unpark_continue);
3026                         /* NOT REACHED */
3027                 }
3028         }
3029
3030         if ((tl->th_flags & TH_LIST_RUNNING) == 0) {
3031                 assert((tl->th_flags & TH_LIST_BUSY) == 0);
3032                 /*
3033                  * We were set running, but not for the purposes of actually running.
3034                  * This could be because the timer elapsed.  Or it could be because the
3035                  * thread aborted.  Either way, we need to return to userspace to exit.
3036                  *
3037                  * The call to workqueue_removethread will consume the lock.
3038                  */
3039
3040                 if (!first_use &&
3041                                 tl->th_priority != qos_class_get_class_index(WQ_THREAD_CLEANUP_QOS)) {
3042                         // Reset the QoS to something low for the pthread cleanup
3043                         pthread_priority_t cleanup_pri = _pthread_priority_make_newest(WQ_THREAD_CLEANUP_QOS, 0, 0);
3044                         reset_priority(tl, cleanup_pri);
3045                 }
3046
3047                 workqueue_removethread(tl, 0, first_use);
3048
3049                 if (first_use){
3050                         pthread_kern->thread_bootstrap_return();
3051                 } else {
3052                         pthread_kern->unix_syscall_return(0);
3053                 }
3054                 /* NOT REACHED */
3055         }
3056
3057         /*
3058          * The timer woke us up or the thread was aborted.  However, we have
3059          * already started to make this a runnable thread.  Wait for that to
3060          * finish, then continue to userspace.
3061          */
3062         while ((tl->th_flags & TH_LIST_BUSY)) {
3063                 assert_wait((caddr_t)tl, (THREAD_UNINT));
3064
3065                 workqueue_unlock(wq);
3066
3067                 thread_block(THREAD_CONTINUE_NULL);
3068
3069                 workqueue_lock_spin(wq);
3070         }
3071
3072 return_to_user:
3073         workqueue_unlock(wq);
3074         _setup_wqthread(p, th, wq, tl, first_use);
3075         pthread_kern->thread_sched_call(th, workqueue_callback);
3076 done:
3077         if (first_use){
3078                 pthread_kern->thread_bootstrap_return();
3079         } else {
3080                 pthread_kern->unix_syscall_return(EJUSTRETURN);
3081         }
3082         panic("Our attempt to return to userspace failed...");
3083 }
3084
3085 /* called with workqueue lock held */
3086 static void
3087 wq_runreq(proc_t p, thread_t th, struct workqueue *wq, struct threadlist *tl,
3088                   boolean_t return_directly, boolean_t needs_kevent_bind)
3089 {
3090         PTHREAD_TRACE1_WQ(TRACE_wq_runitem | DBG_FUNC_START, tl->th_workq, 0, 0, thread_tid(current_thread()), thread_tid(th));
3091
3092         unsigned int kevent_flags = KEVENT_FLAG_WORKQ;
3093         if (tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET) {
3094                 kevent_flags |= KEVENT_FLAG_WORKQ_MANAGER;
3095         }
3096
3097         if (return_directly) {
3098                 if (needs_kevent_bind) {
3099                         assert((tl->th_flags & TH_LIST_KEVENT_BOUND) == 0);
3100                         tl->th_flags |= TH_LIST_KEVENT_BOUND;
3101                 }
3102
3103                 workqueue_unlock(wq);
3104
3105                 if (needs_kevent_bind) {
3106                         kevent_qos_internal_bind(p, class_index_get_thread_qos(tl->th_priority), th, kevent_flags);
3107                 }
3108
3109                 /*
3110                  * For preemption reasons, we want to reset the voucher as late as
3111                  * possible, so we do it in two places:
3112                  *   - Just before parking (i.e. in parkit())
3113                  *   - Prior to doing the setup for the next workitem (i.e. here)
3114                  *
3115                  * Those two places are sufficient to ensure we always reset it before
3116                  * it goes back out to user space, but be careful to not break that
3117                  * guarantee.
3118                  */
3119                 kern_return_t kr = pthread_kern->thread_set_voucher_name(MACH_PORT_NULL);
3120                 assert(kr == KERN_SUCCESS);
3121
3122                 _setup_wqthread(p, th, wq, tl, false);
3123
3124                 PTHREAD_TRACE_WQ(TRACE_wq_run_nextitem|DBG_FUNC_END, tl->th_workq, 0, 0, 4, 0);
3125
3126                 pthread_kern->unix_syscall_return(EJUSTRETURN);
3127                 /* NOT REACHED */
3128         }
3129
3130         if (needs_kevent_bind) {
3131                 // Leave TH_LIST_BUSY set so that the thread can't beat us to calling kevent
3132                 workqueue_unlock(wq);
3133                 assert((tl->th_flags & TH_LIST_KEVENT_BOUND) == 0);
3134                 kevent_qos_internal_bind(p, class_index_get_thread_qos(tl->th_priority), th, kevent_flags);
3135                 tl->th_flags |= TH_LIST_KEVENT_BOUND;
3136                 workqueue_lock_spin(wq);
3137         }
3138         tl->th_flags &= ~(TH_LIST_BUSY);
3139         thread_wakeup_thread(tl,th);
3140 }
3141
3142 #define KEVENT_LIST_LEN 16 // WORKQ_KEVENT_EVENT_BUFFER_LEN
3143 #define KEVENT_DATA_SIZE (32 * 1024)
3144
3145 /**
3146  * configures initial thread stack/registers to jump into:
3147  * _pthread_wqthread(pthread_t self, mach_port_t kport, void *stackaddr, void *keventlist, int upcall_flags, int nkevents);
3148  * to get there we jump through assembily stubs in pthread_asm.s.  Those
3149  * routines setup a stack frame, using the current stack pointer, and marshall
3150  * arguments from registers to the stack as required by the ABI.
3151  *
3152  * One odd thing we do here is to start the pthread_t 4k below what would be the
3153  * top of the stack otherwise.  This is because usually only the first 4k of the
3154  * pthread_t will be used and so we want to put it on the same 16k page as the
3155  * top of the stack to save memory.
3156  *
3157  * When we are done the stack will look like:
3158  * |-----------| th_stackaddr + th_allocsize
3159  * |pthread_t  | th_stackaddr + DEFAULT_STACKSIZE + guardsize + PTHREAD_STACK_OFFSET
3160  * |kevent list| optionally - at most KEVENT_LIST_LEN events
3161  * |kevent data| optionally - at most KEVENT_DATA_SIZE bytes
3162  * |stack gap  | bottom aligned to 16 bytes, and at least as big as stack_gap_min
3163  * |   STACK   |
3164  * |     ⇓     |
3165  * |           |
3166  * |guard page | guardsize
3167  * |-----------| th_stackaddr
3168  */
3169 void
3170 _setup_wqthread(proc_t p, thread_t th, struct workqueue *wq, struct threadlist *tl,
3171                 bool first_use)
3172 {
3173         int error;
3174         uint32_t upcall_flags;
3175
3176         pthread_priority_t priority = pthread_priority_from_wq_class_index(wq, tl->th_priority);
3177
3178         const vm_size_t guardsize = vm_map_page_size(tl->th_workq->wq_map);
3179         const vm_size_t stack_gap_min = (proc_is64bit(p) == 0) ? C_32_STK_ALIGN : C_64_REDZONE_LEN;
3180         const vm_size_t stack_align_min = (proc_is64bit(p) == 0) ? C_32_STK_ALIGN : C_64_STK_ALIGN;
3181
3182         user_addr_t pthread_self_addr = (user_addr_t)(tl->th_stackaddr + PTH_DEFAULT_STACKSIZE + guardsize + PTHREAD_T_OFFSET);
3183         user_addr_t stack_top_addr = (user_addr_t)((pthread_self_addr - stack_gap_min) & -stack_align_min);
3184         user_addr_t stack_bottom_addr = (user_addr_t)(tl->th_stackaddr + guardsize);
3185
3186         user_addr_t wqstart_fnptr = pthread_kern->proc_get_wqthread(p);
3187         if (!wqstart_fnptr) {
3188                 panic("workqueue thread start function pointer is NULL");
3189         }
3190
3191         /* Put the QoS class value into the lower bits of the reuse_thread register, this is where
3192          * the thread priority used to be stored anyway.
3193          */
3194         upcall_flags  = tl->th_upcall_flags << WQ_FLAG_THREAD_PRIOSHIFT;
3195         upcall_flags |= (_pthread_priority_get_qos_newest(priority) & WQ_FLAG_THREAD_PRIOMASK);
3196
3197         upcall_flags |= WQ_FLAG_THREAD_NEWSPI;
3198
3199         uint32_t tsd_offset = pthread_kern->proc_get_pthread_tsd_offset(p);
3200         if (tsd_offset) {
3201                 mach_vm_offset_t th_tsd_base = (mach_vm_offset_t)pthread_self_addr + tsd_offset;
3202                 kern_return_t kret = pthread_kern->thread_set_tsd_base(th, th_tsd_base);
3203                 if (kret == KERN_SUCCESS) {
3204                         upcall_flags |= WQ_FLAG_THREAD_TSD_BASE_SET;
3205                 }
3206         }
3207
3208         if (first_use) {
3209                 /*
3210                 * Pre-fault the first page of the new thread's stack and the page that will
3211                 * contain the pthread_t structure.
3212                 */
3213                 vm_map_t vmap = pthread_kern->current_map();
3214                 if (vm_map_trunc_page_mask((vm_map_offset_t)(stack_top_addr - C_64_REDZONE_LEN), vm_map_page_mask(vmap)) !=
3215                                 vm_map_trunc_page_mask((vm_map_offset_t)pthread_self_addr, vm_map_page_mask(vmap))){
3216                         vm_fault( vmap,
3217                                         vm_map_trunc_page_mask((vm_map_offset_t)(stack_top_addr - C_64_REDZONE_LEN), vm_map_page_mask(vmap)),
3218                                         VM_PROT_READ | VM_PROT_WRITE,
3219                                         FALSE,
3220                                         THREAD_UNINT, NULL, 0);
3221                 }
3222                 vm_fault( vmap,
3223                                 vm_map_trunc_page_mask((vm_map_offset_t)pthread_self_addr, vm_map_page_mask(vmap)),
3224                                 VM_PROT_READ | VM_PROT_WRITE,
3225                                 FALSE,
3226                                 THREAD_UNINT, NULL, 0);
3227         } else {
3228                 upcall_flags |= WQ_FLAG_THREAD_REUSE;
3229         }
3230
3231         user_addr_t kevent_list = NULL;
3232         int kevent_count = 0;
3233         if (upcall_flags & WQ_FLAG_THREAD_KEVENT){
3234                 kevent_list = pthread_self_addr - KEVENT_LIST_LEN * sizeof(struct kevent_qos_s);
3235                 kevent_count = KEVENT_LIST_LEN;
3236
3237                 user_addr_t kevent_data_buf = kevent_list - KEVENT_DATA_SIZE;
3238                 user_size_t kevent_data_available = KEVENT_DATA_SIZE;
3239
3240                 int32_t events_out = 0;
3241
3242                 assert(tl->th_flags | TH_LIST_KEVENT_BOUND);
3243                 unsigned int flags = KEVENT_FLAG_WORKQ | KEVENT_FLAG_STACK_DATA | KEVENT_FLAG_IMMEDIATE;
3244                 if (tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET) {
3245                         flags |= KEVENT_FLAG_WORKQ_MANAGER;
3246                 }
3247                 int ret = kevent_qos_internal(p, class_index_get_thread_qos(tl->th_priority), NULL, 0, kevent_list, kevent_count,
3248                                                                           kevent_data_buf, &kevent_data_available,
3249                                                                           flags, &events_out);
3250
3251                 // turns out there are a lot of edge cases where this will fail, so not enabled by default
3252                 //assert((ret == KERN_SUCCESS && events_out != -1) || ret == KERN_ABORTED);
3253
3254                 // squash any errors into just empty output on
3255                 if (ret != KERN_SUCCESS || events_out == -1){
3256                         events_out = 0;
3257                         kevent_data_available = KEVENT_DATA_SIZE;
3258                 }
3259
3260                 // We shouldn't get data out if there aren't events available
3261                 assert(events_out != 0 || kevent_data_available == KEVENT_DATA_SIZE);
3262
3263                 if (events_out > 0){
3264                         if (kevent_data_available == KEVENT_DATA_SIZE){
3265                                 stack_top_addr = (kevent_list - stack_gap_min) & -stack_align_min;
3266                         } else {
3267                                 stack_top_addr = (kevent_data_buf + kevent_data_available - stack_gap_min) & -stack_align_min;
3268                         }
3269
3270                         kevent_count = events_out;
3271                 } else {
3272                         kevent_list = NULL;
3273                         kevent_count = 0;
3274                 }
3275         }
3276
3277 #if defined(__i386__) || defined(__x86_64__)
3278         if (proc_is64bit(p) == 0) {
3279                 x86_thread_state32_t state = {
3280                         .eip = (unsigned int)wqstart_fnptr,
3281                         .eax = /* arg0 */ (unsigned int)pthread_self_addr,
3282                         .ebx = /* arg1 */ (unsigned int)tl->th_thport,
3283                         .ecx = /* arg2 */ (unsigned int)stack_bottom_addr,
3284                         .edx = /* arg3 */ (unsigned int)kevent_list,
3285                         .edi = /* arg4 */ (unsigned int)upcall_flags,
3286                         .esi = /* arg5 */ (unsigned int)kevent_count,
3287
3288                         .esp = (int)((vm_offset_t)stack_top_addr),
3289                 };
3290
3291                 error = pthread_kern->thread_set_wq_state32(th, (thread_state_t)&state);
3292                 if (error != KERN_SUCCESS) {
3293                         panic(__func__ ": thread_set_wq_state failed: %d", error);
3294                 }
3295         } else {
3296                 x86_thread_state64_t state64 = {
3297                         // x86-64 already passes all the arguments in registers, so we just put them in their final place here
3298                         .rip = (uint64_t)wqstart_fnptr,
3299                         .rdi = (uint64_t)pthread_self_addr,
3300                         .rsi = (uint64_t)tl->th_thport,
3301                         .rdx = (uint64_t)stack_bottom_addr,
3302                         .rcx = (uint64_t)kevent_list,
3303                         .r8  = (uint64_t)upcall_flags,
3304                         .r9  = (uint64_t)kevent_count,
3305
3306                         .rsp = (uint64_t)(stack_top_addr)
3307                 };
3308
3309                 error = pthread_kern->thread_set_wq_state64(th, (thread_state_t)&state64);
3310                 if (error != KERN_SUCCESS) {
3311                         panic(__func__ ": thread_set_wq_state failed: %d", error);
3312                 }
3313         }
3314 #else
3315 #error setup_wqthread  not defined for this architecture
3316 #endif
3317 }
3318
3319 #if DEBUG
3320 static int wq_kevent_test SYSCTL_HANDLER_ARGS {
3321         //(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req)
3322 #pragma unused(oidp, arg1, arg2)
3323         int error;
3324         struct workq_reqthreads_req_s requests[64] = {};
3325
3326         if (req->newlen > sizeof(requests) || req->newlen < sizeof(struct workq_reqthreads_req_s))
3327                 return EINVAL;
3328
3329         error = copyin(req->newptr, requests, req->newlen);
3330         if (error) return error;
3331
3332         _workq_reqthreads(req->p, (int)(req->newlen / sizeof(struct workq_reqthreads_req_s)), requests);
3333
3334         return 0;
3335 }
3336 #endif // DEBUG
3337
3338 #pragma mark - Misc
3339
3340 int
3341 _fill_procworkqueue(proc_t p, struct proc_workqueueinfo * pwqinfo)
3342 {
3343         struct workqueue * wq;
3344         int error = 0;
3345         int     activecount;
3346         uint32_t pri;
3347
3348         if ((wq = pthread_kern->proc_get_wqptr(p)) == NULL) {
3349                 return EINVAL;
3350         }
3351
3352         workqueue_lock_spin(wq);
3353         activecount = 0;
3354
3355         for (pri = 0; pri < WORKQUEUE_NUM_BUCKETS; pri++) {
3356                 activecount += wq->wq_thactive_count[pri];
3357         }
3358         pwqinfo->pwq_nthreads = wq->wq_nthreads;
3359         pwqinfo->pwq_runthreads = activecount;
3360         pwqinfo->pwq_blockedthreads = wq->wq_threads_scheduled - activecount;
3361         pwqinfo->pwq_state = 0;
3362
3363         if (wq->wq_constrained_threads_scheduled >= wq_max_constrained_threads) {
3364                 pwqinfo->pwq_state |= WQ_EXCEEDED_CONSTRAINED_THREAD_LIMIT;
3365         }
3366
3367         if (wq->wq_nthreads >= wq_max_threads) {
3368                 pwqinfo->pwq_state |= WQ_EXCEEDED_TOTAL_THREAD_LIMIT;
3369         }
3370
3371         workqueue_unlock(wq);
3372         return(error);
3373 }
3374
3375 uint32_t
3376 _get_pwq_state_kdp(proc_t p)
3377 {
3378         if (p == NULL) {
3379                 return 0;
3380         }
3381
3382         struct workqueue *wq = pthread_kern->proc_get_wqptr(p);
3383
3384         if (wq == NULL || workqueue_lock_spin_is_acquired_kdp(wq)) {
3385                 return 0;
3386         }
3387
3388         uint32_t pwq_state = WQ_FLAGS_AVAILABLE;
3389
3390         if (wq->wq_constrained_threads_scheduled >= wq_max_constrained_threads) {
3391                 pwq_state |= WQ_EXCEEDED_CONSTRAINED_THREAD_LIMIT;
3392         }
3393
3394         if (wq->wq_nthreads >= wq_max_threads) {
3395                 pwq_state |= WQ_EXCEEDED_TOTAL_THREAD_LIMIT;
3396         }
3397
3398         return pwq_state;
3399 }
3400
3401 int
3402 _thread_selfid(__unused struct proc *p, uint64_t *retval)
3403 {
3404         thread_t thread = current_thread();
3405         *retval = thread_tid(thread);
3406         return KERN_SUCCESS;
3407 }
3408
3409 void
3410 _pthread_init(void)
3411 {
3412         pthread_lck_grp_attr = lck_grp_attr_alloc_init();
3413         pthread_lck_grp = lck_grp_alloc_init("pthread", pthread_lck_grp_attr);
3414
3415         /*
3416          * allocate the lock attribute for pthread synchronizers
3417          */
3418         pthread_lck_attr = lck_attr_alloc_init();
3419
3420         pthread_list_mlock = lck_mtx_alloc_init(pthread_lck_grp, pthread_lck_attr);
3421
3422         pth_global_hashinit();
3423         psynch_thcall = thread_call_allocate(psynch_wq_cleanup, NULL);
3424         psynch_zoneinit();
3425
3426         /*
3427          * register sysctls
3428          */
3429         sysctl_register_oid(&sysctl__kern_wq_yielded_threshold);
3430         sysctl_register_oid(&sysctl__kern_wq_yielded_window_usecs);
3431         sysctl_register_oid(&sysctl__kern_wq_stalled_window_usecs);
3432         sysctl_register_oid(&sysctl__kern_wq_reduce_pool_window_usecs);
3433         sysctl_register_oid(&sysctl__kern_wq_max_timer_interval_usecs);
3434         sysctl_register_oid(&sysctl__kern_wq_max_threads);
3435         sysctl_register_oid(&sysctl__kern_wq_max_constrained_threads);
3436         sysctl_register_oid(&sysctl__kern_pthread_debug_tracing);
3437
3438 #if DEBUG
3439         sysctl_register_oid(&sysctl__kern_wq_max_concurrency);
3440         sysctl_register_oid(&sysctl__debug_wq_kevent_test);
3441 #endif
3442
3443         wq_max_concurrency = pthread_kern->ml_get_max_cpus();
3444
3445 }