kern/kern_support.c

   1 /*
   2  * Copyright (c) 2000-2012 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995-2005 Apple Computer, Inc. All Rights Reserved */
  29 /*
  30  *      pthread_synch.c
  31  */
  32
  33 #define  _PTHREAD_CONDATTR_T
  34 #define  _PTHREAD_COND_T
  35 #define _PTHREAD_MUTEXATTR_T
  36 #define _PTHREAD_MUTEX_T
  37 #define _PTHREAD_RWLOCKATTR_T
  38 #define _PTHREAD_RWLOCK_T
  39
  40 #undef pthread_mutexattr_t
  41 #undef pthread_mutex_t
  42 #undef pthread_condattr_t
  43 #undef pthread_cond_t
  44 #undef pthread_rwlockattr_t
  45 #undef pthread_rwlock_t
  46
  47 #include <sys/param.h>
  48 #include <sys/queue.h>
  49 #include <sys/resourcevar.h>
  50 //#include <sys/proc_internal.h>
  51 #include <sys/kauth.h>
  52 #include <sys/systm.h>
  53 #include <sys/timeb.h>
  54 #include <sys/times.h>
  55 #include <sys/acct.h>
  56 #include <sys/kernel.h>
  57 #include <sys/wait.h>
  58 #include <sys/signalvar.h>
  59 #include <sys/sysctl.h>
  60 #include <sys/syslog.h>
  61 #include <sys/stat.h>
  62 #include <sys/lock.h>
  63 #include <sys/kdebug.h>
  64 //#include <sys/sysproto.h>
  65 #include <sys/vm.h>
  66 #include <sys/user.h>           /* for coredump */
  67 #include <sys/proc_info.h>      /* for fill_procworkqueue */
  68
  69
  70 #include <mach/mach_port.h>
  71 #include <mach/mach_types.h>
  72 #include <mach/semaphore.h>
  73 #include <mach/sync_policy.h>
  74 #include <mach/task.h>
  75 #include <mach/vm_prot.h>
  76 #include <kern/kern_types.h>
  77 #include <kern/task.h>
  78 #include <kern/clock.h>
  79 #include <mach/kern_return.h>
  80 #include <kern/thread.h>
  81 #include <kern/sched_prim.h>
  82 #include <kern/kalloc.h>
  83 #include <kern/sched_prim.h>    /* for thread_exception_return */
  84 #include <kern/processor.h>
  85 #include <kern/assert.h>
  86 #include <mach/mach_vm.h>
  87 #include <mach/mach_param.h>
  88 #include <mach/thread_status.h>
  89 #include <mach/thread_policy.h>
  90 #include <mach/message.h>
  91 #include <mach/port.h>
  92 //#include <vm/vm_protos.h>
  93 #include <vm/vm_fault.h>
  94 #include <vm/vm_map.h>
  95 #include <mach/thread_act.h> /* for thread_resume */
  96 #include <machine/machine_routines.h>
  97
  98 #include <libkern/OSAtomic.h>
  99
 100 #include <sys/pthread_shims.h>
 101 #include "kern_internal.h"
 102
 103 uint32_t pthread_debug_tracing = 0;
 104
 105 SYSCTL_INT(_kern, OID_AUTO, pthread_debug_tracing, CTLFLAG_RW | CTLFLAG_LOCKED,
 106                    &pthread_debug_tracing, 0, "")
 107
 108 // XXX: Dirty import for sys/signarvar.h that's wrapped in BSD_KERNEL_PRIVATE
 109 #define sigcantmask (sigmask(SIGKILL) | sigmask(SIGSTOP))
 110
 111 lck_grp_attr_t   *pthread_lck_grp_attr;
 112 lck_grp_t    *pthread_lck_grp;
 113 lck_attr_t   *pthread_lck_attr;
 114
 115 extern void thread_set_cthreadself(thread_t thread, uint64_t pself, int isLP64);
 116 extern void workqueue_thread_yielded(void);
 117
 118 static boolean_t workqueue_run_nextreq(proc_t p, struct workqueue *wq, thread_t th, boolean_t force_oc,
 119                                         boolean_t  overcommit, pthread_priority_t oc_prio);
 120
 121 static boolean_t workqueue_run_one(proc_t p, struct workqueue *wq, boolean_t overcommit, pthread_priority_t priority);
 122
 123 static void wq_runreq(proc_t p, boolean_t overcommit, pthread_priority_t priority, thread_t th, struct threadlist *tl,
 124                        int reuse_thread, int wake_thread, int return_directly);
 125
 126 static int _setup_wqthread(proc_t p, thread_t th, boolean_t overcommit, pthread_priority_t priority, int reuse_thread, struct threadlist *tl);
 127
 128 static void wq_unpark_continue(void);
 129 static void wq_unsuspend_continue(void);
 130
 131 static boolean_t workqueue_addnewthread(struct workqueue *wq, boolean_t oc_thread);
 132 static void workqueue_removethread(struct threadlist *tl, int fromexit);
 133 static void workqueue_lock_spin(proc_t);
 134 static void workqueue_unlock(proc_t);
 135
 136 int proc_settargetconc(pid_t pid, int queuenum, int32_t targetconc);
 137 int proc_setalltargetconc(pid_t pid, int32_t * targetconcp);
 138
 139 #define WQ_MAXPRI_MIN   0       /* low prio queue num */
 140 #define WQ_MAXPRI_MAX   2       /* max  prio queuenum */
 141 #define WQ_PRI_NUM      3       /* number of prio work queues */
 142
 143 #define C_32_STK_ALIGN          16
 144 #define C_64_STK_ALIGN          16
 145 #define C_64_REDZONE_LEN        128
 146 #define TRUNC_DOWN32(a,c)       ((((uint32_t)a)-(c)) & ((uint32_t)(-(c))))
 147 #define TRUNC_DOWN64(a,c)       ((((uint64_t)a)-(c)) & ((uint64_t)(-(c))))
 148
 149 /*
 150  * Flags filed passed to bsdthread_create and back in pthread_start
 151 31  <---------------------------------> 0
 152 _________________________________________
 153 | flags(8) | policy(8) | importance(16) |
 154 -----------------------------------------
 155 */
 156
 157 #define PTHREAD_START_CUSTOM    0x01000000
 158 #define PTHREAD_START_SETSCHED  0x02000000
 159 #define PTHREAD_START_DETACHED  0x04000000
 160 #define PTHREAD_START_QOSCLASS  0x08000000
 161 #define PTHREAD_START_QOSCLASS_MASK 0xffffff
 162 #define PTHREAD_START_POLICY_BITSHIFT 16
 163 #define PTHREAD_START_POLICY_MASK 0xff
 164 #define PTHREAD_START_IMPORTANCE_MASK 0xffff
 165
 166 #define SCHED_OTHER      POLICY_TIMESHARE
 167 #define SCHED_FIFO       POLICY_FIFO
 168 #define SCHED_RR         POLICY_RR
 169
 170 int
 171 _bsdthread_create(struct proc *p, user_addr_t user_func, user_addr_t user_funcarg, user_addr_t user_stack, user_addr_t user_pthread, uint32_t flags, user_addr_t *retval)
 172 {
 173         kern_return_t kret;
 174         void * sright;
 175         int error = 0;
 176         int allocated = 0;
 177         mach_vm_offset_t stackaddr;
 178         mach_vm_size_t th_allocsize = 0;
 179         mach_vm_size_t user_stacksize;
 180         mach_vm_size_t th_stacksize;
 181         mach_vm_size_t th_guardsize;
 182         mach_vm_offset_t th_stackaddr;
 183         mach_vm_offset_t th_stack;
 184         mach_vm_offset_t th_pthread;
 185         mach_port_name_t th_thport;
 186         thread_t th;
 187         vm_map_t vmap = pthread_kern->current_map();
 188         task_t ctask = current_task();
 189         unsigned int policy, importance;
 190
 191         int isLP64 = 0;
 192
 193         if (pthread_kern->proc_get_register(p) == 0) {
 194                 return EINVAL;
 195         }
 196
 197         PTHREAD_TRACE(TRACE_pthread_thread_create | DBG_FUNC_START, flags, 0, 0, 0, 0);
 198
 199         isLP64 = proc_is64bit(p);
 200         th_guardsize = vm_map_page_size(vmap);
 201
 202 #if defined(__i386__) || defined(__x86_64__)
 203         stackaddr = 0xB0000000;
 204 #else
 205 #error Need to define a stack address hint for this architecture
 206 #endif
 207         kret = pthread_kern->thread_create(ctask, &th);
 208         if (kret != KERN_SUCCESS)
 209                 return(ENOMEM);
 210         thread_reference(th);
 211
 212         sright = (void *)pthread_kern->convert_thread_to_port(th);
 213         th_thport = pthread_kern->ipc_port_copyout_send(sright, pthread_kern->task_get_ipcspace(ctask));
 214
 215         if ((flags & PTHREAD_START_CUSTOM) == 0) {
 216                 th_stacksize = (mach_vm_size_t)user_stack;              /* if it is custom them it is stacksize */
 217                 th_allocsize = th_stacksize + th_guardsize + pthread_kern->proc_get_pthsize(p);
 218
 219                 kret = mach_vm_map(vmap, &stackaddr,
 220                                 th_allocsize,
 221                                 page_size-1,
 222                                 VM_MAKE_TAG(VM_MEMORY_STACK)| VM_FLAGS_ANYWHERE , NULL,
 223                                 0, FALSE, VM_PROT_DEFAULT, VM_PROT_ALL,
 224                                 VM_INHERIT_DEFAULT);
 225                 if (kret != KERN_SUCCESS)
 226                         kret = mach_vm_allocate(vmap,
 227                                         &stackaddr, th_allocsize,
 228                                         VM_MAKE_TAG(VM_MEMORY_STACK)| VM_FLAGS_ANYWHERE);
 229                 if (kret != KERN_SUCCESS) {
 230                         error = ENOMEM;
 231                         goto out;
 232                 }
 233
 234                 PTHREAD_TRACE(TRACE_pthread_thread_create|DBG_FUNC_NONE, th_allocsize, stackaddr, 0, 2, 0);
 235
 236                 th_stackaddr = stackaddr;
 237                 allocated = 1;
 238                 /*
 239                  * The guard page is at the lowest address
 240                  * The stack base is the highest address
 241                  */
 242                 kret = mach_vm_protect(vmap,  stackaddr, th_guardsize, FALSE, VM_PROT_NONE);
 243
 244                 if (kret != KERN_SUCCESS) {
 245                         error = ENOMEM;
 246                         goto out1;
 247                 }
 248                 th_stack = (stackaddr + th_stacksize + th_guardsize);
 249                 th_pthread = (stackaddr + th_stacksize + th_guardsize);
 250                 user_stacksize = th_stacksize;
 251
 252                /*
 253                 * Pre-fault the first page of the new thread's stack and the page that will
 254                 * contain the pthread_t structure.
 255                 */
 256                 vm_fault( vmap,
 257                   vm_map_trunc_page_mask(th_stack - PAGE_SIZE_64, vm_map_page_mask(vmap)),
 258                   VM_PROT_READ | VM_PROT_WRITE,
 259                   FALSE,
 260                   THREAD_UNINT, NULL, 0);
 261
 262                 vm_fault( vmap,
 263                   vm_map_trunc_page_mask(th_pthread, vm_map_page_mask(vmap)),
 264                   VM_PROT_READ | VM_PROT_WRITE,
 265                   FALSE,
 266                   THREAD_UNINT, NULL, 0);
 267         } else {
 268                 th_stack = user_stack;
 269                 user_stacksize = user_stack;
 270                 th_pthread = user_pthread;
 271
 272                 PTHREAD_TRACE(TRACE_pthread_thread_create|DBG_FUNC_NONE, 0, 0, 0, 3, 0);
 273         }
 274
 275 #if defined(__i386__) || defined(__x86_64__)
 276         /*
 277          * Set up i386 registers & function call.
 278          */
 279         if (isLP64 == 0) {
 280                 x86_thread_state32_t state;
 281                 x86_thread_state32_t *ts = &state;
 282
 283                 ts->eip = (unsigned int)pthread_kern->proc_get_threadstart(p);
 284                 ts->eax = (unsigned int)th_pthread;
 285                 ts->ebx = (unsigned int)th_thport;
 286                 ts->ecx = (unsigned int)user_func;
 287                 ts->edx = (unsigned int)user_funcarg;
 288                 ts->edi = (unsigned int)user_stacksize;
 289                 ts->esi = (unsigned int)flags;
 290                 /*
 291                  * set stack pointer
 292                  */
 293                 ts->esp = (int)((vm_offset_t)(th_stack-C_32_STK_ALIGN));
 294
 295                 error = pthread_kern->thread_set_wq_state32(th, (thread_state_t)ts);
 296                 if (error != KERN_SUCCESS) {
 297                         error = EINVAL;
 298                         goto out;
 299                 }
 300         } else {
 301                 x86_thread_state64_t state64;
 302                 x86_thread_state64_t *ts64 = &state64;
 303
 304                 ts64->rip = (uint64_t)pthread_kern->proc_get_threadstart(p);
 305                 ts64->rdi = (uint64_t)th_pthread;
 306                 ts64->rsi = (uint64_t)(th_thport);
 307                 ts64->rdx = (uint64_t)user_func;
 308                 ts64->rcx = (uint64_t)user_funcarg;
 309                 ts64->r8 = (uint64_t)user_stacksize;
 310                 ts64->r9 = (uint64_t)flags;
 311                 /*
 312                  * set stack pointer aligned to 16 byte boundary
 313                  */
 314                 ts64->rsp = (uint64_t)(th_stack - C_64_REDZONE_LEN);
 315
 316                 error = pthread_kern->thread_set_wq_state64(th, (thread_state_t)ts64);
 317                 if (error != KERN_SUCCESS) {
 318                         error = EINVAL;
 319                         goto out;
 320                 }
 321
 322         }
 323 #elif defined(__arm__)
 324         arm_thread_state_t state;
 325         arm_thread_state_t *ts = &state;
 326
 327         ts->pc = (int)pthread_kern->proc_get_threadstart(p);
 328         ts->r[0] = (unsigned int)th_pthread;
 329         ts->r[1] = (unsigned int)th_thport;
 330         ts->r[2] = (unsigned int)user_func;
 331         ts->r[3] = (unsigned int)user_funcarg;
 332         ts->r[4] = (unsigned int)user_stacksize;
 333         ts->r[5] = (unsigned int)flags;
 334
 335         /* Set r7 & lr to 0 for better back tracing */
 336         ts->r[7] = 0;
 337         ts->lr = 0;
 338
 339         /*
 340          * set stack pointer
 341          */
 342         ts->sp = (int)((vm_offset_t)(th_stack-C_32_STK_ALIGN));
 343
 344         (void) pthread_kern->thread_set_wq_state32(th, (thread_state_t)ts);
 345
 346 #else
 347 #error bsdthread_create  not defined for this architecture
 348 #endif
 349
 350         if ((flags & PTHREAD_START_SETSCHED) != 0) {
 351                 /* Set scheduling parameters if needed */
 352                 thread_extended_policy_data_t    extinfo;
 353                 thread_precedence_policy_data_t   precedinfo;
 354
 355                 importance = (flags & PTHREAD_START_IMPORTANCE_MASK);
 356                 policy = (flags >> PTHREAD_START_POLICY_BITSHIFT) & PTHREAD_START_POLICY_MASK;
 357
 358                 if (policy == SCHED_OTHER) {
 359                         extinfo.timeshare = 1;
 360                 } else {
 361                         extinfo.timeshare = 0;
 362                 }
 363
 364                 thread_policy_set(th, THREAD_EXTENDED_POLICY, (thread_policy_t)&extinfo, THREAD_EXTENDED_POLICY_COUNT);
 365
 366 #define BASEPRI_DEFAULT 31
 367                 precedinfo.importance = (importance - BASEPRI_DEFAULT);
 368                 thread_policy_set(th, THREAD_PRECEDENCE_POLICY, (thread_policy_t)&precedinfo, THREAD_PRECEDENCE_POLICY_COUNT);
 369         } else if ((flags & PTHREAD_START_QOSCLASS) != 0) {
 370                 /* Set thread QoS class if requested. */
 371                 pthread_priority_t priority = (pthread_priority_t)(flags & PTHREAD_START_QOSCLASS_MASK);
 372
 373                 thread_qos_policy_data_t qos;
 374                 qos.qos_tier = pthread_priority_get_qos_class(priority);
 375                 qos.tier_importance = (qos.qos_tier == QOS_CLASS_UNSPECIFIED) ? 0 :
 376                                 _pthread_priority_get_relpri(priority);
 377
 378                 pthread_kern->thread_policy_set_internal(th, THREAD_QOS_POLICY, (thread_policy_t)&qos, THREAD_QOS_POLICY_COUNT);
 379         }
 380
 381         kret = pthread_kern->thread_resume(th);
 382         if (kret != KERN_SUCCESS) {
 383                 error = EINVAL;
 384                 goto out1;
 385         }
 386         thread_deallocate(th);  /* drop the creator reference */
 387
 388         PTHREAD_TRACE(TRACE_pthread_thread_create|DBG_FUNC_END, error, th_pthread, 0, 0, 0);
 389
 390         *retval = th_pthread;
 391
 392         return(0);
 393
 394 out1:
 395         if (allocated != 0) {
 396                 (void)mach_vm_deallocate(vmap,  stackaddr, th_allocsize);
 397         }
 398 out:
 399         (void)pthread_kern->mach_port_deallocate(pthread_kern->task_get_ipcspace(ctask), th_thport);
 400         (void)thread_terminate(th);
 401         (void)thread_deallocate(th);
 402         return(error);
 403 }
 404
 405 int
 406 _bsdthread_terminate(__unused struct proc *p,
 407                      user_addr_t stackaddr,
 408                      size_t size,
 409                      uint32_t kthport,
 410                      uint32_t sem,
 411                      __unused int32_t *retval)
 412 {
 413         mach_vm_offset_t freeaddr;
 414         mach_vm_size_t freesize;
 415         kern_return_t kret;
 416
 417         freeaddr = (mach_vm_offset_t)stackaddr;
 418         freesize = size;
 419
 420         PTHREAD_TRACE(TRACE_pthread_thread_terminate|DBG_FUNC_START, freeaddr, freesize, kthport, 0xff, 0);
 421
 422         if ((freesize != (mach_vm_size_t)0) && (freeaddr != (mach_vm_offset_t)0)) {
 423                 kret = mach_vm_deallocate(pthread_kern->current_map(), freeaddr, freesize);
 424                 if (kret != KERN_SUCCESS) {
 425                         PTHREAD_TRACE(TRACE_pthread_thread_terminate|DBG_FUNC_END, kret, 0, 0, 0, 0);
 426                         return(EINVAL);
 427                 }
 428         }
 429
 430         (void) thread_terminate(current_thread());
 431         if (sem != MACH_PORT_NULL) {
 432                  kret = pthread_kern->semaphore_signal_internal_trap(sem);
 433                 if (kret != KERN_SUCCESS) {
 434                         PTHREAD_TRACE(TRACE_pthread_thread_terminate|DBG_FUNC_END, kret, 0, 0, 0, 0);
 435                         return(EINVAL);
 436                 }
 437         }
 438
 439         if (kthport != MACH_PORT_NULL) {
 440                 pthread_kern->mach_port_deallocate(pthread_kern->task_get_ipcspace(current_task()), kthport);
 441         }
 442
 443         PTHREAD_TRACE(TRACE_pthread_thread_terminate|DBG_FUNC_END, 0, 0, 0, 0, 0);
 444
 445         pthread_kern->thread_exception_return();
 446         panic("bsdthread_terminate: still running\n");
 447
 448         PTHREAD_TRACE(TRACE_pthread_thread_terminate|DBG_FUNC_END, 0, 0xff, 0, 0, 0);
 449
 450         return(0);
 451 }
 452
 453 int
 454 _bsdthread_register(struct proc *p,
 455                     user_addr_t threadstart,
 456                     user_addr_t wqthread,
 457                     int pthsize,
 458                     user_addr_t pthread_init_data,
 459                     user_addr_t targetconc_ptr,
 460                     uint64_t dispatchqueue_offset,
 461                     int32_t *retval)
 462 {
 463         /* prevent multiple registrations */
 464         if (pthread_kern->proc_get_register(p) != 0) {
 465                 return(EINVAL);
 466         }
 467         /* syscall randomizer test can pass bogus values */
 468         if (pthsize < 0 || pthsize > MAX_PTHREAD_SIZE) {
 469                 return(EINVAL);
 470         }
 471         pthread_kern->proc_set_threadstart(p, threadstart);
 472         pthread_kern->proc_set_wqthread(p, wqthread);
 473         pthread_kern->proc_set_pthsize(p, pthsize);
 474         pthread_kern->proc_set_register(p);
 475
 476         /* if we have pthread_init_data, then we use that and target_concptr (which is an offset) get data. */
 477         if (pthread_init_data != 0) {
 478                 thread_qos_policy_data_t qos;
 479
 480                 struct _pthread_registration_data data;
 481                 size_t pthread_init_sz = MIN(sizeof(struct _pthread_registration_data), (size_t)targetconc_ptr);
 482
 483                 kern_return_t kr = copyin(pthread_init_data, &data, pthread_init_sz);
 484                 if (kr != KERN_SUCCESS) {
 485                         return EINVAL;
 486                 }
 487
 488                 /* Incoming data from the data structure */
 489                 pthread_kern->proc_set_dispatchqueue_offset(p, data.dispatch_queue_offset);
 490
 491                 /* Outgoing data that userspace expects as a reply */
 492                 if (pthread_kern->qos_main_thread_active()) {
 493                         mach_msg_type_number_t nqos = THREAD_QOS_POLICY_COUNT;
 494                         boolean_t gd = FALSE;
 495
 496                         kr = pthread_kern->thread_policy_get(current_thread(), THREAD_QOS_POLICY, (thread_policy_t)&qos, &nqos, &gd);
 497                         if (kr != KERN_SUCCESS || qos.qos_tier == THREAD_QOS_UNSPECIFIED) {
 498                                 /* Unspecified threads means the kernel wants us to impose legacy upon the thread. */
 499                                 qos.qos_tier = THREAD_QOS_LEGACY;
 500                                 qos.tier_importance = 0;
 501
 502                                 kr = pthread_kern->thread_policy_set_internal(current_thread(), THREAD_QOS_POLICY, (thread_policy_t)&qos, THREAD_QOS_POLICY_COUNT);
 503                         }
 504
 505                         if (kr == KERN_SUCCESS) {
 506                                 data.main_qos = pthread_qos_class_get_priority(qos.qos_tier);
 507                         } else {
 508                                 data.main_qos = _pthread_priority_make_newest(QOS_CLASS_UNSPECIFIED, 0, 0);
 509                         }
 510                 } else {
 511                         data.main_qos = _pthread_priority_make_newest(QOS_CLASS_UNSPECIFIED, 0, 0);
 512                 }
 513
 514                 kr = copyout(&data, pthread_init_data, pthread_init_sz);
 515                 if (kr != KERN_SUCCESS) {
 516                         return EINVAL;
 517                 }
 518         } else {
 519                 pthread_kern->proc_set_dispatchqueue_offset(p, dispatchqueue_offset);
 520                 pthread_kern->proc_set_targconc(p, targetconc_ptr);
 521         }
 522
 523         /* return the supported feature set as the return value. */
 524         *retval = PTHREAD_FEATURE_SUPPORTED;
 525
 526         return(0);
 527 }
 528
 529 int
 530 _bsdthread_ctl_set_qos(struct proc *p, user_addr_t __unused cmd, mach_port_name_t kport, user_addr_t tsd_priority_addr, user_addr_t arg3, int *retval)
 531 {
 532         kern_return_t kr;
 533         thread_t th;
 534
 535         pthread_priority_t priority;
 536
 537         /* Unused parameters must be zero. */
 538         if (arg3 != 0) {
 539                 return EINVAL;
 540         }
 541
 542         /* QoS is stored in a given slot in the pthread TSD. We need to copy that in and set our QoS based on it. */
 543         if (proc_is64bit(p)) {
 544                 uint64_t v;
 545                 kr = copyin(tsd_priority_addr, &v, sizeof(v));
 546                 if (kr != KERN_SUCCESS) {
 547                         return kr;
 548                 }
 549                 priority = (int)(v & 0xffffffff);
 550         } else {
 551                 uint32_t v;
 552                 kr = copyin(tsd_priority_addr, &v, sizeof(v));
 553                 if (kr != KERN_SUCCESS) {
 554                         return kr;
 555                 }
 556                 priority = v;
 557         }
 558
 559         if ((th = port_name_to_thread(kport)) == THREAD_NULL) {
 560                 return ESRCH;
 561         }
 562
 563         /* <rdar://problem/16211829> Disable pthread_set_qos_class_np() on threads other than pthread_self */
 564         if (th != current_thread()) {
 565                 thread_deallocate(th);
 566                 return EPERM;
 567         }
 568
 569         int rv = _bsdthread_ctl_set_self(p, 0, priority, 0, _PTHREAD_SET_SELF_QOS_FLAG, retval);
 570
 571         /* Static param the thread, we just set QoS on it, so its stuck in QoS land now. */
 572         /* pthread_kern->thread_static_param(th, TRUE); */ // see <rdar://problem/16433744>, for details
 573
 574         thread_deallocate(th);
 575
 576         return rv;
 577 }
 578
 579 static inline struct threadlist *
 580 util_get_thread_threadlist_entry(thread_t th)
 581 {
 582         struct uthread *uth = pthread_kern->get_bsdthread_info(th);
 583         if (uth) {
 584                 struct threadlist *tl = pthread_kern->uthread_get_threadlist(uth);
 585                 return tl;
 586         }
 587         return NULL;
 588 }
 589
 590 static inline void
 591 wq_thread_override_reset(thread_t th, user_addr_t resource)
 592 {
 593         struct uthread *uth = pthread_kern->get_bsdthread_info(th);
 594         struct threadlist *tl = pthread_kern->uthread_get_threadlist(uth);
 595
 596         if (tl) {
 597                 /*
 598                  * Drop all outstanding overrides on this thread, done outside the wq lock
 599                  * because proc_usynch_thread_qos_remove_override_for_resource takes a spinlock that
 600                  * could cause us to panic.
 601                  */
 602                 PTHREAD_TRACE(TRACE_wq_override_reset | DBG_FUNC_NONE, tl->th_workq, 0, 0, 0, 0);
 603
 604                 pthread_kern->proc_usynch_thread_qos_reset_override_for_resource(current_task(), uth, 0, resource, THREAD_QOS_OVERRIDE_TYPE_DISPATCH_ASYNCHRONOUS_OVERRIDE);
 605         }
 606 }
 607
 608 int
 609 _bsdthread_ctl_set_self(struct proc *p, user_addr_t __unused cmd, pthread_priority_t priority, mach_port_name_t voucher, _pthread_set_flags_t flags, int __unused *retval)
 610 {
 611         thread_qos_policy_data_t qos;
 612         mach_msg_type_number_t nqos = THREAD_QOS_POLICY_COUNT;
 613         boolean_t gd = FALSE;
 614
 615         kern_return_t kr;
 616         int qos_rv = 0, voucher_rv = 0, fixedpri_rv = 0;
 617
 618         if ((flags & _PTHREAD_SET_SELF_QOS_FLAG) != 0) {
 619                 kr = pthread_kern->thread_policy_get(current_thread(), THREAD_QOS_POLICY, (thread_policy_t)&qos, &nqos, &gd);
 620                 if (kr != KERN_SUCCESS) {
 621                         qos_rv = EINVAL;
 622                         goto voucher;
 623                 }
 624
 625                 /* If we have main-thread QoS then we don't allow a thread to come out of QOS_CLASS_UNSPECIFIED. */
 626                 if (pthread_kern->qos_main_thread_active() && qos.qos_tier == THREAD_QOS_UNSPECIFIED) {
 627                         qos_rv = EPERM;
 628                         goto voucher;
 629                 }
 630
 631                 /* Get the work queue for tracing, also the threadlist for bucket manipluation. */
 632                 struct workqueue *wq = NULL;
 633                 struct threadlist *tl = util_get_thread_threadlist_entry(current_thread());
 634                 if (tl) {
 635                         wq = tl->th_workq;
 636                 }
 637
 638                 PTHREAD_TRACE(TRACE_pthread_set_qos_self | DBG_FUNC_START, wq, qos.qos_tier, qos.tier_importance, 0, 0);
 639
 640                 qos.qos_tier = pthread_priority_get_qos_class(priority);
 641                 qos.tier_importance = (qos.qos_tier == QOS_CLASS_UNSPECIFIED) ? 0 : _pthread_priority_get_relpri(priority);
 642
 643                 kr = pthread_kern->thread_policy_set_internal(current_thread(), THREAD_QOS_POLICY, (thread_policy_t)&qos, THREAD_QOS_POLICY_COUNT);
 644                 if (kr != KERN_SUCCESS) {
 645                         qos_rv = EINVAL;
 646                         goto voucher;
 647                 }
 648
 649                 /* If we're a workqueue, the threadlist item priority needs adjusting, along with the bucket we were running in. */
 650                 if (tl) {
 651                         workqueue_lock_spin(p);
 652
 653                         /* Fix up counters. */
 654                         uint8_t old_bucket = tl->th_priority;
 655                         uint8_t new_bucket = pthread_priority_get_class_index(priority);
 656
 657                         uint32_t old_active = OSAddAtomic(-1, &wq->wq_thactive_count[old_bucket]);
 658                         OSAddAtomic(1, &wq->wq_thactive_count[new_bucket]);
 659
 660                         wq->wq_thscheduled_count[old_bucket]--;
 661                         wq->wq_thscheduled_count[new_bucket]++;
 662
 663                         tl->th_priority = new_bucket;
 664
 665                         /* If we were at the ceiling of non-overcommitted threads for a given bucket, we have to
 666                          * reevaluate whether we should start more work.
 667                          */
 668                         if (old_active == wq->wq_reqconc[old_bucket]) {
 669                                 /* workqueue_run_nextreq will drop the workqueue lock in all exit paths. */
 670                                 (void)workqueue_run_nextreq(p, wq, THREAD_NULL, FALSE, FALSE, 0);
 671                         } else {
 672                                 workqueue_unlock(p);
 673                         }
 674                 }
 675
 676                 PTHREAD_TRACE(TRACE_pthread_set_qos_self | DBG_FUNC_END, wq, qos.qos_tier, qos.tier_importance, 0, 0);
 677         }
 678
 679 voucher:
 680         if ((flags & _PTHREAD_SET_SELF_VOUCHER_FLAG) != 0) {
 681                 kr = pthread_kern->thread_set_voucher_name(voucher);
 682                 if (kr != KERN_SUCCESS) {
 683                         voucher_rv = ENOENT;
 684                         goto fixedpri;
 685                 }
 686         }
 687
 688 fixedpri:
 689         if ((flags & _PTHREAD_SET_SELF_FIXEDPRIORITY_FLAG) != 0) {
 690                 thread_extended_policy_data_t extpol;
 691                 thread_t thread = current_thread();
 692
 693                 extpol.timeshare = 0;
 694
 695                 struct threadlist *tl = util_get_thread_threadlist_entry(thread);
 696                 if (tl) {
 697                         /* Not allowed on workqueue threads, since there is no symmetric clear function */
 698                         fixedpri_rv = ENOTSUP;
 699                         goto done;
 700                 }
 701
 702                 kr = pthread_kern->thread_policy_set_internal(thread, THREAD_EXTENDED_POLICY, (thread_policy_t)&extpol, THREAD_EXTENDED_POLICY_COUNT);
 703                 if (kr != KERN_SUCCESS) {
 704                         fixedpri_rv = EINVAL;
 705                         goto done;
 706                 }
 707         }
 708
 709 done:
 710         if (qos_rv && voucher_rv) {
 711                 /* Both failed, give that a unique error. */
 712                 return EBADMSG;
 713         }
 714
 715         if (qos_rv) {
 716                 return qos_rv;
 717         }
 718
 719         if (voucher_rv) {
 720                 return voucher_rv;
 721         }
 722
 723         if (fixedpri_rv) {
 724                 return fixedpri_rv;
 725         }
 726
 727         return 0;
 728 }
 729
 730 int
 731 _bsdthread_ctl_qos_override_start(struct proc __unused *p, user_addr_t __unused cmd, mach_port_name_t kport, pthread_priority_t priority, user_addr_t resource, int __unused *retval)
 732 {
 733         thread_t th;
 734         int rv = 0;
 735
 736         if ((th = port_name_to_thread(kport)) == THREAD_NULL) {
 737                 return ESRCH;
 738         }
 739
 740         struct uthread *uth = pthread_kern->get_bsdthread_info(th);
 741         int override_qos = pthread_priority_get_qos_class(priority);
 742
 743         struct threadlist *tl = util_get_thread_threadlist_entry(th);
 744         if (tl) {
 745                 PTHREAD_TRACE(TRACE_wq_override_start | DBG_FUNC_NONE, tl->th_workq, thread_tid(th), 1, priority, 0);
 746         }
 747
 748         /* The only failure case here is if we pass a tid and have it lookup the thread, we pass the uthread, so this all always succeeds. */
 749         pthread_kern->proc_usynch_thread_qos_add_override_for_resource(current_task(), uth, 0, override_qos, TRUE, resource, THREAD_QOS_OVERRIDE_TYPE_PTHREAD_EXPLICIT_OVERRIDE);
 750
 751         thread_deallocate(th);
 752         return rv;
 753 }
 754
 755 int
 756 _bsdthread_ctl_qos_override_end(struct proc __unused *p, user_addr_t __unused cmd, mach_port_name_t kport, user_addr_t resource, user_addr_t arg3, int __unused *retval)
 757 {
 758         thread_t th;
 759         int rv = 0;
 760
 761         if (arg3 != 0) {
 762                 return EINVAL;
 763         }
 764
 765         if ((th = port_name_to_thread(kport)) == THREAD_NULL) {
 766                 return ESRCH;
 767         }
 768
 769         struct uthread *uth = pthread_kern->get_bsdthread_info(th);
 770
 771         struct threadlist *tl = util_get_thread_threadlist_entry(th);
 772         if (tl) {
 773                 PTHREAD_TRACE(TRACE_wq_override_end | DBG_FUNC_NONE, tl->th_workq, thread_tid(th), 0, 0, 0);
 774         }
 775
 776         pthread_kern->proc_usynch_thread_qos_remove_override_for_resource(current_task(), uth, 0, resource, THREAD_QOS_OVERRIDE_TYPE_PTHREAD_EXPLICIT_OVERRIDE);
 777
 778         thread_deallocate(th);
 779         return rv;
 780 }
 781
 782 int
 783 _bsdthread_ctl_qos_override_dispatch(struct proc *p, user_addr_t cmd, mach_port_name_t kport, pthread_priority_t priority, user_addr_t arg3, int *retval)
 784 {
 785         if (arg3 != 0) {
 786                 return EINVAL;
 787         }
 788
 789         return _bsdthread_ctl_qos_dispatch_asynchronous_override_add(p, cmd, kport, priority, USER_ADDR_NULL, retval);
 790 }
 791
 792 int
 793 _bsdthread_ctl_qos_dispatch_asynchronous_override_add(struct proc __unused *p, user_addr_t __unused cmd, mach_port_name_t kport, pthread_priority_t priority, user_addr_t resource, int __unused *retval)
 794 {
 795         thread_t th;
 796         int rv = 0;
 797
 798         if ((th = port_name_to_thread(kport)) == THREAD_NULL) {
 799                 return ESRCH;
 800         }
 801
 802         struct uthread *uth = pthread_kern->get_bsdthread_info(th);
 803         int override_qos = pthread_priority_get_qos_class(priority);
 804
 805         struct threadlist *tl = util_get_thread_threadlist_entry(th);
 806         if (!tl) {
 807                 thread_deallocate(th);
 808                 return EPERM;
 809         }
 810
 811         PTHREAD_TRACE(TRACE_wq_override_dispatch | DBG_FUNC_NONE, tl->th_workq, thread_tid(th), 1, priority, 0);
 812
 813         /* The only failure case here is if we pass a tid and have it lookup the thread, we pass the uthread, so this all always succeeds. */
 814         pthread_kern->proc_usynch_thread_qos_add_override_for_resource(current_task(), uth, 0, override_qos, TRUE, resource, THREAD_QOS_OVERRIDE_TYPE_DISPATCH_ASYNCHRONOUS_OVERRIDE);
 815
 816         thread_deallocate(th);
 817         return rv;
 818 }
 819
 820 int
 821 _bsdthread_ctl_qos_override_reset(struct proc *p, user_addr_t cmd, user_addr_t arg1, user_addr_t arg2, user_addr_t arg3, int *retval)
 822 {
 823         if (arg1 != 0 || arg2 != 0 || arg3 != 0) {
 824                 return EINVAL;
 825         }
 826
 827         return _bsdthread_ctl_qos_dispatch_asynchronous_override_reset(p, cmd, 1 /* reset_all */, 0, 0, retval);
 828 }
 829
 830 int
 831 _bsdthread_ctl_qos_dispatch_asynchronous_override_reset(struct proc __unused *p, user_addr_t __unused cmd, int reset_all, user_addr_t resource, user_addr_t arg3, int __unused *retval)
 832 {
 833         thread_t th;
 834         struct threadlist *tl;
 835         int rv = 0;
 836
 837         if ((reset_all && (resource != 0)) || arg3 != 0) {
 838                 return EINVAL;
 839         }
 840
 841         th = current_thread();
 842         tl = util_get_thread_threadlist_entry(th);
 843
 844         if (tl) {
 845                 wq_thread_override_reset(th, reset_all ? THREAD_QOS_OVERRIDE_RESOURCE_WILDCARD : resource);
 846         } else {
 847                 rv = EPERM;
 848         }
 849
 850         return rv;
 851 }
 852
 853 int
 854 _bsdthread_ctl(struct proc *p, user_addr_t cmd, user_addr_t arg1, user_addr_t arg2, user_addr_t arg3, int *retval)
 855 {
 856         switch (cmd) {
 857                 case BSDTHREAD_CTL_SET_QOS:
 858                         return _bsdthread_ctl_set_qos(p, cmd, (mach_port_name_t)arg1, arg2, arg3, retval);
 859                 case BSDTHREAD_CTL_QOS_OVERRIDE_START:
 860                         return _bsdthread_ctl_qos_override_start(p, cmd, (mach_port_name_t)arg1, (pthread_priority_t)arg2, arg3, retval);
 861                 case BSDTHREAD_CTL_QOS_OVERRIDE_END:
 862                         return _bsdthread_ctl_qos_override_end(p, cmd, (mach_port_name_t)arg1, arg2, arg3, retval);
 863                 case BSDTHREAD_CTL_QOS_OVERRIDE_RESET:
 864                         return _bsdthread_ctl_qos_override_reset(p, cmd, arg1, arg2, arg3, retval);
 865                 case BSDTHREAD_CTL_QOS_OVERRIDE_DISPATCH:
 866                         return _bsdthread_ctl_qos_override_dispatch(p, cmd, (mach_port_name_t)arg1, (pthread_priority_t)arg2, arg3, retval);
 867                 case BSDTHREAD_CTL_QOS_DISPATCH_ASYNCHRONOUS_OVERRIDE_ADD:
 868                         return _bsdthread_ctl_qos_dispatch_asynchronous_override_add(p, cmd, (mach_port_name_t)arg1, (pthread_priority_t)arg2, arg3, retval);
 869                 case BSDTHREAD_CTL_QOS_DISPATCH_ASYNCHRONOUS_OVERRIDE_RESET:
 870                         return _bsdthread_ctl_qos_dispatch_asynchronous_override_reset(p, cmd, (int)arg1, arg2, arg3, retval);
 871                 case BSDTHREAD_CTL_SET_SELF:
 872                         return _bsdthread_ctl_set_self(p, cmd, (pthread_priority_t)arg1, (mach_port_name_t)arg2, (_pthread_set_flags_t)arg3, retval);
 873                 default:
 874                         return EINVAL;
 875         }
 876 }
 877
 878 uint32_t wq_yielded_threshold           = WQ_YIELDED_THRESHOLD;
 879 uint32_t wq_yielded_window_usecs        = WQ_YIELDED_WINDOW_USECS;
 880 uint32_t wq_stalled_window_usecs        = WQ_STALLED_WINDOW_USECS;
 881 uint32_t wq_reduce_pool_window_usecs    = WQ_REDUCE_POOL_WINDOW_USECS;
 882 uint32_t wq_max_timer_interval_usecs    = WQ_MAX_TIMER_INTERVAL_USECS;
 883 uint32_t wq_max_threads                 = WORKQUEUE_MAXTHREADS;
 884 uint32_t wq_max_constrained_threads     = WORKQUEUE_MAXTHREADS / 8;
 885
 886
 887 SYSCTL_INT(_kern, OID_AUTO, wq_yielded_threshold, CTLFLAG_RW | CTLFLAG_LOCKED,
 888            &wq_yielded_threshold, 0, "");
 889
 890 SYSCTL_INT(_kern, OID_AUTO, wq_yielded_window_usecs, CTLFLAG_RW | CTLFLAG_LOCKED,
 891            &wq_yielded_window_usecs, 0, "");
 892
 893 SYSCTL_INT(_kern, OID_AUTO, wq_stalled_window_usecs, CTLFLAG_RW | CTLFLAG_LOCKED,
 894            &wq_stalled_window_usecs, 0, "");
 895
 896 SYSCTL_INT(_kern, OID_AUTO, wq_reduce_pool_window_usecs, CTLFLAG_RW | CTLFLAG_LOCKED,
 897            &wq_reduce_pool_window_usecs, 0, "");
 898
 899 SYSCTL_INT(_kern, OID_AUTO, wq_max_timer_interval_usecs, CTLFLAG_RW | CTLFLAG_LOCKED,
 900            &wq_max_timer_interval_usecs, 0, "");
 901
 902 SYSCTL_INT(_kern, OID_AUTO, wq_max_threads, CTLFLAG_RW | CTLFLAG_LOCKED,
 903            &wq_max_threads, 0, "");
 904
 905 SYSCTL_INT(_kern, OID_AUTO, wq_max_constrained_threads, CTLFLAG_RW | CTLFLAG_LOCKED,
 906            &wq_max_constrained_threads, 0, "");
 907
 908
 909 static uint32_t wq_init_constrained_limit = 1;
 910
 911
 912 void
 913 _workqueue_init_lock(proc_t p)
 914 {
 915         lck_spin_init(pthread_kern->proc_get_wqlockptr(p), pthread_lck_grp, pthread_lck_attr);
 916         *(pthread_kern->proc_get_wqinitingptr(p)) = FALSE;
 917 }
 918
 919 void
 920 _workqueue_destroy_lock(proc_t p)
 921 {
 922         lck_spin_destroy(pthread_kern->proc_get_wqlockptr(p), pthread_lck_grp);
 923 }
 924
 925
 926 static void
 927 workqueue_lock_spin(proc_t p)
 928 {
 929         lck_spin_lock(pthread_kern->proc_get_wqlockptr(p));
 930 }
 931
 932 static void
 933 workqueue_unlock(proc_t p)
 934 {
 935         lck_spin_unlock(pthread_kern->proc_get_wqlockptr(p));
 936 }
 937
 938
 939 static void
 940 workqueue_interval_timer_start(struct workqueue *wq)
 941 {
 942         uint64_t deadline;
 943
 944         if (wq->wq_timer_interval == 0) {
 945                 wq->wq_timer_interval = wq_stalled_window_usecs;
 946
 947         } else {
 948                 wq->wq_timer_interval = wq->wq_timer_interval * 2;
 949
 950                 if (wq->wq_timer_interval > wq_max_timer_interval_usecs) {
 951                         wq->wq_timer_interval = wq_max_timer_interval_usecs;
 952                 }
 953         }
 954         clock_interval_to_deadline(wq->wq_timer_interval, 1000, &deadline);
 955
 956         thread_call_enter_delayed(wq->wq_atimer_call, deadline);
 957
 958         PTHREAD_TRACE(TRACE_wq_start_add_timer, wq, wq->wq_reqcount, wq->wq_flags, wq->wq_timer_interval, 0);
 959 }
 960
 961
 962 static boolean_t
 963 wq_thread_is_busy(uint64_t cur_ts, uint64_t *lastblocked_tsp)
 964 {
 965         clock_sec_t     secs;
 966         clock_usec_t    usecs;
 967         uint64_t lastblocked_ts;
 968         uint64_t elapsed;
 969
 970         /*
 971          * the timestamp is updated atomically w/o holding the workqueue lock
 972          * so we need to do an atomic read of the 64 bits so that we don't see
 973          * a mismatched pair of 32 bit reads... we accomplish this in an architecturally
 974          * independent fashion by using OSCompareAndSwap64 to write back the
 975          * value we grabbed... if it succeeds, then we have a good timestamp to
 976          * evaluate... if it fails, we straddled grabbing the timestamp while it
 977          * was being updated... treat a failed update as a busy thread since
 978          * it implies we are about to see a really fresh timestamp anyway
 979          */
 980         lastblocked_ts = *lastblocked_tsp;
 981
 982         if ( !OSCompareAndSwap64((UInt64)lastblocked_ts, (UInt64)lastblocked_ts, lastblocked_tsp))
 983                 return (TRUE);
 984
 985         if (lastblocked_ts >= cur_ts) {
 986                 /*
 987                  * because the update of the timestamp when a thread blocks isn't
 988                  * serialized against us looking at it (i.e. we don't hold the workq lock)
 989                  * it's possible to have a timestamp that matches the current time or
 990                  * that even looks to be in the future relative to when we grabbed the current
 991                  * time... just treat this as a busy thread since it must have just blocked.
 992                  */
 993                 return (TRUE);
 994         }
 995         elapsed = cur_ts - lastblocked_ts;
 996
 997         pthread_kern->absolutetime_to_microtime(elapsed, &secs, &usecs);
 998
 999         if (secs == 0 && usecs < wq_stalled_window_usecs)
1000                 return (TRUE);
1001         return (FALSE);
1002 }
1003
1004
1005 #define WQ_TIMER_NEEDED(wq, start_timer) do {           \
1006         int oldflags = wq->wq_flags;                    \
1007                                                         \
1008         if ( !(oldflags & (WQ_EXITING | WQ_ATIMER_RUNNING))) {  \
1009                 if (OSCompareAndSwap(oldflags, oldflags | WQ_ATIMER_RUNNING, (UInt32 *)&wq->wq_flags)) \
1010                         start_timer = TRUE;                     \
1011         }                                                       \
1012 } while (0)
1013
1014
1015
1016 static void
1017 workqueue_add_timer(struct workqueue *wq, __unused int param1)
1018 {
1019         proc_t          p;
1020         boolean_t       start_timer = FALSE;
1021         boolean_t       retval;
1022         boolean_t       add_thread;
1023         uint32_t        busycount;
1024
1025         PTHREAD_TRACE(TRACE_wq_add_timer | DBG_FUNC_START, wq, wq->wq_flags, wq->wq_nthreads, wq->wq_thidlecount, 0);
1026
1027         p = wq->wq_proc;
1028
1029         workqueue_lock_spin(p);
1030
1031         /*
1032          * because workqueue_callback now runs w/o taking the workqueue lock
1033          * we are unsynchronized w/r to a change in state of the running threads...
1034          * to make sure we always evaluate that change, we allow it to start up
1035          * a new timer if the current one is actively evalutating the state
1036          * however, we do not need more than 2 timers fired up (1 active and 1 pending)
1037          * and we certainly do not want 2 active timers evaluating the state
1038          * simultaneously... so use WQL_ATIMER_BUSY to serialize the timers...
1039          * note that WQL_ATIMER_BUSY is in a different flag word from WQ_ATIMER_RUNNING since
1040          * it is always protected by the workq lock... WQ_ATIMER_RUNNING is evaluated
1041          * and set atomimcally since the callback function needs to manipulate it
1042          * w/o holding the workq lock...
1043          *
1044          * !WQ_ATIMER_RUNNING && !WQL_ATIMER_BUSY   ==   no pending timer, no active timer
1045          * !WQ_ATIMER_RUNNING && WQL_ATIMER_BUSY    ==   no pending timer, 1 active timer
1046          * WQ_ATIMER_RUNNING && !WQL_ATIMER_BUSY    ==   1 pending timer, no active timer
1047          * WQ_ATIMER_RUNNING && WQL_ATIMER_BUSY     ==   1 pending timer, 1 active timer
1048          */
1049         while (wq->wq_lflags & WQL_ATIMER_BUSY) {
1050                 wq->wq_lflags |= WQL_ATIMER_WAITING;
1051
1052                 assert_wait((caddr_t)wq, (THREAD_UNINT));
1053                 workqueue_unlock(p);
1054
1055                 thread_block(THREAD_CONTINUE_NULL);
1056
1057                 workqueue_lock_spin(p);
1058         }
1059         wq->wq_lflags |= WQL_ATIMER_BUSY;
1060
1061         /*
1062          * the workq lock will protect us from seeing WQ_EXITING change state, but we
1063          * still need to update this atomically in case someone else tries to start
1064          * the timer just as we're releasing it
1065          */
1066         while ( !(OSCompareAndSwap(wq->wq_flags, (wq->wq_flags & ~WQ_ATIMER_RUNNING), (UInt32 *)&wq->wq_flags)));
1067
1068 again:
1069         retval = TRUE;
1070         add_thread = FALSE;
1071
1072         if ( !(wq->wq_flags & WQ_EXITING)) {
1073                 /*
1074                  * check to see if the stall frequency was beyond our tolerance
1075                  * or we have work on the queue, but haven't scheduled any
1076                  * new work within our acceptable time interval because
1077                  * there were no idle threads left to schedule
1078                  */
1079                 if (wq->wq_reqcount) {
1080                         uint32_t        priclass;
1081                         uint32_t        thactive_count;
1082                         uint32_t        i;
1083                         uint64_t        curtime;
1084
1085                         for (priclass = 0; priclass < WORKQUEUE_NUM_BUCKETS; priclass++) {
1086                                 if (wq->wq_requests[priclass])
1087                                         break;
1088                         }
1089                         assert(priclass < WORKQUEUE_NUM_BUCKETS);
1090
1091                         curtime = mach_absolute_time();
1092                         busycount = 0;
1093                         thactive_count = 0;
1094
1095                         /*
1096                          * check for conditions under which we would not add a thread, either
1097                          *   a) we've got as many running threads as we want in this priority
1098                          *      band and the priority bands above it
1099                          *
1100                          *   b) check to see if the priority group has blocked threads, if the
1101                          *      last blocked timestamp is old enough, we will have already passed
1102                          *      (a) where we would have stopped if we had enough active threads.
1103                          */
1104                         for (i = 0; i <= priclass; i++) {
1105
1106                                 thactive_count += wq->wq_thactive_count[i];
1107
1108                                 if (wq->wq_thscheduled_count[i]) {
1109                                         if (wq_thread_is_busy(curtime, &wq->wq_lastblocked_ts[i]))
1110                                                 busycount++;
1111                                 }
1112                         }
1113                         if (thactive_count + busycount < wq->wq_max_concurrency) {
1114
1115                                 if (wq->wq_thidlecount == 0) {
1116                                         /*
1117                                          * if we have no idle threads, try to add one
1118                                          */
1119                                         retval = workqueue_addnewthread(wq, FALSE);
1120                                 }
1121                                 add_thread = TRUE;
1122                         }
1123
1124                         if (wq->wq_reqcount) {
1125                                 /*
1126                                  * as long as we have threads to schedule, and we successfully
1127                                  * scheduled new work, keep trying
1128                                  */
1129                                 while (wq->wq_thidlecount && !(wq->wq_flags & WQ_EXITING)) {
1130                                         /*
1131                                          * workqueue_run_nextreq is responsible for
1132                                          * dropping the workqueue lock in all cases
1133                                          */
1134                                         retval = workqueue_run_nextreq(p, wq, THREAD_NULL, FALSE, FALSE, 0);
1135                                         workqueue_lock_spin(p);
1136
1137                                         if (retval == FALSE)
1138                                                 break;
1139                                 }
1140                                 if ( !(wq->wq_flags & WQ_EXITING) && wq->wq_reqcount) {
1141
1142                                         if (wq->wq_thidlecount == 0 && retval == TRUE && add_thread == TRUE)
1143                                                 goto again;
1144
1145                                         if (wq->wq_thidlecount == 0 || busycount)
1146                                                 WQ_TIMER_NEEDED(wq, start_timer);
1147
1148                                         PTHREAD_TRACE(TRACE_wq_add_timer | DBG_FUNC_NONE, wq, wq->wq_reqcount, wq->wq_thidlecount, busycount, 0);
1149                                 }
1150                         }
1151                 }
1152         }
1153         if ( !(wq->wq_flags & WQ_ATIMER_RUNNING))
1154                 wq->wq_timer_interval = 0;
1155
1156         wq->wq_lflags &= ~WQL_ATIMER_BUSY;
1157
1158         if ((wq->wq_flags & WQ_EXITING) || (wq->wq_lflags & WQL_ATIMER_WAITING)) {
1159                 /*
1160                  * wakeup the thread hung up in workqueue_exit or workqueue_add_timer waiting for this timer
1161                  * to finish getting out of the way
1162                  */
1163                 wq->wq_lflags &= ~WQL_ATIMER_WAITING;
1164                 wakeup(wq);
1165         }
1166
1167         PTHREAD_TRACE(TRACE_wq_add_timer | DBG_FUNC_END, wq, start_timer, wq->wq_nthreads, wq->wq_thidlecount, 0);
1168
1169         workqueue_unlock(p);
1170
1171         if (start_timer == TRUE)
1172                 workqueue_interval_timer_start(wq);
1173 }
1174
1175
1176 void
1177 _workqueue_thread_yielded(void)
1178 {
1179         struct workqueue *wq;
1180         proc_t p;
1181
1182         p = current_proc();
1183
1184         if ((wq = pthread_kern->proc_get_wqptr(p)) == NULL || wq->wq_reqcount == 0)
1185                 return;
1186
1187         workqueue_lock_spin(p);
1188
1189         if (wq->wq_reqcount) {
1190                 uint64_t        curtime;
1191                 uint64_t        elapsed;
1192                 clock_sec_t     secs;
1193                 clock_usec_t    usecs;
1194
1195                 if (wq->wq_thread_yielded_count++ == 0)
1196                         wq->wq_thread_yielded_timestamp = mach_absolute_time();
1197
1198                 if (wq->wq_thread_yielded_count < wq_yielded_threshold) {
1199                         workqueue_unlock(p);
1200                         return;
1201                 }
1202
1203                 PTHREAD_TRACE(TRACE_wq_thread_yielded | DBG_FUNC_START, wq, wq->wq_thread_yielded_count, wq->wq_reqcount, 0, 0);
1204
1205                 wq->wq_thread_yielded_count = 0;
1206
1207                 curtime = mach_absolute_time();
1208                 elapsed = curtime - wq->wq_thread_yielded_timestamp;
1209                 pthread_kern->absolutetime_to_microtime(elapsed, &secs, &usecs);
1210
1211                 if (secs == 0 && usecs < wq_yielded_window_usecs) {
1212
1213                         if (wq->wq_thidlecount == 0) {
1214                                 workqueue_addnewthread(wq, TRUE);
1215                                 /*
1216                                  * 'workqueue_addnewthread' drops the workqueue lock
1217                                  * when creating the new thread and then retakes it before
1218                                  * returning... this window allows other threads to process
1219                                  * requests, so we need to recheck for available work
1220                                  * if none found, we just return...  the newly created thread
1221                                  * will eventually get used (if it hasn't already)...
1222                                  */
1223                                 if (wq->wq_reqcount == 0) {
1224                                         workqueue_unlock(p);
1225                                         return;
1226                                 }
1227                         }
1228                         if (wq->wq_thidlecount) {
1229                                 uint32_t        priority;
1230                                 boolean_t       overcommit = FALSE;
1231                                 boolean_t       force_oc = FALSE;
1232
1233                                 for (priority = 0; priority < WORKQUEUE_NUM_BUCKETS; priority++) {
1234                                         if (wq->wq_requests[priority]) {
1235                                                 break;
1236                                         }
1237                                 }
1238                                 assert(priority < WORKQUEUE_NUM_BUCKETS);
1239
1240                                 wq->wq_reqcount--;
1241                                 wq->wq_requests[priority]--;
1242
1243                                 if (wq->wq_ocrequests[priority]) {
1244                                         wq->wq_ocrequests[priority]--;
1245                                         overcommit = TRUE;
1246                                 } else
1247                                         force_oc = TRUE;
1248
1249                                 (void)workqueue_run_nextreq(p, wq, THREAD_NULL, force_oc, overcommit, pthread_priority_from_class_index(priority));
1250                                 /*
1251                                  * workqueue_run_nextreq is responsible for
1252                                  * dropping the workqueue lock in all cases
1253                                  */
1254                                 PTHREAD_TRACE(TRACE_wq_thread_yielded | DBG_FUNC_END, wq, wq->wq_thread_yielded_count, wq->wq_reqcount, 1, 0);
1255
1256                                 return;
1257                         }
1258                 }
1259                 PTHREAD_TRACE(TRACE_wq_thread_yielded | DBG_FUNC_END, wq, wq->wq_thread_yielded_count, wq->wq_reqcount, 2, 0);
1260         }
1261         workqueue_unlock(p);
1262 }
1263
1264
1265
1266 static void
1267 workqueue_callback(int type, thread_t thread)
1268 {
1269         struct uthread    *uth;
1270         struct threadlist *tl;
1271         struct workqueue  *wq;
1272
1273         uth = pthread_kern->get_bsdthread_info(thread);
1274         tl = pthread_kern->uthread_get_threadlist(uth);
1275         wq = tl->th_workq;
1276
1277         switch (type) {
1278         case SCHED_CALL_BLOCK: {
1279                 uint32_t        old_activecount;
1280                 boolean_t       start_timer = FALSE;
1281
1282                 old_activecount = OSAddAtomic(-1, &wq->wq_thactive_count[tl->th_priority]);
1283
1284                 if (old_activecount == wq->wq_reqconc[tl->th_priority]) {
1285                         uint64_t        curtime;
1286                         UInt64          *lastblocked_ptr;
1287
1288                         /*
1289                          * the number of active threads at this priority
1290                          * has fallen below the maximum number of concurrent
1291                          * threads that we're allowed to run
1292                          */
1293                         lastblocked_ptr = (UInt64 *)&wq->wq_lastblocked_ts[tl->th_priority];
1294                         curtime = mach_absolute_time();
1295
1296                         /*
1297                          * if we collide with another thread trying to update the last_blocked (really unlikely
1298                          * since another thread would have to get scheduled and then block after we start down
1299                          * this path), it's not a problem.  Either timestamp is adequate, so no need to retry
1300                          */
1301
1302                         OSCompareAndSwap64(*lastblocked_ptr, (UInt64)curtime, lastblocked_ptr);
1303
1304                         if (wq->wq_reqcount) {
1305                                 /*
1306                                  * we have work to do so start up the timer
1307                                  * if it's not running... we'll let it sort
1308                                  * out whether we really need to start up
1309                                  * another thread
1310                                  */
1311                                 WQ_TIMER_NEEDED(wq, start_timer);
1312                         }
1313
1314                         if (start_timer == TRUE) {
1315                                 workqueue_interval_timer_start(wq);
1316                         }
1317                 }
1318                 PTHREAD_TRACE1(TRACE_wq_thread_block | DBG_FUNC_START, wq, old_activecount, tl->th_priority, start_timer, thread_tid(thread));
1319                 break;
1320         }
1321         case SCHED_CALL_UNBLOCK:
1322                 /*
1323                  * we cannot take the workqueue_lock here...
1324                  * an UNBLOCK can occur from a timer event which
1325                  * is run from an interrupt context... if the workqueue_lock
1326                  * is already held by this processor, we'll deadlock...
1327                  * the thread lock for the thread being UNBLOCKED
1328                  * is also held
1329                  */
1330                 OSAddAtomic(1, &wq->wq_thactive_count[tl->th_priority]);
1331
1332                 PTHREAD_TRACE1(TRACE_wq_thread_block | DBG_FUNC_END, wq, wq->wq_threads_scheduled, tl->th_priority, 0, thread_tid(thread));
1333
1334                 break;
1335         }
1336 }
1337
1338 sched_call_t
1339 _workqueue_get_sched_callback(void)
1340 {
1341         return workqueue_callback;
1342 }
1343
1344 static void
1345 workqueue_removethread(struct threadlist *tl, int fromexit)
1346 {
1347         struct workqueue *wq;
1348         struct uthread * uth;
1349
1350         /*
1351          * If fromexit is set, the call is from workqueue_exit(,
1352          * so some cleanups are to be avoided.
1353          */
1354         wq = tl->th_workq;
1355
1356         TAILQ_REMOVE(&wq->wq_thidlelist, tl, th_entry);
1357
1358         if (fromexit == 0) {
1359                 wq->wq_nthreads--;
1360                 wq->wq_thidlecount--;
1361         }
1362
1363         /*
1364          * Clear the threadlist pointer in uthread so
1365          * blocked thread on wakeup for termination will
1366          * not access the thread list as it is going to be
1367          * freed.
1368          */
1369         pthread_kern->thread_sched_call(tl->th_thread, NULL);
1370
1371         uth = pthread_kern->get_bsdthread_info(tl->th_thread);
1372         if (uth != (struct uthread *)0) {
1373                 pthread_kern->uthread_set_threadlist(uth, NULL);
1374         }
1375         if (fromexit == 0) {
1376                 /* during exit the lock is not held */
1377                 workqueue_unlock(wq->wq_proc);
1378         }
1379
1380         if ( (tl->th_flags & TH_LIST_SUSPENDED) ) {
1381                 /*
1382                  * thread was created, but never used...
1383                  * need to clean up the stack and port ourselves
1384                  * since we're not going to spin up through the
1385                  * normal exit path triggered from Libc
1386                  */
1387                 if (fromexit == 0) {
1388                         /* vm map is already deallocated when this is called from exit */
1389                         (void)mach_vm_deallocate(wq->wq_map, tl->th_stackaddr, tl->th_allocsize);
1390                 }
1391                 (void)pthread_kern->mach_port_deallocate(pthread_kern->task_get_ipcspace(wq->wq_task), tl->th_thport);
1392
1393                 PTHREAD_TRACE1(TRACE_wq_thread_suspend | DBG_FUNC_END, wq, (uintptr_t)thread_tid(current_thread()), wq->wq_nthreads, 0xdead, thread_tid(tl->th_thread));
1394         } else {
1395
1396                 PTHREAD_TRACE1(TRACE_wq_thread_park | DBG_FUNC_END, wq, (uintptr_t)thread_tid(current_thread()), wq->wq_nthreads, 0xdead, thread_tid(tl->th_thread));
1397         }
1398         /*
1399          * drop our ref on the thread
1400          */
1401         thread_deallocate(tl->th_thread);
1402
1403         kfree(tl, sizeof(struct threadlist));
1404 }
1405
1406
1407 /*
1408  * called with workq lock held
1409  * dropped and retaken around thread creation
1410  * return with workq lock held
1411  */
1412 static boolean_t
1413 workqueue_addnewthread(struct workqueue *wq, boolean_t oc_thread)
1414 {
1415         struct threadlist *tl;
1416         struct uthread  *uth;
1417         kern_return_t   kret;
1418         thread_t        th;
1419         proc_t          p;
1420         void            *sright;
1421         mach_vm_offset_t stackaddr;
1422         mach_vm_size_t guardsize;
1423
1424         if ((wq->wq_flags & WQ_EXITING) == WQ_EXITING)
1425                 return (FALSE);
1426
1427         if (wq->wq_nthreads >= wq_max_threads || wq->wq_nthreads >= (pthread_kern->config_thread_max - 20)) {
1428                 wq->wq_lflags |= WQL_EXCEEDED_TOTAL_THREAD_LIMIT;
1429                 return (FALSE);
1430         }
1431         wq->wq_lflags &= ~WQL_EXCEEDED_TOTAL_THREAD_LIMIT;
1432
1433         if (oc_thread == FALSE && wq->wq_constrained_threads_scheduled >= wq_max_constrained_threads) {
1434                 /*
1435                  * if we're not creating this thread to service an overcommit request,
1436                  * then check the size of the constrained thread pool...  if we've already
1437                  * reached our max for threads scheduled from this pool, don't create a new
1438                  * one... the callers of this function are prepared for failure.
1439                  */
1440                 wq->wq_lflags |= WQL_EXCEEDED_CONSTRAINED_THREAD_LIMIT;
1441                 return (FALSE);
1442         }
1443         if (wq->wq_constrained_threads_scheduled < wq_max_constrained_threads)
1444                 wq->wq_lflags &= ~WQL_EXCEEDED_CONSTRAINED_THREAD_LIMIT;
1445
1446         wq->wq_nthreads++;
1447
1448         p = wq->wq_proc;
1449         workqueue_unlock(p);
1450
1451         kret = pthread_kern->thread_create_workq(wq->wq_task, (thread_continue_t)wq_unsuspend_continue, &th);
1452         if (kret != KERN_SUCCESS) {
1453                 goto failed;
1454         }
1455
1456         tl = kalloc(sizeof(struct threadlist));
1457         bzero(tl, sizeof(struct threadlist));
1458
1459 #if defined(__i386__) || defined(__x86_64__)
1460         stackaddr = 0xB0000000;
1461 #else
1462 #error Need to define a stack address hint for this architecture
1463 #endif
1464
1465         guardsize = vm_map_page_size(wq->wq_map);
1466         tl->th_allocsize = PTH_DEFAULT_STACKSIZE + guardsize + pthread_kern->proc_get_pthsize(p);
1467
1468         kret = mach_vm_map(wq->wq_map, &stackaddr,
1469                         tl->th_allocsize,
1470                         page_size-1,
1471                         VM_MAKE_TAG(VM_MEMORY_STACK)| VM_FLAGS_ANYWHERE , NULL,
1472                         0, FALSE, VM_PROT_DEFAULT, VM_PROT_ALL,
1473                         VM_INHERIT_DEFAULT);
1474
1475         if (kret != KERN_SUCCESS) {
1476                 kret = mach_vm_allocate(wq->wq_map,
1477                                         &stackaddr, tl->th_allocsize,
1478                                         VM_MAKE_TAG(VM_MEMORY_STACK) | VM_FLAGS_ANYWHERE);
1479         }
1480         if (kret == KERN_SUCCESS) {
1481                 /*
1482                  * The guard page is at the lowest address
1483                  * The stack base is the highest address
1484                  */
1485                 kret = mach_vm_protect(wq->wq_map, stackaddr, guardsize, FALSE, VM_PROT_NONE);
1486
1487                 if (kret != KERN_SUCCESS)
1488                         (void) mach_vm_deallocate(wq->wq_map, stackaddr, tl->th_allocsize);
1489         }
1490         if (kret != KERN_SUCCESS) {
1491                 (void) thread_terminate(th);
1492                 thread_deallocate(th);
1493
1494                 kfree(tl, sizeof(struct threadlist));
1495                 goto failed;
1496         }
1497         thread_reference(th);
1498
1499         sright = (void *)pthread_kern->convert_thread_to_port(th);
1500         tl->th_thport = pthread_kern->ipc_port_copyout_send(sright, pthread_kern->task_get_ipcspace(wq->wq_task));
1501
1502         pthread_kern->thread_static_param(th, TRUE);
1503
1504         tl->th_flags = TH_LIST_INITED | TH_LIST_SUSPENDED;
1505
1506         tl->th_thread = th;
1507         tl->th_workq = wq;
1508         tl->th_stackaddr = stackaddr;
1509         tl->th_priority = WORKQUEUE_NUM_BUCKETS;
1510         tl->th_policy = -1;
1511
1512         uth = pthread_kern->get_bsdthread_info(tl->th_thread);
1513
1514         workqueue_lock_spin(p);
1515
1516         pthread_kern->uthread_set_threadlist(uth, tl);
1517         TAILQ_INSERT_TAIL(&wq->wq_thidlelist, tl, th_entry);
1518
1519         wq->wq_thidlecount++;
1520
1521         PTHREAD_TRACE1(TRACE_wq_thread_suspend | DBG_FUNC_START, wq, wq->wq_nthreads, 0, thread_tid(current_thread()), thread_tid(tl->th_thread));
1522
1523         return (TRUE);
1524
1525 failed:
1526         workqueue_lock_spin(p);
1527         wq->wq_nthreads--;
1528
1529         return (FALSE);
1530 }
1531
1532
1533 int
1534 _workq_open(struct proc *p, __unused int32_t *retval)
1535 {
1536         struct workqueue * wq;
1537         int wq_size;
1538         char * ptr;
1539         uint32_t i;
1540         uint32_t num_cpus;
1541         int error = 0;
1542         boolean_t need_wakeup = FALSE;
1543
1544         if (pthread_kern->proc_get_register(p) == 0) {
1545                 return EINVAL;
1546         }
1547
1548         num_cpus = pthread_kern->ml_get_max_cpus();
1549
1550         if (wq_init_constrained_limit) {
1551                 uint32_t limit;
1552                 /*
1553                  * set up the limit for the constrained pool
1554                  * this is a virtual pool in that we don't
1555                  * maintain it on a separate idle and run list
1556                  */
1557                 limit = num_cpus * WORKQUEUE_CONSTRAINED_FACTOR;
1558
1559                 if (limit > wq_max_constrained_threads)
1560                         wq_max_constrained_threads = limit;
1561
1562                 wq_init_constrained_limit = 0;
1563         }
1564         workqueue_lock_spin(p);
1565
1566         if (pthread_kern->proc_get_wqptr(p) == NULL) {
1567
1568                 while (*pthread_kern->proc_get_wqinitingptr(p) == TRUE) {
1569
1570                         assert_wait((caddr_t)pthread_kern->proc_get_wqinitingptr(p), THREAD_UNINT);
1571                         workqueue_unlock(p);
1572
1573                         thread_block(THREAD_CONTINUE_NULL);
1574
1575                         workqueue_lock_spin(p);
1576                 }
1577                 if (pthread_kern->proc_get_wqptr(p) != NULL) {
1578                         goto out;
1579                 }
1580
1581                 *(pthread_kern->proc_get_wqinitingptr(p)) = TRUE;
1582
1583                 workqueue_unlock(p);
1584
1585                 wq_size = sizeof(struct workqueue);
1586
1587                 ptr = (char *)kalloc(wq_size);
1588                 bzero(ptr, wq_size);
1589
1590                 wq = (struct workqueue *)ptr;
1591                 wq->wq_flags = WQ_LIST_INITED;
1592                 wq->wq_proc = p;
1593                 wq->wq_max_concurrency = num_cpus;
1594                 wq->wq_task = current_task();
1595                 wq->wq_map  = pthread_kern->current_map();
1596
1597                 for (i = 0; i < WORKQUEUE_NUM_BUCKETS; i++)
1598                         wq->wq_reqconc[i] = (uint16_t)wq->wq_max_concurrency;
1599
1600                 TAILQ_INIT(&wq->wq_thrunlist);
1601                 TAILQ_INIT(&wq->wq_thidlelist);
1602
1603                 wq->wq_atimer_call = thread_call_allocate((thread_call_func_t)workqueue_add_timer, (thread_call_param_t)wq);
1604
1605                 workqueue_lock_spin(p);
1606
1607                 pthread_kern->proc_set_wqptr(p, wq);
1608                 pthread_kern->proc_set_wqsize(p, wq_size);
1609
1610                 *(pthread_kern->proc_get_wqinitingptr(p)) = FALSE;
1611                 need_wakeup = TRUE;
1612         }
1613 out:
1614         workqueue_unlock(p);
1615
1616         if (need_wakeup == TRUE) {
1617                 wakeup(pthread_kern->proc_get_wqinitingptr(p));
1618         }
1619         return(error);
1620 }
1621
1622
1623 int
1624 _workq_kernreturn(struct proc *p,
1625                   int options,
1626                   __unused user_addr_t item,
1627                   int arg2,
1628                   int arg3,
1629                   __unused int32_t *retval)
1630 {
1631         struct workqueue *wq;
1632         int error       = 0;
1633
1634         if (pthread_kern->proc_get_register(p) == 0) {
1635                 return EINVAL;
1636         }
1637
1638         switch (options) {
1639         case WQOPS_QUEUE_NEWSPISUPP: {
1640                 /*
1641                  * arg2 = offset of serialno into dispatch queue
1642                  */
1643                 int offset = arg2;
1644
1645                 pthread_kern->proc_set_dispatchqueue_serialno_offset(p, (uint64_t)offset);
1646                 break;
1647         }
1648         case WQOPS_QUEUE_REQTHREADS: {
1649                 /*
1650                  * arg2 = number of threads to start
1651                  * arg3 = priority
1652                  */
1653                 boolean_t overcommit = FALSE;
1654                 int reqcount         = arg2;
1655                 pthread_priority_t priority = arg3;
1656                 int class;
1657
1658                 overcommit = (_pthread_priority_get_flags(priority) & _PTHREAD_PRIORITY_OVERCOMMIT_FLAG) != 0;
1659                 class = pthread_priority_get_class_index(priority);
1660
1661                 if ((reqcount <= 0) || (class < 0) || (class >= WORKQUEUE_NUM_BUCKETS)) {
1662                         error = EINVAL;
1663                         break;
1664                 }
1665
1666                 workqueue_lock_spin(p);
1667
1668                 if ((wq = (struct workqueue *)pthread_kern->proc_get_wqptr(p)) == NULL) {
1669                         workqueue_unlock(p);
1670
1671                         error = EINVAL;
1672                         break;
1673                 }
1674
1675                 if (!overcommit) {
1676                         wq->wq_reqcount += reqcount;
1677                         wq->wq_requests[class] += reqcount;
1678
1679                         PTHREAD_TRACE(TRACE_wq_req_threads | DBG_FUNC_NONE, wq, priority, wq->wq_requests[class], reqcount, 0);
1680
1681                         while (wq->wq_reqcount) {
1682                                 if (!workqueue_run_one(p, wq, overcommit, priority))
1683                                         break;
1684                         }
1685                 } else {
1686                         PTHREAD_TRACE(TRACE_wq_req_octhreads | DBG_FUNC_NONE, wq, priority, wq->wq_requests[class], reqcount, 0);
1687
1688                         while (reqcount) {
1689                                 if (!workqueue_run_one(p, wq, overcommit, priority))
1690                                         break;
1691                                 reqcount--;
1692                         }
1693                         if (reqcount) {
1694                                 /*
1695                                  * we need to delay starting some of the overcommit requests...
1696                                  * we should only fail to create the overcommit threads if
1697                                  * we're at the max thread limit... as existing threads
1698                                  * return to the kernel, we'll notice the ocrequests
1699                                  * and spin them back to user space as the overcommit variety
1700                                  */
1701                                 wq->wq_reqcount += reqcount;
1702                                 wq->wq_requests[class] += reqcount;
1703                                 wq->wq_ocrequests[class] += reqcount;
1704
1705                                 PTHREAD_TRACE(TRACE_wq_delay_octhreads | DBG_FUNC_NONE, wq, priority, wq->wq_requests[class], reqcount, 0);
1706                         }
1707                 }
1708                 workqueue_unlock(p);
1709                 break;
1710         }
1711
1712         case WQOPS_THREAD_RETURN: {
1713                 thread_t th = current_thread();
1714                 struct uthread *uth = pthread_kern->get_bsdthread_info(th);
1715                 struct threadlist *tl = util_get_thread_threadlist_entry(th);
1716
1717                 /* reset signal mask on the workqueue thread to default state */
1718                 if (pthread_kern->uthread_get_sigmask(uth) != (sigset_t)(~workq_threadmask)) {
1719                         pthread_kern->proc_lock(p);
1720                         pthread_kern->uthread_set_sigmask(uth, ~workq_threadmask);
1721                         pthread_kern->proc_unlock(p);
1722                 }
1723
1724                 /* dropping WQ override counts has to be done outside the wq lock. */
1725                 wq_thread_override_reset(th, THREAD_QOS_OVERRIDE_RESOURCE_WILDCARD);
1726
1727                 workqueue_lock_spin(p);
1728
1729                 if ((wq = (struct workqueue *)pthread_kern->proc_get_wqptr(p)) == NULL || !tl) {
1730                         workqueue_unlock(p);
1731
1732                         error = EINVAL;
1733                         break;
1734                 }
1735                 PTHREAD_TRACE(TRACE_wq_runitem | DBG_FUNC_END, wq, 0, 0, 0, 0);
1736
1737
1738                 (void)workqueue_run_nextreq(p, wq, th, FALSE, FALSE, 0);
1739                 /*
1740                  * workqueue_run_nextreq is responsible for
1741                  * dropping the workqueue lock in all cases
1742                  */
1743                 break;
1744         }
1745
1746         default:
1747                 error = EINVAL;
1748                 break;
1749         }
1750         return (error);
1751 }
1752
1753 /*
1754  * Routine:     workqueue_mark_exiting
1755  *
1756  * Function:    Mark the work queue such that new threads will not be added to the
1757  *              work queue after we return.
1758  *
1759  * Conditions:  Called against the current process.
1760  */
1761 void
1762 _workqueue_mark_exiting(struct proc *p)
1763 {
1764         struct workqueue *wq = pthread_kern->proc_get_wqptr(p);
1765
1766         if (wq != NULL) {
1767
1768                 PTHREAD_TRACE(TRACE_wq_pthread_exit|DBG_FUNC_START, wq, 0, 0, 0, 0);
1769
1770                 workqueue_lock_spin(p);
1771
1772                 /*
1773                  * we now arm the timer in the callback function w/o holding the workq lock...
1774                  * we do this by setting  WQ_ATIMER_RUNNING via OSCompareAndSwap in order to
1775                  * insure only a single timer if running and to notice that WQ_EXITING has
1776                  * been set (we don't want to start a timer once WQ_EXITING is posted)
1777                  *
1778                  * so once we have successfully set WQ_EXITING, we cannot fire up a new timer...
1779                  * therefor no need to clear the timer state atomically from the flags
1780                  *
1781                  * since we always hold the workq lock when dropping WQ_ATIMER_RUNNING
1782                  * the check for and sleep until clear is protected
1783                  */
1784                 while (!(OSCompareAndSwap(wq->wq_flags, (wq->wq_flags | WQ_EXITING), (UInt32 *)&wq->wq_flags)));
1785
1786                 if (wq->wq_flags & WQ_ATIMER_RUNNING) {
1787                         if (thread_call_cancel(wq->wq_atimer_call) == TRUE) {
1788                                 wq->wq_flags &= ~WQ_ATIMER_RUNNING;
1789                         }
1790                 }
1791                 while ((wq->wq_flags & WQ_ATIMER_RUNNING) || (wq->wq_lflags & WQL_ATIMER_BUSY)) {
1792                         assert_wait((caddr_t)wq, (THREAD_UNINT));
1793                         workqueue_unlock(p);
1794
1795                         thread_block(THREAD_CONTINUE_NULL);
1796
1797                         workqueue_lock_spin(p);
1798                 }
1799                 workqueue_unlock(p);
1800
1801                 PTHREAD_TRACE(TRACE_wq_pthread_exit|DBG_FUNC_END, 0, 0, 0, 0, 0);
1802         }
1803 }
1804
1805 /*
1806  * Routine:     workqueue_exit
1807  *
1808  * Function:    clean up the work queue structure(s) now that there are no threads
1809  *              left running inside the work queue (except possibly current_thread).
1810  *
1811  * Conditions:  Called by the last thread in the process.
1812  *              Called against current process.
1813  */
1814 void
1815 _workqueue_exit(struct proc *p)
1816 {
1817         struct workqueue  * wq;
1818         struct threadlist  * tl, *tlist;
1819         struct uthread  *uth;
1820         int wq_size = 0;
1821
1822         wq = pthread_kern->proc_get_wqptr(p);
1823         if (wq != NULL) {
1824
1825                 PTHREAD_TRACE(TRACE_wq_workqueue_exit|DBG_FUNC_START, wq, 0, 0, 0, 0);
1826
1827                 wq_size = pthread_kern->proc_get_wqsize(p);
1828                 pthread_kern->proc_set_wqptr(p, NULL);
1829                 pthread_kern->proc_set_wqsize(p, 0);
1830
1831                 /*
1832                  * Clean up workqueue data structures for threads that exited and
1833                  * didn't get a chance to clean up after themselves.
1834                  */
1835                 TAILQ_FOREACH_SAFE(tl, &wq->wq_thrunlist, th_entry, tlist) {
1836                         pthread_kern->thread_sched_call(tl->th_thread, NULL);
1837
1838                         uth = pthread_kern->get_bsdthread_info(tl->th_thread);
1839                         if (uth != (struct uthread *)0) {
1840                                 pthread_kern->uthread_set_threadlist(uth, NULL);
1841                         }
1842                         TAILQ_REMOVE(&wq->wq_thrunlist, tl, th_entry);
1843
1844                         /*
1845                          * drop our last ref on the thread
1846                          */
1847                         thread_deallocate(tl->th_thread);
1848
1849                         kfree(tl, sizeof(struct threadlist));
1850                 }
1851                 TAILQ_FOREACH_SAFE(tl, &wq->wq_thidlelist, th_entry, tlist) {
1852                         workqueue_removethread(tl, 1);
1853                 }
1854                 thread_call_free(wq->wq_atimer_call);
1855
1856                 kfree(wq, wq_size);
1857
1858                 PTHREAD_TRACE(TRACE_wq_workqueue_exit|DBG_FUNC_END, 0, 0, 0, 0, 0);
1859         }
1860 }
1861
1862
1863 static boolean_t
1864 workqueue_run_one(proc_t p, struct workqueue *wq, boolean_t overcommit, pthread_priority_t priority)
1865 {
1866         boolean_t       ran_one;
1867
1868         if (wq->wq_thidlecount == 0) {
1869                 if (overcommit == FALSE) {
1870                         if (wq->wq_constrained_threads_scheduled < wq->wq_max_concurrency)
1871                                 workqueue_addnewthread(wq, overcommit);
1872                 } else {
1873                         workqueue_addnewthread(wq, overcommit);
1874
1875                         if (wq->wq_thidlecount == 0)
1876                                 return (FALSE);
1877                 }
1878         }
1879         ran_one = workqueue_run_nextreq(p, wq, THREAD_NULL, FALSE, overcommit, priority);
1880         /*
1881          * workqueue_run_nextreq is responsible for
1882          * dropping the workqueue lock in all cases
1883          */
1884         workqueue_lock_spin(p);
1885
1886         return (ran_one);
1887 }
1888
1889
1890
1891 /*
1892  * workqueue_run_nextreq:
1893  *   called with the workqueue lock held...
1894  *   responsible for dropping it in all cases
1895  */
1896 static boolean_t
1897 workqueue_run_nextreq(proc_t p, struct workqueue *wq, thread_t thread,
1898                       boolean_t force_oc, boolean_t overcommit, pthread_priority_t oc_prio)
1899 {
1900         thread_t th_to_run = THREAD_NULL;
1901         thread_t th_to_park = THREAD_NULL;
1902         int wake_thread = 0;
1903         int reuse_thread = WQ_FLAG_THREAD_REUSE;
1904         uint32_t priclass, orig_class;
1905         uint32_t us_to_wait;
1906         struct threadlist *tl = NULL;
1907         struct uthread *uth = NULL;
1908         boolean_t start_timer = FALSE;
1909         boolean_t adjust_counters = TRUE;
1910         uint64_t        curtime;
1911         uint32_t        thactive_count;
1912         uint32_t        busycount;
1913
1914         PTHREAD_TRACE(TRACE_wq_run_nextitem|DBG_FUNC_START, wq, thread, wq->wq_thidlecount, wq->wq_reqcount, 0);
1915
1916         if (thread != THREAD_NULL) {
1917                 uth = pthread_kern->get_bsdthread_info(thread);
1918
1919                 if ((tl = pthread_kern->uthread_get_threadlist(uth)) == NULL) {
1920                         panic("wq thread with no threadlist");
1921                 }
1922         }
1923
1924         /*
1925          * from here until we drop the workq lock
1926          * we can't be pre-empted since we hold
1927          * the lock in spin mode... this is important
1928          * since we have to independently update the priority that
1929          * the thread is associated with and the priorty based
1930          * counters that "workqueue_callback" also changes and bases
1931          * decisons on.
1932          */
1933 dispatch_overcommit:
1934
1935         if (overcommit || force_oc) {
1936                 priclass = pthread_priority_get_class_index(oc_prio);
1937
1938                 if (thread != THREAD_NULL) {
1939                         th_to_run = thread;
1940                         goto pick_up_work;
1941                 }
1942                 goto grab_idle_thread;
1943         }
1944         if (wq->wq_reqcount) {
1945                 for (priclass = 0; priclass < WORKQUEUE_NUM_BUCKETS; priclass++) {
1946                         if (wq->wq_requests[priclass])
1947                                 break;
1948                 }
1949                 assert(priclass < WORKQUEUE_NUM_BUCKETS);
1950
1951                 if (wq->wq_ocrequests[priclass] && (thread != THREAD_NULL || wq->wq_thidlecount)) {
1952                         /*
1953                          * handle delayed overcommit request...
1954                          * they have priority over normal requests
1955                          * within a given priority level
1956                          */
1957                         wq->wq_reqcount--;
1958                         wq->wq_requests[priclass]--;
1959                         wq->wq_ocrequests[priclass]--;
1960
1961                         oc_prio = pthread_priority_from_class_index(priclass);
1962                         overcommit = TRUE;
1963
1964                         goto dispatch_overcommit;
1965                 }
1966         }
1967         /*
1968          * if we get here, the work should be handled by a constrained thread
1969          */
1970         if (wq->wq_reqcount == 0 || wq->wq_constrained_threads_scheduled >= wq_max_constrained_threads) {
1971                 /*
1972                  * no work to do, or we're already at or over the scheduling limit for
1973                  * constrained threads...  just return or park the thread...
1974                  * do not start the timer for this condition... if we don't have any work,
1975                  * we'll check again when new work arrives... if we're over the limit, we need 1 or more
1976                  * constrained threads to return to the kernel before we can dispatch additional work
1977                  */
1978                 if ((th_to_park = thread) == THREAD_NULL)
1979                         goto out_of_work;
1980                 goto parkit;
1981         }
1982
1983         thactive_count = 0;
1984         busycount = 0;
1985
1986         curtime = mach_absolute_time();
1987
1988         thactive_count += wq->wq_thactive_count[priclass];
1989
1990         if (wq->wq_thscheduled_count[priclass]) {
1991                 if (wq_thread_is_busy(curtime, &wq->wq_lastblocked_ts[priclass])) {
1992                         busycount++;
1993                 }
1994         }
1995
1996         if (thread != THREAD_NULL) {
1997                 if (tl->th_priority == priclass) {
1998                         /*
1999                          * dont't count this thread as currently active
2000                          */
2001                         thactive_count--;
2002                 }
2003         }
2004         if (thactive_count + busycount >= wq->wq_max_concurrency) {
2005                 if (busycount) {
2006                                 /*
2007                                  * we found at least 1 thread in the
2008                                  * 'busy' state... make sure we start
2009                                  * the timer because if they are the only
2010                                  * threads keeping us from scheduling
2011                                  * this work request, we won't get a callback
2012                                  * to kick off the timer... we need to
2013                                  * start it now...
2014                                  */
2015                                 WQ_TIMER_NEEDED(wq, start_timer);
2016                 }
2017
2018                 PTHREAD_TRACE(TRACE_wq_overcommitted|DBG_FUNC_NONE, wq, (start_timer ? 1<<7 : 0) | pthread_priority_from_class_index(priclass), thactive_count, busycount, 0);
2019
2020                 if ((th_to_park = thread) == THREAD_NULL) {
2021                         goto out_of_work;
2022                 }
2023
2024                 goto parkit;
2025         }
2026
2027         if (thread != THREAD_NULL) {
2028                 /*
2029                  * thread is non-NULL here when we return from userspace
2030                  * in workq_kernreturn, rather than trying to find a thread
2031                  * we pick up new work for this specific thread.
2032                  */
2033                 th_to_run = thread;
2034                 goto pick_up_work;
2035         }
2036
2037 grab_idle_thread:
2038         if (wq->wq_thidlecount == 0) {
2039                 /*
2040                  * we have no additional threads waiting to pick up
2041                  * work, however, there is additional work to do.
2042                  */
2043                 WQ_TIMER_NEEDED(wq, start_timer);
2044
2045                 PTHREAD_TRACE(TRACE_wq_stalled, wq, wq->wq_nthreads, start_timer, 0, 0);
2046
2047                 goto no_thread_to_run;
2048         }
2049
2050         /*
2051          * we already know there is both work available
2052          * and an idle thread, so activate a thread and then
2053          * fall into the code that pulls a new work request...
2054          */
2055         tl = TAILQ_FIRST(&wq->wq_thidlelist);
2056         TAILQ_REMOVE(&wq->wq_thidlelist, tl, th_entry);
2057         wq->wq_thidlecount--;
2058
2059         TAILQ_INSERT_TAIL(&wq->wq_thrunlist, tl, th_entry);
2060
2061         if ((tl->th_flags & TH_LIST_SUSPENDED) == TH_LIST_SUSPENDED) {
2062                 tl->th_flags &= ~TH_LIST_SUSPENDED;
2063                 reuse_thread = 0;
2064
2065         } else if ((tl->th_flags & TH_LIST_BLOCKED) == TH_LIST_BLOCKED) {
2066                 tl->th_flags &= ~TH_LIST_BLOCKED;
2067                 wake_thread = 1;
2068         }
2069         tl->th_flags |= TH_LIST_RUNNING | TH_LIST_BUSY;
2070
2071         wq->wq_threads_scheduled++;
2072         wq->wq_thscheduled_count[priclass]++;
2073         OSAddAtomic(1, &wq->wq_thactive_count[priclass]);
2074
2075         adjust_counters = FALSE;
2076         th_to_run = tl->th_thread;
2077
2078 pick_up_work:
2079         if (!overcommit && !force_oc) {
2080                 wq->wq_reqcount--;
2081                 wq->wq_requests[priclass]--;
2082
2083                 if ( !(tl->th_flags & TH_LIST_CONSTRAINED)) {
2084                         wq->wq_constrained_threads_scheduled++;
2085                         tl->th_flags |= TH_LIST_CONSTRAINED;
2086                 }
2087         } else {
2088                 if (tl->th_flags & TH_LIST_CONSTRAINED) {
2089                         wq->wq_constrained_threads_scheduled--;
2090                         tl->th_flags &= ~TH_LIST_CONSTRAINED;
2091                 }
2092         }
2093
2094         orig_class = tl->th_priority;
2095         tl->th_priority = (uint8_t)priclass;
2096
2097         if (adjust_counters && (orig_class != priclass)) {
2098                 /*
2099                  * we need to adjust these counters based on this
2100                  * thread's new disposition w/r to priority
2101                  */
2102                 OSAddAtomic(-1, &wq->wq_thactive_count[orig_class]);
2103                 OSAddAtomic(1, &wq->wq_thactive_count[priclass]);
2104
2105                 wq->wq_thscheduled_count[orig_class]--;
2106                 wq->wq_thscheduled_count[priclass]++;
2107         }
2108         wq->wq_thread_yielded_count = 0;
2109
2110         workqueue_unlock(p);
2111
2112         if (orig_class != priclass) {
2113                 pthread_priority_t pri = pthread_priority_from_class_index(priclass);
2114
2115                 thread_qos_policy_data_t qosinfo;
2116
2117                 /* Set the QoS tier on the thread, along with the ceiling of max importance for this class. */
2118                 qosinfo.qos_tier = pthread_priority_get_qos_class(pri);
2119                 qosinfo.tier_importance = 0;
2120
2121                 PTHREAD_TRACE(TRACE_wq_reset_priority | DBG_FUNC_START, wq, thread_tid(tl->th_thread), pthread_priority_from_class_index(orig_class), 0, 0);
2122
2123                 /* All the previous implementation here now boils down to setting the QoS policy on the thread. */
2124                 pthread_kern->thread_policy_set_internal(th_to_run, THREAD_QOS_POLICY, (thread_policy_t)&qosinfo, THREAD_QOS_POLICY_COUNT);
2125
2126                 PTHREAD_TRACE(TRACE_wq_reset_priority | DBG_FUNC_END, wq, thread_tid(tl->th_thread), pthread_priority_from_class_index(priclass), qosinfo.qos_tier, 0);
2127         }
2128
2129         /*
2130          * if current thread is reused for work request, does not return via unix_syscall
2131          */
2132         wq_runreq(p, overcommit, pthread_priority_from_class_index(priclass), th_to_run, tl, reuse_thread, wake_thread, (thread == th_to_run));
2133
2134         PTHREAD_TRACE(TRACE_wq_run_nextitem|DBG_FUNC_END, wq, thread_tid(th_to_run), overcommit, 1, 0);
2135
2136         return (TRUE);
2137
2138 out_of_work:
2139         /*
2140          * we have no work to do or we are fully booked
2141          * w/r to running threads...
2142          */
2143 no_thread_to_run:
2144         workqueue_unlock(p);
2145
2146         if (start_timer)
2147                 workqueue_interval_timer_start(wq);
2148
2149         PTHREAD_TRACE(TRACE_wq_run_nextitem|DBG_FUNC_END, wq, thread_tid(thread), start_timer, 2, 0);
2150
2151         return (FALSE);
2152
2153 parkit:
2154         /*
2155          * this is a workqueue thread with no more
2156          * work to do... park it for now
2157          */
2158         TAILQ_REMOVE(&wq->wq_thrunlist, tl, th_entry);
2159         tl->th_flags &= ~TH_LIST_RUNNING;
2160
2161         tl->th_flags |= TH_LIST_BLOCKED;
2162         TAILQ_INSERT_HEAD(&wq->wq_thidlelist, tl, th_entry);
2163
2164         pthread_kern->thread_sched_call(th_to_park, NULL);
2165
2166         OSAddAtomic(-1, &wq->wq_thactive_count[tl->th_priority]);
2167         wq->wq_thscheduled_count[tl->th_priority]--;
2168         wq->wq_threads_scheduled--;
2169
2170         if (tl->th_flags & TH_LIST_CONSTRAINED) {
2171                 wq->wq_constrained_threads_scheduled--;
2172                 wq->wq_lflags &= ~WQL_EXCEEDED_CONSTRAINED_THREAD_LIMIT;
2173                 tl->th_flags &= ~TH_LIST_CONSTRAINED;
2174         }
2175         if (wq->wq_thidlecount < 100)
2176                 us_to_wait = wq_reduce_pool_window_usecs - (wq->wq_thidlecount * (wq_reduce_pool_window_usecs / 100));
2177         else
2178                 us_to_wait = wq_reduce_pool_window_usecs / 100;
2179
2180         wq->wq_thidlecount++;
2181         wq->wq_lflags &= ~WQL_EXCEEDED_TOTAL_THREAD_LIMIT;
2182
2183         assert_wait_timeout_with_leeway((caddr_t)tl, (THREAD_INTERRUPTIBLE),
2184                         TIMEOUT_URGENCY_SYS_BACKGROUND|TIMEOUT_URGENCY_LEEWAY, us_to_wait,
2185                         wq_reduce_pool_window_usecs, NSEC_PER_USEC);
2186
2187         workqueue_unlock(p);
2188
2189         if (start_timer)
2190                 workqueue_interval_timer_start(wq);
2191
2192         PTHREAD_TRACE1(TRACE_wq_thread_park | DBG_FUNC_START, wq, wq->wq_threads_scheduled, wq->wq_thidlecount, us_to_wait, thread_tid(th_to_park));
2193         PTHREAD_TRACE(TRACE_wq_run_nextitem | DBG_FUNC_END, wq, thread_tid(thread), 0, 3, 0);
2194
2195         thread_block((thread_continue_t)wq_unpark_continue);
2196         /* NOT REACHED */
2197
2198         return (FALSE);
2199 }
2200
2201
2202 static void
2203 wq_unsuspend_continue(void)
2204 {
2205         struct uthread *uth = NULL;
2206         thread_t th_to_unsuspend;
2207         struct threadlist *tl;
2208         proc_t  p;
2209
2210         th_to_unsuspend = current_thread();
2211         uth = pthread_kern->get_bsdthread_info(th_to_unsuspend);
2212
2213         if (uth != NULL && (tl = pthread_kern->uthread_get_threadlist(uth)) != NULL) {
2214
2215                 if ((tl->th_flags & (TH_LIST_RUNNING | TH_LIST_BUSY)) == TH_LIST_RUNNING) {
2216                         /*
2217                          * most likely a normal resume of this thread occurred...
2218                          * it's also possible that the thread was aborted after we
2219                          * finished setting it up so that it could be dispatched... if
2220                          * so, thread_bootstrap_return will notice the abort and put
2221                          * the thread on the path to self-destruction
2222                          */
2223 normal_resume_to_user:
2224                         pthread_kern->thread_sched_call(th_to_unsuspend, workqueue_callback);
2225                         pthread_kern->thread_bootstrap_return();
2226                 }
2227                 /*
2228                  * if we get here, it's because we've been resumed due to
2229                  * an abort of this thread (process is crashing)
2230                  */
2231                 p = current_proc();
2232
2233                 workqueue_lock_spin(p);
2234
2235                 if (tl->th_flags & TH_LIST_SUSPENDED) {
2236                         /*
2237                          * thread has been aborted while still on our idle
2238                          * queue... remove it from our domain...
2239                          * workqueue_removethread consumes the lock
2240                          */
2241                         workqueue_removethread(tl, 0);
2242                         pthread_kern->thread_bootstrap_return();
2243                 }
2244                 while ((tl->th_flags & TH_LIST_BUSY)) {
2245                         /*
2246                          * this thread was aborted after we started making
2247                          * it runnable, but before we finished dispatching it...
2248                          * we need to wait for that process to finish,
2249                          * and we need to ask for a wakeup instead of a
2250                          * thread_resume since the abort has already resumed us
2251                          */
2252                         tl->th_flags |= TH_LIST_NEED_WAKEUP;
2253
2254                         assert_wait((caddr_t)tl, (THREAD_UNINT));
2255
2256                         workqueue_unlock(p);
2257                         thread_block(THREAD_CONTINUE_NULL);
2258                         workqueue_lock_spin(p);
2259                 }
2260                 workqueue_unlock(p);
2261                 /*
2262                  * we have finished setting up the thread's context...
2263                  * thread_bootstrap_return will take us through the abort path
2264                  * where the thread will self destruct
2265                  */
2266                 goto normal_resume_to_user;
2267         }
2268         pthread_kern->thread_bootstrap_return();
2269 }
2270
2271
2272 static void
2273 wq_unpark_continue(void)
2274 {
2275         struct uthread *uth = NULL;
2276         struct threadlist *tl;
2277         thread_t th_to_unpark;
2278         proc_t  p;
2279
2280         th_to_unpark = current_thread();
2281         uth = pthread_kern->get_bsdthread_info(th_to_unpark);
2282
2283         if (uth != NULL) {
2284                 if ((tl = pthread_kern->uthread_get_threadlist(uth)) != NULL) {
2285
2286                         if ((tl->th_flags & (TH_LIST_RUNNING | TH_LIST_BUSY)) == TH_LIST_RUNNING) {
2287                                 /*
2288                                  * a normal wakeup of this thread occurred... no need
2289                                  * for any synchronization with the timer and wq_runreq
2290                                  */
2291 normal_return_to_user:
2292                                 pthread_kern->thread_sched_call(th_to_unpark, workqueue_callback);
2293
2294                                 PTHREAD_TRACE(0xefffd018 | DBG_FUNC_END, tl->th_workq, 0, 0, 0, 0);
2295
2296                                 pthread_kern->thread_exception_return();
2297                         }
2298                         p = current_proc();
2299
2300                         workqueue_lock_spin(p);
2301
2302                         if ( !(tl->th_flags & TH_LIST_RUNNING)) {
2303                                 /*
2304                                  * the timer popped us out and we've not
2305                                  * been moved off of the idle list
2306                                  * so we should now self-destruct
2307                                  *
2308                                  * workqueue_removethread consumes the lock
2309                                  */
2310                                 workqueue_removethread(tl, 0);
2311                                 pthread_kern->thread_exception_return();
2312                         }
2313                         /*
2314                          * the timer woke us up, but we have already
2315                          * started to make this a runnable thread,
2316                          * but have not yet finished that process...
2317                          * so wait for the normal wakeup
2318                          */
2319                         while ((tl->th_flags & TH_LIST_BUSY)) {
2320
2321                                 assert_wait((caddr_t)tl, (THREAD_UNINT));
2322
2323                                 workqueue_unlock(p);
2324
2325                                 thread_block(THREAD_CONTINUE_NULL);
2326
2327                                 workqueue_lock_spin(p);
2328                         }
2329                         /*
2330                          * we have finished setting up the thread's context
2331                          * now we can return as if we got a normal wakeup
2332                          */
2333                         workqueue_unlock(p);
2334
2335                         goto normal_return_to_user;
2336                 }
2337         }
2338         pthread_kern->thread_exception_return();
2339 }
2340
2341
2342
2343 static void
2344 wq_runreq(proc_t p, boolean_t overcommit, pthread_priority_t priority, thread_t th, struct threadlist *tl,
2345            int reuse_thread, int wake_thread, int return_directly)
2346 {
2347         int ret = 0;
2348         boolean_t need_resume = FALSE;
2349
2350         PTHREAD_TRACE1(TRACE_wq_runitem | DBG_FUNC_START, tl->th_workq, overcommit, priority, thread_tid(current_thread()), thread_tid(th));
2351
2352         ret = _setup_wqthread(p, th, overcommit, priority, reuse_thread, tl);
2353
2354         if (ret != 0)
2355                 panic("setup_wqthread failed  %x\n", ret);
2356
2357         if (return_directly) {
2358                 PTHREAD_TRACE(TRACE_wq_run_nextitem|DBG_FUNC_END, tl->th_workq, 0, 0, 4, 0);
2359
2360                 pthread_kern->thread_exception_return();
2361                 panic("wq_runreq: thread_exception_return returned ...\n");
2362         }
2363         if (wake_thread) {
2364                 workqueue_lock_spin(p);
2365
2366                 tl->th_flags &= ~TH_LIST_BUSY;
2367                 wakeup(tl);
2368
2369                 workqueue_unlock(p);
2370         } else {
2371                 PTHREAD_TRACE1(TRACE_wq_thread_suspend | DBG_FUNC_END, tl->th_workq, 0, 0, thread_tid(current_thread()), thread_tid(th));
2372
2373                 workqueue_lock_spin(p);
2374
2375                 if (tl->th_flags & TH_LIST_NEED_WAKEUP) {
2376                         wakeup(tl);
2377                 } else {
2378                         need_resume = TRUE;
2379                 }
2380
2381                 tl->th_flags &= ~(TH_LIST_BUSY | TH_LIST_NEED_WAKEUP);
2382
2383                 workqueue_unlock(p);
2384
2385                 if (need_resume) {
2386                         /*
2387                          * need to do this outside of the workqueue spin lock
2388                          * since thread_resume locks the thread via a full mutex
2389                          */
2390                         pthread_kern->thread_resume(th);
2391                 }
2392         }
2393 }
2394
2395
2396 int
2397 _setup_wqthread(proc_t p, thread_t th, boolean_t overcommit, pthread_priority_t priority, int reuse_thread, struct threadlist *tl)
2398 {
2399         uint32_t flags = reuse_thread | WQ_FLAG_THREAD_NEWSPI;
2400         mach_vm_size_t guardsize = vm_map_page_size(tl->th_workq->wq_map);
2401         int error = 0;
2402
2403         if (overcommit) {
2404                 flags |= WQ_FLAG_THREAD_OVERCOMMIT;
2405         }
2406
2407         /* Put the QoS class value into the lower bits of the reuse_thread register, this is where
2408          * the thread priority used to be stored anyway.
2409          */
2410         flags |= (_pthread_priority_get_qos_newest(priority) & WQ_FLAG_THREAD_PRIOMASK);
2411
2412 #if defined(__i386__) || defined(__x86_64__)
2413         int isLP64 = proc_is64bit(p);
2414
2415         /*
2416          * Set up i386 registers & function call.
2417          */
2418         if (isLP64 == 0) {
2419                 x86_thread_state32_t state;
2420                 x86_thread_state32_t *ts = &state;
2421
2422                 ts->eip = (unsigned int)pthread_kern->proc_get_wqthread(p);
2423                 ts->eax = (unsigned int)(tl->th_stackaddr + PTH_DEFAULT_STACKSIZE + guardsize);
2424                 ts->ebx = (unsigned int)tl->th_thport;
2425                 ts->ecx = (unsigned int)(tl->th_stackaddr + guardsize);
2426                 ts->edx = (unsigned int)0;
2427                 ts->edi = (unsigned int)flags;
2428                 ts->esi = (unsigned int)0;
2429                 /*
2430                  * set stack pointer
2431                  */
2432                 ts->esp = (int)((vm_offset_t)((tl->th_stackaddr + PTH_DEFAULT_STACKSIZE + guardsize) - C_32_STK_ALIGN));
2433
2434                 (void)pthread_kern->thread_set_wq_state32(th, (thread_state_t)ts);
2435
2436         } else {
2437                 x86_thread_state64_t state64;
2438                 x86_thread_state64_t *ts64 = &state64;
2439
2440                 ts64->rip = (uint64_t)pthread_kern->proc_get_wqthread(p);
2441                 ts64->rdi = (uint64_t)(tl->th_stackaddr + PTH_DEFAULT_STACKSIZE + guardsize);
2442                 ts64->rsi = (uint64_t)(tl->th_thport);
2443                 ts64->rdx = (uint64_t)(tl->th_stackaddr + guardsize);
2444                 ts64->rcx = (uint64_t)0;
2445                 ts64->r8 = (uint64_t)flags;
2446                 ts64->r9 = (uint64_t)0;
2447
2448                 /*
2449                  * set stack pointer aligned to 16 byte boundary
2450                  */
2451                 ts64->rsp = (uint64_t)((tl->th_stackaddr + PTH_DEFAULT_STACKSIZE + guardsize) - C_64_REDZONE_LEN);
2452
2453                 error = pthread_kern->thread_set_wq_state64(th, (thread_state_t)ts64);
2454                 if (error != KERN_SUCCESS) {
2455                         error = EINVAL;
2456                 }
2457         }
2458 #else
2459 #error setup_wqthread  not defined for this architecture
2460 #endif
2461
2462         return error;
2463 }
2464
2465 int
2466 _fill_procworkqueue(proc_t p, struct proc_workqueueinfo * pwqinfo)
2467 {
2468         struct workqueue * wq;
2469         int error = 0;
2470         int     activecount;
2471         uint32_t pri;
2472
2473         workqueue_lock_spin(p);
2474         if ((wq = pthread_kern->proc_get_wqptr(p)) == NULL) {
2475                 error = EINVAL;
2476                 goto out;
2477         }
2478         activecount = 0;
2479
2480         for (pri = 0; pri < WORKQUEUE_NUM_BUCKETS; pri++) {
2481                 activecount += wq->wq_thactive_count[pri];
2482         }
2483         pwqinfo->pwq_nthreads = wq->wq_nthreads;
2484         pwqinfo->pwq_runthreads = activecount;
2485         pwqinfo->pwq_blockedthreads = wq->wq_threads_scheduled - activecount;
2486         pwqinfo->pwq_state = 0;
2487
2488         if (wq->wq_lflags & WQL_EXCEEDED_CONSTRAINED_THREAD_LIMIT) {
2489                 pwqinfo->pwq_state |= WQ_EXCEEDED_CONSTRAINED_THREAD_LIMIT;
2490         }
2491
2492         if (wq->wq_lflags & WQL_EXCEEDED_TOTAL_THREAD_LIMIT) {
2493                 pwqinfo->pwq_state |= WQ_EXCEEDED_TOTAL_THREAD_LIMIT;
2494         }
2495
2496 out:
2497         workqueue_unlock(p);
2498         return(error);
2499 }
2500
2501 int
2502 _thread_selfid(__unused struct proc *p, uint64_t *retval)
2503 {
2504         thread_t thread = current_thread();
2505         *retval = thread_tid(thread);
2506         return KERN_SUCCESS;
2507 }
2508
2509 void
2510 _pthread_init(void)
2511 {
2512         pthread_lck_grp_attr = lck_grp_attr_alloc_init();
2513         pthread_lck_grp = lck_grp_alloc_init("pthread", pthread_lck_grp_attr);
2514
2515         /*
2516          * allocate the lock attribute for pthread synchronizers
2517          */
2518         pthread_lck_attr = lck_attr_alloc_init();
2519
2520         _workqueue_init_lock((proc_t)get_bsdtask_info(kernel_task));
2521         pthread_list_mlock = lck_mtx_alloc_init(pthread_lck_grp, pthread_lck_attr);
2522
2523         pth_global_hashinit();
2524         psynch_thcall = thread_call_allocate(psynch_wq_cleanup, NULL);
2525         psynch_zoneinit();
2526
2527         /*
2528          * register sysctls
2529          */
2530         sysctl_register_oid(&sysctl__kern_wq_yielded_threshold);
2531         sysctl_register_oid(&sysctl__kern_wq_yielded_window_usecs);
2532         sysctl_register_oid(&sysctl__kern_wq_stalled_window_usecs);
2533         sysctl_register_oid(&sysctl__kern_wq_reduce_pool_window_usecs);
2534         sysctl_register_oid(&sysctl__kern_wq_max_timer_interval_usecs);
2535         sysctl_register_oid(&sysctl__kern_wq_max_threads);
2536         sysctl_register_oid(&sysctl__kern_wq_max_constrained_threads);
2537         sysctl_register_oid(&sysctl__kern_pthread_debug_tracing);
2538 }