kern/kern_support.c

   1 /*
   2  * Copyright (c) 2000-2012 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995-2005 Apple Computer, Inc. All Rights Reserved */
  29 /*
  30  *      pthread_synch.c
  31  */
  32
  33 #define  _PTHREAD_CONDATTR_T
  34 #define  _PTHREAD_COND_T
  35 #define _PTHREAD_MUTEXATTR_T
  36 #define _PTHREAD_MUTEX_T
  37 #define _PTHREAD_RWLOCKATTR_T
  38 #define _PTHREAD_RWLOCK_T
  39
  40 #undef pthread_mutexattr_t
  41 #undef pthread_mutex_t
  42 #undef pthread_condattr_t
  43 #undef pthread_cond_t
  44 #undef pthread_rwlockattr_t
  45 #undef pthread_rwlock_t
  46
  47 #include <sys/param.h>
  48 #include <sys/queue.h>
  49 #include <sys/resourcevar.h>
  50 //#include <sys/proc_internal.h>
  51 #include <sys/kauth.h>
  52 #include <sys/systm.h>
  53 #include <sys/timeb.h>
  54 #include <sys/times.h>
  55 #include <sys/acct.h>
  56 #include <sys/kernel.h>
  57 #include <sys/wait.h>
  58 #include <sys/signalvar.h>
  59 #include <sys/sysctl.h>
  60 #include <sys/syslog.h>
  61 #include <sys/stat.h>
  62 #include <sys/lock.h>
  63 #include <sys/kdebug.h>
  64 //#include <sys/sysproto.h>
  65 #include <sys/vm.h>
  66 #include <sys/user.h>           /* for coredump */
  67 #include <sys/proc_info.h>      /* for fill_procworkqueue */
  68
  69
  70 #include <mach/mach_port.h>
  71 #include <mach/mach_types.h>
  72 #include <mach/semaphore.h>
  73 #include <mach/sync_policy.h>
  74 #include <mach/task.h>
  75 #include <mach/vm_prot.h>
  76 #include <kern/kern_types.h>
  77 #include <kern/task.h>
  78 #include <kern/clock.h>
  79 #include <mach/kern_return.h>
  80 #include <kern/thread.h>
  81 #include <kern/sched_prim.h>
  82 #include <kern/kalloc.h>
  83 #include <kern/sched_prim.h>    /* for thread_exception_return */
  84 #include <kern/processor.h>
  85 #include <kern/assert.h>
  86 #include <mach/mach_vm.h>
  87 #include <mach/mach_param.h>
  88 #include <mach/thread_status.h>
  89 #include <mach/thread_policy.h>
  90 #include <mach/message.h>
  91 #include <mach/port.h>
  92 //#include <vm/vm_protos.h>
  93 #include <vm/vm_fault.h>
  94 #include <vm/vm_map.h>
  95 #include <mach/thread_act.h> /* for thread_resume */
  96 #include <machine/machine_routines.h>
  97
  98 #include <libkern/OSAtomic.h>
  99
 100 #include <sys/pthread_shims.h>
 101 #include "kern_internal.h"
 102
 103 uint32_t pthread_debug_tracing = 0;
 104
 105 SYSCTL_INT(_kern, OID_AUTO, pthread_debug_tracing, CTLFLAG_RW | CTLFLAG_LOCKED,
 106                    &pthread_debug_tracing, 0, "")
 107
 108 // XXX: Dirty import for sys/signarvar.h that's wrapped in BSD_KERNEL_PRIVATE
 109 #define sigcantmask (sigmask(SIGKILL) | sigmask(SIGSTOP))
 110
 111 lck_grp_attr_t   *pthread_lck_grp_attr;
 112 lck_grp_t    *pthread_lck_grp;
 113 lck_attr_t   *pthread_lck_attr;
 114
 115 extern void thread_set_cthreadself(thread_t thread, uint64_t pself, int isLP64);
 116 extern void workqueue_thread_yielded(void);
 117
 118 static boolean_t workqueue_run_nextreq(proc_t p, struct workqueue *wq, thread_t th, boolean_t force_oc,
 119                                         boolean_t  overcommit, pthread_priority_t oc_prio);
 120
 121 static boolean_t workqueue_run_one(proc_t p, struct workqueue *wq, boolean_t overcommit, pthread_priority_t priority);
 122
 123 static void wq_runreq(proc_t p, boolean_t overcommit, pthread_priority_t priority, thread_t th, struct threadlist *tl,
 124                        int reuse_thread, int wake_thread, int return_directly);
 125
 126 static int _setup_wqthread(proc_t p, thread_t th, boolean_t overcommit, pthread_priority_t priority, int reuse_thread, struct threadlist *tl);
 127
 128 static void wq_unpark_continue(void);
 129 static void wq_unsuspend_continue(void);
 130
 131 static boolean_t workqueue_addnewthread(struct workqueue *wq, boolean_t oc_thread);
 132 static void workqueue_removethread(struct threadlist *tl, int fromexit);
 133 static void workqueue_lock_spin(proc_t);
 134 static void workqueue_unlock(proc_t);
 135
 136 int proc_settargetconc(pid_t pid, int queuenum, int32_t targetconc);
 137 int proc_setalltargetconc(pid_t pid, int32_t * targetconcp);
 138
 139 #define WQ_MAXPRI_MIN   0       /* low prio queue num */
 140 #define WQ_MAXPRI_MAX   2       /* max  prio queuenum */
 141 #define WQ_PRI_NUM      3       /* number of prio work queues */
 142
 143 #define C_32_STK_ALIGN          16
 144 #define C_64_STK_ALIGN          16
 145 #define C_64_REDZONE_LEN        128
 146 #define TRUNC_DOWN32(a,c)       ((((uint32_t)a)-(c)) & ((uint32_t)(-(c))))
 147 #define TRUNC_DOWN64(a,c)       ((((uint64_t)a)-(c)) & ((uint64_t)(-(c))))
 148
 149 /*
 150  * Flags filed passed to bsdthread_create and back in pthread_start
 151 31  <---------------------------------> 0
 152 _________________________________________
 153 | flags(8) | policy(8) | importance(16) |
 154 -----------------------------------------
 155 */
 156
 157 #define PTHREAD_START_CUSTOM    0x01000000
 158 #define PTHREAD_START_SETSCHED  0x02000000
 159 #define PTHREAD_START_DETACHED  0x04000000
 160 #define PTHREAD_START_QOSCLASS  0x08000000
 161 #define PTHREAD_START_QOSCLASS_MASK 0xffffff
 162 #define PTHREAD_START_POLICY_BITSHIFT 16
 163 #define PTHREAD_START_POLICY_MASK 0xff
 164 #define PTHREAD_START_IMPORTANCE_MASK 0xffff
 165
 166 #define SCHED_OTHER      POLICY_TIMESHARE
 167 #define SCHED_FIFO       POLICY_FIFO
 168 #define SCHED_RR         POLICY_RR
 169
 170 int
 171 _bsdthread_create(struct proc *p, user_addr_t user_func, user_addr_t user_funcarg, user_addr_t user_stack, user_addr_t user_pthread, uint32_t flags, user_addr_t *retval)
 172 {
 173         kern_return_t kret;
 174         void * sright;
 175         int error = 0;
 176         int allocated = 0;
 177         mach_vm_offset_t stackaddr;
 178         mach_vm_size_t th_allocsize = 0;
 179         mach_vm_size_t user_stacksize;
 180         mach_vm_size_t th_stacksize;
 181         mach_vm_size_t th_guardsize;
 182         mach_vm_offset_t th_stackaddr;
 183         mach_vm_offset_t th_stack;
 184         mach_vm_offset_t th_pthread;
 185         mach_port_name_t th_thport;
 186         thread_t th;
 187         vm_map_t vmap = pthread_kern->current_map();
 188         task_t ctask = current_task();
 189         unsigned int policy, importance;
 190
 191         int isLP64 = 0;
 192
 193         if (pthread_kern->proc_get_register(p) == 0) {
 194                 return EINVAL;
 195         }
 196
 197         PTHREAD_TRACE(TRACE_pthread_thread_create | DBG_FUNC_START, flags, 0, 0, 0, 0);
 198
 199         isLP64 = proc_is64bit(p);
 200         th_guardsize = vm_map_page_size(vmap);
 201
 202 #if defined(__i386__) || defined(__x86_64__)
 203         stackaddr = 0xB0000000;
 204 #else
 205 #error Need to define a stack address hint for this architecture
 206 #endif
 207         kret = pthread_kern->thread_create(ctask, &th);
 208         if (kret != KERN_SUCCESS)
 209                 return(ENOMEM);
 210         thread_reference(th);
 211
 212         sright = (void *)pthread_kern->convert_thread_to_port(th);
 213         th_thport = pthread_kern->ipc_port_copyout_send(sright, pthread_kern->task_get_ipcspace(ctask));
 214
 215         if ((flags & PTHREAD_START_CUSTOM) == 0) {
 216                 th_stacksize = (mach_vm_size_t)user_stack;              /* if it is custom them it is stacksize */
 217                 th_allocsize = th_stacksize + th_guardsize + pthread_kern->proc_get_pthsize(p);
 218
 219                 kret = mach_vm_map(vmap, &stackaddr,
 220                                 th_allocsize,
 221                                 page_size-1,
 222                                 VM_MAKE_TAG(VM_MEMORY_STACK)| VM_FLAGS_ANYWHERE , NULL,
 223                                 0, FALSE, VM_PROT_DEFAULT, VM_PROT_ALL,
 224                                 VM_INHERIT_DEFAULT);
 225                 if (kret != KERN_SUCCESS)
 226                         kret = mach_vm_allocate(vmap,
 227                                         &stackaddr, th_allocsize,
 228                                         VM_MAKE_TAG(VM_MEMORY_STACK)| VM_FLAGS_ANYWHERE);
 229                 if (kret != KERN_SUCCESS) {
 230                         error = ENOMEM;
 231                         goto out;
 232                 }
 233
 234                 PTHREAD_TRACE(TRACE_pthread_thread_create|DBG_FUNC_NONE, th_allocsize, stackaddr, 0, 2, 0);
 235
 236                 th_stackaddr = stackaddr;
 237                 allocated = 1;
 238                 /*
 239                  * The guard page is at the lowest address
 240                  * The stack base is the highest address
 241                  */
 242                 kret = mach_vm_protect(vmap,  stackaddr, th_guardsize, FALSE, VM_PROT_NONE);
 243
 244                 if (kret != KERN_SUCCESS) {
 245                         error = ENOMEM;
 246                         goto out1;
 247                 }
 248                 th_stack = (stackaddr + th_stacksize + th_guardsize);
 249                 th_pthread = (stackaddr + th_stacksize + th_guardsize);
 250                 user_stacksize = th_stacksize;
 251
 252                /*
 253                 * Pre-fault the first page of the new thread's stack and the page that will
 254                 * contain the pthread_t structure.
 255                 */
 256                 vm_fault( vmap,
 257                   vm_map_trunc_page_mask(th_stack - PAGE_SIZE_64, vm_map_page_mask(vmap)),
 258                   VM_PROT_READ | VM_PROT_WRITE,
 259                   FALSE,
 260                   THREAD_UNINT, NULL, 0);
 261
 262                 vm_fault( vmap,
 263                   vm_map_trunc_page_mask(th_pthread, vm_map_page_mask(vmap)),
 264                   VM_PROT_READ | VM_PROT_WRITE,
 265                   FALSE,
 266                   THREAD_UNINT, NULL, 0);
 267         } else {
 268                 th_stack = user_stack;
 269                 user_stacksize = user_stack;
 270                 th_pthread = user_pthread;
 271
 272                 PTHREAD_TRACE(TRACE_pthread_thread_create|DBG_FUNC_NONE, 0, 0, 0, 3, 0);
 273         }
 274
 275 #if defined(__i386__) || defined(__x86_64__)
 276         /*
 277          * Set up i386 registers & function call.
 278          */
 279         if (isLP64 == 0) {
 280                 x86_thread_state32_t state;
 281                 x86_thread_state32_t *ts = &state;
 282
 283                 ts->eip = (unsigned int)pthread_kern->proc_get_threadstart(p);
 284                 ts->eax = (unsigned int)th_pthread;
 285                 ts->ebx = (unsigned int)th_thport;
 286                 ts->ecx = (unsigned int)user_func;
 287                 ts->edx = (unsigned int)user_funcarg;
 288                 ts->edi = (unsigned int)user_stacksize;
 289                 ts->esi = (unsigned int)flags;
 290                 /*
 291                  * set stack pointer
 292                  */
 293                 ts->esp = (int)((vm_offset_t)(th_stack-C_32_STK_ALIGN));
 294
 295                 error = pthread_kern->thread_set_wq_state32(th, (thread_state_t)ts);
 296                 if (error != KERN_SUCCESS) {
 297                         error = EINVAL;
 298                         goto out;
 299                 }
 300         } else {
 301                 x86_thread_state64_t state64;
 302                 x86_thread_state64_t *ts64 = &state64;
 303
 304                 ts64->rip = (uint64_t)pthread_kern->proc_get_threadstart(p);
 305                 ts64->rdi = (uint64_t)th_pthread;
 306                 ts64->rsi = (uint64_t)(th_thport);
 307                 ts64->rdx = (uint64_t)user_func;
 308                 ts64->rcx = (uint64_t)user_funcarg;
 309                 ts64->r8 = (uint64_t)user_stacksize;
 310                 ts64->r9 = (uint64_t)flags;
 311                 /*
 312                  * set stack pointer aligned to 16 byte boundary
 313                  */
 314                 ts64->rsp = (uint64_t)(th_stack - C_64_REDZONE_LEN);
 315
 316                 error = pthread_kern->thread_set_wq_state64(th, (thread_state_t)ts64);
 317                 if (error != KERN_SUCCESS) {
 318                         error = EINVAL;
 319                         goto out;
 320                 }
 321
 322         }
 323 #elif defined(__arm__)
 324         arm_thread_state_t state;
 325         arm_thread_state_t *ts = &state;
 326
 327         ts->pc = (int)pthread_kern->proc_get_threadstart(p);
 328         ts->r[0] = (unsigned int)th_pthread;
 329         ts->r[1] = (unsigned int)th_thport;
 330         ts->r[2] = (unsigned int)user_func;
 331         ts->r[3] = (unsigned int)user_funcarg;
 332         ts->r[4] = (unsigned int)user_stacksize;
 333         ts->r[5] = (unsigned int)flags;
 334
 335         /* Set r7 & lr to 0 for better back tracing */
 336         ts->r[7] = 0;
 337         ts->lr = 0;
 338
 339         /*
 340          * set stack pointer
 341          */
 342         ts->sp = (int)((vm_offset_t)(th_stack-C_32_STK_ALIGN));
 343
 344         (void) pthread_kern->thread_set_wq_state32(th, (thread_state_t)ts);
 345
 346 #else
 347 #error bsdthread_create  not defined for this architecture
 348 #endif
 349
 350         if ((flags & PTHREAD_START_SETSCHED) != 0) {
 351                 /* Set scheduling parameters if needed */
 352                 thread_extended_policy_data_t    extinfo;
 353                 thread_precedence_policy_data_t   precedinfo;
 354
 355                 importance = (flags & PTHREAD_START_IMPORTANCE_MASK);
 356                 policy = (flags >> PTHREAD_START_POLICY_BITSHIFT) & PTHREAD_START_POLICY_MASK;
 357
 358                 if (policy == SCHED_OTHER) {
 359                         extinfo.timeshare = 1;
 360                 } else {
 361                         extinfo.timeshare = 0;
 362                 }
 363
 364                 thread_policy_set(th, THREAD_EXTENDED_POLICY, (thread_policy_t)&extinfo, THREAD_EXTENDED_POLICY_COUNT);
 365
 366 #define BASEPRI_DEFAULT 31
 367                 precedinfo.importance = (importance - BASEPRI_DEFAULT);
 368                 thread_policy_set(th, THREAD_PRECEDENCE_POLICY, (thread_policy_t)&precedinfo, THREAD_PRECEDENCE_POLICY_COUNT);
 369         } else if ((flags & PTHREAD_START_QOSCLASS) != 0) {
 370                 /* Set thread QoS class if requested. */
 371                 pthread_priority_t priority = (pthread_priority_t)(flags & PTHREAD_START_QOSCLASS_MASK);
 372
 373                 thread_qos_policy_data_t qos;
 374                 qos.qos_tier = pthread_priority_get_qos_class(priority);
 375                 qos.tier_importance = (qos.qos_tier == QOS_CLASS_UNSPECIFIED) ? 0 :
 376                                 _pthread_priority_get_relpri(priority);
 377
 378                 pthread_kern->thread_policy_set_internal(th, THREAD_QOS_POLICY, (thread_policy_t)&qos, THREAD_QOS_POLICY_COUNT);
 379         }
 380
 381         kret = pthread_kern->thread_resume(th);
 382         if (kret != KERN_SUCCESS) {
 383                 error = EINVAL;
 384                 goto out1;
 385         }
 386         thread_deallocate(th);  /* drop the creator reference */
 387
 388         PTHREAD_TRACE(TRACE_pthread_thread_create|DBG_FUNC_END, error, th_pthread, 0, 0, 0);
 389
 390         *retval = th_pthread;
 391
 392         return(0);
 393
 394 out1:
 395         if (allocated != 0) {
 396                 (void)mach_vm_deallocate(vmap,  stackaddr, th_allocsize);
 397         }
 398 out:
 399         (void)pthread_kern->mach_port_deallocate(pthread_kern->task_get_ipcspace(ctask), th_thport);
 400         (void)thread_terminate(th);
 401         (void)thread_deallocate(th);
 402         return(error);
 403 }
 404
 405 int
 406 _bsdthread_terminate(__unused struct proc *p,
 407                      user_addr_t stackaddr,
 408                      size_t size,
 409                      uint32_t kthport,
 410                      uint32_t sem,
 411                      __unused int32_t *retval)
 412 {
 413         mach_vm_offset_t freeaddr;
 414         mach_vm_size_t freesize;
 415         kern_return_t kret;
 416
 417         freeaddr = (mach_vm_offset_t)stackaddr;
 418         freesize = size;
 419
 420         PTHREAD_TRACE(TRACE_pthread_thread_terminate|DBG_FUNC_START, freeaddr, freesize, kthport, 0xff, 0);
 421
 422         if ((freesize != (mach_vm_size_t)0) && (freeaddr != (mach_vm_offset_t)0)) {
 423                 kret = mach_vm_deallocate(pthread_kern->current_map(), freeaddr, freesize);
 424                 if (kret != KERN_SUCCESS) {
 425                         PTHREAD_TRACE(TRACE_pthread_thread_terminate|DBG_FUNC_END, kret, 0, 0, 0, 0);
 426                         return(EINVAL);
 427                 }
 428         }
 429
 430         (void) thread_terminate(current_thread());
 431         if (sem != MACH_PORT_NULL) {
 432                  kret = pthread_kern->semaphore_signal_internal_trap(sem);
 433                 if (kret != KERN_SUCCESS) {
 434                         PTHREAD_TRACE(TRACE_pthread_thread_terminate|DBG_FUNC_END, kret, 0, 0, 0, 0);
 435                         return(EINVAL);
 436                 }
 437         }
 438
 439         if (kthport != MACH_PORT_NULL) {
 440                 pthread_kern->mach_port_deallocate(pthread_kern->task_get_ipcspace(current_task()), kthport);
 441         }
 442
 443         PTHREAD_TRACE(TRACE_pthread_thread_terminate|DBG_FUNC_END, 0, 0, 0, 0, 0);
 444
 445         pthread_kern->thread_exception_return();
 446         panic("bsdthread_terminate: still running\n");
 447
 448         PTHREAD_TRACE(TRACE_pthread_thread_terminate|DBG_FUNC_END, 0, 0xff, 0, 0, 0);
 449
 450         return(0);
 451 }
 452
 453 int
 454 _bsdthread_register(struct proc *p,
 455                     user_addr_t threadstart,
 456                     user_addr_t wqthread,
 457                     int pthsize,
 458                     user_addr_t pthread_init_data,
 459                     user_addr_t targetconc_ptr,
 460                     uint64_t dispatchqueue_offset,
 461                     int32_t *retval)
 462 {
 463         /* prevent multiple registrations */
 464         if (pthread_kern->proc_get_register(p) != 0) {
 465                 return(EINVAL);
 466         }
 467         /* syscall randomizer test can pass bogus values */
 468         if (pthsize < 0 || pthsize > MAX_PTHREAD_SIZE) {
 469                 return(EINVAL);
 470         }
 471         pthread_kern->proc_set_threadstart(p, threadstart);
 472         pthread_kern->proc_set_wqthread(p, wqthread);
 473         pthread_kern->proc_set_pthsize(p, pthsize);
 474         pthread_kern->proc_set_register(p);
 475
 476         /* if we have pthread_init_data, then we use that and target_concptr (which is an offset) get data. */
 477         if (pthread_init_data != 0) {
 478                 thread_qos_policy_data_t qos;
 479
 480                 struct _pthread_registration_data data;
 481                 size_t pthread_init_sz = MIN(sizeof(struct _pthread_registration_data), (size_t)targetconc_ptr);
 482
 483                 kern_return_t kr = copyin(pthread_init_data, &data, pthread_init_sz);
 484                 if (kr != KERN_SUCCESS) {
 485                         return EINVAL;
 486                 }
 487
 488                 /* Incoming data from the data structure */
 489                 pthread_kern->proc_set_dispatchqueue_offset(p, data.dispatch_queue_offset);
 490
 491                 /* Outgoing data that userspace expects as a reply */
 492                 if (pthread_kern->qos_main_thread_active()) {
 493                         mach_msg_type_number_t nqos = THREAD_QOS_POLICY_COUNT;
 494                         boolean_t gd = FALSE;
 495
 496                         kr = pthread_kern->thread_policy_get(current_thread(), THREAD_QOS_POLICY, (thread_policy_t)&qos, &nqos, &gd);
 497                         if (kr != KERN_SUCCESS || qos.qos_tier == THREAD_QOS_UNSPECIFIED) {
 498                                 /* Unspecified threads means the kernel wants us to impose legacy upon the thread. */
 499                                 qos.qos_tier = THREAD_QOS_LEGACY;
 500                                 qos.tier_importance = 0;
 501
 502                                 kr = pthread_kern->thread_policy_set_internal(current_thread(), THREAD_QOS_POLICY, (thread_policy_t)&qos, THREAD_QOS_POLICY_COUNT);
 503                         }
 504
 505                         if (kr == KERN_SUCCESS) {
 506                                 data.main_qos = pthread_qos_class_get_priority(qos.qos_tier);
 507                         } else {
 508                                 data.main_qos = _pthread_priority_make_newest(QOS_CLASS_UNSPECIFIED, 0, 0);
 509                         }
 510                 } else {
 511                         data.main_qos = _pthread_priority_make_newest(QOS_CLASS_UNSPECIFIED, 0, 0);
 512                 }
 513
 514                 kr = copyout(&data, pthread_init_data, pthread_init_sz);
 515                 if (kr != KERN_SUCCESS) {
 516                         return EINVAL;
 517                 }
 518         } else {
 519                 pthread_kern->proc_set_dispatchqueue_offset(p, dispatchqueue_offset);
 520                 pthread_kern->proc_set_targconc(p, targetconc_ptr);
 521         }
 522
 523         /* return the supported feature set as the return value. */
 524         *retval = PTHREAD_FEATURE_SUPPORTED;
 525
 526         return(0);
 527 }
 528
 529 int
 530 _bsdthread_ctl_set_qos(struct proc *p, user_addr_t __unused cmd, mach_port_name_t kport, user_addr_t tsd_priority_addr, user_addr_t arg3, int *retval)
 531 {
 532         kern_return_t kr;
 533         thread_t th;
 534
 535         pthread_priority_t priority;
 536
 537         /* Unused parameters must be zero. */
 538         if (arg3 != 0) {
 539                 return EINVAL;
 540         }
 541
 542         /* QoS is stored in a given slot in the pthread TSD. We need to copy that in and set our QoS based on it. */
 543         if (proc_is64bit(p)) {
 544                 uint64_t v;
 545                 kr = copyin(tsd_priority_addr, &v, sizeof(v));
 546                 if (kr != KERN_SUCCESS) {
 547                         return kr;
 548                 }
 549                 priority = (int)(v & 0xffffffff);
 550         } else {
 551                 uint32_t v;
 552                 kr = copyin(tsd_priority_addr, &v, sizeof(v));
 553                 if (kr != KERN_SUCCESS) {
 554                         return kr;
 555                 }
 556                 priority = v;
 557         }
 558
 559         if ((th = port_name_to_thread(kport)) == THREAD_NULL) {
 560                 return ESRCH;
 561         }
 562
 563         /* <rdar://problem/16211829> Disable pthread_set_qos_class_np() on threads other than pthread_self */
 564         if (th != current_thread()) {
 565                 thread_deallocate(th);
 566                 return EPERM;
 567         }
 568
 569         int rv = _bsdthread_ctl_set_self(p, 0, priority, 0, _PTHREAD_SET_SELF_QOS_FLAG, retval);
 570
 571         /* Static param the thread, we just set QoS on it, so its stuck in QoS land now. */
 572         /* pthread_kern->thread_static_param(th, TRUE); */ // see <rdar://problem/16433744>, for details
 573
 574         thread_deallocate(th);
 575
 576         return rv;
 577 }
 578
 579 static inline struct threadlist *
 580 util_get_thread_threadlist_entry(thread_t th)
 581 {
 582         struct uthread *uth = pthread_kern->get_bsdthread_info(th);
 583         if (uth) {
 584                 struct threadlist *tl = pthread_kern->uthread_get_threadlist(uth);
 585                 return tl;
 586         }
 587         return NULL;
 588 }
 589
 590 static inline void
 591 wq_thread_override_reset(thread_t th)
 592 {
 593         struct uthread *uth = pthread_kern->get_bsdthread_info(th);
 594         struct threadlist *tl = pthread_kern->uthread_get_threadlist(uth);
 595
 596         if (tl) {
 597                 /*
 598                  * Drop all outstanding overrides on this thread, done outside the wq lock
 599                  * because proc_usynch_thread_qos_remove_override takes a spinlock that
 600                  * could cause us to panic.
 601                  */
 602                 uint32_t count = tl->th_dispatch_override_count;
 603                 while (!OSCompareAndSwap(count, 0, &tl->th_dispatch_override_count)) {
 604                         count = tl->th_dispatch_override_count;
 605                 }
 606
 607                 PTHREAD_TRACE(TRACE_wq_override_reset | DBG_FUNC_NONE, tl->th_workq, count, 0, 0, 0);
 608
 609                 for (int i=count; i>0; i--) {
 610                         pthread_kern->proc_usynch_thread_qos_remove_override(uth, 0);
 611                 }
 612         }
 613 }
 614
 615 int
 616 _bsdthread_ctl_set_self(struct proc *p, user_addr_t __unused cmd, pthread_priority_t priority, mach_port_name_t voucher, _pthread_set_flags_t flags, int __unused *retval)
 617 {
 618         thread_qos_policy_data_t qos;
 619         mach_msg_type_number_t nqos = THREAD_QOS_POLICY_COUNT;
 620         boolean_t gd = FALSE;
 621
 622         kern_return_t kr;
 623         int qos_rv = 0, voucher_rv = 0, fixedpri_rv = 0;
 624
 625         if ((flags & _PTHREAD_SET_SELF_QOS_FLAG) != 0) {
 626                 kr = pthread_kern->thread_policy_get(current_thread(), THREAD_QOS_POLICY, (thread_policy_t)&qos, &nqos, &gd);
 627                 if (kr != KERN_SUCCESS) {
 628                         qos_rv = EINVAL;
 629                         goto voucher;
 630                 }
 631
 632                 /* If we have main-thread QoS then we don't allow a thread to come out of QOS_CLASS_UNSPECIFIED. */
 633                 if (pthread_kern->qos_main_thread_active() && qos.qos_tier == THREAD_QOS_UNSPECIFIED) {
 634                         qos_rv = EPERM;
 635                         goto voucher;
 636                 }
 637
 638                 /* Get the work queue for tracing, also the threadlist for bucket manipluation. */
 639                 struct workqueue *wq = NULL;
 640                 struct threadlist *tl = util_get_thread_threadlist_entry(current_thread());
 641                 if (tl) {
 642                         wq = tl->th_workq;
 643                 }
 644
 645                 PTHREAD_TRACE(TRACE_pthread_set_qos_self | DBG_FUNC_START, wq, qos.qos_tier, qos.tier_importance, 0, 0);
 646
 647                 qos.qos_tier = pthread_priority_get_qos_class(priority);
 648                 qos.tier_importance = (qos.qos_tier == QOS_CLASS_UNSPECIFIED) ? 0 : _pthread_priority_get_relpri(priority);
 649
 650                 kr = pthread_kern->thread_policy_set_internal(current_thread(), THREAD_QOS_POLICY, (thread_policy_t)&qos, THREAD_QOS_POLICY_COUNT);
 651                 if (kr != KERN_SUCCESS) {
 652                         qos_rv = EINVAL;
 653                         goto voucher;
 654                 }
 655
 656                 /* If we're a workqueue, the threadlist item priority needs adjusting, along with the bucket we were running in. */
 657                 if (tl) {
 658                         workqueue_lock_spin(p);
 659
 660                         /* Fix up counters. */
 661                         uint8_t old_bucket = tl->th_priority;
 662                         uint8_t new_bucket = pthread_priority_get_class_index(priority);
 663
 664                         uint32_t old_active = OSAddAtomic(-1, &wq->wq_thactive_count[old_bucket]);
 665                         OSAddAtomic(1, &wq->wq_thactive_count[new_bucket]);
 666
 667                         wq->wq_thscheduled_count[old_bucket]--;
 668                         wq->wq_thscheduled_count[new_bucket]++;
 669
 670                         tl->th_priority = new_bucket;
 671
 672                         /* If we were at the ceiling of non-overcommitted threads for a given bucket, we have to
 673                          * reevaluate whether we should start more work.
 674                          */
 675                         if (old_active == wq->wq_reqconc[old_bucket]) {
 676                                 /* workqueue_run_nextreq will drop the workqueue lock in all exit paths. */
 677                                 (void)workqueue_run_nextreq(p, wq, THREAD_NULL, FALSE, FALSE, 0);
 678                         } else {
 679                                 workqueue_unlock(p);
 680                         }
 681                 }
 682
 683                 PTHREAD_TRACE(TRACE_pthread_set_qos_self | DBG_FUNC_END, wq, qos.qos_tier, qos.tier_importance, 0, 0);
 684         }
 685
 686 voucher:
 687         if ((flags & _PTHREAD_SET_SELF_VOUCHER_FLAG) != 0) {
 688                 kr = pthread_kern->thread_set_voucher_name(voucher);
 689                 if (kr != KERN_SUCCESS) {
 690                         voucher_rv = ENOENT;
 691                         goto fixedpri;
 692                 }
 693         }
 694
 695 fixedpri:
 696         if ((flags & _PTHREAD_SET_SELF_FIXEDPRIORITY_FLAG) != 0) {
 697                 thread_extended_policy_data_t extpol;
 698                 thread_t thread = current_thread();
 699
 700                 extpol.timeshare = 0;
 701
 702                 struct threadlist *tl = util_get_thread_threadlist_entry(thread);
 703                 if (tl) {
 704                         /* Not allowed on workqueue threads, since there is no symmetric clear function */
 705                         fixedpri_rv = ENOTSUP;
 706                         goto done;
 707                 }
 708
 709                 kr = pthread_kern->thread_policy_set_internal(thread, THREAD_EXTENDED_POLICY, (thread_policy_t)&extpol, THREAD_EXTENDED_POLICY_COUNT);
 710                 if (kr != KERN_SUCCESS) {
 711                         fixedpri_rv = EINVAL;
 712                         goto done;
 713                 }
 714         }
 715
 716 done:
 717         if (qos_rv && voucher_rv) {
 718                 /* Both failed, give that a unique error. */
 719                 return EBADMSG;
 720         }
 721
 722         if (qos_rv) {
 723                 return qos_rv;
 724         }
 725
 726         if (voucher_rv) {
 727                 return voucher_rv;
 728         }
 729
 730         if (fixedpri_rv) {
 731                 return fixedpri_rv;
 732         }
 733
 734         return 0;
 735 }
 736
 737 int
 738 _bsdthread_ctl_qos_override_start(struct proc __unused *p, user_addr_t __unused cmd, mach_port_name_t kport, pthread_priority_t priority, user_addr_t arg3, int __unused *retval)
 739 {
 740         thread_t th;
 741         int rv = 0;
 742
 743         if (arg3 != 0) {
 744                 return EINVAL;
 745         }
 746
 747         if ((th = port_name_to_thread(kport)) == THREAD_NULL) {
 748                 return ESRCH;
 749         }
 750
 751         struct uthread *uth = pthread_kern->get_bsdthread_info(th);
 752         int override_qos = pthread_priority_get_qos_class(priority);
 753
 754         struct threadlist *tl = util_get_thread_threadlist_entry(th);
 755         if (tl) {
 756                 /* Workqueue threads count their overrides, so they can forcibly balance any outstanding
 757                  * overrides when they return to the kernel.
 758                  */
 759                 uint32_t o = OSAddAtomic(1, &tl->th_override_count);
 760                 PTHREAD_TRACE(TRACE_wq_override_start | DBG_FUNC_NONE, tl->th_workq, thread_tid(th), o+1, priority, 0);
 761         }
 762
 763         /* The only failure case here is if we pass a tid and have it lookup the thread, we pass the uthread, so this all always succeeds. */
 764         pthread_kern->proc_usynch_thread_qos_add_override(uth, 0, override_qos, TRUE);
 765
 766         thread_deallocate(th);
 767         return rv;
 768 }
 769
 770 int
 771 _bsdthread_ctl_qos_override_end(struct proc __unused *p, user_addr_t __unused cmd, mach_port_name_t kport, user_addr_t arg2, user_addr_t arg3, int __unused *retval)
 772 {
 773         thread_t th;
 774         int rv = 0;
 775
 776         if (arg2 != 0 || arg3 != 0) {
 777                 return EINVAL;
 778         }
 779
 780         if ((th = port_name_to_thread(kport)) == THREAD_NULL) {
 781                 return ESRCH;
 782         }
 783
 784         struct uthread *uth = pthread_kern->get_bsdthread_info(th);
 785
 786         struct threadlist *tl = util_get_thread_threadlist_entry(th);
 787         if (tl) {
 788                 uint32_t o = OSAddAtomic(-1, &tl->th_override_count);
 789
 790                 PTHREAD_TRACE(TRACE_wq_override_end | DBG_FUNC_NONE, tl->th_workq, thread_tid(th), o-1, 0, 0);
 791
 792                 if (o == 0) {
 793                         /* underflow! */
 794                         thread_deallocate(th);
 795                         return EFAULT;
 796                 }
 797         }
 798
 799         pthread_kern->proc_usynch_thread_qos_remove_override(uth, 0);
 800
 801         thread_deallocate(th);
 802         return rv;
 803 }
 804
 805 int
 806 _bsdthread_ctl_qos_override_dispatch(struct proc __unused *p, user_addr_t __unused cmd, mach_port_name_t kport, pthread_priority_t priority, user_addr_t arg3, int __unused *retval)
 807 {
 808         thread_t th;
 809         int rv = 0;
 810
 811         if (arg3 != 0) {
 812                 return EINVAL;
 813         }
 814
 815         if ((th = port_name_to_thread(kport)) == THREAD_NULL) {
 816                 return ESRCH;
 817         }
 818
 819         struct uthread *uth = pthread_kern->get_bsdthread_info(th);
 820         int override_qos = pthread_priority_get_qos_class(priority);
 821
 822         struct threadlist *tl = util_get_thread_threadlist_entry(th);
 823         if (!tl) {
 824                 thread_deallocate(th);
 825                 return EPERM;
 826         }
 827
 828         /* Workqueue threads count their overrides, so they can forcibly balance any outstanding
 829          * overrides when they return to the kernel.
 830          */
 831         uint32_t o = OSAddAtomic(1, &tl->th_dispatch_override_count);
 832         PTHREAD_TRACE(TRACE_wq_override_dispatch | DBG_FUNC_NONE, tl->th_workq, thread_tid(th), o+1, priority, 0);
 833
 834         /* The only failure case here is if we pass a tid and have it lookup the thread, we pass the uthread, so this all always succeeds. */
 835         pthread_kern->proc_usynch_thread_qos_add_override(uth, 0, override_qos, TRUE);
 836
 837         thread_deallocate(th);
 838         return rv;
 839 }
 840
 841 int
 842 _bsdthread_ctl_qos_override_reset(struct proc __unused *p, user_addr_t __unused cmd, user_addr_t arg1, user_addr_t arg2, user_addr_t arg3, int __unused *retval)
 843 {
 844         thread_t th;
 845         struct threadlist *tl;
 846         int rv = 0;
 847
 848         if (arg1 != 0 || arg2 != 0 || arg3 != 0) {
 849                 return EINVAL;
 850         }
 851
 852         th = current_thread();
 853         tl = util_get_thread_threadlist_entry(th);
 854
 855         if (tl) {
 856                 wq_thread_override_reset(th);
 857         } else {
 858                 rv = EPERM;
 859         }
 860
 861         return rv;
 862 }
 863
 864 int
 865 _bsdthread_ctl(struct proc *p, user_addr_t cmd, user_addr_t arg1, user_addr_t arg2, user_addr_t arg3, int *retval)
 866 {
 867         switch (cmd) {
 868                 case BSDTHREAD_CTL_SET_QOS:
 869                         return _bsdthread_ctl_set_qos(p, cmd, (mach_port_name_t)arg1, arg2, arg3, retval);
 870                 case BSDTHREAD_CTL_QOS_OVERRIDE_START:
 871                         return _bsdthread_ctl_qos_override_start(p, cmd, (mach_port_name_t)arg1, (pthread_priority_t)arg2, arg3, retval);
 872                 case BSDTHREAD_CTL_QOS_OVERRIDE_END:
 873                         return _bsdthread_ctl_qos_override_end(p, cmd, (mach_port_name_t)arg1, arg2, arg3, retval);
 874                 case BSDTHREAD_CTL_QOS_OVERRIDE_RESET:
 875                         return _bsdthread_ctl_qos_override_reset(p, cmd, arg1, arg2, arg3, retval);
 876                 case BSDTHREAD_CTL_QOS_OVERRIDE_DISPATCH:
 877                         return _bsdthread_ctl_qos_override_dispatch(p, cmd, (mach_port_name_t)arg1, (pthread_priority_t)arg2, arg3, retval);
 878                 case BSDTHREAD_CTL_SET_SELF:
 879                         return _bsdthread_ctl_set_self(p, cmd, (pthread_priority_t)arg1, (mach_port_name_t)arg2, (_pthread_set_flags_t)arg3, retval);
 880                 default:
 881                         return EINVAL;
 882         }
 883 }
 884
 885 uint32_t wq_yielded_threshold           = WQ_YIELDED_THRESHOLD;
 886 uint32_t wq_yielded_window_usecs        = WQ_YIELDED_WINDOW_USECS;
 887 uint32_t wq_stalled_window_usecs        = WQ_STALLED_WINDOW_USECS;
 888 uint32_t wq_reduce_pool_window_usecs    = WQ_REDUCE_POOL_WINDOW_USECS;
 889 uint32_t wq_max_timer_interval_usecs    = WQ_MAX_TIMER_INTERVAL_USECS;
 890 uint32_t wq_max_threads                 = WORKQUEUE_MAXTHREADS;
 891 uint32_t wq_max_constrained_threads     = WORKQUEUE_MAXTHREADS / 8;
 892
 893
 894 SYSCTL_INT(_kern, OID_AUTO, wq_yielded_threshold, CTLFLAG_RW | CTLFLAG_LOCKED,
 895            &wq_yielded_threshold, 0, "");
 896
 897 SYSCTL_INT(_kern, OID_AUTO, wq_yielded_window_usecs, CTLFLAG_RW | CTLFLAG_LOCKED,
 898            &wq_yielded_window_usecs, 0, "");
 899
 900 SYSCTL_INT(_kern, OID_AUTO, wq_stalled_window_usecs, CTLFLAG_RW | CTLFLAG_LOCKED,
 901            &wq_stalled_window_usecs, 0, "");
 902
 903 SYSCTL_INT(_kern, OID_AUTO, wq_reduce_pool_window_usecs, CTLFLAG_RW | CTLFLAG_LOCKED,
 904            &wq_reduce_pool_window_usecs, 0, "");
 905
 906 SYSCTL_INT(_kern, OID_AUTO, wq_max_timer_interval_usecs, CTLFLAG_RW | CTLFLAG_LOCKED,
 907            &wq_max_timer_interval_usecs, 0, "");
 908
 909 SYSCTL_INT(_kern, OID_AUTO, wq_max_threads, CTLFLAG_RW | CTLFLAG_LOCKED,
 910            &wq_max_threads, 0, "");
 911
 912 SYSCTL_INT(_kern, OID_AUTO, wq_max_constrained_threads, CTLFLAG_RW | CTLFLAG_LOCKED,
 913            &wq_max_constrained_threads, 0, "");
 914
 915
 916 static uint32_t wq_init_constrained_limit = 1;
 917
 918
 919 void
 920 _workqueue_init_lock(proc_t p)
 921 {
 922         lck_spin_init(pthread_kern->proc_get_wqlockptr(p), pthread_lck_grp, pthread_lck_attr);
 923         *(pthread_kern->proc_get_wqinitingptr(p)) = FALSE;
 924 }
 925
 926 void
 927 _workqueue_destroy_lock(proc_t p)
 928 {
 929         lck_spin_destroy(pthread_kern->proc_get_wqlockptr(p), pthread_lck_grp);
 930 }
 931
 932
 933 static void
 934 workqueue_lock_spin(proc_t p)
 935 {
 936         lck_spin_lock(pthread_kern->proc_get_wqlockptr(p));
 937 }
 938
 939 static void
 940 workqueue_unlock(proc_t p)
 941 {
 942         lck_spin_unlock(pthread_kern->proc_get_wqlockptr(p));
 943 }
 944
 945
 946 static void
 947 workqueue_interval_timer_start(struct workqueue *wq)
 948 {
 949         uint64_t deadline;
 950
 951         if (wq->wq_timer_interval == 0) {
 952                 wq->wq_timer_interval = wq_stalled_window_usecs;
 953
 954         } else {
 955                 wq->wq_timer_interval = wq->wq_timer_interval * 2;
 956
 957                 if (wq->wq_timer_interval > wq_max_timer_interval_usecs) {
 958                         wq->wq_timer_interval = wq_max_timer_interval_usecs;
 959                 }
 960         }
 961         clock_interval_to_deadline(wq->wq_timer_interval, 1000, &deadline);
 962
 963         thread_call_enter_delayed(wq->wq_atimer_call, deadline);
 964
 965         PTHREAD_TRACE(TRACE_wq_start_add_timer, wq, wq->wq_reqcount, wq->wq_flags, wq->wq_timer_interval, 0);
 966 }
 967
 968
 969 static boolean_t
 970 wq_thread_is_busy(uint64_t cur_ts, uint64_t *lastblocked_tsp)
 971 {
 972         clock_sec_t     secs;
 973         clock_usec_t    usecs;
 974         uint64_t lastblocked_ts;
 975         uint64_t elapsed;
 976
 977         /*
 978          * the timestamp is updated atomically w/o holding the workqueue lock
 979          * so we need to do an atomic read of the 64 bits so that we don't see
 980          * a mismatched pair of 32 bit reads... we accomplish this in an architecturally
 981          * independent fashion by using OSCompareAndSwap64 to write back the
 982          * value we grabbed... if it succeeds, then we have a good timestamp to
 983          * evaluate... if it fails, we straddled grabbing the timestamp while it
 984          * was being updated... treat a failed update as a busy thread since
 985          * it implies we are about to see a really fresh timestamp anyway
 986          */
 987         lastblocked_ts = *lastblocked_tsp;
 988
 989         if ( !OSCompareAndSwap64((UInt64)lastblocked_ts, (UInt64)lastblocked_ts, lastblocked_tsp))
 990                 return (TRUE);
 991
 992         if (lastblocked_ts >= cur_ts) {
 993                 /*
 994                  * because the update of the timestamp when a thread blocks isn't
 995                  * serialized against us looking at it (i.e. we don't hold the workq lock)
 996                  * it's possible to have a timestamp that matches the current time or
 997                  * that even looks to be in the future relative to when we grabbed the current
 998                  * time... just treat this as a busy thread since it must have just blocked.
 999                  */
1000                 return (TRUE);
1001         }
1002         elapsed = cur_ts - lastblocked_ts;
1003
1004         pthread_kern->absolutetime_to_microtime(elapsed, &secs, &usecs);
1005
1006         if (secs == 0 && usecs < wq_stalled_window_usecs)
1007                 return (TRUE);
1008         return (FALSE);
1009 }
1010
1011
1012 #define WQ_TIMER_NEEDED(wq, start_timer) do {           \
1013         int oldflags = wq->wq_flags;                    \
1014                                                         \
1015         if ( !(oldflags & (WQ_EXITING | WQ_ATIMER_RUNNING))) {  \
1016                 if (OSCompareAndSwap(oldflags, oldflags | WQ_ATIMER_RUNNING, (UInt32 *)&wq->wq_flags)) \
1017                         start_timer = TRUE;                     \
1018         }                                                       \
1019 } while (0)
1020
1021
1022
1023 static void
1024 workqueue_add_timer(struct workqueue *wq, __unused int param1)
1025 {
1026         proc_t          p;
1027         boolean_t       start_timer = FALSE;
1028         boolean_t       retval;
1029         boolean_t       add_thread;
1030         uint32_t        busycount;
1031
1032         PTHREAD_TRACE(TRACE_wq_add_timer | DBG_FUNC_START, wq, wq->wq_flags, wq->wq_nthreads, wq->wq_thidlecount, 0);
1033
1034         p = wq->wq_proc;
1035
1036         workqueue_lock_spin(p);
1037
1038         /*
1039          * because workqueue_callback now runs w/o taking the workqueue lock
1040          * we are unsynchronized w/r to a change in state of the running threads...
1041          * to make sure we always evaluate that change, we allow it to start up
1042          * a new timer if the current one is actively evalutating the state
1043          * however, we do not need more than 2 timers fired up (1 active and 1 pending)
1044          * and we certainly do not want 2 active timers evaluating the state
1045          * simultaneously... so use WQL_ATIMER_BUSY to serialize the timers...
1046          * note that WQL_ATIMER_BUSY is in a different flag word from WQ_ATIMER_RUNNING since
1047          * it is always protected by the workq lock... WQ_ATIMER_RUNNING is evaluated
1048          * and set atomimcally since the callback function needs to manipulate it
1049          * w/o holding the workq lock...
1050          *
1051          * !WQ_ATIMER_RUNNING && !WQL_ATIMER_BUSY   ==   no pending timer, no active timer
1052          * !WQ_ATIMER_RUNNING && WQL_ATIMER_BUSY    ==   no pending timer, 1 active timer
1053          * WQ_ATIMER_RUNNING && !WQL_ATIMER_BUSY    ==   1 pending timer, no active timer
1054          * WQ_ATIMER_RUNNING && WQL_ATIMER_BUSY     ==   1 pending timer, 1 active timer
1055          */
1056         while (wq->wq_lflags & WQL_ATIMER_BUSY) {
1057                 wq->wq_lflags |= WQL_ATIMER_WAITING;
1058
1059                 assert_wait((caddr_t)wq, (THREAD_UNINT));
1060                 workqueue_unlock(p);
1061
1062                 thread_block(THREAD_CONTINUE_NULL);
1063
1064                 workqueue_lock_spin(p);
1065         }
1066         wq->wq_lflags |= WQL_ATIMER_BUSY;
1067
1068         /*
1069          * the workq lock will protect us from seeing WQ_EXITING change state, but we
1070          * still need to update this atomically in case someone else tries to start
1071          * the timer just as we're releasing it
1072          */
1073         while ( !(OSCompareAndSwap(wq->wq_flags, (wq->wq_flags & ~WQ_ATIMER_RUNNING), (UInt32 *)&wq->wq_flags)));
1074
1075 again:
1076         retval = TRUE;
1077         add_thread = FALSE;
1078
1079         if ( !(wq->wq_flags & WQ_EXITING)) {
1080                 /*
1081                  * check to see if the stall frequency was beyond our tolerance
1082                  * or we have work on the queue, but haven't scheduled any
1083                  * new work within our acceptable time interval because
1084                  * there were no idle threads left to schedule
1085                  */
1086                 if (wq->wq_reqcount) {
1087                         uint32_t        priclass;
1088                         uint32_t        thactive_count;
1089                         uint32_t        i;
1090                         uint64_t        curtime;
1091
1092                         for (priclass = 0; priclass < WORKQUEUE_NUM_BUCKETS; priclass++) {
1093                                 if (wq->wq_requests[priclass])
1094                                         break;
1095                         }
1096                         assert(priclass < WORKQUEUE_NUM_BUCKETS);
1097
1098                         curtime = mach_absolute_time();
1099                         busycount = 0;
1100                         thactive_count = 0;
1101
1102                         /*
1103                          * check for conditions under which we would not add a thread, either
1104                          *   a) we've got as many running threads as we want in this priority
1105                          *      band and the priority bands above it
1106                          *
1107                          *   b) check to see if the priority group has blocked threads, if the
1108                          *      last blocked timestamp is old enough, we will have already passed
1109                          *      (a) where we would have stopped if we had enough active threads.
1110                          */
1111                         for (i = 0; i <= priclass; i++) {
1112
1113                                 thactive_count += wq->wq_thactive_count[i];
1114
1115                                 if (wq->wq_thscheduled_count[i]) {
1116                                         if (wq_thread_is_busy(curtime, &wq->wq_lastblocked_ts[i]))
1117                                                 busycount++;
1118                                 }
1119                         }
1120                         if (thactive_count + busycount < wq->wq_max_concurrency) {
1121
1122                                 if (wq->wq_thidlecount == 0) {
1123                                         /*
1124                                          * if we have no idle threads, try to add one
1125                                          */
1126                                         retval = workqueue_addnewthread(wq, FALSE);
1127                                 }
1128                                 add_thread = TRUE;
1129                         }
1130
1131                         if (wq->wq_reqcount) {
1132                                 /*
1133                                  * as long as we have threads to schedule, and we successfully
1134                                  * scheduled new work, keep trying
1135                                  */
1136                                 while (wq->wq_thidlecount && !(wq->wq_flags & WQ_EXITING)) {
1137                                         /*
1138                                          * workqueue_run_nextreq is responsible for
1139                                          * dropping the workqueue lock in all cases
1140                                          */
1141                                         retval = workqueue_run_nextreq(p, wq, THREAD_NULL, FALSE, FALSE, 0);
1142                                         workqueue_lock_spin(p);
1143
1144                                         if (retval == FALSE)
1145                                                 break;
1146                                 }
1147                                 if ( !(wq->wq_flags & WQ_EXITING) && wq->wq_reqcount) {
1148
1149                                         if (wq->wq_thidlecount == 0 && retval == TRUE && add_thread == TRUE)
1150                                                 goto again;
1151
1152                                         if (wq->wq_thidlecount == 0 || busycount)
1153                                                 WQ_TIMER_NEEDED(wq, start_timer);
1154
1155                                         PTHREAD_TRACE(TRACE_wq_add_timer | DBG_FUNC_NONE, wq, wq->wq_reqcount, wq->wq_thidlecount, busycount, 0);
1156                                 }
1157                         }
1158                 }
1159         }
1160         if ( !(wq->wq_flags & WQ_ATIMER_RUNNING))
1161                 wq->wq_timer_interval = 0;
1162
1163         wq->wq_lflags &= ~WQL_ATIMER_BUSY;
1164
1165         if ((wq->wq_flags & WQ_EXITING) || (wq->wq_lflags & WQL_ATIMER_WAITING)) {
1166                 /*
1167                  * wakeup the thread hung up in workqueue_exit or workqueue_add_timer waiting for this timer
1168                  * to finish getting out of the way
1169                  */
1170                 wq->wq_lflags &= ~WQL_ATIMER_WAITING;
1171                 wakeup(wq);
1172         }
1173
1174         PTHREAD_TRACE(TRACE_wq_add_timer | DBG_FUNC_END, wq, start_timer, wq->wq_nthreads, wq->wq_thidlecount, 0);
1175
1176         workqueue_unlock(p);
1177
1178         if (start_timer == TRUE)
1179                 workqueue_interval_timer_start(wq);
1180 }
1181
1182
1183 void
1184 _workqueue_thread_yielded(void)
1185 {
1186         struct workqueue *wq;
1187         proc_t p;
1188
1189         p = current_proc();
1190
1191         if ((wq = pthread_kern->proc_get_wqptr(p)) == NULL || wq->wq_reqcount == 0)
1192                 return;
1193
1194         workqueue_lock_spin(p);
1195
1196         if (wq->wq_reqcount) {
1197                 uint64_t        curtime;
1198                 uint64_t        elapsed;
1199                 clock_sec_t     secs;
1200                 clock_usec_t    usecs;
1201
1202                 if (wq->wq_thread_yielded_count++ == 0)
1203                         wq->wq_thread_yielded_timestamp = mach_absolute_time();
1204
1205                 if (wq->wq_thread_yielded_count < wq_yielded_threshold) {
1206                         workqueue_unlock(p);
1207                         return;
1208                 }
1209
1210                 PTHREAD_TRACE(TRACE_wq_thread_yielded | DBG_FUNC_START, wq, wq->wq_thread_yielded_count, wq->wq_reqcount, 0, 0);
1211
1212                 wq->wq_thread_yielded_count = 0;
1213
1214                 curtime = mach_absolute_time();
1215                 elapsed = curtime - wq->wq_thread_yielded_timestamp;
1216                 pthread_kern->absolutetime_to_microtime(elapsed, &secs, &usecs);
1217
1218                 if (secs == 0 && usecs < wq_yielded_window_usecs) {
1219
1220                         if (wq->wq_thidlecount == 0) {
1221                                 workqueue_addnewthread(wq, TRUE);
1222                                 /*
1223                                  * 'workqueue_addnewthread' drops the workqueue lock
1224                                  * when creating the new thread and then retakes it before
1225                                  * returning... this window allows other threads to process
1226                                  * requests, so we need to recheck for available work
1227                                  * if none found, we just return...  the newly created thread
1228                                  * will eventually get used (if it hasn't already)...
1229                                  */
1230                                 if (wq->wq_reqcount == 0) {
1231                                         workqueue_unlock(p);
1232                                         return;
1233                                 }
1234                         }
1235                         if (wq->wq_thidlecount) {
1236                                 uint32_t        priority;
1237                                 boolean_t       overcommit = FALSE;
1238                                 boolean_t       force_oc = FALSE;
1239
1240                                 for (priority = 0; priority < WORKQUEUE_NUM_BUCKETS; priority++) {
1241                                         if (wq->wq_requests[priority]) {
1242                                                 break;
1243                                         }
1244                                 }
1245                                 assert(priority < WORKQUEUE_NUM_BUCKETS);
1246
1247                                 wq->wq_reqcount--;
1248                                 wq->wq_requests[priority]--;
1249
1250                                 if (wq->wq_ocrequests[priority]) {
1251                                         wq->wq_ocrequests[priority]--;
1252                                         overcommit = TRUE;
1253                                 } else
1254                                         force_oc = TRUE;
1255
1256                                 (void)workqueue_run_nextreq(p, wq, THREAD_NULL, force_oc, overcommit, pthread_priority_from_class_index(priority));
1257                                 /*
1258                                  * workqueue_run_nextreq is responsible for
1259                                  * dropping the workqueue lock in all cases
1260                                  */
1261                                 PTHREAD_TRACE(TRACE_wq_thread_yielded | DBG_FUNC_END, wq, wq->wq_thread_yielded_count, wq->wq_reqcount, 1, 0);
1262
1263                                 return;
1264                         }
1265                 }
1266                 PTHREAD_TRACE(TRACE_wq_thread_yielded | DBG_FUNC_END, wq, wq->wq_thread_yielded_count, wq->wq_reqcount, 2, 0);
1267         }
1268         workqueue_unlock(p);
1269 }
1270
1271
1272
1273 static void
1274 workqueue_callback(int type, thread_t thread)
1275 {
1276         struct uthread    *uth;
1277         struct threadlist *tl;
1278         struct workqueue  *wq;
1279
1280         uth = pthread_kern->get_bsdthread_info(thread);
1281         tl = pthread_kern->uthread_get_threadlist(uth);
1282         wq = tl->th_workq;
1283
1284         switch (type) {
1285         case SCHED_CALL_BLOCK: {
1286                 uint32_t        old_activecount;
1287                 boolean_t       start_timer = FALSE;
1288
1289                 old_activecount = OSAddAtomic(-1, &wq->wq_thactive_count[tl->th_priority]);
1290
1291                 if (old_activecount == wq->wq_reqconc[tl->th_priority]) {
1292                         uint64_t        curtime;
1293                         UInt64          *lastblocked_ptr;
1294
1295                         /*
1296                          * the number of active threads at this priority
1297                          * has fallen below the maximum number of concurrent
1298                          * threads that we're allowed to run
1299                          */
1300                         lastblocked_ptr = (UInt64 *)&wq->wq_lastblocked_ts[tl->th_priority];
1301                         curtime = mach_absolute_time();
1302
1303                         /*
1304                          * if we collide with another thread trying to update the last_blocked (really unlikely
1305                          * since another thread would have to get scheduled and then block after we start down
1306                          * this path), it's not a problem.  Either timestamp is adequate, so no need to retry
1307                          */
1308
1309                         OSCompareAndSwap64(*lastblocked_ptr, (UInt64)curtime, lastblocked_ptr);
1310
1311                         if (wq->wq_reqcount) {
1312                                 /*
1313                                  * we have work to do so start up the timer
1314                                  * if it's not running... we'll let it sort
1315                                  * out whether we really need to start up
1316                                  * another thread
1317                                  */
1318                                 WQ_TIMER_NEEDED(wq, start_timer);
1319                         }
1320
1321                         if (start_timer == TRUE) {
1322                                 workqueue_interval_timer_start(wq);
1323                         }
1324                 }
1325                 PTHREAD_TRACE1(TRACE_wq_thread_block | DBG_FUNC_START, wq, old_activecount, tl->th_priority, start_timer, thread_tid(thread));
1326                 break;
1327         }
1328         case SCHED_CALL_UNBLOCK:
1329                 /*
1330                  * we cannot take the workqueue_lock here...
1331                  * an UNBLOCK can occur from a timer event which
1332                  * is run from an interrupt context... if the workqueue_lock
1333                  * is already held by this processor, we'll deadlock...
1334                  * the thread lock for the thread being UNBLOCKED
1335                  * is also held
1336                  */
1337                 OSAddAtomic(1, &wq->wq_thactive_count[tl->th_priority]);
1338
1339                 PTHREAD_TRACE1(TRACE_wq_thread_block | DBG_FUNC_END, wq, wq->wq_threads_scheduled, tl->th_priority, 0, thread_tid(thread));
1340
1341                 break;
1342         }
1343 }
1344
1345 sched_call_t
1346 _workqueue_get_sched_callback(void)
1347 {
1348         return workqueue_callback;
1349 }
1350
1351 static void
1352 workqueue_removethread(struct threadlist *tl, int fromexit)
1353 {
1354         struct workqueue *wq;
1355         struct uthread * uth;
1356
1357         /*
1358          * If fromexit is set, the call is from workqueue_exit(,
1359          * so some cleanups are to be avoided.
1360          */
1361         wq = tl->th_workq;
1362
1363         TAILQ_REMOVE(&wq->wq_thidlelist, tl, th_entry);
1364
1365         if (fromexit == 0) {
1366                 wq->wq_nthreads--;
1367                 wq->wq_thidlecount--;
1368         }
1369
1370         /*
1371          * Clear the threadlist pointer in uthread so
1372          * blocked thread on wakeup for termination will
1373          * not access the thread list as it is going to be
1374          * freed.
1375          */
1376         pthread_kern->thread_sched_call(tl->th_thread, NULL);
1377
1378         uth = pthread_kern->get_bsdthread_info(tl->th_thread);
1379         if (uth != (struct uthread *)0) {
1380                 pthread_kern->uthread_set_threadlist(uth, NULL);
1381         }
1382         if (fromexit == 0) {
1383                 /* during exit the lock is not held */
1384                 workqueue_unlock(wq->wq_proc);
1385         }
1386
1387         if ( (tl->th_flags & TH_LIST_SUSPENDED) ) {
1388                 /*
1389                  * thread was created, but never used...
1390                  * need to clean up the stack and port ourselves
1391                  * since we're not going to spin up through the
1392                  * normal exit path triggered from Libc
1393                  */
1394                 if (fromexit == 0) {
1395                         /* vm map is already deallocated when this is called from exit */
1396                         (void)mach_vm_deallocate(wq->wq_map, tl->th_stackaddr, tl->th_allocsize);
1397                 }
1398                 (void)pthread_kern->mach_port_deallocate(pthread_kern->task_get_ipcspace(wq->wq_task), tl->th_thport);
1399
1400                 PTHREAD_TRACE1(TRACE_wq_thread_suspend | DBG_FUNC_END, wq, (uintptr_t)thread_tid(current_thread()), wq->wq_nthreads, 0xdead, thread_tid(tl->th_thread));
1401         } else {
1402
1403                 PTHREAD_TRACE1(TRACE_wq_thread_park | DBG_FUNC_END, wq, (uintptr_t)thread_tid(current_thread()), wq->wq_nthreads, 0xdead, thread_tid(tl->th_thread));
1404         }
1405         /*
1406          * drop our ref on the thread
1407          */
1408         thread_deallocate(tl->th_thread);
1409
1410         kfree(tl, sizeof(struct threadlist));
1411 }
1412
1413
1414 /*
1415  * called with workq lock held
1416  * dropped and retaken around thread creation
1417  * return with workq lock held
1418  */
1419 static boolean_t
1420 workqueue_addnewthread(struct workqueue *wq, boolean_t oc_thread)
1421 {
1422         struct threadlist *tl;
1423         struct uthread  *uth;
1424         kern_return_t   kret;
1425         thread_t        th;
1426         proc_t          p;
1427         void            *sright;
1428         mach_vm_offset_t stackaddr;
1429         mach_vm_size_t guardsize;
1430
1431         if ((wq->wq_flags & WQ_EXITING) == WQ_EXITING)
1432                 return (FALSE);
1433
1434         if (wq->wq_nthreads >= wq_max_threads || wq->wq_nthreads >= (pthread_kern->config_thread_max - 20)) {
1435                 wq->wq_lflags |= WQL_EXCEEDED_TOTAL_THREAD_LIMIT;
1436                 return (FALSE);
1437         }
1438         wq->wq_lflags &= ~WQL_EXCEEDED_TOTAL_THREAD_LIMIT;
1439
1440         if (oc_thread == FALSE && wq->wq_constrained_threads_scheduled >= wq_max_constrained_threads) {
1441                 /*
1442                  * if we're not creating this thread to service an overcommit request,
1443                  * then check the size of the constrained thread pool...  if we've already
1444                  * reached our max for threads scheduled from this pool, don't create a new
1445                  * one... the callers of this function are prepared for failure.
1446                  */
1447                 wq->wq_lflags |= WQL_EXCEEDED_CONSTRAINED_THREAD_LIMIT;
1448                 return (FALSE);
1449         }
1450         if (wq->wq_constrained_threads_scheduled < wq_max_constrained_threads)
1451                 wq->wq_lflags &= ~WQL_EXCEEDED_CONSTRAINED_THREAD_LIMIT;
1452
1453         wq->wq_nthreads++;
1454
1455         p = wq->wq_proc;
1456         workqueue_unlock(p);
1457
1458         kret = pthread_kern->thread_create_workq(wq->wq_task, (thread_continue_t)wq_unsuspend_continue, &th);
1459         if (kret != KERN_SUCCESS) {
1460                 goto failed;
1461         }
1462
1463         tl = kalloc(sizeof(struct threadlist));
1464         bzero(tl, sizeof(struct threadlist));
1465
1466 #if defined(__i386__) || defined(__x86_64__)
1467         stackaddr = 0xB0000000;
1468 #else
1469 #error Need to define a stack address hint for this architecture
1470 #endif
1471
1472         guardsize = vm_map_page_size(wq->wq_map);
1473         tl->th_allocsize = PTH_DEFAULT_STACKSIZE + guardsize + pthread_kern->proc_get_pthsize(p);
1474
1475         kret = mach_vm_map(wq->wq_map, &stackaddr,
1476                         tl->th_allocsize,
1477                         page_size-1,
1478                         VM_MAKE_TAG(VM_MEMORY_STACK)| VM_FLAGS_ANYWHERE , NULL,
1479                         0, FALSE, VM_PROT_DEFAULT, VM_PROT_ALL,
1480                         VM_INHERIT_DEFAULT);
1481
1482         if (kret != KERN_SUCCESS) {
1483                 kret = mach_vm_allocate(wq->wq_map,
1484                                         &stackaddr, tl->th_allocsize,
1485                                         VM_MAKE_TAG(VM_MEMORY_STACK) | VM_FLAGS_ANYWHERE);
1486         }
1487         if (kret == KERN_SUCCESS) {
1488                 /*
1489                  * The guard page is at the lowest address
1490                  * The stack base is the highest address
1491                  */
1492                 kret = mach_vm_protect(wq->wq_map, stackaddr, guardsize, FALSE, VM_PROT_NONE);
1493
1494                 if (kret != KERN_SUCCESS)
1495                         (void) mach_vm_deallocate(wq->wq_map, stackaddr, tl->th_allocsize);
1496         }
1497         if (kret != KERN_SUCCESS) {
1498                 (void) thread_terminate(th);
1499                 thread_deallocate(th);
1500
1501                 kfree(tl, sizeof(struct threadlist));
1502                 goto failed;
1503         }
1504         thread_reference(th);
1505
1506         sright = (void *)pthread_kern->convert_thread_to_port(th);
1507         tl->th_thport = pthread_kern->ipc_port_copyout_send(sright, pthread_kern->task_get_ipcspace(wq->wq_task));
1508
1509         pthread_kern->thread_static_param(th, TRUE);
1510
1511         tl->th_flags = TH_LIST_INITED | TH_LIST_SUSPENDED;
1512
1513         tl->th_thread = th;
1514         tl->th_workq = wq;
1515         tl->th_stackaddr = stackaddr;
1516         tl->th_priority = WORKQUEUE_NUM_BUCKETS;
1517         tl->th_policy = -1;
1518
1519         uth = pthread_kern->get_bsdthread_info(tl->th_thread);
1520
1521         workqueue_lock_spin(p);
1522
1523         pthread_kern->uthread_set_threadlist(uth, tl);
1524         TAILQ_INSERT_TAIL(&wq->wq_thidlelist, tl, th_entry);
1525
1526         wq->wq_thidlecount++;
1527
1528         PTHREAD_TRACE1(TRACE_wq_thread_suspend | DBG_FUNC_START, wq, wq->wq_nthreads, 0, thread_tid(current_thread()), thread_tid(tl->th_thread));
1529
1530         return (TRUE);
1531
1532 failed:
1533         workqueue_lock_spin(p);
1534         wq->wq_nthreads--;
1535
1536         return (FALSE);
1537 }
1538
1539
1540 int
1541 _workq_open(struct proc *p, __unused int32_t *retval)
1542 {
1543         struct workqueue * wq;
1544         int wq_size;
1545         char * ptr;
1546         uint32_t i;
1547         uint32_t num_cpus;
1548         int error = 0;
1549         boolean_t need_wakeup = FALSE;
1550
1551         if (pthread_kern->proc_get_register(p) == 0) {
1552                 return EINVAL;
1553         }
1554
1555         num_cpus = pthread_kern->ml_get_max_cpus();
1556
1557         if (wq_init_constrained_limit) {
1558                 uint32_t limit;
1559                 /*
1560                  * set up the limit for the constrained pool
1561                  * this is a virtual pool in that we don't
1562                  * maintain it on a separate idle and run list
1563                  */
1564                 limit = num_cpus * WORKQUEUE_CONSTRAINED_FACTOR;
1565
1566                 if (limit > wq_max_constrained_threads)
1567                         wq_max_constrained_threads = limit;
1568
1569                 wq_init_constrained_limit = 0;
1570         }
1571         workqueue_lock_spin(p);
1572
1573         if (pthread_kern->proc_get_wqptr(p) == NULL) {
1574
1575                 while (*pthread_kern->proc_get_wqinitingptr(p) == TRUE) {
1576
1577                         assert_wait((caddr_t)pthread_kern->proc_get_wqinitingptr(p), THREAD_UNINT);
1578                         workqueue_unlock(p);
1579
1580                         thread_block(THREAD_CONTINUE_NULL);
1581
1582                         workqueue_lock_spin(p);
1583                 }
1584                 if (pthread_kern->proc_get_wqptr(p) != NULL) {
1585                         goto out;
1586                 }
1587
1588                 *(pthread_kern->proc_get_wqinitingptr(p)) = TRUE;
1589
1590                 workqueue_unlock(p);
1591
1592                 wq_size = sizeof(struct workqueue);
1593
1594                 ptr = (char *)kalloc(wq_size);
1595                 bzero(ptr, wq_size);
1596
1597                 wq = (struct workqueue *)ptr;
1598                 wq->wq_flags = WQ_LIST_INITED;
1599                 wq->wq_proc = p;
1600                 wq->wq_max_concurrency = num_cpus;
1601                 wq->wq_task = current_task();
1602                 wq->wq_map  = pthread_kern->current_map();
1603
1604                 for (i = 0; i < WORKQUEUE_NUM_BUCKETS; i++)
1605                         wq->wq_reqconc[i] = (uint16_t)wq->wq_max_concurrency;
1606
1607                 TAILQ_INIT(&wq->wq_thrunlist);
1608                 TAILQ_INIT(&wq->wq_thidlelist);
1609
1610                 wq->wq_atimer_call = thread_call_allocate((thread_call_func_t)workqueue_add_timer, (thread_call_param_t)wq);
1611
1612                 workqueue_lock_spin(p);
1613
1614                 pthread_kern->proc_set_wqptr(p, wq);
1615                 pthread_kern->proc_set_wqsize(p, wq_size);
1616
1617                 *(pthread_kern->proc_get_wqinitingptr(p)) = FALSE;
1618                 need_wakeup = TRUE;
1619         }
1620 out:
1621         workqueue_unlock(p);
1622
1623         if (need_wakeup == TRUE) {
1624                 wakeup(pthread_kern->proc_get_wqinitingptr(p));
1625         }
1626         return(error);
1627 }
1628
1629
1630 int
1631 _workq_kernreturn(struct proc *p,
1632                   int options,
1633                   __unused user_addr_t item,
1634                   int arg2,
1635                   int arg3,
1636                   __unused int32_t *retval)
1637 {
1638         struct workqueue *wq;
1639         int error       = 0;
1640
1641         if (pthread_kern->proc_get_register(p) == 0) {
1642                 return EINVAL;
1643         }
1644
1645         switch (options) {
1646         case WQOPS_QUEUE_NEWSPISUPP: {
1647                 /*
1648                  * arg2 = offset of serialno into dispatch queue
1649                  */
1650                 int offset = arg2;
1651
1652                 pthread_kern->proc_set_dispatchqueue_serialno_offset(p, (uint64_t)offset);
1653                 break;
1654         }
1655         case WQOPS_QUEUE_REQTHREADS: {
1656                 /*
1657                  * arg2 = number of threads to start
1658                  * arg3 = priority
1659                  */
1660                 boolean_t overcommit = FALSE;
1661                 int reqcount         = arg2;
1662                 pthread_priority_t priority = arg3;
1663                 int class;
1664
1665                 overcommit = (_pthread_priority_get_flags(priority) & _PTHREAD_PRIORITY_OVERCOMMIT_FLAG) != 0;
1666                 class = pthread_priority_get_class_index(priority);
1667
1668                 if ((reqcount <= 0) || (class < 0) || (class >= WORKQUEUE_NUM_BUCKETS)) {
1669                         error = EINVAL;
1670                         break;
1671                 }
1672
1673                 workqueue_lock_spin(p);
1674
1675                 if ((wq = (struct workqueue *)pthread_kern->proc_get_wqptr(p)) == NULL) {
1676                         workqueue_unlock(p);
1677
1678                         error = EINVAL;
1679                         break;
1680                 }
1681
1682                 if (!overcommit) {
1683                         wq->wq_reqcount += reqcount;
1684                         wq->wq_requests[class] += reqcount;
1685
1686                         PTHREAD_TRACE(TRACE_wq_req_threads | DBG_FUNC_NONE, wq, priority, wq->wq_requests[class], reqcount, 0);
1687
1688                         while (wq->wq_reqcount) {
1689                                 if (!workqueue_run_one(p, wq, overcommit, priority))
1690                                         break;
1691                         }
1692                 } else {
1693                         PTHREAD_TRACE(TRACE_wq_req_octhreads | DBG_FUNC_NONE, wq, priority, wq->wq_requests[class], reqcount, 0);
1694
1695                         while (reqcount) {
1696                                 if (!workqueue_run_one(p, wq, overcommit, priority))
1697                                         break;
1698                                 reqcount--;
1699                         }
1700                         if (reqcount) {
1701                                 /*
1702                                  * we need to delay starting some of the overcommit requests...
1703                                  * we should only fail to create the overcommit threads if
1704                                  * we're at the max thread limit... as existing threads
1705                                  * return to the kernel, we'll notice the ocrequests
1706                                  * and spin them back to user space as the overcommit variety
1707                                  */
1708                                 wq->wq_reqcount += reqcount;
1709                                 wq->wq_requests[class] += reqcount;
1710                                 wq->wq_ocrequests[class] += reqcount;
1711
1712                                 PTHREAD_TRACE(TRACE_wq_delay_octhreads | DBG_FUNC_NONE, wq, priority, wq->wq_requests[class], reqcount, 0);
1713                         }
1714                 }
1715                 workqueue_unlock(p);
1716                 break;
1717         }
1718
1719         case WQOPS_THREAD_RETURN: {
1720                 thread_t th = current_thread();
1721                 struct uthread *uth = pthread_kern->get_bsdthread_info(th);
1722                 struct threadlist *tl = util_get_thread_threadlist_entry(th);
1723
1724                 /* reset signal mask on the workqueue thread to default state */
1725                 if (pthread_kern->uthread_get_sigmask(uth) != (sigset_t)(~workq_threadmask)) {
1726                         pthread_kern->proc_lock(p);
1727                         pthread_kern->uthread_set_sigmask(uth, ~workq_threadmask);
1728                         pthread_kern->proc_unlock(p);
1729                 }
1730
1731                 /* dropping WQ override counts has to be done outside the wq lock. */
1732                 wq_thread_override_reset(th);
1733
1734                 workqueue_lock_spin(p);
1735
1736                 if ((wq = (struct workqueue *)pthread_kern->proc_get_wqptr(p)) == NULL || !tl) {
1737                         workqueue_unlock(p);
1738
1739                         error = EINVAL;
1740                         break;
1741                 }
1742                 PTHREAD_TRACE(TRACE_wq_runitem | DBG_FUNC_END, wq, 0, 0, 0, 0);
1743
1744
1745                 (void)workqueue_run_nextreq(p, wq, th, FALSE, FALSE, 0);
1746                 /*
1747                  * workqueue_run_nextreq is responsible for
1748                  * dropping the workqueue lock in all cases
1749                  */
1750                 break;
1751         }
1752
1753         default:
1754                 error = EINVAL;
1755                 break;
1756         }
1757         return (error);
1758 }
1759
1760 /*
1761  * Routine:     workqueue_mark_exiting
1762  *
1763  * Function:    Mark the work queue such that new threads will not be added to the
1764  *              work queue after we return.
1765  *
1766  * Conditions:  Called against the current process.
1767  */
1768 void
1769 _workqueue_mark_exiting(struct proc *p)
1770 {
1771         struct workqueue *wq = pthread_kern->proc_get_wqptr(p);
1772
1773         if (wq != NULL) {
1774
1775                 PTHREAD_TRACE(TRACE_wq_pthread_exit|DBG_FUNC_START, wq, 0, 0, 0, 0);
1776
1777                 workqueue_lock_spin(p);
1778
1779                 /*
1780                  * we now arm the timer in the callback function w/o holding the workq lock...
1781                  * we do this by setting  WQ_ATIMER_RUNNING via OSCompareAndSwap in order to
1782                  * insure only a single timer if running and to notice that WQ_EXITING has
1783                  * been set (we don't want to start a timer once WQ_EXITING is posted)
1784                  *
1785                  * so once we have successfully set WQ_EXITING, we cannot fire up a new timer...
1786                  * therefor no need to clear the timer state atomically from the flags
1787                  *
1788                  * since we always hold the workq lock when dropping WQ_ATIMER_RUNNING
1789                  * the check for and sleep until clear is protected
1790                  */
1791                 while (!(OSCompareAndSwap(wq->wq_flags, (wq->wq_flags | WQ_EXITING), (UInt32 *)&wq->wq_flags)));
1792
1793                 if (wq->wq_flags & WQ_ATIMER_RUNNING) {
1794                         if (thread_call_cancel(wq->wq_atimer_call) == TRUE) {
1795                                 wq->wq_flags &= ~WQ_ATIMER_RUNNING;
1796                         }
1797                 }
1798                 while ((wq->wq_flags & WQ_ATIMER_RUNNING) || (wq->wq_lflags & WQL_ATIMER_BUSY)) {
1799                         assert_wait((caddr_t)wq, (THREAD_UNINT));
1800                         workqueue_unlock(p);
1801
1802                         thread_block(THREAD_CONTINUE_NULL);
1803
1804                         workqueue_lock_spin(p);
1805                 }
1806                 workqueue_unlock(p);
1807
1808                 PTHREAD_TRACE(TRACE_wq_pthread_exit|DBG_FUNC_END, 0, 0, 0, 0, 0);
1809         }
1810 }
1811
1812 /*
1813  * Routine:     workqueue_exit
1814  *
1815  * Function:    clean up the work queue structure(s) now that there are no threads
1816  *              left running inside the work queue (except possibly current_thread).
1817  *
1818  * Conditions:  Called by the last thread in the process.
1819  *              Called against current process.
1820  */
1821 void
1822 _workqueue_exit(struct proc *p)
1823 {
1824         struct workqueue  * wq;
1825         struct threadlist  * tl, *tlist;
1826         struct uthread  *uth;
1827         int wq_size = 0;
1828
1829         wq = pthread_kern->proc_get_wqptr(p);
1830         if (wq != NULL) {
1831
1832                 PTHREAD_TRACE(TRACE_wq_workqueue_exit|DBG_FUNC_START, wq, 0, 0, 0, 0);
1833
1834                 wq_size = pthread_kern->proc_get_wqsize(p);
1835                 pthread_kern->proc_set_wqptr(p, NULL);
1836                 pthread_kern->proc_set_wqsize(p, 0);
1837
1838                 /*
1839                  * Clean up workqueue data structures for threads that exited and
1840                  * didn't get a chance to clean up after themselves.
1841                  */
1842                 TAILQ_FOREACH_SAFE(tl, &wq->wq_thrunlist, th_entry, tlist) {
1843                         pthread_kern->thread_sched_call(tl->th_thread, NULL);
1844
1845                         uth = pthread_kern->get_bsdthread_info(tl->th_thread);
1846                         if (uth != (struct uthread *)0) {
1847                                 pthread_kern->uthread_set_threadlist(uth, NULL);
1848                         }
1849                         TAILQ_REMOVE(&wq->wq_thrunlist, tl, th_entry);
1850
1851                         /*
1852                          * drop our last ref on the thread
1853                          */
1854                         thread_deallocate(tl->th_thread);
1855
1856                         kfree(tl, sizeof(struct threadlist));
1857                 }
1858                 TAILQ_FOREACH_SAFE(tl, &wq->wq_thidlelist, th_entry, tlist) {
1859                         workqueue_removethread(tl, 1);
1860                 }
1861                 thread_call_free(wq->wq_atimer_call);
1862
1863                 kfree(wq, wq_size);
1864
1865                 PTHREAD_TRACE(TRACE_wq_workqueue_exit|DBG_FUNC_END, 0, 0, 0, 0, 0);
1866         }
1867 }
1868
1869
1870 static boolean_t
1871 workqueue_run_one(proc_t p, struct workqueue *wq, boolean_t overcommit, pthread_priority_t priority)
1872 {
1873         boolean_t       ran_one;
1874
1875         if (wq->wq_thidlecount == 0) {
1876                 if (overcommit == FALSE) {
1877                         if (wq->wq_constrained_threads_scheduled < wq->wq_max_concurrency)
1878                                 workqueue_addnewthread(wq, overcommit);
1879                 } else {
1880                         workqueue_addnewthread(wq, overcommit);
1881
1882                         if (wq->wq_thidlecount == 0)
1883                                 return (FALSE);
1884                 }
1885         }
1886         ran_one = workqueue_run_nextreq(p, wq, THREAD_NULL, FALSE, overcommit, priority);
1887         /*
1888          * workqueue_run_nextreq is responsible for
1889          * dropping the workqueue lock in all cases
1890          */
1891         workqueue_lock_spin(p);
1892
1893         return (ran_one);
1894 }
1895
1896
1897
1898 /*
1899  * workqueue_run_nextreq:
1900  *   called with the workqueue lock held...
1901  *   responsible for dropping it in all cases
1902  */
1903 static boolean_t
1904 workqueue_run_nextreq(proc_t p, struct workqueue *wq, thread_t thread,
1905                       boolean_t force_oc, boolean_t overcommit, pthread_priority_t oc_prio)
1906 {
1907         thread_t th_to_run = THREAD_NULL;
1908         thread_t th_to_park = THREAD_NULL;
1909         int wake_thread = 0;
1910         int reuse_thread = WQ_FLAG_THREAD_REUSE;
1911         uint32_t priclass, orig_class;
1912         uint32_t us_to_wait;
1913         struct threadlist *tl = NULL;
1914         struct uthread *uth = NULL;
1915         boolean_t start_timer = FALSE;
1916         boolean_t adjust_counters = TRUE;
1917         uint64_t        curtime;
1918         uint32_t        thactive_count;
1919         uint32_t        busycount;
1920
1921         PTHREAD_TRACE(TRACE_wq_run_nextitem|DBG_FUNC_START, wq, thread, wq->wq_thidlecount, wq->wq_reqcount, 0);
1922
1923         if (thread != THREAD_NULL) {
1924                 uth = pthread_kern->get_bsdthread_info(thread);
1925
1926                 if ((tl = pthread_kern->uthread_get_threadlist(uth)) == NULL) {
1927                         panic("wq thread with no threadlist");
1928                 }
1929         }
1930
1931         /*
1932          * from here until we drop the workq lock
1933          * we can't be pre-empted since we hold
1934          * the lock in spin mode... this is important
1935          * since we have to independently update the priority that
1936          * the thread is associated with and the priorty based
1937          * counters that "workqueue_callback" also changes and bases
1938          * decisons on.
1939          */
1940 dispatch_overcommit:
1941
1942         if (overcommit || force_oc) {
1943                 priclass = pthread_priority_get_class_index(oc_prio);
1944
1945                 if (thread != THREAD_NULL) {
1946                         th_to_run = thread;
1947                         goto pick_up_work;
1948                 }
1949                 goto grab_idle_thread;
1950         }
1951         if (wq->wq_reqcount) {
1952                 for (priclass = 0; priclass < WORKQUEUE_NUM_BUCKETS; priclass++) {
1953                         if (wq->wq_requests[priclass])
1954                                 break;
1955                 }
1956                 assert(priclass < WORKQUEUE_NUM_BUCKETS);
1957
1958                 if (wq->wq_ocrequests[priclass] && (thread != THREAD_NULL || wq->wq_thidlecount)) {
1959                         /*
1960                          * handle delayed overcommit request...
1961                          * they have priority over normal requests
1962                          * within a given priority level
1963                          */
1964                         wq->wq_reqcount--;
1965                         wq->wq_requests[priclass]--;
1966                         wq->wq_ocrequests[priclass]--;
1967
1968                         oc_prio = pthread_priority_from_class_index(priclass);
1969                         overcommit = TRUE;
1970
1971                         goto dispatch_overcommit;
1972                 }
1973         }
1974         /*
1975          * if we get here, the work should be handled by a constrained thread
1976          */
1977         if (wq->wq_reqcount == 0 || wq->wq_constrained_threads_scheduled >= wq_max_constrained_threads) {
1978                 /*
1979                  * no work to do, or we're already at or over the scheduling limit for
1980                  * constrained threads...  just return or park the thread...
1981                  * do not start the timer for this condition... if we don't have any work,
1982                  * we'll check again when new work arrives... if we're over the limit, we need 1 or more
1983                  * constrained threads to return to the kernel before we can dispatch additional work
1984                  */
1985                 if ((th_to_park = thread) == THREAD_NULL)
1986                         goto out_of_work;
1987                 goto parkit;
1988         }
1989
1990         thactive_count = 0;
1991         busycount = 0;
1992
1993         curtime = mach_absolute_time();
1994
1995         thactive_count += wq->wq_thactive_count[priclass];
1996
1997         if (wq->wq_thscheduled_count[priclass]) {
1998                 if (wq_thread_is_busy(curtime, &wq->wq_lastblocked_ts[priclass])) {
1999                         busycount++;
2000                 }
2001         }
2002
2003         if (thread != THREAD_NULL) {
2004                 if (tl->th_priority == priclass) {
2005                         /*
2006                          * dont't count this thread as currently active
2007                          */
2008                         thactive_count--;
2009                 }
2010         }
2011         if (thactive_count + busycount >= wq->wq_max_concurrency) {
2012                 if (busycount) {
2013                                 /*
2014                                  * we found at least 1 thread in the
2015                                  * 'busy' state... make sure we start
2016                                  * the timer because if they are the only
2017                                  * threads keeping us from scheduling
2018                                  * this work request, we won't get a callback
2019                                  * to kick off the timer... we need to
2020                                  * start it now...
2021                                  */
2022                                 WQ_TIMER_NEEDED(wq, start_timer);
2023                 }
2024
2025                 PTHREAD_TRACE(TRACE_wq_overcommitted|DBG_FUNC_NONE, wq, (start_timer ? 1<<7 : 0) | pthread_priority_from_class_index(priclass), thactive_count, busycount, 0);
2026
2027                 if ((th_to_park = thread) == THREAD_NULL) {
2028                         goto out_of_work;
2029                 }
2030
2031                 goto parkit;
2032         }
2033
2034         if (thread != THREAD_NULL) {
2035                 /*
2036                  * thread is non-NULL here when we return from userspace
2037                  * in workq_kernreturn, rather than trying to find a thread
2038                  * we pick up new work for this specific thread.
2039                  */
2040                 th_to_run = thread;
2041                 goto pick_up_work;
2042         }
2043
2044 grab_idle_thread:
2045         if (wq->wq_thidlecount == 0) {
2046                 /*
2047                  * we have no additional threads waiting to pick up
2048                  * work, however, there is additional work to do.
2049                  */
2050                 WQ_TIMER_NEEDED(wq, start_timer);
2051
2052                 PTHREAD_TRACE(TRACE_wq_stalled, wq, wq->wq_nthreads, start_timer, 0, 0);
2053
2054                 goto no_thread_to_run;
2055         }
2056
2057         /*
2058          * we already know there is both work available
2059          * and an idle thread, so activate a thread and then
2060          * fall into the code that pulls a new work request...
2061          */
2062         tl = TAILQ_FIRST(&wq->wq_thidlelist);
2063         TAILQ_REMOVE(&wq->wq_thidlelist, tl, th_entry);
2064         wq->wq_thidlecount--;
2065
2066         TAILQ_INSERT_TAIL(&wq->wq_thrunlist, tl, th_entry);
2067
2068         if ((tl->th_flags & TH_LIST_SUSPENDED) == TH_LIST_SUSPENDED) {
2069                 tl->th_flags &= ~TH_LIST_SUSPENDED;
2070                 reuse_thread = 0;
2071
2072         } else if ((tl->th_flags & TH_LIST_BLOCKED) == TH_LIST_BLOCKED) {
2073                 tl->th_flags &= ~TH_LIST_BLOCKED;
2074                 wake_thread = 1;
2075         }
2076         tl->th_flags |= TH_LIST_RUNNING | TH_LIST_BUSY;
2077
2078         wq->wq_threads_scheduled++;
2079         wq->wq_thscheduled_count[priclass]++;
2080         OSAddAtomic(1, &wq->wq_thactive_count[priclass]);
2081
2082         adjust_counters = FALSE;
2083         th_to_run = tl->th_thread;
2084
2085 pick_up_work:
2086         if (!overcommit && !force_oc) {
2087                 wq->wq_reqcount--;
2088                 wq->wq_requests[priclass]--;
2089
2090                 if ( !(tl->th_flags & TH_LIST_CONSTRAINED)) {
2091                         wq->wq_constrained_threads_scheduled++;
2092                         tl->th_flags |= TH_LIST_CONSTRAINED;
2093                 }
2094         } else {
2095                 if (tl->th_flags & TH_LIST_CONSTRAINED) {
2096                         wq->wq_constrained_threads_scheduled--;
2097                         tl->th_flags &= ~TH_LIST_CONSTRAINED;
2098                 }
2099         }
2100
2101         orig_class = tl->th_priority;
2102         tl->th_priority = (uint8_t)priclass;
2103
2104         if (adjust_counters && (orig_class != priclass)) {
2105                 /*
2106                  * we need to adjust these counters based on this
2107                  * thread's new disposition w/r to priority
2108                  */
2109                 OSAddAtomic(-1, &wq->wq_thactive_count[orig_class]);
2110                 OSAddAtomic(1, &wq->wq_thactive_count[priclass]);
2111
2112                 wq->wq_thscheduled_count[orig_class]--;
2113                 wq->wq_thscheduled_count[priclass]++;
2114         }
2115         wq->wq_thread_yielded_count = 0;
2116
2117         workqueue_unlock(p);
2118
2119         if (orig_class != priclass) {
2120                 pthread_priority_t pri = pthread_priority_from_class_index(priclass);
2121
2122                 thread_qos_policy_data_t qosinfo;
2123
2124                 /* Set the QoS tier on the thread, along with the ceiling of max importance for this class. */
2125                 qosinfo.qos_tier = pthread_priority_get_qos_class(pri);
2126                 qosinfo.tier_importance = 0;
2127
2128                 PTHREAD_TRACE(TRACE_wq_reset_priority | DBG_FUNC_START, wq, thread_tid(tl->th_thread), pthread_priority_from_class_index(orig_class), 0, 0);
2129
2130                 /* All the previous implementation here now boils down to setting the QoS policy on the thread. */
2131                 pthread_kern->thread_policy_set_internal(th_to_run, THREAD_QOS_POLICY, (thread_policy_t)&qosinfo, THREAD_QOS_POLICY_COUNT);
2132
2133                 PTHREAD_TRACE(TRACE_wq_reset_priority | DBG_FUNC_END, wq, thread_tid(tl->th_thread), pthread_priority_from_class_index(priclass), qosinfo.qos_tier, 0);
2134         }
2135
2136         /*
2137          * if current thread is reused for work request, does not return via unix_syscall
2138          */
2139         wq_runreq(p, overcommit, pthread_priority_from_class_index(priclass), th_to_run, tl, reuse_thread, wake_thread, (thread == th_to_run));
2140
2141         PTHREAD_TRACE(TRACE_wq_run_nextitem|DBG_FUNC_END, wq, thread_tid(th_to_run), overcommit, 1, 0);
2142
2143         return (TRUE);
2144
2145 out_of_work:
2146         /*
2147          * we have no work to do or we are fully booked
2148          * w/r to running threads...
2149          */
2150 no_thread_to_run:
2151         workqueue_unlock(p);
2152
2153         if (start_timer)
2154                 workqueue_interval_timer_start(wq);
2155
2156         PTHREAD_TRACE(TRACE_wq_run_nextitem|DBG_FUNC_END, wq, thread_tid(thread), start_timer, 2, 0);
2157
2158         return (FALSE);
2159
2160 parkit:
2161         /*
2162          * this is a workqueue thread with no more
2163          * work to do... park it for now
2164          */
2165         TAILQ_REMOVE(&wq->wq_thrunlist, tl, th_entry);
2166         tl->th_flags &= ~TH_LIST_RUNNING;
2167
2168         tl->th_flags |= TH_LIST_BLOCKED;
2169         TAILQ_INSERT_HEAD(&wq->wq_thidlelist, tl, th_entry);
2170
2171         pthread_kern->thread_sched_call(th_to_park, NULL);
2172
2173         OSAddAtomic(-1, &wq->wq_thactive_count[tl->th_priority]);
2174         wq->wq_thscheduled_count[tl->th_priority]--;
2175         wq->wq_threads_scheduled--;
2176
2177         if (tl->th_flags & TH_LIST_CONSTRAINED) {
2178                 wq->wq_constrained_threads_scheduled--;
2179                 wq->wq_lflags &= ~WQL_EXCEEDED_CONSTRAINED_THREAD_LIMIT;
2180                 tl->th_flags &= ~TH_LIST_CONSTRAINED;
2181         }
2182         if (wq->wq_thidlecount < 100)
2183                 us_to_wait = wq_reduce_pool_window_usecs - (wq->wq_thidlecount * (wq_reduce_pool_window_usecs / 100));
2184         else
2185                 us_to_wait = wq_reduce_pool_window_usecs / 100;
2186
2187         wq->wq_thidlecount++;
2188         wq->wq_lflags &= ~WQL_EXCEEDED_TOTAL_THREAD_LIMIT;
2189
2190         assert_wait_timeout_with_leeway((caddr_t)tl, (THREAD_INTERRUPTIBLE),
2191                         TIMEOUT_URGENCY_SYS_BACKGROUND|TIMEOUT_URGENCY_LEEWAY, us_to_wait,
2192                         wq_reduce_pool_window_usecs, NSEC_PER_USEC);
2193
2194         workqueue_unlock(p);
2195
2196         if (start_timer)
2197                 workqueue_interval_timer_start(wq);
2198
2199         PTHREAD_TRACE1(TRACE_wq_thread_park | DBG_FUNC_START, wq, wq->wq_threads_scheduled, wq->wq_thidlecount, us_to_wait, thread_tid(th_to_park));
2200         PTHREAD_TRACE(TRACE_wq_run_nextitem | DBG_FUNC_END, wq, thread_tid(thread), 0, 3, 0);
2201
2202         thread_block((thread_continue_t)wq_unpark_continue);
2203         /* NOT REACHED */
2204
2205         return (FALSE);
2206 }
2207
2208
2209 static void
2210 wq_unsuspend_continue(void)
2211 {
2212         struct uthread *uth = NULL;
2213         thread_t th_to_unsuspend;
2214         struct threadlist *tl;
2215         proc_t  p;
2216
2217         th_to_unsuspend = current_thread();
2218         uth = pthread_kern->get_bsdthread_info(th_to_unsuspend);
2219
2220         if (uth != NULL && (tl = pthread_kern->uthread_get_threadlist(uth)) != NULL) {
2221
2222                 if ((tl->th_flags & (TH_LIST_RUNNING | TH_LIST_BUSY)) == TH_LIST_RUNNING) {
2223                         /*
2224                          * most likely a normal resume of this thread occurred...
2225                          * it's also possible that the thread was aborted after we
2226                          * finished setting it up so that it could be dispatched... if
2227                          * so, thread_bootstrap_return will notice the abort and put
2228                          * the thread on the path to self-destruction
2229                          */
2230 normal_resume_to_user:
2231                         pthread_kern->thread_sched_call(th_to_unsuspend, workqueue_callback);
2232                         pthread_kern->thread_bootstrap_return();
2233                 }
2234                 /*
2235                  * if we get here, it's because we've been resumed due to
2236                  * an abort of this thread (process is crashing)
2237                  */
2238                 p = current_proc();
2239
2240                 workqueue_lock_spin(p);
2241
2242                 if (tl->th_flags & TH_LIST_SUSPENDED) {
2243                         /*
2244                          * thread has been aborted while still on our idle
2245                          * queue... remove it from our domain...
2246                          * workqueue_removethread consumes the lock
2247                          */
2248                         workqueue_removethread(tl, 0);
2249                         pthread_kern->thread_bootstrap_return();
2250                 }
2251                 while ((tl->th_flags & TH_LIST_BUSY)) {
2252                         /*
2253                          * this thread was aborted after we started making
2254                          * it runnable, but before we finished dispatching it...
2255                          * we need to wait for that process to finish,
2256                          * and we need to ask for a wakeup instead of a
2257                          * thread_resume since the abort has already resumed us
2258                          */
2259                         tl->th_flags |= TH_LIST_NEED_WAKEUP;
2260
2261                         assert_wait((caddr_t)tl, (THREAD_UNINT));
2262
2263                         workqueue_unlock(p);
2264                         thread_block(THREAD_CONTINUE_NULL);
2265                         workqueue_lock_spin(p);
2266                 }
2267                 workqueue_unlock(p);
2268                 /*
2269                  * we have finished setting up the thread's context...
2270                  * thread_bootstrap_return will take us through the abort path
2271                  * where the thread will self destruct
2272                  */
2273                 goto normal_resume_to_user;
2274         }
2275         pthread_kern->thread_bootstrap_return();
2276 }
2277
2278
2279 static void
2280 wq_unpark_continue(void)
2281 {
2282         struct uthread *uth = NULL;
2283         struct threadlist *tl;
2284         thread_t th_to_unpark;
2285         proc_t  p;
2286
2287         th_to_unpark = current_thread();
2288         uth = pthread_kern->get_bsdthread_info(th_to_unpark);
2289
2290         if (uth != NULL) {
2291                 if ((tl = pthread_kern->uthread_get_threadlist(uth)) != NULL) {
2292
2293                         if ((tl->th_flags & (TH_LIST_RUNNING | TH_LIST_BUSY)) == TH_LIST_RUNNING) {
2294                                 /*
2295                                  * a normal wakeup of this thread occurred... no need
2296                                  * for any synchronization with the timer and wq_runreq
2297                                  */
2298 normal_return_to_user:
2299                                 pthread_kern->thread_sched_call(th_to_unpark, workqueue_callback);
2300
2301                                 PTHREAD_TRACE(0xefffd018 | DBG_FUNC_END, tl->th_workq, 0, 0, 0, 0);
2302
2303                                 pthread_kern->thread_exception_return();
2304                         }
2305                         p = current_proc();
2306
2307                         workqueue_lock_spin(p);
2308
2309                         if ( !(tl->th_flags & TH_LIST_RUNNING)) {
2310                                 /*
2311                                  * the timer popped us out and we've not
2312                                  * been moved off of the idle list
2313                                  * so we should now self-destruct
2314                                  *
2315                                  * workqueue_removethread consumes the lock
2316                                  */
2317                                 workqueue_removethread(tl, 0);
2318                                 pthread_kern->thread_exception_return();
2319                         }
2320                         /*
2321                          * the timer woke us up, but we have already
2322                          * started to make this a runnable thread,
2323                          * but have not yet finished that process...
2324                          * so wait for the normal wakeup
2325                          */
2326                         while ((tl->th_flags & TH_LIST_BUSY)) {
2327
2328                                 assert_wait((caddr_t)tl, (THREAD_UNINT));
2329
2330                                 workqueue_unlock(p);
2331
2332                                 thread_block(THREAD_CONTINUE_NULL);
2333
2334                                 workqueue_lock_spin(p);
2335                         }
2336                         /*
2337                          * we have finished setting up the thread's context
2338                          * now we can return as if we got a normal wakeup
2339                          */
2340                         workqueue_unlock(p);
2341
2342                         goto normal_return_to_user;
2343                 }
2344         }
2345         pthread_kern->thread_exception_return();
2346 }
2347
2348
2349
2350 static void
2351 wq_runreq(proc_t p, boolean_t overcommit, pthread_priority_t priority, thread_t th, struct threadlist *tl,
2352            int reuse_thread, int wake_thread, int return_directly)
2353 {
2354         int ret = 0;
2355         boolean_t need_resume = FALSE;
2356
2357         PTHREAD_TRACE1(TRACE_wq_runitem | DBG_FUNC_START, tl->th_workq, overcommit, priority, thread_tid(current_thread()), thread_tid(th));
2358
2359         ret = _setup_wqthread(p, th, overcommit, priority, reuse_thread, tl);
2360
2361         if (ret != 0)
2362                 panic("setup_wqthread failed  %x\n", ret);
2363
2364         if (return_directly) {
2365                 PTHREAD_TRACE(TRACE_wq_run_nextitem|DBG_FUNC_END, tl->th_workq, 0, 0, 4, 0);
2366
2367                 pthread_kern->thread_exception_return();
2368                 panic("wq_runreq: thread_exception_return returned ...\n");
2369         }
2370         if (wake_thread) {
2371                 workqueue_lock_spin(p);
2372
2373                 tl->th_flags &= ~TH_LIST_BUSY;
2374                 wakeup(tl);
2375
2376                 workqueue_unlock(p);
2377         } else {
2378                 PTHREAD_TRACE1(TRACE_wq_thread_suspend | DBG_FUNC_END, tl->th_workq, 0, 0, thread_tid(current_thread()), thread_tid(th));
2379
2380                 workqueue_lock_spin(p);
2381
2382                 if (tl->th_flags & TH_LIST_NEED_WAKEUP) {
2383                         wakeup(tl);
2384                 } else {
2385                         need_resume = TRUE;
2386                 }
2387
2388                 tl->th_flags &= ~(TH_LIST_BUSY | TH_LIST_NEED_WAKEUP);
2389
2390                 workqueue_unlock(p);
2391
2392                 if (need_resume) {
2393                         /*
2394                          * need to do this outside of the workqueue spin lock
2395                          * since thread_resume locks the thread via a full mutex
2396                          */
2397                         pthread_kern->thread_resume(th);
2398                 }
2399         }
2400 }
2401
2402
2403 int
2404 _setup_wqthread(proc_t p, thread_t th, boolean_t overcommit, pthread_priority_t priority, int reuse_thread, struct threadlist *tl)
2405 {
2406         uint32_t flags = reuse_thread | WQ_FLAG_THREAD_NEWSPI;
2407         mach_vm_size_t guardsize = vm_map_page_size(tl->th_workq->wq_map);
2408         int error = 0;
2409
2410         if (overcommit) {
2411                 flags |= WQ_FLAG_THREAD_OVERCOMMIT;
2412         }
2413
2414         /* Put the QoS class value into the lower bits of the reuse_thread register, this is where
2415          * the thread priority used to be stored anyway.
2416          */
2417         flags |= (_pthread_priority_get_qos_newest(priority) & WQ_FLAG_THREAD_PRIOMASK);
2418
2419 #if defined(__i386__) || defined(__x86_64__)
2420         int isLP64 = proc_is64bit(p);
2421
2422         /*
2423          * Set up i386 registers & function call.
2424          */
2425         if (isLP64 == 0) {
2426                 x86_thread_state32_t state;
2427                 x86_thread_state32_t *ts = &state;
2428
2429                 ts->eip = (unsigned int)pthread_kern->proc_get_wqthread(p);
2430                 ts->eax = (unsigned int)(tl->th_stackaddr + PTH_DEFAULT_STACKSIZE + guardsize);
2431                 ts->ebx = (unsigned int)tl->th_thport;
2432                 ts->ecx = (unsigned int)(tl->th_stackaddr + guardsize);
2433                 ts->edx = (unsigned int)0;
2434                 ts->edi = (unsigned int)flags;
2435                 ts->esi = (unsigned int)0;
2436                 /*
2437                  * set stack pointer
2438                  */
2439                 ts->esp = (int)((vm_offset_t)((tl->th_stackaddr + PTH_DEFAULT_STACKSIZE + guardsize) - C_32_STK_ALIGN));
2440
2441                 (void)pthread_kern->thread_set_wq_state32(th, (thread_state_t)ts);
2442
2443         } else {
2444                 x86_thread_state64_t state64;
2445                 x86_thread_state64_t *ts64 = &state64;
2446
2447                 ts64->rip = (uint64_t)pthread_kern->proc_get_wqthread(p);
2448                 ts64->rdi = (uint64_t)(tl->th_stackaddr + PTH_DEFAULT_STACKSIZE + guardsize);
2449                 ts64->rsi = (uint64_t)(tl->th_thport);
2450                 ts64->rdx = (uint64_t)(tl->th_stackaddr + guardsize);
2451                 ts64->rcx = (uint64_t)0;
2452                 ts64->r8 = (uint64_t)flags;
2453                 ts64->r9 = (uint64_t)0;
2454
2455                 /*
2456                  * set stack pointer aligned to 16 byte boundary
2457                  */
2458                 ts64->rsp = (uint64_t)((tl->th_stackaddr + PTH_DEFAULT_STACKSIZE + guardsize) - C_64_REDZONE_LEN);
2459
2460                 error = pthread_kern->thread_set_wq_state64(th, (thread_state_t)ts64);
2461                 if (error != KERN_SUCCESS) {
2462                         error = EINVAL;
2463                 }
2464         }
2465 #else
2466 #error setup_wqthread  not defined for this architecture
2467 #endif
2468
2469         return error;
2470 }
2471
2472 int
2473 _fill_procworkqueue(proc_t p, struct proc_workqueueinfo * pwqinfo)
2474 {
2475         struct workqueue * wq;
2476         int error = 0;
2477         int     activecount;
2478         uint32_t pri;
2479
2480         workqueue_lock_spin(p);
2481         if ((wq = pthread_kern->proc_get_wqptr(p)) == NULL) {
2482                 error = EINVAL;
2483                 goto out;
2484         }
2485         activecount = 0;
2486
2487         for (pri = 0; pri < WORKQUEUE_NUM_BUCKETS; pri++) {
2488                 activecount += wq->wq_thactive_count[pri];
2489         }
2490         pwqinfo->pwq_nthreads = wq->wq_nthreads;
2491         pwqinfo->pwq_runthreads = activecount;
2492         pwqinfo->pwq_blockedthreads = wq->wq_threads_scheduled - activecount;
2493         pwqinfo->pwq_state = 0;
2494
2495         if (wq->wq_lflags & WQL_EXCEEDED_CONSTRAINED_THREAD_LIMIT) {
2496                 pwqinfo->pwq_state |= WQ_EXCEEDED_CONSTRAINED_THREAD_LIMIT;
2497         }
2498
2499         if (wq->wq_lflags & WQL_EXCEEDED_TOTAL_THREAD_LIMIT) {
2500                 pwqinfo->pwq_state |= WQ_EXCEEDED_TOTAL_THREAD_LIMIT;
2501         }
2502
2503 out:
2504         workqueue_unlock(p);
2505         return(error);
2506 }
2507
2508 int
2509 _thread_selfid(__unused struct proc *p, uint64_t *retval)
2510 {
2511         thread_t thread = current_thread();
2512         *retval = thread_tid(thread);
2513         return KERN_SUCCESS;
2514 }
2515
2516 void
2517 _pthread_init(void)
2518 {
2519         pthread_lck_grp_attr = lck_grp_attr_alloc_init();
2520         pthread_lck_grp = lck_grp_alloc_init("pthread", pthread_lck_grp_attr);
2521
2522         /*
2523          * allocate the lock attribute for pthread synchronizers
2524          */
2525         pthread_lck_attr = lck_attr_alloc_init();
2526
2527         _workqueue_init_lock((proc_t)get_bsdtask_info(kernel_task));
2528         pthread_list_mlock = lck_mtx_alloc_init(pthread_lck_grp, pthread_lck_attr);
2529
2530         pth_global_hashinit();
2531         psynch_thcall = thread_call_allocate(psynch_wq_cleanup, NULL);
2532         psynch_zoneinit();
2533
2534         /*
2535          * register sysctls
2536          */
2537         sysctl_register_oid(&sysctl__kern_wq_yielded_threshold);
2538         sysctl_register_oid(&sysctl__kern_wq_yielded_window_usecs);
2539         sysctl_register_oid(&sysctl__kern_wq_stalled_window_usecs);
2540         sysctl_register_oid(&sysctl__kern_wq_reduce_pool_window_usecs);
2541         sysctl_register_oid(&sysctl__kern_wq_max_timer_interval_usecs);
2542         sysctl_register_oid(&sysctl__kern_wq_max_threads);
2543         sysctl_register_oid(&sysctl__kern_wq_max_constrained_threads);
2544         sysctl_register_oid(&sysctl__kern_pthread_debug_tracing);
2545 }