bsd/kern/pthread_synch.c

   1 /*
   2  * Copyright (c) 2000-2009 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995-2005 Apple Computer, Inc. All Rights Reserved */
  29 /*
  30  *      pthread_synch.c
  31  */
  32
  33 #define  _PTHREAD_CONDATTR_T
  34 #define  _PTHREAD_COND_T
  35 #define _PTHREAD_MUTEXATTR_T
  36 #define _PTHREAD_MUTEX_T
  37 #define _PTHREAD_RWLOCKATTR_T
  38 #define _PTHREAD_RWLOCK_T
  39
  40 #undef pthread_mutexattr_t
  41 #undef pthread_mutex_t
  42 #undef pthread_condattr_t
  43 #undef pthread_cond_t
  44 #undef pthread_rwlockattr_t
  45 #undef pthread_rwlock_t
  46
  47 #include <sys/param.h>
  48 #include <sys/queue.h>
  49 #include <sys/resourcevar.h>
  50 #include <sys/proc_internal.h>
  51 #include <sys/kauth.h>
  52 #include <sys/systm.h>
  53 #include <sys/timeb.h>
  54 #include <sys/times.h>
  55 #include <sys/acct.h>
  56 #include <sys/kernel.h>
  57 #include <sys/wait.h>
  58 #include <sys/signalvar.h>
  59 #include <sys/syslog.h>
  60 #include <sys/stat.h>
  61 #include <sys/lock.h>
  62 #include <sys/kdebug.h>
  63 #include <sys/sysproto.h>
  64 #include <sys/pthread_internal.h>
  65 #include <sys/vm.h>
  66 #include <sys/user.h>           /* for coredump */
  67 #include <sys/proc_info.h>      /* for fill_procworkqueue */
  68
  69
  70 #include <mach/mach_types.h>
  71 #include <mach/vm_prot.h>
  72 #include <mach/semaphore.h>
  73 #include <mach/sync_policy.h>
  74 #include <mach/task.h>
  75 #include <kern/kern_types.h>
  76 #include <kern/task.h>
  77 #include <kern/clock.h>
  78 #include <mach/kern_return.h>
  79 #include <kern/thread.h>
  80 #include <kern/sched_prim.h>
  81 #include <kern/kalloc.h>
  82 #include <kern/sched_prim.h>    /* for thread_exception_return */
  83 #include <kern/processor.h>
  84 #include <kern/affinity.h>
  85 #include <kern/assert.h>
  86 #include <mach/mach_vm.h>
  87 #include <mach/mach_param.h>
  88 #include <mach/thread_status.h>
  89 #include <mach/thread_policy.h>
  90 #include <mach/message.h>
  91 #include <mach/port.h>
  92 #include <vm/vm_protos.h>
  93 #include <vm/vm_map.h>  /* for current_map() */
  94 #include <vm/vm_fault.h>
  95 #include <mach/thread_act.h> /* for thread_resume */
  96 #include <machine/machine_routines.h>
  97 #if defined(__i386__)
  98 #include <i386/machine_routines.h>
  99 #include <i386/eflags.h>
 100 #include <i386/psl.h>
 101 #include <i386/seg.h>
 102 #endif
 103
 104 #include <libkern/OSAtomic.h>
 105
 106 #if 0
 107 #undef KERNEL_DEBUG
 108 #define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT
 109 #undef KERNEL_DEBUG1
 110 #define KERNEL_DEBUG1 KERNEL_DEBUG_CONSTANT1
 111 #endif
 112
 113 lck_grp_attr_t   *pthread_lck_grp_attr;
 114 lck_grp_t    *pthread_lck_grp;
 115 lck_attr_t   *pthread_lck_attr;
 116
 117 extern kern_return_t thread_getstatus(register thread_t act, int flavor,
 118                         thread_state_t tstate, mach_msg_type_number_t *count);
 119 extern kern_return_t thread_setstatus(thread_t thread, int flavor,
 120                         thread_state_t tstate, mach_msg_type_number_t count);
 121 extern void thread_set_cthreadself(thread_t thread, uint64_t pself, int isLP64);
 122 extern kern_return_t mach_port_deallocate(ipc_space_t, mach_port_name_t);
 123 extern kern_return_t semaphore_signal_internal_trap(mach_port_name_t);
 124
 125 extern void workqueue_thread_yielded(void);
 126
 127 static int workqueue_additem(struct workqueue *wq, int prio, user_addr_t item, int affinity);
 128 static boolean_t workqueue_run_nextitem(proc_t p, struct workqueue *wq, thread_t th,
 129                                         user_addr_t oc_item, int oc_prio, int oc_affinity);
 130 static void wq_runitem(proc_t p, user_addr_t item, thread_t th, struct threadlist *tl,
 131                        int reuse_thread, int wake_thread, int return_directly);
 132 static void wq_unpark_continue(void);
 133 static void wq_unsuspend_continue(void);
 134 static int setup_wqthread(proc_t p, thread_t th, user_addr_t item, int reuse_thread, struct threadlist *tl);
 135 static boolean_t workqueue_addnewthread(struct workqueue *wq, boolean_t oc_thread);
 136 static void workqueue_removethread(struct threadlist *tl, int fromexit);
 137 static void workqueue_lock_spin(proc_t);
 138 static void workqueue_unlock(proc_t);
 139 int proc_settargetconc(pid_t pid, int queuenum, int32_t targetconc);
 140 int proc_setalltargetconc(pid_t pid, int32_t * targetconcp);
 141
 142 #define WQ_MAXPRI_MIN   0       /* low prio queue num */
 143 #define WQ_MAXPRI_MAX   2       /* max  prio queuenum */
 144 #define WQ_PRI_NUM      3       /* number of prio work queues */
 145
 146 #define C_32_STK_ALIGN          16
 147 #define C_64_STK_ALIGN          16
 148 #define C_64_REDZONE_LEN        128
 149 #define TRUNC_DOWN32(a,c)       ((((uint32_t)a)-(c)) & ((uint32_t)(-(c))))
 150 #define TRUNC_DOWN64(a,c)       ((((uint64_t)a)-(c)) & ((uint64_t)(-(c))))
 151
 152
 153 /*
 154  * Flags filed passed to bsdthread_create and back in pthread_start
 155 31  <---------------------------------> 0
 156 _________________________________________
 157 | flags(8) | policy(8) | importance(16) |
 158 -----------------------------------------
 159 */
 160 void _pthread_start(pthread_t self, mach_port_t kport, void *(*fun)(void *), void * funarg, size_t stacksize, unsigned int flags);
 161
 162 #define PTHREAD_START_CUSTOM    0x01000000
 163 #define PTHREAD_START_SETSCHED  0x02000000
 164 #define PTHREAD_START_DETACHED  0x04000000
 165 #define PTHREAD_START_POLICY_BITSHIFT 16
 166 #define PTHREAD_START_POLICY_MASK 0xff
 167 #define PTHREAD_START_IMPORTANCE_MASK 0xffff
 168
 169 #define SCHED_OTHER      POLICY_TIMESHARE
 170 #define SCHED_FIFO       POLICY_FIFO
 171 #define SCHED_RR         POLICY_RR
 172
 173
 174
 175 int
 176 bsdthread_create(__unused struct proc *p, struct bsdthread_create_args  *uap, user_addr_t *retval)
 177 {
 178         kern_return_t kret;
 179         void * sright;
 180         int error = 0;
 181         int allocated = 0;
 182         mach_vm_offset_t stackaddr;
 183         mach_vm_size_t th_allocsize = 0;
 184         mach_vm_size_t user_stacksize;
 185         mach_vm_size_t th_stacksize;
 186         mach_vm_offset_t th_stackaddr;
 187         mach_vm_offset_t th_stack;
 188         mach_vm_offset_t th_pthread;
 189         mach_port_name_t th_thport;
 190         thread_t th;
 191         user_addr_t user_func = uap->func;
 192         user_addr_t user_funcarg = uap->func_arg;
 193         user_addr_t user_stack = uap->stack;
 194         user_addr_t user_pthread = uap->pthread;
 195         unsigned int  flags = (unsigned int)uap->flags;
 196         vm_map_t vmap = current_map();
 197         task_t ctask = current_task();
 198         unsigned int policy, importance;
 199
 200         int isLP64 = 0;
 201
 202
 203         if ((p->p_lflag & P_LREGISTER) == 0)
 204                 return(EINVAL);
 205 #if 0
 206         KERNEL_DEBUG_CONSTANT(0x9000080 | DBG_FUNC_START, flags, 0, 0, 0, 0);
 207 #endif
 208
 209         isLP64 = IS_64BIT_PROCESS(p);
 210
 211
 212 #if defined(__i386__) || defined(__x86_64__)
 213         stackaddr = 0xB0000000;
 214 #else
 215 #error Need to define a stack address hint for this architecture
 216 #endif
 217         kret = thread_create(ctask, &th);
 218         if (kret != KERN_SUCCESS)
 219                 return(ENOMEM);
 220         thread_reference(th);
 221
 222         sright = (void *) convert_thread_to_port(th);
 223         th_thport = ipc_port_copyout_send(sright, get_task_ipcspace(ctask));
 224
 225         if ((flags & PTHREAD_START_CUSTOM) == 0) {
 226                 th_stacksize = (mach_vm_size_t)user_stack;              /* if it is custom them it is stacksize */
 227                 th_allocsize = th_stacksize + PTH_DEFAULT_GUARDSIZE + p->p_pthsize;
 228
 229                 kret = mach_vm_map(vmap, &stackaddr,
 230                                 th_allocsize,
 231                                 page_size-1,
 232                                 VM_MAKE_TAG(VM_MEMORY_STACK)| VM_FLAGS_ANYWHERE , NULL,
 233                                 0, FALSE, VM_PROT_DEFAULT, VM_PROT_ALL,
 234                                 VM_INHERIT_DEFAULT);
 235                 if (kret != KERN_SUCCESS)
 236                         kret = mach_vm_allocate(vmap,
 237                                         &stackaddr, th_allocsize,
 238                                         VM_MAKE_TAG(VM_MEMORY_STACK)| VM_FLAGS_ANYWHERE);
 239                 if (kret != KERN_SUCCESS) {
 240                         error = ENOMEM;
 241                         goto out;
 242                 }
 243 #if 0
 244                 KERNEL_DEBUG_CONSTANT(0x9000080 |DBG_FUNC_NONE, th_allocsize, stackaddr, 0, 2, 0);
 245 #endif
 246                 th_stackaddr = stackaddr;
 247                 allocated = 1;
 248                 /*
 249                  * The guard page is at the lowest address
 250                  * The stack base is the highest address
 251                  */
 252                 kret = mach_vm_protect(vmap,  stackaddr, PTH_DEFAULT_GUARDSIZE, FALSE, VM_PROT_NONE);
 253
 254                 if (kret != KERN_SUCCESS) {
 255                         error = ENOMEM;
 256                         goto out1;
 257                 }
 258                 th_stack = (stackaddr + th_stacksize + PTH_DEFAULT_GUARDSIZE);
 259                 th_pthread = (stackaddr + th_stacksize + PTH_DEFAULT_GUARDSIZE);
 260                 user_stacksize = th_stacksize;
 261
 262                /*
 263                 * Pre-fault the first page of the new thread's stack and the page that will
 264                 * contain the pthread_t structure.
 265                 */
 266                 vm_fault( vmap,
 267                   vm_map_trunc_page(th_stack - PAGE_SIZE_64),
 268                   VM_PROT_READ | VM_PROT_WRITE,
 269                   FALSE,
 270                   THREAD_UNINT, NULL, 0);
 271
 272                 vm_fault( vmap,
 273                   vm_map_trunc_page(th_pthread),
 274                   VM_PROT_READ | VM_PROT_WRITE,
 275                   FALSE,
 276                   THREAD_UNINT, NULL, 0);
 277         } else {
 278                 th_stack = user_stack;
 279                 user_stacksize = user_stack;
 280                 th_pthread = user_pthread;
 281 #if 0
 282                 KERNEL_DEBUG_CONSTANT(0x9000080 |DBG_FUNC_NONE, 0, 0, 0, 3, 0);
 283 #endif
 284         }
 285
 286 #if defined(__i386__) || defined(__x86_64__)
 287         {
 288         /*
 289          * Set up i386 registers & function call.
 290          */
 291         if (isLP64 == 0) {
 292                 x86_thread_state32_t state;
 293                 x86_thread_state32_t *ts = &state;
 294
 295                 ts->eip = (int)p->p_threadstart;
 296                 ts->eax = (unsigned int)th_pthread;
 297                 ts->ebx = (unsigned int)th_thport;
 298                 ts->ecx = (unsigned int)user_func;
 299                 ts->edx = (unsigned int)user_funcarg;
 300                 ts->edi = (unsigned int)user_stacksize;
 301                 ts->esi = (unsigned int)uap->flags;
 302                 /*
 303                  * set stack pointer
 304                  */
 305                 ts->esp = (int)((vm_offset_t)(th_stack-C_32_STK_ALIGN));
 306
 307                 thread_set_wq_state32(th, (thread_state_t)ts);
 308
 309         } else {
 310                 x86_thread_state64_t state64;
 311                 x86_thread_state64_t *ts64 = &state64;
 312
 313                 ts64->rip = (uint64_t)p->p_threadstart;
 314                 ts64->rdi = (uint64_t)th_pthread;
 315                 ts64->rsi = (uint64_t)(th_thport);
 316                 ts64->rdx = (uint64_t)user_func;
 317                 ts64->rcx = (uint64_t)user_funcarg;
 318                 ts64->r8 = (uint64_t)user_stacksize;
 319                 ts64->r9 = (uint64_t)uap->flags;
 320                 /*
 321                  * set stack pointer aligned to 16 byte boundary
 322                  */
 323                 ts64->rsp = (uint64_t)(th_stack - C_64_REDZONE_LEN);
 324
 325                 thread_set_wq_state64(th, (thread_state_t)ts64);
 326         }
 327         }
 328 #else
 329 #error bsdthread_create  not defined for this architecture
 330 #endif
 331         /* Set scheduling parameters if needed */
 332         if ((flags & PTHREAD_START_SETSCHED) != 0) {
 333                 thread_extended_policy_data_t    extinfo;
 334                 thread_precedence_policy_data_t   precedinfo;
 335
 336                 importance = (flags & PTHREAD_START_IMPORTANCE_MASK);
 337                 policy = (flags >> PTHREAD_START_POLICY_BITSHIFT) & PTHREAD_START_POLICY_MASK;
 338
 339                 if (policy == SCHED_OTHER)
 340                         extinfo.timeshare = 1;
 341                 else
 342                         extinfo.timeshare = 0;
 343                 thread_policy_set(th, THREAD_EXTENDED_POLICY, (thread_policy_t)&extinfo, THREAD_EXTENDED_POLICY_COUNT);
 344
 345 #define BASEPRI_DEFAULT 31
 346                 precedinfo.importance = (importance - BASEPRI_DEFAULT);
 347                 thread_policy_set(th, THREAD_PRECEDENCE_POLICY, (thread_policy_t)&precedinfo, THREAD_PRECEDENCE_POLICY_COUNT);
 348         }
 349
 350         kret = thread_resume(th);
 351         if (kret != KERN_SUCCESS) {
 352                 error = EINVAL;
 353                 goto out1;
 354         }
 355         thread_deallocate(th);  /* drop the creator reference */
 356 #if 0
 357         KERNEL_DEBUG_CONSTANT(0x9000080 |DBG_FUNC_END, error, th_pthread, 0, 0, 0);
 358 #endif
 359         *retval = th_pthread;
 360
 361         return(0);
 362
 363 out1:
 364         if (allocated != 0)
 365                 (void)mach_vm_deallocate(vmap,  stackaddr, th_allocsize);
 366 out:
 367         (void)mach_port_deallocate(get_task_ipcspace(ctask), th_thport);
 368         (void)thread_terminate(th);
 369         (void)thread_deallocate(th);
 370         return(error);
 371 }
 372
 373 int
 374 bsdthread_terminate(__unused struct proc *p, struct bsdthread_terminate_args  *uap, __unused int32_t *retval)
 375 {
 376         mach_vm_offset_t  freeaddr;
 377         mach_vm_size_t freesize;
 378         kern_return_t kret;
 379         mach_port_name_t kthport = (mach_port_name_t)uap->port;
 380         mach_port_name_t sem = (mach_port_name_t)uap->sem;
 381
 382         freeaddr = (mach_vm_offset_t)uap->stackaddr;
 383         freesize = uap->freesize;
 384
 385 #if 0
 386         KERNEL_DEBUG_CONSTANT(0x9000084 |DBG_FUNC_START, freeaddr, freesize, kthport, 0xff, 0);
 387 #endif
 388         if ((freesize != (mach_vm_size_t)0) && (freeaddr != (mach_vm_offset_t)0)) {
 389                 kret = mach_vm_deallocate(current_map(), freeaddr, freesize);
 390                 if (kret != KERN_SUCCESS) {
 391                         return(EINVAL);
 392                 }
 393         }
 394
 395         (void) thread_terminate(current_thread());
 396         if (sem != MACH_PORT_NULL) {
 397                  kret = semaphore_signal_internal_trap(sem);
 398                 if (kret != KERN_SUCCESS) {
 399                         return(EINVAL);
 400                 }
 401         }
 402
 403         if (kthport != MACH_PORT_NULL)
 404                         mach_port_deallocate(get_task_ipcspace(current_task()), kthport);
 405         thread_exception_return();
 406         panic("bsdthread_terminate: still running\n");
 407 #if 0
 408         KERNEL_DEBUG_CONSTANT(0x9000084 |DBG_FUNC_END, 0, 0, 0, 0xff, 0);
 409 #endif
 410         return(0);
 411 }
 412
 413
 414 int
 415 bsdthread_register(struct proc *p, struct bsdthread_register_args  *uap, __unused int32_t *retval)
 416 {
 417         /* prevent multiple registrations */
 418         if ((p->p_lflag & P_LREGISTER) != 0)
 419                 return(EINVAL);
 420         /* syscall randomizer test can pass bogus values */
 421         if (uap->pthsize > MAX_PTHREAD_SIZE) {
 422                 return(EINVAL);
 423         }
 424         p->p_threadstart = uap->threadstart;
 425         p->p_wqthread = uap->wqthread;
 426         p->p_pthsize = uap->pthsize;
 427         p->p_targconc = uap->targetconc_ptr;
 428         p->p_dispatchqueue_offset = uap->dispatchqueue_offset;
 429         proc_setregister(p);
 430
 431         return(0);
 432 }
 433
 434 uint32_t wq_yielded_threshold           = WQ_YIELDED_THRESHOLD;
 435 uint32_t wq_yielded_window_usecs        = WQ_YIELDED_WINDOW_USECS;
 436 uint32_t wq_stalled_window_usecs        = WQ_STALLED_WINDOW_USECS;
 437 uint32_t wq_reduce_pool_window_usecs    = WQ_REDUCE_POOL_WINDOW_USECS;
 438 uint32_t wq_max_timer_interval_usecs    = WQ_MAX_TIMER_INTERVAL_USECS;
 439 uint32_t wq_max_threads                 = WORKQUEUE_MAXTHREADS;
 440 uint32_t wq_max_constrained_threads     = WORKQUEUE_MAXTHREADS / 8;
 441
 442
 443 SYSCTL_INT(_kern, OID_AUTO, wq_yielded_threshold, CTLFLAG_RW | CTLFLAG_LOCKED,
 444            &wq_yielded_threshold, 0, "");
 445
 446 SYSCTL_INT(_kern, OID_AUTO, wq_yielded_window_usecs, CTLFLAG_RW | CTLFLAG_LOCKED,
 447            &wq_yielded_window_usecs, 0, "");
 448
 449 SYSCTL_INT(_kern, OID_AUTO, wq_stalled_window_usecs, CTLFLAG_RW | CTLFLAG_LOCKED,
 450            &wq_stalled_window_usecs, 0, "");
 451
 452 SYSCTL_INT(_kern, OID_AUTO, wq_reduce_pool_window_usecs, CTLFLAG_RW | CTLFLAG_LOCKED,
 453            &wq_reduce_pool_window_usecs, 0, "");
 454
 455 SYSCTL_INT(_kern, OID_AUTO, wq_max_timer_interval_usecs, CTLFLAG_RW | CTLFLAG_LOCKED,
 456            &wq_max_timer_interval_usecs, 0, "");
 457
 458 SYSCTL_INT(_kern, OID_AUTO, wq_max_threads, CTLFLAG_RW | CTLFLAG_LOCKED,
 459            &wq_max_threads, 0, "");
 460
 461 SYSCTL_INT(_kern, OID_AUTO, wq_max_constrained_threads, CTLFLAG_RW | CTLFLAG_LOCKED,
 462            &wq_max_constrained_threads, 0, "");
 463
 464
 465 static uint32_t wq_init_constrained_limit = 1;
 466
 467
 468 void
 469 workqueue_init_lock(proc_t p)
 470 {
 471         lck_spin_init(&p->p_wqlock, pthread_lck_grp, pthread_lck_attr);
 472
 473         p->p_wqiniting = FALSE;
 474 }
 475
 476 void
 477 workqueue_destroy_lock(proc_t p)
 478 {
 479         lck_spin_destroy(&p->p_wqlock, pthread_lck_grp);
 480 }
 481
 482
 483 static void
 484 workqueue_lock_spin(proc_t p)
 485 {
 486         lck_spin_lock(&p->p_wqlock);
 487 }
 488
 489 static void
 490 workqueue_unlock(proc_t p)
 491 {
 492         lck_spin_unlock(&p->p_wqlock);
 493 }
 494
 495
 496 static void
 497 workqueue_interval_timer_start(struct workqueue *wq)
 498 {
 499         uint64_t deadline;
 500
 501         if (wq->wq_timer_interval == 0)
 502                 wq->wq_timer_interval = wq_stalled_window_usecs;
 503         else {
 504                 wq->wq_timer_interval = wq->wq_timer_interval * 2;
 505
 506                 if (wq->wq_timer_interval > wq_max_timer_interval_usecs)
 507                         wq->wq_timer_interval = wq_max_timer_interval_usecs;
 508         }
 509         clock_interval_to_deadline(wq->wq_timer_interval, 1000, &deadline);
 510
 511         thread_call_enter_delayed(wq->wq_atimer_call, deadline);
 512
 513         KERNEL_DEBUG(0xefffd110, wq, wq->wq_itemcount, wq->wq_flags, wq->wq_timer_interval, 0);
 514 }
 515
 516
 517 static boolean_t
 518 wq_thread_is_busy(uint64_t cur_ts, uint64_t *lastblocked_tsp)
 519 {       clock_sec_t     secs;
 520         clock_usec_t    usecs;
 521         uint64_t lastblocked_ts;
 522         uint64_t elapsed;
 523
 524         /*
 525          * the timestamp is updated atomically w/o holding the workqueue lock
 526          * so we need to do an atomic read of the 64 bits so that we don't see
 527          * a mismatched pair of 32 bit reads... we accomplish this in an architecturally
 528          * independent fashion by using OSCompareAndSwap64 to write back the
 529          * value we grabbed... if it succeeds, then we have a good timestamp to
 530          * evaluate... if it fails, we straddled grabbing the timestamp while it
 531          * was being updated... treat a failed update as a busy thread since
 532          * it implies we are about to see a really fresh timestamp anyway
 533          */
 534         lastblocked_ts = *lastblocked_tsp;
 535
 536         if ( !OSCompareAndSwap64((UInt64)lastblocked_ts, (UInt64)lastblocked_ts, lastblocked_tsp))
 537                 return (TRUE);
 538
 539         if (lastblocked_ts >= cur_ts) {
 540                 /*
 541                  * because the update of the timestamp when a thread blocks isn't
 542                  * serialized against us looking at it (i.e. we don't hold the workq lock)
 543                  * it's possible to have a timestamp that matches the current time or
 544                  * that even looks to be in the future relative to when we grabbed the current
 545                  * time... just treat this as a busy thread since it must have just blocked.
 546                  */
 547                 return (TRUE);
 548         }
 549         elapsed = cur_ts - lastblocked_ts;
 550
 551         absolutetime_to_microtime(elapsed, &secs, &usecs);
 552
 553         if (secs == 0 && usecs < wq_stalled_window_usecs)
 554                 return (TRUE);
 555         return (FALSE);
 556 }
 557
 558
 559 #define WQ_TIMER_NEEDED(wq, start_timer) do {           \
 560         int oldflags = wq->wq_flags;                    \
 561                                                         \
 562         if ( !(oldflags & (WQ_EXITING | WQ_ATIMER_RUNNING))) {  \
 563                 if (OSCompareAndSwap(oldflags, oldflags | WQ_ATIMER_RUNNING, (UInt32 *)&wq->wq_flags)) \
 564                         start_timer = TRUE;                     \
 565         }                                                       \
 566 } while (0)
 567
 568
 569
 570 static void
 571 workqueue_add_timer(struct workqueue *wq, __unused int param1)
 572 {
 573         proc_t          p;
 574         boolean_t       start_timer = FALSE;
 575         boolean_t       retval;
 576         boolean_t       add_thread;
 577         uint32_t        busycount;
 578
 579         KERNEL_DEBUG(0xefffd108 | DBG_FUNC_START, wq, wq->wq_flags, wq->wq_nthreads, wq->wq_thidlecount, 0);
 580
 581         p = wq->wq_proc;
 582
 583         workqueue_lock_spin(p);
 584
 585         /*
 586          * because workqueue_callback now runs w/o taking the workqueue lock
 587          * we are unsynchronized w/r to a change in state of the running threads...
 588          * to make sure we always evaluate that change, we allow it to start up
 589          * a new timer if the current one is actively evalutating the state
 590          * however, we do not need more than 2 timers fired up (1 active and 1 pending)
 591          * and we certainly do not want 2 active timers evaluating the state
 592          * simultaneously... so use WQL_ATIMER_BUSY to serialize the timers...
 593          * note that WQL_ATIMER_BUSY is in a different flag word from WQ_ATIMER_RUNNING since
 594          * it is always protected by the workq lock... WQ_ATIMER_RUNNING is evaluated
 595          * and set atomimcally since the callback function needs to manipulate it
 596          * w/o holding the workq lock...
 597          *
 598          * !WQ_ATIMER_RUNNING && !WQL_ATIMER_BUSY   ==   no pending timer, no active timer
 599          * !WQ_ATIMER_RUNNING && WQL_ATIMER_BUSY    ==   no pending timer, 1 active timer
 600          * WQ_ATIMER_RUNNING && !WQL_ATIMER_BUSY    ==   1 pending timer, no active timer
 601          * WQ_ATIMER_RUNNING && WQL_ATIMER_BUSY     ==   1 pending timer, 1 active timer
 602          */
 603         while (wq->wq_lflags & WQL_ATIMER_BUSY) {
 604                 wq->wq_lflags |= WQL_ATIMER_WAITING;
 605
 606                 assert_wait((caddr_t)wq, (THREAD_UNINT));
 607                 workqueue_unlock(p);
 608
 609                 thread_block(THREAD_CONTINUE_NULL);
 610
 611                 workqueue_lock_spin(p);
 612         }
 613         wq->wq_lflags |= WQL_ATIMER_BUSY;
 614
 615         /*
 616          * the workq lock will protect us from seeing WQ_EXITING change state, but we
 617          * still need to update this atomically in case someone else tries to start
 618          * the timer just as we're releasing it
 619          */
 620         while ( !(OSCompareAndSwap(wq->wq_flags, (wq->wq_flags & ~WQ_ATIMER_RUNNING), (UInt32 *)&wq->wq_flags)));
 621
 622 again:
 623         retval = TRUE;
 624         add_thread = FALSE;
 625
 626         if ( !(wq->wq_flags & WQ_EXITING)) {
 627                 /*
 628                  * check to see if the stall frequency was beyond our tolerance
 629                  * or we have work on the queue, but haven't scheduled any
 630                  * new work within our acceptable time interval because
 631                  * there were no idle threads left to schedule
 632                  */
 633                 if (wq->wq_itemcount) {
 634                         uint32_t        priority;
 635                         uint32_t        affinity_tag;
 636                         uint32_t        i;
 637                         uint64_t        curtime;
 638
 639                         for (priority = 0; priority < WORKQUEUE_NUMPRIOS; priority++) {
 640                                 if (wq->wq_list_bitmap & (1 << priority))
 641                                         break;
 642                         }
 643                         assert(priority < WORKQUEUE_NUMPRIOS);
 644
 645                         curtime = mach_absolute_time();
 646                         busycount = 0;
 647
 648                         for (affinity_tag = 0; affinity_tag < wq->wq_reqconc[priority]; affinity_tag++) {
 649                                 /*
 650                                  * if we have no idle threads, we can try to add them if needed
 651                                  */
 652                                 if (wq->wq_thidlecount == 0)
 653                                         add_thread = TRUE;
 654
 655                                 /*
 656                                  * look for first affinity group that is currently not active
 657                                  * i.e. no active threads at this priority level or higher
 658                                  * and has not been active recently at this priority level or higher
 659                                  */
 660                                 for (i = 0; i <= priority; i++) {
 661                                         if (wq->wq_thactive_count[i][affinity_tag]) {
 662                                                 add_thread = FALSE;
 663                                                 break;
 664                                         }
 665                                         if (wq->wq_thscheduled_count[i][affinity_tag]) {
 666                                                 if (wq_thread_is_busy(curtime, &wq->wq_lastblocked_ts[i][affinity_tag])) {
 667                                                         add_thread = FALSE;
 668                                                         busycount++;
 669                                                         break;
 670                                                 }
 671                                         }
 672                                 }
 673                                 if (add_thread == TRUE) {
 674                                         retval = workqueue_addnewthread(wq, FALSE);
 675                                         break;
 676                                 }
 677                         }
 678                         if (wq->wq_itemcount) {
 679                                 /*
 680                                  * as long as we have threads to schedule, and we successfully
 681                                  * scheduled new work, keep trying
 682                                  */
 683                                 while (wq->wq_thidlecount && !(wq->wq_flags & WQ_EXITING)) {
 684                                         /*
 685                                          * workqueue_run_nextitem is responsible for
 686                                          * dropping the workqueue lock in all cases
 687                                          */
 688                                         retval = workqueue_run_nextitem(p, wq, THREAD_NULL, 0, 0, 0);
 689                                         workqueue_lock_spin(p);
 690
 691                                         if (retval == FALSE)
 692                                                 break;
 693                                 }
 694                                 if ( !(wq->wq_flags & WQ_EXITING) && wq->wq_itemcount) {
 695
 696                                         if (wq->wq_thidlecount == 0 && retval == TRUE && add_thread == TRUE)
 697                                                 goto again;
 698
 699                                         if (wq->wq_thidlecount == 0 || busycount)
 700                                                 WQ_TIMER_NEEDED(wq, start_timer);
 701
 702                                         KERNEL_DEBUG(0xefffd108 | DBG_FUNC_NONE, wq, wq->wq_itemcount, wq->wq_thidlecount, busycount, 0);
 703                                 }
 704                         }
 705                 }
 706         }
 707         if ( !(wq->wq_flags & WQ_ATIMER_RUNNING))
 708                 wq->wq_timer_interval = 0;
 709
 710         wq->wq_lflags &= ~WQL_ATIMER_BUSY;
 711
 712         if ((wq->wq_flags & WQ_EXITING) || (wq->wq_lflags & WQL_ATIMER_WAITING)) {
 713                 /*
 714                  * wakeup the thread hung up in workqueue_exit or workqueue_add_timer waiting for this timer
 715                  * to finish getting out of the way
 716                  */
 717                 wq->wq_lflags &= ~WQL_ATIMER_WAITING;
 718                 wakeup(wq);
 719         }
 720         KERNEL_DEBUG(0xefffd108 | DBG_FUNC_END, wq, start_timer, wq->wq_nthreads, wq->wq_thidlecount, 0);
 721
 722         workqueue_unlock(p);
 723
 724         if (start_timer == TRUE)
 725                 workqueue_interval_timer_start(wq);
 726 }
 727
 728
 729 void
 730 workqueue_thread_yielded(void)
 731 {
 732         struct workqueue *wq;
 733         proc_t          p;
 734
 735         p = current_proc();
 736
 737         if ((wq = p->p_wqptr) == NULL || wq->wq_itemcount == 0)
 738                 return;
 739
 740         workqueue_lock_spin(p);
 741
 742         if (wq->wq_itemcount) {
 743                 uint64_t        curtime;
 744                 uint64_t        elapsed;
 745                 clock_sec_t     secs;
 746                 clock_usec_t    usecs;
 747
 748                 if (wq->wq_thread_yielded_count++ == 0)
 749                         wq->wq_thread_yielded_timestamp = mach_absolute_time();
 750
 751                 if (wq->wq_thread_yielded_count < wq_yielded_threshold) {
 752                         workqueue_unlock(p);
 753                         return;
 754                 }
 755                 KERNEL_DEBUG(0xefffd138 | DBG_FUNC_START, wq, wq->wq_thread_yielded_count, wq->wq_itemcount, 0, 0);
 756
 757                 wq->wq_thread_yielded_count = 0;
 758
 759                 curtime = mach_absolute_time();
 760                 elapsed = curtime - wq->wq_thread_yielded_timestamp;
 761                 absolutetime_to_microtime(elapsed, &secs, &usecs);
 762
 763                 if (secs == 0 && usecs < wq_yielded_window_usecs) {
 764
 765                         if (wq->wq_thidlecount == 0) {
 766                                 workqueue_addnewthread(wq, TRUE);
 767                                 /*
 768                                  * 'workqueue_addnewthread' drops the workqueue lock
 769                                  * when creating the new thread and then retakes it before
 770                                  * returning... this window allows other threads to process
 771                                  * work on the queue, so we need to recheck for available work
 772                                  * if none found, we just return...  the newly created thread
 773                                  * will eventually get used (if it hasn't already)...
 774                                  */
 775                                 if (wq->wq_itemcount == 0) {
 776                                         workqueue_unlock(p);
 777                                         return;
 778                                 }
 779                         }
 780                         if (wq->wq_thidlecount) {
 781                                 uint32_t        priority;
 782                                 uint32_t        affinity = -1;
 783                                 user_addr_t     item;
 784                                 struct workitem *witem = NULL;
 785                                 struct workitemlist *wl = NULL;
 786                                 struct uthread    *uth;
 787                                 struct threadlist *tl;
 788
 789                                 uth = get_bsdthread_info(current_thread());
 790                                 if ((tl = uth->uu_threadlist))
 791                                         affinity = tl->th_affinity_tag;
 792
 793                                 for (priority = 0; priority < WORKQUEUE_NUMPRIOS; priority++) {
 794                                         if (wq->wq_list_bitmap & (1 << priority)) {
 795                                                 wl = (struct workitemlist *)&wq->wq_list[priority];
 796                                                 break;
 797                                         }
 798                                 }
 799                                 assert(wl != NULL);
 800                                 assert(!(TAILQ_EMPTY(&wl->wl_itemlist)));
 801
 802                                 witem = TAILQ_FIRST(&wl->wl_itemlist);
 803                                 TAILQ_REMOVE(&wl->wl_itemlist, witem, wi_entry);
 804
 805                                 if (TAILQ_EMPTY(&wl->wl_itemlist))
 806                                         wq->wq_list_bitmap &= ~(1 << priority);
 807                                 wq->wq_itemcount--;
 808
 809                                 item = witem->wi_item;
 810                                 witem->wi_item = (user_addr_t)0;
 811                                 witem->wi_affinity = 0;
 812
 813                                 TAILQ_INSERT_HEAD(&wl->wl_freelist, witem, wi_entry);
 814
 815                                 (void)workqueue_run_nextitem(p, wq, THREAD_NULL, item, priority, affinity);
 816                                 /*
 817                                  * workqueue_run_nextitem is responsible for
 818                                  * dropping the workqueue lock in all cases
 819                                  */
 820                                 KERNEL_DEBUG(0xefffd138 | DBG_FUNC_END, wq, wq->wq_thread_yielded_count, wq->wq_itemcount, 1, 0);
 821
 822                                 return;
 823                         }
 824                 }
 825                 KERNEL_DEBUG(0xefffd138 | DBG_FUNC_END, wq, wq->wq_thread_yielded_count, wq->wq_itemcount, 2, 0);
 826         }
 827         workqueue_unlock(p);
 828 }
 829
 830
 831
 832 static void
 833 workqueue_callback(int type, thread_t thread)
 834 {
 835         struct uthread    *uth;
 836         struct threadlist *tl;
 837         struct workqueue  *wq;
 838
 839         uth = get_bsdthread_info(thread);
 840         tl = uth->uu_threadlist;
 841         wq = tl->th_workq;
 842
 843         switch (type) {
 844
 845               case SCHED_CALL_BLOCK:
 846                 {
 847                 uint32_t        old_activecount;
 848
 849                 old_activecount = OSAddAtomic(-1, &wq->wq_thactive_count[tl->th_priority][tl->th_affinity_tag]);
 850
 851                 if (old_activecount == 1) {
 852                         boolean_t       start_timer = FALSE;
 853                         uint64_t        curtime;
 854                         UInt64          *lastblocked_ptr;
 855
 856                         /*
 857                          * we were the last active thread on this affinity set
 858                          * and we've got work to do
 859                          */
 860                         lastblocked_ptr = (UInt64 *)&wq->wq_lastblocked_ts[tl->th_priority][tl->th_affinity_tag];
 861                         curtime = mach_absolute_time();
 862
 863                         /*
 864                          * if we collide with another thread trying to update the last_blocked (really unlikely
 865                          * since another thread would have to get scheduled and then block after we start down
 866                          * this path), it's not a problem.  Either timestamp is adequate, so no need to retry
 867                          */
 868
 869                         OSCompareAndSwap64(*lastblocked_ptr, (UInt64)curtime, lastblocked_ptr);
 870
 871                         if (wq->wq_itemcount)
 872                                 WQ_TIMER_NEEDED(wq, start_timer);
 873
 874                         if (start_timer == TRUE)
 875                                 workqueue_interval_timer_start(wq);
 876                 }
 877                 KERNEL_DEBUG1(0xefffd020 | DBG_FUNC_START, wq, old_activecount, tl->th_priority, tl->th_affinity_tag, thread_tid(thread));
 878                 }
 879                 break;
 880
 881               case SCHED_CALL_UNBLOCK:
 882                 /*
 883                  * we cannot take the workqueue_lock here...
 884                  * an UNBLOCK can occur from a timer event which
 885                  * is run from an interrupt context... if the workqueue_lock
 886                  * is already held by this processor, we'll deadlock...
 887                  * the thread lock for the thread being UNBLOCKED
 888                  * is also held
 889                  */
 890                  OSAddAtomic(1, &wq->wq_thactive_count[tl->th_priority][tl->th_affinity_tag]);
 891
 892                  KERNEL_DEBUG1(0xefffd020 | DBG_FUNC_END, wq, wq->wq_threads_scheduled, tl->th_priority, tl->th_affinity_tag, thread_tid(thread));
 893
 894                  break;
 895         }
 896 }
 897
 898
 899 static void
 900 workqueue_removethread(struct threadlist *tl, int fromexit)
 901 {
 902         struct workqueue *wq;
 903         struct uthread * uth;
 904
 905         /*
 906          * If fromexit is set, the call is from workqueue_exit(,
 907          * so some cleanups are to be avoided.
 908          */
 909         wq = tl->th_workq;
 910
 911         TAILQ_REMOVE(&wq->wq_thidlelist, tl, th_entry);
 912
 913         if (fromexit == 0) {
 914                 wq->wq_nthreads--;
 915                 wq->wq_thidlecount--;
 916         }
 917
 918         /*
 919          * Clear the threadlist pointer in uthread so
 920          * blocked thread on wakeup for termination will
 921          * not access the thread list as it is going to be
 922          * freed.
 923          */
 924         thread_sched_call(tl->th_thread, NULL);
 925
 926         uth = get_bsdthread_info(tl->th_thread);
 927         if (uth != (struct uthread *)0) {
 928                 uth->uu_threadlist = NULL;
 929         }
 930         if (fromexit == 0) {
 931                 /* during exit the lock is not held */
 932                 workqueue_unlock(wq->wq_proc);
 933         }
 934
 935         if ( (tl->th_flags & TH_LIST_SUSPENDED) ) {
 936                 /*
 937                  * thread was created, but never used...
 938                  * need to clean up the stack and port ourselves
 939                  * since we're not going to spin up through the
 940                  * normal exit path triggered from Libc
 941                  */
 942                 if (fromexit == 0) {
 943                         /* vm map is already deallocated when this is called from exit */
 944                         (void)mach_vm_deallocate(wq->wq_map, tl->th_stackaddr, tl->th_allocsize);
 945                 }
 946                 (void)mach_port_deallocate(get_task_ipcspace(wq->wq_task), tl->th_thport);
 947
 948                 KERNEL_DEBUG1(0xefffd014 | DBG_FUNC_END, wq, (uintptr_t)thread_tid(current_thread()), wq->wq_nthreads, 0xdead, thread_tid(tl->th_thread));
 949         } else {
 950
 951                 KERNEL_DEBUG1(0xefffd018 | DBG_FUNC_END, wq, (uintptr_t)thread_tid(current_thread()), wq->wq_nthreads, 0xdead, thread_tid(tl->th_thread));
 952         }
 953         /*
 954          * drop our ref on the thread
 955          */
 956         thread_deallocate(tl->th_thread);
 957
 958         kfree(tl, sizeof(struct threadlist));
 959 }
 960
 961
 962 /*
 963  * called with workq lock held
 964  * dropped and retaken around thread creation
 965  * return with workq lock held
 966  */
 967 static boolean_t
 968 workqueue_addnewthread(struct workqueue *wq, boolean_t oc_thread)
 969 {
 970         struct threadlist *tl;
 971         struct uthread  *uth;
 972         kern_return_t   kret;
 973         thread_t        th;
 974         proc_t          p;
 975         void            *sright;
 976         mach_vm_offset_t stackaddr;
 977
 978         if (wq->wq_nthreads >= wq_max_threads || wq->wq_nthreads >= (CONFIG_THREAD_MAX - 20)) {
 979                 wq->wq_lflags |= WQL_EXCEEDED_TOTAL_THREAD_LIMIT;
 980                 return (FALSE);
 981         }
 982         wq->wq_lflags &= ~WQL_EXCEEDED_TOTAL_THREAD_LIMIT;
 983
 984         if (oc_thread == FALSE && wq->wq_constrained_threads_scheduled >= wq_max_constrained_threads) {
 985                 /*
 986                  * if we're not creating this thread to service an overcommit request,
 987                  * then check the size of the constrained thread pool...  if we've already
 988                  * reached our max for threads scheduled from this pool, don't create a new
 989                  * one... the callers of this function are prepared for failure.
 990                  */
 991                 wq->wq_lflags |= WQL_EXCEEDED_CONSTRAINED_THREAD_LIMIT;
 992                 return (FALSE);
 993         }
 994         if (wq->wq_constrained_threads_scheduled < wq_max_constrained_threads)
 995                 wq->wq_lflags &= ~WQL_EXCEEDED_CONSTRAINED_THREAD_LIMIT;
 996
 997         wq->wq_nthreads++;
 998
 999         p = wq->wq_proc;
1000         workqueue_unlock(p);
1001
1002         kret = thread_create_workq(wq->wq_task, (thread_continue_t)wq_unsuspend_continue, &th);
1003
1004         if (kret != KERN_SUCCESS)
1005                 goto failed;
1006
1007         tl = kalloc(sizeof(struct threadlist));
1008         bzero(tl, sizeof(struct threadlist));
1009
1010 #if defined(__i386__) || defined(__x86_64__)
1011         stackaddr = 0xB0000000;
1012 #else
1013 #error Need to define a stack address hint for this architecture
1014 #endif
1015         tl->th_allocsize = PTH_DEFAULT_STACKSIZE + PTH_DEFAULT_GUARDSIZE + p->p_pthsize;
1016
1017         kret = mach_vm_map(wq->wq_map, &stackaddr,
1018                         tl->th_allocsize,
1019                         page_size-1,
1020                         VM_MAKE_TAG(VM_MEMORY_STACK)| VM_FLAGS_ANYWHERE , NULL,
1021                         0, FALSE, VM_PROT_DEFAULT, VM_PROT_ALL,
1022                         VM_INHERIT_DEFAULT);
1023
1024         if (kret != KERN_SUCCESS) {
1025                 kret = mach_vm_allocate(wq->wq_map,
1026                                         &stackaddr, tl->th_allocsize,
1027                                         VM_MAKE_TAG(VM_MEMORY_STACK) | VM_FLAGS_ANYWHERE);
1028         }
1029         if (kret == KERN_SUCCESS) {
1030                 /*
1031                  * The guard page is at the lowest address
1032                  * The stack base is the highest address
1033                  */
1034                 kret = mach_vm_protect(wq->wq_map, stackaddr, PTH_DEFAULT_GUARDSIZE, FALSE, VM_PROT_NONE);
1035
1036                 if (kret != KERN_SUCCESS)
1037                         (void) mach_vm_deallocate(wq->wq_map, stackaddr, tl->th_allocsize);
1038         }
1039         if (kret != KERN_SUCCESS) {
1040                 (void) thread_terminate(th);
1041                 thread_deallocate(th);
1042
1043                 kfree(tl, sizeof(struct threadlist));
1044                 goto failed;
1045         }
1046         thread_reference(th);
1047
1048         sright = (void *) convert_thread_to_port(th);
1049         tl->th_thport = ipc_port_copyout_send(sright, get_task_ipcspace(wq->wq_task));
1050
1051         thread_static_param(th, TRUE);
1052
1053         tl->th_flags = TH_LIST_INITED | TH_LIST_SUSPENDED;
1054
1055         tl->th_thread = th;
1056         tl->th_workq = wq;
1057         tl->th_stackaddr = stackaddr;
1058         tl->th_affinity_tag = -1;
1059         tl->th_priority = WORKQUEUE_NUMPRIOS;
1060         tl->th_policy = -1;
1061
1062         uth = get_bsdthread_info(tl->th_thread);
1063
1064         workqueue_lock_spin(p);
1065
1066         uth->uu_threadlist = (void *)tl;
1067         TAILQ_INSERT_TAIL(&wq->wq_thidlelist, tl, th_entry);
1068
1069         wq->wq_thidlecount++;
1070
1071         KERNEL_DEBUG1(0xefffd014 | DBG_FUNC_START, wq, wq->wq_nthreads, 0, thread_tid(current_thread()), thread_tid(tl->th_thread));
1072
1073         return (TRUE);
1074
1075 failed:
1076         workqueue_lock_spin(p);
1077         wq->wq_nthreads--;
1078
1079         return (FALSE);
1080 }
1081
1082
1083 int
1084 workq_open(struct proc *p, __unused struct workq_open_args  *uap, __unused int32_t *retval)
1085 {
1086         struct workqueue * wq;
1087         int wq_size;
1088         char * ptr;
1089         char * nptr;
1090         int j;
1091         uint32_t i;
1092         uint32_t num_cpus;
1093         int error = 0;
1094         boolean_t need_wakeup = FALSE;
1095         struct workitem * witem;
1096         struct workitemlist *wl;
1097
1098         if ((p->p_lflag & P_LREGISTER) == 0)
1099                 return(EINVAL);
1100
1101         num_cpus = ml_get_max_cpus();
1102
1103         if (wq_init_constrained_limit) {
1104                 uint32_t limit;
1105                 /*
1106                  * set up the limit for the constrained pool
1107                  * this is a virtual pool in that we don't
1108                  * maintain it on a separate idle and run list
1109                  */
1110                 limit = num_cpus * (WORKQUEUE_NUMPRIOS + 1);
1111
1112                 if (limit > wq_max_constrained_threads)
1113                         wq_max_constrained_threads = limit;
1114
1115                 wq_init_constrained_limit = 0;
1116         }
1117         workqueue_lock_spin(p);
1118
1119         if (p->p_wqptr == NULL) {
1120
1121                 while (p->p_wqiniting == TRUE) {
1122
1123                         assert_wait((caddr_t)&p->p_wqiniting, THREAD_UNINT);
1124                         workqueue_unlock(p);
1125
1126                         thread_block(THREAD_CONTINUE_NULL);
1127
1128                         workqueue_lock_spin(p);
1129                 }
1130                 if (p->p_wqptr != NULL)
1131                         goto out;
1132
1133                 p->p_wqiniting = TRUE;
1134
1135                 workqueue_unlock(p);
1136
1137                 wq_size = sizeof(struct workqueue) +
1138                         (num_cpus * WORKQUEUE_NUMPRIOS * sizeof(uint32_t)) +
1139                         (num_cpus * WORKQUEUE_NUMPRIOS * sizeof(uint32_t)) +
1140                         (num_cpus * WORKQUEUE_NUMPRIOS * sizeof(uint64_t)) +
1141                         sizeof(uint64_t);
1142
1143                 ptr = (char *)kalloc(wq_size);
1144                 bzero(ptr, wq_size);
1145
1146                 wq = (struct workqueue *)ptr;
1147                 wq->wq_flags = WQ_LIST_INITED;
1148                 wq->wq_proc = p;
1149                 wq->wq_affinity_max = num_cpus;
1150                 wq->wq_task = current_task();
1151                 wq->wq_map  = current_map();
1152
1153                 for (i = 0; i < WORKQUEUE_NUMPRIOS; i++) {
1154                         wl = (struct workitemlist *)&wq->wq_list[i];
1155                         TAILQ_INIT(&wl->wl_itemlist);
1156                         TAILQ_INIT(&wl->wl_freelist);
1157
1158                         for (j = 0; j < WORKITEM_SIZE; j++) {
1159                                 witem = &wq->wq_array[(i*WORKITEM_SIZE) + j];
1160                                 TAILQ_INSERT_TAIL(&wl->wl_freelist, witem, wi_entry);
1161                         }
1162                         wq->wq_reqconc[i] = wq->wq_affinity_max;
1163                 }
1164                 nptr = ptr + sizeof(struct workqueue);
1165
1166                 for (i = 0; i < WORKQUEUE_NUMPRIOS; i++) {
1167                         wq->wq_thactive_count[i] = (uint32_t *)nptr;
1168                         nptr += (num_cpus * sizeof(uint32_t));
1169                 }
1170                 for (i = 0; i < WORKQUEUE_NUMPRIOS; i++) {
1171                         wq->wq_thscheduled_count[i] = (uint32_t *)nptr;
1172                         nptr += (num_cpus * sizeof(uint32_t));
1173                 }
1174                 /*
1175                  * align nptr on a 64 bit boundary so that we can do nice
1176                  * atomic64 operations on the timestamps...
1177                  * note that we requested an extra uint64_t when calcuating
1178                  * the size for the allocation of the workqueue struct
1179                  */
1180                 nptr += (sizeof(uint64_t) - 1);
1181                 nptr = (char *)((uintptr_t)nptr & ~(sizeof(uint64_t) - 1));
1182
1183                 for (i = 0; i < WORKQUEUE_NUMPRIOS; i++) {
1184                         wq->wq_lastblocked_ts[i] = (uint64_t *)nptr;
1185                         nptr += (num_cpus * sizeof(uint64_t));
1186                 }
1187                 TAILQ_INIT(&wq->wq_thrunlist);
1188                 TAILQ_INIT(&wq->wq_thidlelist);
1189
1190                 wq->wq_atimer_call = thread_call_allocate((thread_call_func_t)workqueue_add_timer, (thread_call_param_t)wq);
1191
1192                 workqueue_lock_spin(p);
1193
1194                 p->p_wqptr = (void *)wq;
1195                 p->p_wqsize = wq_size;
1196
1197                 p->p_wqiniting = FALSE;
1198                 need_wakeup = TRUE;
1199         }
1200 out:
1201         workqueue_unlock(p);
1202
1203         if (need_wakeup == TRUE)
1204                 wakeup(&p->p_wqiniting);
1205         return(error);
1206 }
1207
1208 int
1209 workq_kernreturn(struct proc *p, struct workq_kernreturn_args  *uap, __unused int32_t *retval)
1210 {
1211         user_addr_t item = uap->item;
1212         int options     = uap->options;
1213         int prio        = uap->prio;    /* should  be used to find the right workqueue */
1214         int affinity    = uap->affinity;
1215         int error       = 0;
1216         thread_t th     = THREAD_NULL;
1217         user_addr_t oc_item = 0;
1218         struct workqueue *wq;
1219
1220         if ((p->p_lflag & P_LREGISTER) == 0)
1221                 return(EINVAL);
1222
1223         /*
1224          * affinity not yet hooked up on this path
1225          */
1226         affinity = -1;
1227
1228         switch (options) {
1229
1230                 case WQOPS_QUEUE_ADD: {
1231
1232                         if (prio & WORKQUEUE_OVERCOMMIT) {
1233                                 prio &= ~WORKQUEUE_OVERCOMMIT;
1234                                 oc_item = item;
1235                         }
1236                         if ((prio < 0) || (prio >= WORKQUEUE_NUMPRIOS))
1237                                 return (EINVAL);
1238
1239                         workqueue_lock_spin(p);
1240
1241                         if ((wq = (struct workqueue *)p->p_wqptr) == NULL) {
1242                                 workqueue_unlock(p);
1243                                 return (EINVAL);
1244                         }
1245                         if (wq->wq_thidlecount == 0 && (oc_item || (wq->wq_constrained_threads_scheduled < wq->wq_affinity_max))) {
1246
1247                                 workqueue_addnewthread(wq, oc_item ? TRUE : FALSE);
1248
1249                                 if (wq->wq_thidlecount == 0)
1250                                         oc_item = 0;
1251                         }
1252                         if (oc_item == 0)
1253                                 error = workqueue_additem(wq, prio, item, affinity);
1254
1255                         KERNEL_DEBUG(0xefffd008 | DBG_FUNC_NONE, wq, prio, affinity, oc_item, 0);
1256                         }
1257                         break;
1258                 case WQOPS_THREAD_RETURN: {
1259
1260                         th = current_thread();
1261                         struct uthread *uth = get_bsdthread_info(th);
1262
1263                         /* reset signal mask on the workqueue thread to default state */
1264                         if (uth->uu_sigmask != (sigset_t)(~workq_threadmask)) {
1265                                 proc_lock(p);
1266                                 uth->uu_sigmask = ~workq_threadmask;
1267                                 proc_unlock(p);
1268                         }
1269
1270                         workqueue_lock_spin(p);
1271
1272                         if ((wq = (struct workqueue *)p->p_wqptr) == NULL || (uth->uu_threadlist == NULL)) {
1273                                 workqueue_unlock(p);
1274                                 return (EINVAL);
1275                         }
1276                         KERNEL_DEBUG(0xefffd004 | DBG_FUNC_END, wq, 0, 0, 0, 0);
1277                         }
1278                         break;
1279                 case WQOPS_THREAD_SETCONC: {
1280
1281                         if ((prio < 0) || (prio > WORKQUEUE_NUMPRIOS))
1282                                 return (EINVAL);
1283
1284                         workqueue_lock_spin(p);
1285
1286                         if ((wq = (struct workqueue *)p->p_wqptr) == NULL) {
1287                                 workqueue_unlock(p);
1288                                 return (EINVAL);
1289                         }
1290                         /*
1291                          * for this operation, we re-purpose the affinity
1292                          * argument as the concurrency target
1293                          */
1294                         if (prio < WORKQUEUE_NUMPRIOS)
1295                                 wq->wq_reqconc[prio] = affinity;
1296                         else {
1297                                 for (prio = 0; prio < WORKQUEUE_NUMPRIOS; prio++)
1298                                         wq->wq_reqconc[prio] = affinity;
1299
1300                         }
1301                         }
1302                         break;
1303                 default:
1304                         return (EINVAL);
1305         }
1306         (void)workqueue_run_nextitem(p, wq, th, oc_item, prio, affinity);
1307         /*
1308          * workqueue_run_nextitem is responsible for
1309          * dropping the workqueue lock in all cases
1310          */
1311         return (error);
1312
1313 }
1314
1315 void
1316 workqueue_exit(struct proc *p)
1317 {
1318         struct workqueue  * wq;
1319         struct threadlist  * tl, *tlist;
1320         struct uthread  *uth;
1321         int wq_size = 0;
1322
1323         if (p->p_wqptr != NULL) {
1324
1325                 KERNEL_DEBUG(0x900808c | DBG_FUNC_START, p->p_wqptr, 0, 0, 0, 0);
1326
1327                 workqueue_lock_spin(p);
1328
1329                 wq = (struct workqueue *)p->p_wqptr;
1330
1331                 if (wq == NULL) {
1332                         workqueue_unlock(p);
1333
1334                         KERNEL_DEBUG(0x900808c | DBG_FUNC_END, 0, 0, 0, -1, 0);
1335                         return;
1336                 }
1337                 wq_size = p->p_wqsize;
1338                 p->p_wqptr = NULL;
1339                 p->p_wqsize = 0;
1340
1341                 /*
1342                  * we now arm the timer in the callback function w/o holding the workq lock...
1343                  * we do this by setting  WQ_ATIMER_RUNNING via OSCompareAndSwap in order to
1344                  * insure only a single timer if running and to notice that WQ_EXITING has
1345                  * been set (we don't want to start a timer once WQ_EXITING is posted)
1346                  *
1347                  * so once we have successfully set WQ_EXITING, we cannot fire up a new timer...
1348                  * therefor no need to clear the timer state atomically from the flags
1349                  *
1350                  * since we always hold the workq lock when dropping WQ_ATIMER_RUNNING
1351                  * the check for and sleep until clear is protected
1352                  */
1353                 while ( !(OSCompareAndSwap(wq->wq_flags, (wq->wq_flags | WQ_EXITING), (UInt32 *)&wq->wq_flags)));
1354
1355                 if (wq->wq_flags & WQ_ATIMER_RUNNING) {
1356                         if (thread_call_cancel(wq->wq_atimer_call) == TRUE)
1357                                 wq->wq_flags &= ~WQ_ATIMER_RUNNING;
1358                 }
1359                 while ((wq->wq_flags & WQ_ATIMER_RUNNING) || (wq->wq_lflags & WQL_ATIMER_BUSY)) {
1360
1361                         assert_wait((caddr_t)wq, (THREAD_UNINT));
1362                         workqueue_unlock(p);
1363
1364                         thread_block(THREAD_CONTINUE_NULL);
1365
1366                         workqueue_lock_spin(p);
1367                 }
1368                 workqueue_unlock(p);
1369
1370                 TAILQ_FOREACH_SAFE(tl, &wq->wq_thrunlist, th_entry, tlist) {
1371
1372                         thread_sched_call(tl->th_thread, NULL);
1373
1374                         uth = get_bsdthread_info(tl->th_thread);
1375                         if (uth != (struct uthread *)0) {
1376                                 uth->uu_threadlist = NULL;
1377                         }
1378                         TAILQ_REMOVE(&wq->wq_thrunlist, tl, th_entry);
1379
1380                         /*
1381                          * drop our last ref on the thread
1382                          */
1383                         thread_deallocate(tl->th_thread);
1384
1385                         kfree(tl, sizeof(struct threadlist));
1386                 }
1387                 TAILQ_FOREACH_SAFE(tl, &wq->wq_thidlelist, th_entry, tlist) {
1388                         workqueue_removethread(tl, 1);
1389                 }
1390                 thread_call_free(wq->wq_atimer_call);
1391
1392                 kfree(wq, wq_size);
1393
1394                 KERNEL_DEBUG(0x900808c | DBG_FUNC_END, 0, 0, 0, 0, 0);
1395         }
1396 }
1397
1398 static int
1399 workqueue_additem(struct workqueue *wq, int prio, user_addr_t item, int affinity)
1400 {
1401         struct workitem *witem;
1402         struct workitemlist *wl;
1403
1404         wl = (struct workitemlist *)&wq->wq_list[prio];
1405
1406         if (TAILQ_EMPTY(&wl->wl_freelist))
1407                 return (ENOMEM);
1408
1409         witem = (struct workitem *)TAILQ_FIRST(&wl->wl_freelist);
1410         TAILQ_REMOVE(&wl->wl_freelist, witem, wi_entry);
1411
1412         witem->wi_item = item;
1413         witem->wi_affinity = affinity;
1414         TAILQ_INSERT_TAIL(&wl->wl_itemlist, witem, wi_entry);
1415
1416         wq->wq_list_bitmap |= (1 << prio);
1417
1418         wq->wq_itemcount++;
1419
1420         return (0);
1421 }
1422
1423 static int workqueue_importance[WORKQUEUE_NUMPRIOS] =
1424 {
1425         2, 0, -2, INT_MIN,
1426 };
1427
1428 #define WORKQ_POLICY_TIMESHARE 1
1429
1430 static int workqueue_policy[WORKQUEUE_NUMPRIOS] =
1431 {
1432         WORKQ_POLICY_TIMESHARE, WORKQ_POLICY_TIMESHARE, WORKQ_POLICY_TIMESHARE, WORKQ_POLICY_TIMESHARE
1433 };
1434
1435
1436 /*
1437  * workqueue_run_nextitem:
1438  *   called with the workqueue lock held...
1439  *   responsible for dropping it in all cases
1440  */
1441 static boolean_t
1442 workqueue_run_nextitem(proc_t p, struct workqueue *wq, thread_t thread, user_addr_t oc_item, int oc_prio, int oc_affinity)
1443 {
1444         struct workitem *witem = NULL;
1445         user_addr_t item = 0;
1446         thread_t th_to_run = THREAD_NULL;
1447         thread_t th_to_park = THREAD_NULL;
1448         int wake_thread = 0;
1449         int reuse_thread = 1;
1450         uint32_t priority, orig_priority;
1451         uint32_t affinity_tag, orig_affinity_tag;
1452         uint32_t i, n;
1453         uint32_t activecount;
1454         uint32_t busycount;
1455         uint32_t us_to_wait;
1456         struct threadlist *tl = NULL;
1457         struct threadlist *ttl = NULL;
1458         struct uthread *uth = NULL;
1459         struct workitemlist *wl = NULL;
1460         boolean_t start_timer = FALSE;
1461         boolean_t adjust_counters = TRUE;
1462         uint64_t  curtime;
1463
1464
1465         KERNEL_DEBUG(0xefffd000 | DBG_FUNC_START, wq, thread, wq->wq_thidlecount, wq->wq_itemcount, 0);
1466
1467         /*
1468          * from here until we drop the workq lock
1469          * we can't be pre-empted since we hold
1470          * the lock in spin mode... this is important
1471          * since we have to independently update the priority
1472          * and affinity that the thread is associated with
1473          * and these values are used to index the multi-dimensional
1474          * counter arrays in 'workqueue_callback'
1475          */
1476         if (oc_item) {
1477                 uint32_t min_scheduled = 0;
1478                 uint32_t scheduled_count;
1479                 uint32_t active_count;
1480                 uint32_t t_affinity = 0;
1481
1482                 priority = oc_prio;
1483                 item = oc_item;
1484
1485                 if ((affinity_tag = oc_affinity) == (uint32_t)-1) {
1486                         for (affinity_tag = 0; affinity_tag < wq->wq_reqconc[priority]; affinity_tag++) {
1487                                 /*
1488                                  * look for the affinity group with the least number of threads
1489                                  */
1490                                 scheduled_count = 0;
1491                                 active_count = 0;
1492
1493                                 for (i = 0; i <= priority; i++) {
1494                                         scheduled_count += wq->wq_thscheduled_count[i][affinity_tag];
1495                                         active_count += wq->wq_thactive_count[i][affinity_tag];
1496                                 }
1497                                 if (active_count == 0) {
1498                                         t_affinity = affinity_tag;
1499                                         break;
1500                                 }
1501                                 if (affinity_tag == 0 || scheduled_count < min_scheduled) {
1502                                         min_scheduled = scheduled_count;
1503                                         t_affinity = affinity_tag;
1504                                 }
1505                         }
1506                         affinity_tag = t_affinity;
1507                 }
1508                 goto grab_idle_thread;
1509         }
1510         /*
1511          * if we get here, the work should be handled by a constrained thread
1512          */
1513         if (wq->wq_itemcount == 0 || wq->wq_constrained_threads_scheduled >= wq_max_constrained_threads) {
1514                 /*
1515                  * no work to do, or we're already at or over the scheduling limit for
1516                  * constrained threads...  just return or park the thread...
1517                  * do not start the timer for this condition... if we don't have any work,
1518                  * we'll check again when new work arrives... if we're over the limit, we need 1 or more
1519                  * constrained threads to return to the kernel before we can dispatch work from our queue
1520                  */
1521                 if ((th_to_park = thread) == THREAD_NULL)
1522                         goto out_of_work;
1523                 goto parkit;
1524         }
1525         for (priority = 0; priority < WORKQUEUE_NUMPRIOS; priority++) {
1526                 if (wq->wq_list_bitmap & (1 << priority)) {
1527                         wl = (struct workitemlist *)&wq->wq_list[priority];
1528                         break;
1529                 }
1530         }
1531         assert(wl != NULL);
1532         assert(!(TAILQ_EMPTY(&wl->wl_itemlist)));
1533
1534         curtime = mach_absolute_time();
1535
1536         if (thread != THREAD_NULL) {
1537                 uth = get_bsdthread_info(thread);
1538                 tl = uth->uu_threadlist;
1539                 affinity_tag = tl->th_affinity_tag;
1540
1541                 /*
1542                  * check to see if the affinity group this thread is
1543                  * associated with is still within the bounds of the
1544                  * specified concurrency for the priority level
1545                  * we're considering running work for
1546                  */
1547                 if (affinity_tag < wq->wq_reqconc[priority]) {
1548                         /*
1549                          * we're a worker thread from the pool... currently we
1550                          * are considered 'active' which means we're counted
1551                          * in "wq_thactive_count"
1552                          * add up the active counts of all the priority levels
1553                          * up to and including the one we want to schedule
1554                          */
1555                         for (activecount = 0, i = 0; i <= priority; i++) {
1556                                 uint32_t  acount;
1557
1558                                 acount = wq->wq_thactive_count[i][affinity_tag];
1559
1560                                 if (acount == 0 && wq->wq_thscheduled_count[i][affinity_tag]) {
1561                                         if (wq_thread_is_busy(curtime, &wq->wq_lastblocked_ts[i][affinity_tag]))
1562                                                 acount = 1;
1563                                 }
1564                                 activecount += acount;
1565                         }
1566                         if (activecount == 1) {
1567                                 /*
1568                                  * we're the only active thread associated with our
1569                                  * affinity group at this priority level and higher,
1570                                  * so pick up some work and keep going
1571                                  */
1572                                 th_to_run = thread;
1573                                 goto pick_up_work;
1574                         }
1575                 }
1576                 /*
1577                  * there's more than 1 thread running in this affinity group
1578                  * or the concurrency level has been cut back for this priority...
1579                  * lets continue on and look for an 'empty' group to run this
1580                  * work item in
1581                  */
1582         }
1583         busycount = 0;
1584
1585         for (affinity_tag = 0; affinity_tag < wq->wq_reqconc[priority]; affinity_tag++) {
1586                 /*
1587                  * look for first affinity group that is currently not active
1588                  * i.e. no active threads at this priority level or higher
1589                  * and no threads that have run recently
1590                  */
1591                 for (activecount = 0, i = 0; i <= priority; i++) {
1592                         if ((activecount = wq->wq_thactive_count[i][affinity_tag]))
1593                                 break;
1594
1595                         if (wq->wq_thscheduled_count[i][affinity_tag]) {
1596                                 if (wq_thread_is_busy(curtime, &wq->wq_lastblocked_ts[i][affinity_tag])) {
1597                                         busycount++;
1598                                         break;
1599                                 }
1600                         }
1601                 }
1602                 if (activecount == 0 && busycount == 0)
1603                         break;
1604         }
1605         if (affinity_tag >= wq->wq_reqconc[priority]) {
1606                 /*
1607                  * we've already got at least 1 thread per
1608                  * affinity group in the active state...
1609                  */
1610                 if (busycount) {
1611                         /*
1612                          * we found at least 1 thread in the
1613                          * 'busy' state... make sure we start
1614                          * the timer because if they are the only
1615                          * threads keeping us from scheduling
1616                          * this workitem, we won't get a callback
1617                          * to kick off the timer... we need to
1618                          * start it now...
1619                          */
1620                         WQ_TIMER_NEEDED(wq, start_timer);
1621                 }
1622                 KERNEL_DEBUG(0xefffd000 | DBG_FUNC_NONE, wq, busycount, start_timer, 0, 0);
1623
1624                 if (thread != THREAD_NULL) {
1625                         /*
1626                          * go park this one for later
1627                          */
1628                         th_to_park = thread;
1629                         goto parkit;
1630                 }
1631                 goto out_of_work;
1632         }
1633         if (thread != THREAD_NULL) {
1634                 /*
1635                  * we're overbooked on the affinity group this thread is
1636                  * currently associated with, but we have work to do
1637                  * and at least 1 idle processor, so we'll just retarget
1638                  * this thread to a new affinity group
1639                  */
1640                 th_to_run = thread;
1641                 goto pick_up_work;
1642         }
1643         if (wq->wq_thidlecount == 0) {
1644                 /*
1645                  * we don't have a thread to schedule, but we have
1646                  * work to do and at least 1 affinity group that
1647                  * doesn't currently have an active thread...
1648                  */
1649                 WQ_TIMER_NEEDED(wq, start_timer);
1650
1651                 KERNEL_DEBUG(0xefffd118, wq, wq->wq_nthreads, start_timer, 0, 0);
1652
1653                 goto no_thread_to_run;
1654         }
1655
1656 grab_idle_thread:
1657         /*
1658          * we've got a candidate (affinity group with no currently
1659          * active threads) to start a new thread on...
1660          * we already know there is both work available
1661          * and an idle thread, so activate a thread and then
1662          * fall into the code that pulls a new workitem...
1663          */
1664         TAILQ_FOREACH(ttl, &wq->wq_thidlelist, th_entry) {
1665                 if (ttl->th_affinity_tag == affinity_tag || ttl->th_affinity_tag == (uint16_t)-1) {
1666
1667                         TAILQ_REMOVE(&wq->wq_thidlelist, ttl, th_entry);
1668                         tl = ttl;
1669
1670                         break;
1671                 }
1672         }
1673         if (tl == NULL) {
1674                 tl = TAILQ_FIRST(&wq->wq_thidlelist);
1675                 TAILQ_REMOVE(&wq->wq_thidlelist, tl, th_entry);
1676         }
1677         wq->wq_thidlecount--;
1678
1679         TAILQ_INSERT_TAIL(&wq->wq_thrunlist, tl, th_entry);
1680
1681         if ((tl->th_flags & TH_LIST_SUSPENDED) == TH_LIST_SUSPENDED) {
1682                 tl->th_flags &= ~TH_LIST_SUSPENDED;
1683                 reuse_thread = 0;
1684
1685         } else if ((tl->th_flags & TH_LIST_BLOCKED) == TH_LIST_BLOCKED) {
1686                 tl->th_flags &= ~TH_LIST_BLOCKED;
1687                 wake_thread = 1;
1688         }
1689         tl->th_flags |= TH_LIST_RUNNING | TH_LIST_BUSY;
1690
1691         wq->wq_threads_scheduled++;
1692         wq->wq_thscheduled_count[priority][affinity_tag]++;
1693         OSAddAtomic(1, &wq->wq_thactive_count[priority][affinity_tag]);
1694
1695         adjust_counters = FALSE;
1696         th_to_run = tl->th_thread;
1697
1698 pick_up_work:
1699         if (item == 0) {
1700                 witem = TAILQ_FIRST(&wl->wl_itemlist);
1701                 TAILQ_REMOVE(&wl->wl_itemlist, witem, wi_entry);
1702
1703                 if (TAILQ_EMPTY(&wl->wl_itemlist))
1704                         wq->wq_list_bitmap &= ~(1 << priority);
1705                 wq->wq_itemcount--;
1706
1707                 item = witem->wi_item;
1708                 witem->wi_item = (user_addr_t)0;
1709                 witem->wi_affinity = 0;
1710                 TAILQ_INSERT_HEAD(&wl->wl_freelist, witem, wi_entry);
1711
1712                 if ( !(tl->th_flags & TH_LIST_CONSTRAINED)) {
1713                         wq->wq_constrained_threads_scheduled++;
1714                         tl->th_flags |= TH_LIST_CONSTRAINED;
1715                 }
1716         } else {
1717                 if (tl->th_flags & TH_LIST_CONSTRAINED) {
1718                         wq->wq_constrained_threads_scheduled--;
1719                         tl->th_flags &= ~TH_LIST_CONSTRAINED;
1720                 }
1721         }
1722         orig_priority = tl->th_priority;
1723         orig_affinity_tag = tl->th_affinity_tag;
1724
1725         tl->th_priority = priority;
1726         tl->th_affinity_tag = affinity_tag;
1727
1728         if (adjust_counters == TRUE && (orig_priority != priority || orig_affinity_tag != affinity_tag)) {
1729                 /*
1730                  * we need to adjust these counters based on this
1731                  * thread's new disposition w/r to affinity and priority
1732                  */
1733                 OSAddAtomic(-1, &wq->wq_thactive_count[orig_priority][orig_affinity_tag]);
1734                 OSAddAtomic(1, &wq->wq_thactive_count[priority][affinity_tag]);
1735
1736                 wq->wq_thscheduled_count[orig_priority][orig_affinity_tag]--;
1737                 wq->wq_thscheduled_count[priority][affinity_tag]++;
1738         }
1739         wq->wq_thread_yielded_count = 0;
1740
1741         workqueue_unlock(p);
1742
1743         if (orig_affinity_tag != affinity_tag) {
1744                 /*
1745                  * this thread's affinity does not match the affinity group
1746                  * its being placed on (it's either a brand new thread or
1747                  * we're retargeting an existing thread to a new group)...
1748                  * affinity tag of 0 means no affinity...
1749                  * but we want our tags to be 0 based because they
1750                  * are used to index arrays, so...
1751                  * keep it 0 based internally and bump by 1 when
1752                  * calling out to set it
1753                  */
1754                 KERNEL_DEBUG(0xefffd114 | DBG_FUNC_START, wq, orig_affinity_tag, 0, 0, 0);
1755
1756                 (void)thread_affinity_set(th_to_run, affinity_tag + 1);
1757
1758                 KERNEL_DEBUG(0xefffd114 | DBG_FUNC_END, wq, affinity_tag, 0, 0, 0);
1759         }
1760         if (orig_priority != priority) {
1761                 thread_precedence_policy_data_t precedinfo;
1762                 thread_extended_policy_data_t   extinfo;
1763                 uint32_t        policy;
1764
1765                 policy = workqueue_policy[priority];
1766
1767                 KERNEL_DEBUG(0xefffd120 | DBG_FUNC_START, wq, orig_priority, tl->th_policy, 0, 0);
1768
1769                 if ((orig_priority == WORKQUEUE_BG_PRIOQUEUE) || (priority == WORKQUEUE_BG_PRIOQUEUE)) {
1770                         struct uthread *ut = NULL;
1771
1772                         ut = get_bsdthread_info(th_to_run);
1773
1774                         if (orig_priority == WORKQUEUE_BG_PRIOQUEUE) {
1775                                 /* remove the disk throttle, importance will be reset in anycase */
1776 #if !CONFIG_EMBEDDED
1777                                 proc_restore_workq_bgthreadpolicy(th_to_run);
1778 #else /* !CONFIG_EMBEDDED */
1779                                 if ((ut->uu_flag & UT_BACKGROUND) != 0) {
1780                                         ut->uu_flag &= ~UT_BACKGROUND;
1781                                         ut->uu_iopol_disk = IOPOL_NORMAL;
1782                                 }
1783 #endif /* !CONFIG_EMBEDDED */
1784                         }
1785
1786                         if (priority == WORKQUEUE_BG_PRIOQUEUE) {
1787 #if !CONFIG_EMBEDDED
1788                         proc_apply_workq_bgthreadpolicy(th_to_run);
1789 #else /* !CONFIG_EMBEDDED */
1790                                 if ((ut->uu_flag & UT_BACKGROUND) == 0) {
1791                                         /* set diskthrottling */
1792                                         ut->uu_flag |= UT_BACKGROUND;
1793                                         ut->uu_iopol_disk = IOPOL_THROTTLE;
1794                                 }
1795 #endif /* !CONFIG_EMBEDDED */
1796                         }
1797                 }
1798
1799                 if (tl->th_policy != policy) {
1800                         extinfo.timeshare = policy;
1801                         (void)thread_policy_set_internal(th_to_run, THREAD_EXTENDED_POLICY, (thread_policy_t)&extinfo, THREAD_EXTENDED_POLICY_COUNT);
1802
1803                         tl->th_policy = policy;
1804                 }
1805
1806                 precedinfo.importance = workqueue_importance[priority];
1807                 (void)thread_policy_set_internal(th_to_run, THREAD_PRECEDENCE_POLICY, (thread_policy_t)&precedinfo, THREAD_PRECEDENCE_POLICY_COUNT);
1808
1809
1810                 KERNEL_DEBUG(0xefffd120 | DBG_FUNC_END, wq,  priority, policy, 0, 0);
1811         }
1812         if (kdebug_enable) {
1813                 int     lpri = -1;
1814                 int     laffinity = -1;
1815                 int     first = -1;
1816                 uint32_t  code = 0xefffd02c | DBG_FUNC_START;
1817
1818                 for (n = 0; n < WORKQUEUE_NUMPRIOS; n++) {
1819                         for (i = 0; i < wq->wq_affinity_max; i++) {
1820                                 if (wq->wq_thactive_count[n][i]) {
1821                                         if (lpri != -1) {
1822                                                 KERNEL_DEBUG(code, lpri, laffinity, wq->wq_thactive_count[lpri][laffinity], first, 0);
1823                                                 code = 0xefffd02c;
1824                                                 first = 0;
1825                                         }
1826                                         lpri = n;
1827                                         laffinity = i;
1828                                 }
1829                         }
1830                 }
1831                 if (lpri != -1) {
1832                         if (first == -1)
1833                                 first = 0xeeeeeeee;
1834                         KERNEL_DEBUG(0xefffd02c | DBG_FUNC_END, lpri, laffinity, wq->wq_thactive_count[lpri][laffinity], first, 0);
1835                 }
1836         }
1837         /*
1838          * if current thread is reused for workitem, does not return via unix_syscall
1839          */
1840         wq_runitem(p, item, th_to_run, tl, reuse_thread, wake_thread, (thread == th_to_run));
1841
1842         KERNEL_DEBUG(0xefffd000 | DBG_FUNC_END, wq, thread_tid(th_to_run), item, 1, 0);
1843
1844         return (TRUE);
1845
1846 out_of_work:
1847         /*
1848          * we have no work to do or we are fully booked
1849          * w/r to running threads...
1850          */
1851 no_thread_to_run:
1852         workqueue_unlock(p);
1853
1854         if (start_timer)
1855                 workqueue_interval_timer_start(wq);
1856
1857         KERNEL_DEBUG(0xefffd000 | DBG_FUNC_END, wq, thread_tid(thread), 0, 2, 0);
1858
1859         return (FALSE);
1860
1861 parkit:
1862         /*
1863          * this is a workqueue thread with no more
1864          * work to do... park it for now
1865          */
1866         uth = get_bsdthread_info(th_to_park);
1867         tl = uth->uu_threadlist;
1868         if (tl == 0)
1869                 panic("wq thread with no threadlist ");
1870
1871         TAILQ_REMOVE(&wq->wq_thrunlist, tl, th_entry);
1872         tl->th_flags &= ~TH_LIST_RUNNING;
1873
1874         tl->th_flags |= TH_LIST_BLOCKED;
1875         TAILQ_INSERT_HEAD(&wq->wq_thidlelist, tl, th_entry);
1876
1877         thread_sched_call(th_to_park, NULL);
1878
1879         OSAddAtomic(-1, &wq->wq_thactive_count[tl->th_priority][tl->th_affinity_tag]);
1880         wq->wq_thscheduled_count[tl->th_priority][tl->th_affinity_tag]--;
1881         wq->wq_threads_scheduled--;
1882
1883         if (tl->th_flags & TH_LIST_CONSTRAINED) {
1884                 wq->wq_constrained_threads_scheduled--;
1885                 wq->wq_lflags &= ~WQL_EXCEEDED_CONSTRAINED_THREAD_LIMIT;
1886                 tl->th_flags &= ~TH_LIST_CONSTRAINED;
1887         }
1888         if (wq->wq_thidlecount < 100)
1889                 us_to_wait = wq_reduce_pool_window_usecs - (wq->wq_thidlecount * (wq_reduce_pool_window_usecs / 100));
1890         else
1891                 us_to_wait = wq_reduce_pool_window_usecs / 100;
1892
1893         wq->wq_thidlecount++;
1894         wq->wq_lflags &= ~WQL_EXCEEDED_TOTAL_THREAD_LIMIT;
1895
1896         assert_wait_timeout((caddr_t)tl, (THREAD_INTERRUPTIBLE), us_to_wait, NSEC_PER_USEC);
1897
1898         workqueue_unlock(p);
1899
1900         if (start_timer)
1901                 workqueue_interval_timer_start(wq);
1902
1903         KERNEL_DEBUG1(0xefffd018 | DBG_FUNC_START, wq, wq->wq_threads_scheduled, wq->wq_thidlecount, us_to_wait, thread_tid(th_to_park));
1904         KERNEL_DEBUG(0xefffd000 | DBG_FUNC_END, wq, thread_tid(thread), 0, 3, 0);
1905
1906         thread_block((thread_continue_t)wq_unpark_continue);
1907         /* NOT REACHED */
1908
1909         return (FALSE);
1910 }
1911
1912
1913 static void
1914 wq_unsuspend_continue(void)
1915 {
1916         struct uthread *uth = NULL;
1917         thread_t th_to_unsuspend;
1918         struct threadlist *tl;
1919         proc_t  p;
1920
1921         th_to_unsuspend = current_thread();
1922         uth = get_bsdthread_info(th_to_unsuspend);
1923
1924         if (uth != NULL && (tl = uth->uu_threadlist) != NULL) {
1925
1926                 if ((tl->th_flags & (TH_LIST_RUNNING | TH_LIST_BUSY)) == TH_LIST_RUNNING) {
1927                         /*
1928                          * most likely a normal resume of this thread occurred...
1929                          * it's also possible that the thread was aborted after we
1930                          * finished setting it up so that it could be dispatched... if
1931                          * so, thread_bootstrap_return will notice the abort and put
1932                          * the thread on the path to self-destruction
1933                          */
1934 normal_resume_to_user:
1935                         thread_sched_call(th_to_unsuspend, workqueue_callback);
1936
1937                         thread_bootstrap_return();
1938                 }
1939                 /*
1940                  * if we get here, it's because we've been resumed due to
1941                  * an abort of this thread (process is crashing)
1942                  */
1943                 p = current_proc();
1944
1945                 workqueue_lock_spin(p);
1946
1947                 if (tl->th_flags & TH_LIST_SUSPENDED) {
1948                         /*
1949                          * thread has been aborted while still on our idle
1950                          * queue... remove it from our domain...
1951                          * workqueue_removethread consumes the lock
1952                          */
1953                         workqueue_removethread(tl, 0);
1954
1955                         thread_bootstrap_return();
1956                 }
1957                 while ((tl->th_flags & TH_LIST_BUSY)) {
1958                         /*
1959                          * this thread was aborted after we started making
1960                          * it runnable, but before we finished dispatching it...
1961                          * we need to wait for that process to finish,
1962                          * and we need to ask for a wakeup instead of a
1963                          * thread_resume since the abort has already resumed us
1964                          */
1965                         tl->th_flags |= TH_LIST_NEED_WAKEUP;
1966
1967                         assert_wait((caddr_t)tl, (THREAD_UNINT));
1968
1969                         workqueue_unlock(p);
1970
1971                         thread_block(THREAD_CONTINUE_NULL);
1972
1973                         workqueue_lock_spin(p);
1974                 }
1975                 workqueue_unlock(p);
1976                 /*
1977                  * we have finished setting up the thread's context...
1978                  * thread_bootstrap_return will take us through the abort path
1979                  * where the thread will self destruct
1980                  */
1981                 goto normal_resume_to_user;
1982         }
1983         thread_bootstrap_return();
1984 }
1985
1986
1987 static void
1988 wq_unpark_continue(void)
1989 {
1990         struct uthread *uth = NULL;
1991         struct threadlist *tl;
1992         thread_t th_to_unpark;
1993         proc_t  p;
1994
1995         th_to_unpark = current_thread();
1996         uth = get_bsdthread_info(th_to_unpark);
1997
1998         if (uth != NULL) {
1999                 if ((tl = uth->uu_threadlist) != NULL) {
2000
2001                         if ((tl->th_flags & (TH_LIST_RUNNING | TH_LIST_BUSY)) == TH_LIST_RUNNING) {
2002                                 /*
2003                                  * a normal wakeup of this thread occurred... no need
2004                                  * for any synchronization with the timer and wq_runitem
2005                                  */
2006 normal_return_to_user:
2007                                 thread_sched_call(th_to_unpark, workqueue_callback);
2008
2009                                 KERNEL_DEBUG(0xefffd018 | DBG_FUNC_END, tl->th_workq, 0, 0, 0, 0);
2010
2011                                 thread_exception_return();
2012                         }
2013                         p = current_proc();
2014
2015                         workqueue_lock_spin(p);
2016
2017                         if ( !(tl->th_flags & TH_LIST_RUNNING)) {
2018                                 /*
2019                                  * the timer popped us out and we've not
2020                                  * been moved off of the idle list
2021                                  * so we should now self-destruct
2022                                  *
2023                                  * workqueue_removethread consumes the lock
2024                                  */
2025                                 workqueue_removethread(tl, 0);
2026
2027                                 thread_exception_return();
2028                         }
2029                         /*
2030                          * the timer woke us up, but we have already
2031                          * started to make this a runnable thread,
2032                          * but have not yet finished that process...
2033                          * so wait for the normal wakeup
2034                          */
2035                         while ((tl->th_flags & TH_LIST_BUSY)) {
2036
2037                                 assert_wait((caddr_t)tl, (THREAD_UNINT));
2038
2039                                 workqueue_unlock(p);
2040
2041                                 thread_block(THREAD_CONTINUE_NULL);
2042
2043                                 workqueue_lock_spin(p);
2044                         }
2045                         /*
2046                          * we have finished setting up the thread's context
2047                          * now we can return as if we got a normal wakeup
2048                          */
2049                         workqueue_unlock(p);
2050
2051                         goto normal_return_to_user;
2052                 }
2053         }
2054         thread_exception_return();
2055 }
2056
2057
2058
2059 static void
2060 wq_runitem(proc_t p, user_addr_t item, thread_t th, struct threadlist *tl,
2061            int reuse_thread, int wake_thread, int return_directly)
2062 {
2063         int ret = 0;
2064         boolean_t need_resume = FALSE;
2065
2066         KERNEL_DEBUG1(0xefffd004 | DBG_FUNC_START, tl->th_workq, tl->th_priority, tl->th_affinity_tag, thread_tid(current_thread()), thread_tid(th));
2067
2068         ret = setup_wqthread(p, th, item, reuse_thread, tl);
2069
2070         if (ret != 0)
2071                 panic("setup_wqthread failed  %x\n", ret);
2072
2073         if (return_directly) {
2074                 KERNEL_DEBUG(0xefffd000 | DBG_FUNC_END, tl->th_workq, 0, 0, 4, 0);
2075
2076                 thread_exception_return();
2077
2078                 panic("wq_runitem: thread_exception_return returned ...\n");
2079         }
2080         if (wake_thread) {
2081                 workqueue_lock_spin(p);
2082
2083                 tl->th_flags &= ~TH_LIST_BUSY;
2084                 wakeup(tl);
2085
2086                 workqueue_unlock(p);
2087         } else {
2088                 KERNEL_DEBUG1(0xefffd014 | DBG_FUNC_END, tl->th_workq, 0, 0, thread_tid(current_thread()), thread_tid(th));
2089
2090                 workqueue_lock_spin(p);
2091
2092                 if (tl->th_flags & TH_LIST_NEED_WAKEUP)
2093                         wakeup(tl);
2094                 else
2095                         need_resume = TRUE;
2096
2097                 tl->th_flags &= ~(TH_LIST_BUSY | TH_LIST_NEED_WAKEUP);
2098
2099                 workqueue_unlock(p);
2100
2101                 if (need_resume) {
2102                         /*
2103                          * need to do this outside of the workqueue spin lock
2104                          * since thread_resume locks the thread via a full mutex
2105                          */
2106                         thread_resume(th);
2107                 }
2108         }
2109 }
2110
2111
2112 int
2113 setup_wqthread(proc_t p, thread_t th, user_addr_t item, int reuse_thread, struct threadlist *tl)
2114 {
2115 #if defined(__i386__) || defined(__x86_64__)
2116         int isLP64 = 0;
2117
2118         isLP64 = IS_64BIT_PROCESS(p);
2119         /*
2120          * Set up i386 registers & function call.
2121          */
2122         if (isLP64 == 0) {
2123                 x86_thread_state32_t state;
2124                 x86_thread_state32_t *ts = &state;
2125
2126                 ts->eip = (int)p->p_wqthread;
2127                 ts->eax = (unsigned int)(tl->th_stackaddr + PTH_DEFAULT_STACKSIZE + PTH_DEFAULT_GUARDSIZE);
2128                 ts->ebx = (unsigned int)tl->th_thport;
2129                 ts->ecx = (unsigned int)(tl->th_stackaddr + PTH_DEFAULT_GUARDSIZE);
2130                 ts->edx = (unsigned int)item;
2131                 ts->edi = (unsigned int)reuse_thread;
2132                 ts->esi = (unsigned int)0;
2133                 /*
2134                  * set stack pointer
2135                  */
2136                 ts->esp = (int)((vm_offset_t)((tl->th_stackaddr + PTH_DEFAULT_STACKSIZE + PTH_DEFAULT_GUARDSIZE) - C_32_STK_ALIGN));
2137
2138                 if ((reuse_thread != 0) && (ts->eax == (unsigned int)0))
2139                         panic("setup_wqthread: setting reuse thread with null pthread\n");
2140                 thread_set_wq_state32(th, (thread_state_t)ts);
2141
2142         } else {
2143                 x86_thread_state64_t state64;
2144                 x86_thread_state64_t *ts64 = &state64;
2145
2146                 ts64->rip = (uint64_t)p->p_wqthread;
2147                 ts64->rdi = (uint64_t)(tl->th_stackaddr + PTH_DEFAULT_STACKSIZE + PTH_DEFAULT_GUARDSIZE);
2148                 ts64->rsi = (uint64_t)(tl->th_thport);
2149                 ts64->rdx = (uint64_t)(tl->th_stackaddr + PTH_DEFAULT_GUARDSIZE);
2150                 ts64->rcx = (uint64_t)item;
2151                 ts64->r8 = (uint64_t)reuse_thread;
2152                 ts64->r9 = (uint64_t)0;
2153
2154                 /*
2155                  * set stack pointer aligned to 16 byte boundary
2156                  */
2157                 ts64->rsp = (uint64_t)((tl->th_stackaddr + PTH_DEFAULT_STACKSIZE + PTH_DEFAULT_GUARDSIZE) - C_64_REDZONE_LEN);
2158
2159                 if ((reuse_thread != 0) && (ts64->rdi == (uint64_t)0))
2160                         panic("setup_wqthread: setting reuse thread with null pthread\n");
2161                 thread_set_wq_state64(th, (thread_state_t)ts64);
2162         }
2163 #else
2164 #error setup_wqthread  not defined for this architecture
2165 #endif
2166         return(0);
2167 }
2168
2169 int
2170 fill_procworkqueue(proc_t p, struct proc_workqueueinfo * pwqinfo)
2171 {
2172         struct workqueue * wq;
2173         int error = 0;
2174         int     activecount;
2175         uint32_t pri, affinity;
2176
2177         workqueue_lock_spin(p);
2178         if ((wq = p->p_wqptr) == NULL) {
2179                 error = EINVAL;
2180                 goto out;
2181         }
2182         activecount = 0;
2183
2184         for (pri = 0; pri < WORKQUEUE_NUMPRIOS; pri++) {
2185                 for (affinity = 0; affinity < wq->wq_affinity_max; affinity++)
2186                         activecount += wq->wq_thactive_count[pri][affinity];
2187         }
2188         pwqinfo->pwq_nthreads = wq->wq_nthreads;
2189         pwqinfo->pwq_runthreads = activecount;
2190         pwqinfo->pwq_blockedthreads = wq->wq_threads_scheduled - activecount;
2191         pwqinfo->pwq_state = 0;
2192
2193         if (wq->wq_lflags & WQL_EXCEEDED_CONSTRAINED_THREAD_LIMIT)
2194                 pwqinfo->pwq_state |= WQ_EXCEEDED_CONSTRAINED_THREAD_LIMIT;
2195
2196         if (wq->wq_lflags & WQL_EXCEEDED_TOTAL_THREAD_LIMIT)
2197                 pwqinfo->pwq_state |= WQ_EXCEEDED_TOTAL_THREAD_LIMIT;
2198
2199 out:
2200         workqueue_unlock(p);
2201         return(error);
2202 }
2203
2204 /* Set target concurrency of one of the  queue(0,1,2) with specified value */
2205 int
2206 proc_settargetconc(pid_t pid, int queuenum, int32_t targetconc)
2207 {
2208         proc_t p, self;
2209         uint64_t addr;
2210         int32_t conc = targetconc;
2211         int error = 0;
2212         vm_map_t oldmap = VM_MAP_NULL;
2213         int gotref = 0;
2214
2215         self = current_proc();
2216         if (self->p_pid != pid) {
2217                 /* if not on self, hold a refernce on the process */
2218
2219                 if (pid == 0)
2220                         return(EINVAL);
2221
2222                 p = proc_find(pid);
2223
2224                 if (p == PROC_NULL)
2225                         return(ESRCH);
2226                 gotref = 1;
2227
2228         } else
2229                 p = self;
2230
2231         if ((addr = p->p_targconc) == (uint64_t)0) {
2232                 error = EINVAL;
2233                 goto out;
2234         }
2235
2236
2237         if ((queuenum >= WQ_MAXPRI_MIN) && (queuenum <= WQ_MAXPRI_MAX)) {
2238                 addr += (queuenum * sizeof(int32_t));
2239                 if (gotref == 1)
2240                         oldmap = vm_map_switch(get_task_map(p->task));
2241                 error = copyout(&conc, addr, sizeof(int32_t));
2242                 if (gotref == 1)
2243                         (void)vm_map_switch(oldmap);
2244
2245         } else  {
2246                 error = EINVAL;
2247         }
2248 out:
2249         if (gotref == 1)
2250                 proc_rele(p);
2251         return(error);
2252 }
2253
2254
2255 /* Set target concurrency on all the prio queues with specified value */
2256 int
2257 proc_setalltargetconc(pid_t pid, int32_t * targetconcp)
2258 {
2259         proc_t p, self;
2260         uint64_t addr;
2261         int error = 0;
2262         vm_map_t oldmap = VM_MAP_NULL;
2263         int gotref = 0;
2264
2265         self = current_proc();
2266         if (self->p_pid != pid) {
2267                 /* if not on self, hold a refernce on the process */
2268
2269                 if (pid == 0)
2270                         return(EINVAL);
2271
2272                 p = proc_find(pid);
2273
2274                 if (p == PROC_NULL)
2275                         return(ESRCH);
2276                 gotref = 1;
2277
2278         } else
2279                 p = self;
2280
2281         if ((addr = (uint64_t)p->p_targconc) == (uint64_t)0) {
2282                 error = EINVAL;
2283                 goto out;
2284         }
2285
2286
2287         if (gotref == 1)
2288                 oldmap = vm_map_switch(get_task_map(p->task));
2289
2290         error = copyout(targetconcp, addr, WQ_PRI_NUM * sizeof(int32_t));
2291         if (gotref == 1)
2292                 (void)vm_map_switch(oldmap);
2293
2294 out:
2295         if (gotref == 1)
2296                 proc_rele(p);
2297         return(error);
2298 }
2299
2300 int thread_selfid(__unused struct proc *p, __unused struct thread_selfid_args *uap, uint64_t *retval)
2301 {
2302         thread_t thread = current_thread();
2303         *retval = thread_tid(thread);
2304         return KERN_SUCCESS;
2305 }
2306
2307 void
2308 pthread_init(void)
2309 {
2310         pthread_lck_grp_attr = lck_grp_attr_alloc_init();
2311         pthread_lck_grp = lck_grp_alloc_init("pthread", pthread_lck_grp_attr);
2312
2313         /*
2314          * allocate the lock attribute for pthread synchronizers
2315          */
2316         pthread_lck_attr = lck_attr_alloc_init();
2317
2318         workqueue_init_lock((proc_t) get_bsdtask_info(kernel_task));
2319 #if PSYNCH
2320         pthread_list_mlock = lck_mtx_alloc_init(pthread_lck_grp, pthread_lck_attr);
2321
2322         pth_global_hashinit();
2323         psynch_thcall = thread_call_allocate(psynch_wq_cleanup, NULL);
2324         psynch_zoneinit();
2325 #endif /* PSYNCH */
2326 }