bsd/kern/pthread_synch.c

   1 /*
   2  * Copyright (c) 2000-2009 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995-2005 Apple Computer, Inc. All Rights Reserved */
  29 /*
  30  *      pthread_synch.c
  31  */
  32
  33 #define  _PTHREAD_CONDATTR_T
  34 #define  _PTHREAD_COND_T
  35 #define _PTHREAD_MUTEXATTR_T
  36 #define _PTHREAD_MUTEX_T
  37 #define _PTHREAD_RWLOCKATTR_T
  38 #define _PTHREAD_RWLOCK_T
  39
  40 #undef pthread_mutexattr_t
  41 #undef pthread_mutex_t
  42 #undef pthread_condattr_t
  43 #undef pthread_cond_t
  44 #undef pthread_rwlockattr_t
  45 #undef pthread_rwlock_t
  46
  47 #include <sys/param.h>
  48 #include <sys/queue.h>
  49 #include <sys/resourcevar.h>
  50 #include <sys/proc_internal.h>
  51 #include <sys/kauth.h>
  52 #include <sys/systm.h>
  53 #include <sys/timeb.h>
  54 #include <sys/times.h>
  55 #include <sys/acct.h>
  56 #include <sys/kernel.h>
  57 #include <sys/wait.h>
  58 #include <sys/signalvar.h>
  59 #include <sys/syslog.h>
  60 #include <sys/stat.h>
  61 #include <sys/lock.h>
  62 #include <sys/kdebug.h>
  63 #include <sys/sysproto.h>
  64 #include <sys/pthread_internal.h>
  65 #include <sys/vm.h>
  66 #include <sys/user.h>           /* for coredump */
  67 #include <sys/proc_info.h>      /* for fill_procworkqueue */
  68
  69
  70 #include <mach/mach_types.h>
  71 #include <mach/vm_prot.h>
  72 #include <mach/semaphore.h>
  73 #include <mach/sync_policy.h>
  74 #include <mach/task.h>
  75 #include <kern/kern_types.h>
  76 #include <kern/task.h>
  77 #include <kern/clock.h>
  78 #include <mach/kern_return.h>
  79 #include <kern/thread.h>
  80 #include <kern/sched_prim.h>
  81 #include <kern/kalloc.h>
  82 #include <kern/sched_prim.h>    /* for thread_exception_return */
  83 #include <kern/processor.h>
  84 #include <kern/affinity.h>
  85 #include <kern/assert.h>
  86 #include <mach/mach_vm.h>
  87 #include <mach/mach_param.h>
  88 #include <mach/thread_status.h>
  89 #include <mach/thread_policy.h>
  90 #include <mach/message.h>
  91 #include <mach/port.h>
  92 #include <vm/vm_protos.h>
  93 #include <vm/vm_map.h>  /* for current_map() */
  94 #include <mach/thread_act.h> /* for thread_resume */
  95 #include <machine/machine_routines.h>
  96 #if defined(__i386__)
  97 #include <i386/machine_routines.h>
  98 #include <i386/eflags.h>
  99 #include <i386/psl.h>
 100 #include <i386/seg.h>
 101 #endif
 102
 103 #include <libkern/OSAtomic.h>
 104
 105 #if 0
 106 #undef KERNEL_DEBUG
 107 #define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT
 108 #undef KERNEL_DEBUG1
 109 #define KERNEL_DEBUG1 KERNEL_DEBUG_CONSTANT1
 110 #endif
 111
 112
 113 #if defined(__ppc__) || defined(__ppc64__)
 114 #include <architecture/ppc/cframe.h>
 115 #endif
 116
 117
 118 lck_grp_attr_t   *pthread_lck_grp_attr;
 119 lck_grp_t    *pthread_lck_grp;
 120 lck_attr_t   *pthread_lck_attr;
 121
 122 extern kern_return_t thread_getstatus(register thread_t act, int flavor,
 123                         thread_state_t tstate, mach_msg_type_number_t *count);
 124 extern kern_return_t thread_setstatus(thread_t thread, int flavor,
 125                         thread_state_t tstate, mach_msg_type_number_t count);
 126 extern void thread_set_cthreadself(thread_t thread, uint64_t pself, int isLP64);
 127 extern kern_return_t mach_port_deallocate(ipc_space_t, mach_port_name_t);
 128 extern kern_return_t semaphore_signal_internal_trap(mach_port_name_t);
 129
 130 extern void workqueue_thread_yielded(void);
 131
 132 static int workqueue_additem(struct workqueue *wq, int prio, user_addr_t item, int affinity);
 133 static int workqueue_removeitem(struct workqueue *wq, int prio, user_addr_t item);
 134 static boolean_t workqueue_run_nextitem(proc_t p, struct workqueue *wq, thread_t th,
 135                                         user_addr_t oc_item, int oc_prio, int oc_affinity);
 136 static void wq_runitem(proc_t p, user_addr_t item, thread_t th, struct threadlist *tl,
 137                        int reuse_thread, int wake_thread, int return_directly);
 138 static void wq_unpark_continue(void);
 139 static int setup_wqthread(proc_t p, thread_t th, user_addr_t item, int reuse_thread, struct threadlist *tl);
 140 static boolean_t workqueue_addnewthread(struct workqueue *wq);
 141 static void workqueue_removethread(struct threadlist *tl);
 142 static void workqueue_lock_spin(proc_t);
 143 static void workqueue_unlock(proc_t);
 144 int proc_settargetconc(pid_t pid, int queuenum, int32_t targetconc);
 145 int proc_setalltargetconc(pid_t pid, int32_t * targetconcp);
 146
 147 #define WQ_MAXPRI_MIN   0       /* low prio queue num */
 148 #define WQ_MAXPRI_MAX   2       /* max  prio queuenum */
 149 #define WQ_PRI_NUM      3       /* number of prio work queues */
 150
 151 #define C_32_STK_ALIGN          16
 152 #define C_64_STK_ALIGN          16
 153 #define C_64_REDZONE_LEN        128
 154 #define TRUNC_DOWN32(a,c)       ((((uint32_t)a)-(c)) & ((uint32_t)(-(c))))
 155 #define TRUNC_DOWN64(a,c)       ((((uint64_t)a)-(c)) & ((uint64_t)(-(c))))
 156
 157
 158 /*
 159  * Flags filed passed to bsdthread_create and back in pthread_start
 160 31  <---------------------------------> 0
 161 _________________________________________
 162 | flags(8) | policy(8) | importance(16) |
 163 -----------------------------------------
 164 */
 165 void _pthread_start(pthread_t self, mach_port_t kport, void *(*fun)(void *), void * funarg, size_t stacksize, unsigned int flags);
 166
 167 #define PTHREAD_START_CUSTOM    0x01000000
 168 #define PTHREAD_START_SETSCHED  0x02000000
 169 #define PTHREAD_START_DETACHED  0x04000000
 170 #define PTHREAD_START_POLICY_BITSHIFT 16
 171 #define PTHREAD_START_POLICY_MASK 0xff
 172 #define PTHREAD_START_IMPORTANCE_MASK 0xffff
 173
 174 #define SCHED_OTHER      POLICY_TIMESHARE
 175 #define SCHED_FIFO       POLICY_FIFO
 176 #define SCHED_RR         POLICY_RR
 177
 178
 179
 180 int
 181 bsdthread_create(__unused struct proc *p, struct bsdthread_create_args  *uap, user_addr_t *retval)
 182 {
 183         kern_return_t kret;
 184         void * sright;
 185         int error = 0;
 186         int allocated = 0;
 187         mach_vm_offset_t stackaddr;
 188         mach_vm_size_t th_allocsize = 0;
 189         mach_vm_size_t user_stacksize;
 190         mach_vm_size_t th_stacksize;
 191         mach_vm_offset_t th_stackaddr;
 192         mach_vm_offset_t th_stack;
 193         mach_vm_offset_t th_pthread;
 194         mach_port_name_t th_thport;
 195         thread_t th;
 196         user_addr_t user_func = uap->func;
 197         user_addr_t user_funcarg = uap->func_arg;
 198         user_addr_t user_stack = uap->stack;
 199         user_addr_t user_pthread = uap->pthread;
 200         unsigned int  flags = (unsigned int)uap->flags;
 201         vm_map_t vmap = current_map();
 202         task_t ctask = current_task();
 203         unsigned int policy, importance;
 204
 205         int isLP64 = 0;
 206
 207
 208         if ((p->p_lflag & P_LREGISTER) == 0)
 209                 return(EINVAL);
 210 #if 0
 211         KERNEL_DEBUG_CONSTANT(0x9000080 | DBG_FUNC_START, flags, 0, 0, 0, 0);
 212 #endif
 213
 214         isLP64 = IS_64BIT_PROCESS(p);
 215
 216
 217 #if defined(__ppc__)
 218         stackaddr = 0xF0000000;
 219 #elif defined(__i386__) || defined(__x86_64__)
 220         stackaddr = 0xB0000000;
 221 #else
 222 #error Need to define a stack address hint for this architecture
 223 #endif
 224         kret = thread_create(ctask, &th);
 225         if (kret != KERN_SUCCESS)
 226                 return(ENOMEM);
 227         thread_reference(th);
 228
 229         sright = (void *) convert_thread_to_port(th);
 230         th_thport = ipc_port_copyout_send(sright, get_task_ipcspace(ctask));
 231
 232         if ((flags & PTHREAD_START_CUSTOM) == 0) {
 233                 th_stacksize = (mach_vm_size_t)user_stack;              /* if it is custom them it is stacksize */
 234                 th_allocsize = th_stacksize + PTH_DEFAULT_GUARDSIZE + p->p_pthsize;
 235
 236                 kret = mach_vm_map(vmap, &stackaddr,
 237                                 th_allocsize,
 238                                 page_size-1,
 239                                 VM_MAKE_TAG(VM_MEMORY_STACK)| VM_FLAGS_ANYWHERE , NULL,
 240                                 0, FALSE, VM_PROT_DEFAULT, VM_PROT_ALL,
 241                                 VM_INHERIT_DEFAULT);
 242                 if (kret != KERN_SUCCESS)
 243                         kret = mach_vm_allocate(vmap,
 244                                         &stackaddr, th_allocsize,
 245                                         VM_MAKE_TAG(VM_MEMORY_STACK)| VM_FLAGS_ANYWHERE);
 246                 if (kret != KERN_SUCCESS) {
 247                         error = ENOMEM;
 248                         goto out;
 249                 }
 250 #if 0
 251                 KERNEL_DEBUG_CONSTANT(0x9000080 |DBG_FUNC_NONE, th_allocsize, stackaddr, 0, 2, 0);
 252 #endif
 253                 th_stackaddr = stackaddr;
 254                 allocated = 1;
 255                 /*
 256                  * The guard page is at the lowest address
 257                  * The stack base is the highest address
 258                  */
 259                 kret = mach_vm_protect(vmap,  stackaddr, PTH_DEFAULT_GUARDSIZE, FALSE, VM_PROT_NONE);
 260
 261                 if (kret != KERN_SUCCESS) {
 262                         error = ENOMEM;
 263                         goto out1;
 264                 }
 265                 th_stack = (stackaddr + th_stacksize + PTH_DEFAULT_GUARDSIZE);
 266                 th_pthread = (stackaddr + th_stacksize + PTH_DEFAULT_GUARDSIZE);
 267                 user_stacksize = th_stacksize;
 268         } else {
 269                 th_stack = user_stack;
 270                 user_stacksize = user_stack;
 271                 th_pthread = user_pthread;
 272 #if 0
 273                 KERNEL_DEBUG_CONSTANT(0x9000080 |DBG_FUNC_NONE, 0, 0, 0, 3, 0);
 274 #endif
 275         }
 276
 277 #if defined(__ppc__)
 278         /*
 279          * Set up PowerPC registers...
 280          * internally they are always kept as 64 bit and
 281          * since the register set is the same between 32 and 64bit modes
 282          * we don't need 2 different methods for setting the state
 283          */
 284         {
 285                 ppc_thread_state64_t state64;
 286                 ppc_thread_state64_t *ts64 = &state64;
 287
 288                 ts64->srr0 = (uint64_t)p->p_threadstart;
 289                 ts64->r1 = (uint64_t)(th_stack - C_ARGSAVE_LEN - C_RED_ZONE);
 290                 ts64->r3 = (uint64_t)th_pthread;
 291                 ts64->r4 = (uint64_t)(th_thport);
 292                 ts64->r5 = (uint64_t)user_func;
 293                 ts64->r6 = (uint64_t)user_funcarg;
 294                 ts64->r7 = (uint64_t)user_stacksize;
 295                 ts64->r8 = (uint64_t)uap->flags;
 296
 297                 thread_set_wq_state64(th, (thread_state_t)ts64);
 298
 299                 thread_set_cthreadself(th, (uint64_t)th_pthread, isLP64);
 300         }
 301 #elif defined(__i386__) || defined(__x86_64__)
 302         {
 303         /*
 304          * Set up i386 registers & function call.
 305          */
 306         if (isLP64 == 0) {
 307                 x86_thread_state32_t state;
 308                 x86_thread_state32_t *ts = &state;
 309
 310                 ts->eip = (int)p->p_threadstart;
 311                 ts->eax = (unsigned int)th_pthread;
 312                 ts->ebx = (unsigned int)th_thport;
 313                 ts->ecx = (unsigned int)user_func;
 314                 ts->edx = (unsigned int)user_funcarg;
 315                 ts->edi = (unsigned int)user_stacksize;
 316                 ts->esi = (unsigned int)uap->flags;
 317                 /*
 318                  * set stack pointer
 319                  */
 320                 ts->esp = (int)((vm_offset_t)(th_stack-C_32_STK_ALIGN));
 321
 322                 thread_set_wq_state32(th, (thread_state_t)ts);
 323
 324         } else {
 325                 x86_thread_state64_t state64;
 326                 x86_thread_state64_t *ts64 = &state64;
 327
 328                 ts64->rip = (uint64_t)p->p_threadstart;
 329                 ts64->rdi = (uint64_t)th_pthread;
 330                 ts64->rsi = (uint64_t)(th_thport);
 331                 ts64->rdx = (uint64_t)user_func;
 332                 ts64->rcx = (uint64_t)user_funcarg;
 333                 ts64->r8 = (uint64_t)user_stacksize;
 334                 ts64->r9 = (uint64_t)uap->flags;
 335                 /*
 336                  * set stack pointer aligned to 16 byte boundary
 337                  */
 338                 ts64->rsp = (uint64_t)(th_stack - C_64_REDZONE_LEN);
 339
 340                 thread_set_wq_state64(th, (thread_state_t)ts64);
 341         }
 342         }
 343 #else
 344 #error bsdthread_create  not defined for this architecture
 345 #endif
 346         /* Set scheduling parameters if needed */
 347         if ((flags & PTHREAD_START_SETSCHED) != 0) {
 348                 thread_extended_policy_data_t    extinfo;
 349                 thread_precedence_policy_data_t   precedinfo;
 350
 351                 importance = (flags & PTHREAD_START_IMPORTANCE_MASK);
 352                 policy = (flags >> PTHREAD_START_POLICY_BITSHIFT) & PTHREAD_START_POLICY_MASK;
 353
 354                 if (policy == SCHED_OTHER)
 355                         extinfo.timeshare = 1;
 356                 else
 357                         extinfo.timeshare = 0;
 358                 thread_policy_set(th, THREAD_EXTENDED_POLICY, (thread_policy_t)&extinfo, THREAD_EXTENDED_POLICY_COUNT);
 359
 360 #define BASEPRI_DEFAULT 31
 361                 precedinfo.importance = (importance - BASEPRI_DEFAULT);
 362                 thread_policy_set(th, THREAD_PRECEDENCE_POLICY, (thread_policy_t)&precedinfo, THREAD_PRECEDENCE_POLICY_COUNT);
 363         }
 364
 365         kret = thread_resume(th);
 366         if (kret != KERN_SUCCESS) {
 367                 error = EINVAL;
 368                 goto out1;
 369         }
 370         thread_deallocate(th);  /* drop the creator reference */
 371 #if 0
 372         KERNEL_DEBUG_CONSTANT(0x9000080 |DBG_FUNC_END, error, th_pthread, 0, 0, 0);
 373 #endif
 374         *retval = th_pthread;
 375
 376         return(0);
 377
 378 out1:
 379         if (allocated != 0)
 380                 (void)mach_vm_deallocate(vmap,  stackaddr, th_allocsize);
 381 out:
 382         (void)mach_port_deallocate(get_task_ipcspace(ctask), th_thport);
 383         (void)thread_terminate(th);
 384         (void)thread_deallocate(th);
 385         return(error);
 386 }
 387
 388 int
 389 bsdthread_terminate(__unused struct proc *p, struct bsdthread_terminate_args  *uap, __unused int32_t *retval)
 390 {
 391         mach_vm_offset_t  freeaddr;
 392         mach_vm_size_t freesize;
 393         kern_return_t kret;
 394         mach_port_name_t kthport = (mach_port_name_t)uap->port;
 395         mach_port_name_t sem = (mach_port_name_t)uap->sem;
 396
 397         freeaddr = (mach_vm_offset_t)uap->stackaddr;
 398         freesize = uap->freesize;
 399
 400 #if 0
 401         KERNEL_DEBUG_CONSTANT(0x9000084 |DBG_FUNC_START, freeaddr, freesize, kthport, 0xff, 0);
 402 #endif
 403         if ((freesize != (mach_vm_size_t)0) && (freeaddr != (mach_vm_offset_t)0)) {
 404                 kret = mach_vm_deallocate(current_map(), freeaddr, freesize);
 405                 if (kret != KERN_SUCCESS) {
 406                         return(EINVAL);
 407                 }
 408         }
 409
 410         (void) thread_terminate(current_thread());
 411         if (sem != MACH_PORT_NULL) {
 412                  kret = semaphore_signal_internal_trap(sem);
 413                 if (kret != KERN_SUCCESS) {
 414                         return(EINVAL);
 415                 }
 416         }
 417
 418         if (kthport != MACH_PORT_NULL)
 419                         mach_port_deallocate(get_task_ipcspace(current_task()), kthport);
 420         thread_exception_return();
 421         panic("bsdthread_terminate: still running\n");
 422 #if 0
 423         KERNEL_DEBUG_CONSTANT(0x9000084 |DBG_FUNC_END, 0, 0, 0, 0xff, 0);
 424 #endif
 425         return(0);
 426 }
 427
 428
 429 int
 430 bsdthread_register(struct proc *p, struct bsdthread_register_args  *uap, __unused int32_t *retval)
 431 {
 432         /* prevent multiple registrations */
 433         if ((p->p_lflag & P_LREGISTER) != 0)
 434                 return(EINVAL);
 435         /* syscall randomizer test can pass bogus values */
 436         if (uap->pthsize > MAX_PTHREAD_SIZE) {
 437                 return(EINVAL);
 438         }
 439         p->p_threadstart = uap->threadstart;
 440         p->p_wqthread = uap->wqthread;
 441         p->p_pthsize = uap->pthsize;
 442         p->p_targconc = uap->targetconc_ptr;
 443         p->p_dispatchqueue_offset = uap->dispatchqueue_offset;
 444         proc_setregister(p);
 445
 446         return(0);
 447 }
 448
 449
 450 uint32_t wq_yielded_threshold           = WQ_YIELDED_THRESHOLD;
 451 uint32_t wq_yielded_window_usecs        = WQ_YIELDED_WINDOW_USECS;
 452 uint32_t wq_stalled_window_usecs        = WQ_STALLED_WINDOW_USECS;
 453 uint32_t wq_reduce_pool_window_usecs    = WQ_REDUCE_POOL_WINDOW_USECS;
 454 uint32_t wq_max_timer_interval_usecs    = WQ_MAX_TIMER_INTERVAL_USECS;
 455 uint32_t wq_max_threads                 = WORKQUEUE_MAXTHREADS;
 456
 457
 458 SYSCTL_INT(_kern, OID_AUTO, wq_yielded_threshold, CTLFLAG_RW,
 459            &wq_yielded_threshold, 0, "");
 460
 461 SYSCTL_INT(_kern, OID_AUTO, wq_yielded_window_usecs, CTLFLAG_RW,
 462            &wq_yielded_window_usecs, 0, "");
 463
 464 SYSCTL_INT(_kern, OID_AUTO, wq_stalled_window_usecs, CTLFLAG_RW,
 465            &wq_stalled_window_usecs, 0, "");
 466
 467 SYSCTL_INT(_kern, OID_AUTO, wq_reduce_pool_window_usecs, CTLFLAG_RW,
 468            &wq_reduce_pool_window_usecs, 0, "");
 469
 470 SYSCTL_INT(_kern, OID_AUTO, wq_max_timer_interval_usecs, CTLFLAG_RW,
 471            &wq_max_timer_interval_usecs, 0, "");
 472
 473 SYSCTL_INT(_kern, OID_AUTO, wq_max_threads, CTLFLAG_RW,
 474            &wq_max_threads, 0, "");
 475
 476
 477 void
 478 workqueue_init_lock(proc_t p)
 479 {
 480         lck_spin_init(&p->p_wqlock, pthread_lck_grp, pthread_lck_attr);
 481
 482         p->p_wqiniting = FALSE;
 483 }
 484
 485 void
 486 workqueue_destroy_lock(proc_t p)
 487 {
 488         lck_spin_destroy(&p->p_wqlock, pthread_lck_grp);
 489 }
 490
 491
 492 static void
 493 workqueue_lock_spin(proc_t p)
 494 {
 495         lck_spin_lock(&p->p_wqlock);
 496 }
 497
 498 static void
 499 workqueue_unlock(proc_t p)
 500 {
 501         lck_spin_unlock(&p->p_wqlock);
 502 }
 503
 504
 505 static void
 506 workqueue_interval_timer_start(struct workqueue *wq)
 507 {
 508         uint64_t deadline;
 509
 510         if (wq->wq_timer_interval == 0)
 511                 wq->wq_timer_interval = wq_stalled_window_usecs;
 512         else {
 513                 wq->wq_timer_interval = wq->wq_timer_interval * 2;
 514
 515                 if (wq->wq_timer_interval > wq_max_timer_interval_usecs)
 516                         wq->wq_timer_interval = wq_max_timer_interval_usecs;
 517         }
 518         clock_interval_to_deadline(wq->wq_timer_interval, 1000, &deadline);
 519
 520         thread_call_enter_delayed(wq->wq_atimer_call, deadline);
 521
 522         KERNEL_DEBUG(0xefffd110, wq, wq->wq_itemcount, wq->wq_flags, wq->wq_timer_interval, 0);
 523 }
 524
 525
 526 static boolean_t
 527 wq_thread_is_busy(uint64_t cur_ts, uint64_t *lastblocked_tsp)
 528 {       clock_sec_t     secs;
 529         clock_usec_t    usecs;
 530         uint64_t lastblocked_ts;
 531         uint64_t elapsed;
 532
 533         /*
 534          * the timestamp is updated atomically w/o holding the workqueue lock
 535          * so we need to do an atomic read of the 64 bits so that we don't see
 536          * a mismatched pair of 32 bit reads... we accomplish this in an architecturally
 537          * independent fashion by using OSCompareAndSwap64 to write back the
 538          * value we grabbed... if it succeeds, then we have a good timestamp to
 539          * evaluate... if it fails, we straddled grabbing the timestamp while it
 540          * was being updated... treat a failed update as a busy thread since
 541          * it implies we are about to see a really fresh timestamp anyway
 542          */
 543         lastblocked_ts = *lastblocked_tsp;
 544
 545 #if defined(__ppc__)
 546 #else
 547         if ( !OSCompareAndSwap64((UInt64)lastblocked_ts, (UInt64)lastblocked_ts, lastblocked_tsp))
 548                 return (TRUE);
 549 #endif
 550         if (lastblocked_ts >= cur_ts) {
 551                 /*
 552                  * because the update of the timestamp when a thread blocks isn't
 553                  * serialized against us looking at it (i.e. we don't hold the workq lock)
 554                  * it's possible to have a timestamp that matches the current time or
 555                  * that even looks to be in the future relative to when we grabbed the current
 556                  * time... just treat this as a busy thread since it must have just blocked.
 557                  */
 558                 return (TRUE);
 559         }
 560         elapsed = cur_ts - lastblocked_ts;
 561
 562         absolutetime_to_microtime(elapsed, &secs, &usecs);
 563
 564         if (secs == 0 && usecs < wq_stalled_window_usecs)
 565                 return (TRUE);
 566         return (FALSE);
 567 }
 568
 569
 570 #define WQ_TIMER_NEEDED(wq, start_timer) do {           \
 571         int oldflags = wq->wq_flags;                    \
 572                                                         \
 573         if ( !(oldflags & (WQ_EXITING | WQ_ATIMER_RUNNING))) {  \
 574                 if (OSCompareAndSwap(oldflags, oldflags | WQ_ATIMER_RUNNING, (UInt32 *)&wq->wq_flags)) \
 575                         start_timer = TRUE;                     \
 576         }                                                       \
 577 } while (0)
 578
 579
 580
 581 static void
 582 workqueue_add_timer(struct workqueue *wq, __unused int param1)
 583 {
 584         proc_t          p;
 585         boolean_t       start_timer = FALSE;
 586         boolean_t       retval;
 587         boolean_t       add_thread;
 588         uint32_t        busycount;
 589
 590         KERNEL_DEBUG(0xefffd108 | DBG_FUNC_START, wq, wq->wq_flags, wq->wq_nthreads, wq->wq_thidlecount, 0);
 591
 592         p = wq->wq_proc;
 593
 594         workqueue_lock_spin(p);
 595
 596         /*
 597          * because workqueue_callback now runs w/o taking the workqueue lock
 598          * we are unsynchronized w/r to a change in state of the running threads...
 599          * to make sure we always evaluate that change, we allow it to start up
 600          * a new timer if the current one is actively evalutating the state
 601          * however, we do not need more than 2 timers fired up (1 active and 1 pending)
 602          * and we certainly do not want 2 active timers evaluating the state
 603          * simultaneously... so use WQL_ATIMER_BUSY to serialize the timers...
 604          * note that WQL_ATIMER_BUSY is in a different flag word from WQ_ATIMER_RUNNING since
 605          * it is always protected by the workq lock... WQ_ATIMER_RUNNING is evaluated
 606          * and set atomimcally since the callback function needs to manipulate it
 607          * w/o holding the workq lock...
 608          *
 609          * !WQ_ATIMER_RUNNING && !WQL_ATIMER_BUSY   ==   no pending timer, no active timer
 610          * !WQ_ATIMER_RUNNING && WQL_ATIMER_BUSY    ==   no pending timer, 1 active timer
 611          * WQ_ATIMER_RUNNING && !WQL_ATIMER_BUSY    ==   1 pending timer, no active timer
 612          * WQ_ATIMER_RUNNING && WQL_ATIMER_BUSY     ==   1 pending timer, 1 active timer
 613          */
 614         while (wq->wq_lflags & WQL_ATIMER_BUSY) {
 615                 wq->wq_lflags |= WQL_ATIMER_WAITING;
 616
 617                 assert_wait((caddr_t)wq, (THREAD_UNINT));
 618                 workqueue_unlock(p);
 619
 620                 thread_block(THREAD_CONTINUE_NULL);
 621
 622                 workqueue_lock_spin(p);
 623         }
 624         wq->wq_lflags |= WQL_ATIMER_BUSY;
 625
 626         /*
 627          * the workq lock will protect us from seeing WQ_EXITING change state, but we
 628          * still need to update this atomically in case someone else tries to start
 629          * the timer just as we're releasing it
 630          */
 631         while ( !(OSCompareAndSwap(wq->wq_flags, (wq->wq_flags & ~WQ_ATIMER_RUNNING), (UInt32 *)&wq->wq_flags)));
 632
 633 again:
 634         retval = TRUE;
 635         add_thread = FALSE;
 636
 637         if ( !(wq->wq_flags & WQ_EXITING)) {
 638                 /*
 639                  * check to see if the stall frequency was beyond our tolerance
 640                  * or we have work on the queue, but haven't scheduled any
 641                  * new work within our acceptable time interval because
 642                  * there were no idle threads left to schedule
 643                  */
 644                 if (wq->wq_itemcount) {
 645                         uint32_t        priority;
 646                         uint32_t        affinity_tag;
 647                         uint32_t        i;
 648                         uint64_t        curtime;
 649
 650                         for (priority = 0; priority < WORKQUEUE_NUMPRIOS; priority++) {
 651                                 if (wq->wq_list_bitmap & (1 << priority))
 652                                         break;
 653                         }
 654                         assert(priority < WORKQUEUE_NUMPRIOS);
 655
 656                         curtime = mach_absolute_time();
 657                         busycount = 0;
 658
 659                         for (affinity_tag = 0; affinity_tag < wq->wq_reqconc[priority]; affinity_tag++) {
 660                                 /*
 661                                  * if we have no idle threads, we can try to add them if needed
 662                                  */
 663                                 if (wq->wq_thidlecount == 0)
 664                                         add_thread = TRUE;
 665
 666                                 /*
 667                                  * look for first affinity group that is currently not active
 668                                  * i.e. no active threads at this priority level or higher
 669                                  * and has not been active recently at this priority level or higher
 670                                  */
 671                                 for (i = 0; i <= priority; i++) {
 672                                         if (wq->wq_thactive_count[i][affinity_tag]) {
 673                                                 add_thread = FALSE;
 674                                                 break;
 675                                         }
 676                                         if (wq->wq_thscheduled_count[i][affinity_tag]) {
 677                                                 if (wq_thread_is_busy(curtime, &wq->wq_lastblocked_ts[i][affinity_tag])) {
 678                                                         add_thread = FALSE;
 679                                                         busycount++;
 680                                                         break;
 681                                                 }
 682                                         }
 683                                 }
 684                                 if (add_thread == TRUE) {
 685                                         retval = workqueue_addnewthread(wq);
 686                                         break;
 687                                 }
 688                         }
 689                         if (wq->wq_itemcount) {
 690                                 /*
 691                                  * as long as we have threads to schedule, and we successfully
 692                                  * scheduled new work, keep trying
 693                                  */
 694                                 while (wq->wq_thidlecount && !(wq->wq_flags & WQ_EXITING)) {
 695                                         /*
 696                                          * workqueue_run_nextitem is responsible for
 697                                          * dropping the workqueue lock in all cases
 698                                          */
 699                                         retval = workqueue_run_nextitem(p, wq, THREAD_NULL, 0, 0, 0);
 700                                         workqueue_lock_spin(p);
 701
 702                                         if (retval == FALSE)
 703                                                 break;
 704                                 }
 705                                 if ( !(wq->wq_flags & WQ_EXITING) && wq->wq_itemcount) {
 706
 707                                         if (wq->wq_thidlecount == 0 && retval == TRUE && add_thread == TRUE)
 708                                                 goto again;
 709
 710                                         if (wq->wq_thidlecount == 0 || busycount)
 711                                                 WQ_TIMER_NEEDED(wq, start_timer);
 712
 713                                         KERNEL_DEBUG(0xefffd108 | DBG_FUNC_NONE, wq, wq->wq_itemcount, wq->wq_thidlecount, busycount, 0);
 714                                 }
 715                         }
 716                 }
 717         }
 718         if ( !(wq->wq_flags & WQ_ATIMER_RUNNING))
 719                 wq->wq_timer_interval = 0;
 720
 721         wq->wq_lflags &= ~WQL_ATIMER_BUSY;
 722
 723         if ((wq->wq_flags & WQ_EXITING) || (wq->wq_lflags & WQL_ATIMER_WAITING)) {
 724                 /*
 725                  * wakeup the thread hung up in workqueue_exit or workqueue_add_timer waiting for this timer
 726                  * to finish getting out of the way
 727                  */
 728                 wq->wq_lflags &= ~WQL_ATIMER_WAITING;
 729                 wakeup(wq);
 730         }
 731         KERNEL_DEBUG(0xefffd108 | DBG_FUNC_END, wq, start_timer, wq->wq_nthreads, wq->wq_thidlecount, 0);
 732
 733         workqueue_unlock(p);
 734
 735         if (start_timer == TRUE)
 736                 workqueue_interval_timer_start(wq);
 737 }
 738
 739
 740 void
 741 workqueue_thread_yielded(void)
 742 {
 743         struct workqueue *wq;
 744         proc_t          p;
 745
 746         p = current_proc();
 747
 748         if ((wq = p->p_wqptr) == NULL || wq->wq_itemcount == 0)
 749                 return;
 750
 751         workqueue_lock_spin(p);
 752
 753         if (wq->wq_itemcount) {
 754                 uint64_t        curtime;
 755                 uint64_t        elapsed;
 756                 clock_sec_t     secs;
 757                 clock_usec_t    usecs;
 758
 759                 if (wq->wq_thread_yielded_count++ == 0)
 760                         wq->wq_thread_yielded_timestamp = mach_absolute_time();
 761
 762                 if (wq->wq_thread_yielded_count < wq_yielded_threshold) {
 763                         workqueue_unlock(p);
 764                         return;
 765                 }
 766                 KERNEL_DEBUG(0xefffd138 | DBG_FUNC_START, wq, wq->wq_thread_yielded_count, wq->wq_itemcount, 0, 0);
 767
 768                 wq->wq_thread_yielded_count = 0;
 769
 770                 curtime = mach_absolute_time();
 771                 elapsed = curtime - wq->wq_thread_yielded_timestamp;
 772                 absolutetime_to_microtime(elapsed, &secs, &usecs);
 773
 774                 if (secs == 0 && usecs < wq_yielded_window_usecs) {
 775
 776                         if (wq->wq_thidlecount == 0) {
 777                                 workqueue_addnewthread(wq);
 778                                 /*
 779                                  * 'workqueue_addnewthread' drops the workqueue lock
 780                                  * when creating the new thread and then retakes it before
 781                                  * returning... this window allows other threads to process
 782                                  * work on the queue, so we need to recheck for available work
 783                                  * if none found, we just return...  the newly created thread
 784                                  * will eventually get used (if it hasn't already)...
 785                                  */
 786                                 if (wq->wq_itemcount == 0) {
 787                                         workqueue_unlock(p);
 788                                         return;
 789                                 }
 790                         }
 791                         if (wq->wq_thidlecount) {
 792                                 uint32_t        priority;
 793                                 uint32_t        affinity = -1;
 794                                 user_addr_t     item;
 795                                 struct workitem *witem = NULL;
 796                                 struct workitemlist *wl = NULL;
 797                                 struct uthread    *uth;
 798                                 struct threadlist *tl;
 799
 800                                 uth = get_bsdthread_info(current_thread());
 801                                 if ((tl = uth->uu_threadlist))
 802                                         affinity = tl->th_affinity_tag;
 803
 804                                 for (priority = 0; priority < WORKQUEUE_NUMPRIOS; priority++) {
 805                                         if (wq->wq_list_bitmap & (1 << priority)) {
 806                                                 wl = (struct workitemlist *)&wq->wq_list[priority];
 807                                                 break;
 808                                         }
 809                                 }
 810                                 assert(wl != NULL);
 811                                 assert(!(TAILQ_EMPTY(&wl->wl_itemlist)));
 812
 813                                 witem = TAILQ_FIRST(&wl->wl_itemlist);
 814                                 TAILQ_REMOVE(&wl->wl_itemlist, witem, wi_entry);
 815
 816                                 if (TAILQ_EMPTY(&wl->wl_itemlist))
 817                                         wq->wq_list_bitmap &= ~(1 << priority);
 818                                 wq->wq_itemcount--;
 819
 820                                 item = witem->wi_item;
 821                                 witem->wi_item = (user_addr_t)0;
 822                                 witem->wi_affinity = 0;
 823
 824                                 TAILQ_INSERT_HEAD(&wl->wl_freelist, witem, wi_entry);
 825
 826                                 (void)workqueue_run_nextitem(p, wq, THREAD_NULL, item, priority, affinity);
 827                                 /*
 828                                  * workqueue_run_nextitem is responsible for
 829                                  * dropping the workqueue lock in all cases
 830                                  */
 831                                 KERNEL_DEBUG(0xefffd138 | DBG_FUNC_END, wq, wq->wq_thread_yielded_count, wq->wq_itemcount, 1, 0);
 832
 833                                 return;
 834                         }
 835                 }
 836                 KERNEL_DEBUG(0xefffd138 | DBG_FUNC_END, wq, wq->wq_thread_yielded_count, wq->wq_itemcount, 2, 0);
 837         }
 838         workqueue_unlock(p);
 839 }
 840
 841
 842
 843 static void
 844 workqueue_callback(int type, thread_t thread)
 845 {
 846         struct uthread    *uth;
 847         struct threadlist *tl;
 848         struct workqueue  *wq;
 849
 850         uth = get_bsdthread_info(thread);
 851         tl = uth->uu_threadlist;
 852         wq = tl->th_workq;
 853
 854         switch (type) {
 855
 856               case SCHED_CALL_BLOCK:
 857                 {
 858                 uint32_t        old_activecount;
 859
 860                 old_activecount = OSAddAtomic(-1, &wq->wq_thactive_count[tl->th_priority][tl->th_affinity_tag]);
 861
 862                 if (old_activecount == 1) {
 863                         boolean_t       start_timer = FALSE;
 864                         uint64_t        curtime;
 865                         UInt64          *lastblocked_ptr;
 866
 867                         /*
 868                          * we were the last active thread on this affinity set
 869                          * and we've got work to do
 870                          */
 871                         lastblocked_ptr = (UInt64 *)&wq->wq_lastblocked_ts[tl->th_priority][tl->th_affinity_tag];
 872                         curtime = mach_absolute_time();
 873
 874                         /*
 875                          * if we collide with another thread trying to update the last_blocked (really unlikely
 876                          * since another thread would have to get scheduled and then block after we start down
 877                          * this path), it's not a problem.  Either timestamp is adequate, so no need to retry
 878                          */
 879 #if defined(__ppc__)
 880                         /*
 881                          * this doesn't have to actually work reliablly for PPC, it just has to compile/link
 882                          */
 883                         *lastblocked_ptr = (UInt64)curtime;
 884 #else
 885                         OSCompareAndSwap64(*lastblocked_ptr, (UInt64)curtime, lastblocked_ptr);
 886 #endif
 887                         if (wq->wq_itemcount)
 888                                 WQ_TIMER_NEEDED(wq, start_timer);
 889
 890                         if (start_timer == TRUE)
 891                                 workqueue_interval_timer_start(wq);
 892                 }
 893                 KERNEL_DEBUG1(0xefffd020 | DBG_FUNC_START, wq, old_activecount, tl->th_priority, tl->th_affinity_tag, thread_tid(thread));
 894                 }
 895                 break;
 896
 897               case SCHED_CALL_UNBLOCK:
 898                 /*
 899                  * we cannot take the workqueue_lock here...
 900                  * an UNBLOCK can occur from a timer event which
 901                  * is run from an interrupt context... if the workqueue_lock
 902                  * is already held by this processor, we'll deadlock...
 903                  * the thread lock for the thread being UNBLOCKED
 904                  * is also held
 905                  */
 906                 if (tl->th_suspended) {
 907                         OSAddAtomic(-1, &tl->th_suspended);
 908                         KERNEL_DEBUG1(0xefffd024, wq, wq->wq_threads_scheduled, tl->th_priority, tl->th_affinity_tag, thread_tid(thread));
 909                 } else {
 910                         OSAddAtomic(1, &wq->wq_thactive_count[tl->th_priority][tl->th_affinity_tag]);
 911
 912                         KERNEL_DEBUG1(0xefffd020 | DBG_FUNC_END, wq, wq->wq_threads_scheduled, tl->th_priority, tl->th_affinity_tag, thread_tid(thread));
 913                 }
 914                 break;
 915         }
 916 }
 917
 918
 919 static void
 920 workqueue_removethread(struct threadlist *tl)
 921 {
 922         struct workqueue *wq;
 923         struct uthread * uth;
 924
 925         wq = tl->th_workq;
 926
 927         TAILQ_REMOVE(&wq->wq_thidlelist, tl, th_entry);
 928
 929         wq->wq_nthreads--;
 930         wq->wq_thidlecount--;
 931
 932         /*
 933          * Clear the threadlist pointer in uthread so
 934          * blocked thread on wakeup for termination will
 935          * not access the thread list as it is going to be
 936          * freed.
 937          */
 938         thread_sched_call(tl->th_thread, NULL);
 939
 940         uth = get_bsdthread_info(tl->th_thread);
 941         if (uth != (struct uthread *)0) {
 942                 uth->uu_threadlist = NULL;
 943         }
 944         workqueue_unlock(wq->wq_proc);
 945
 946         if ( (tl->th_flags & TH_LIST_SUSPENDED) ) {
 947                 /*
 948                  * thread was created, but never used...
 949                  * need to clean up the stack and port ourselves
 950                  * since we're not going to spin up through the
 951                  * normal exit path triggered from Libc
 952                  */
 953                 (void)mach_vm_deallocate(wq->wq_map, tl->th_stackaddr, tl->th_allocsize);
 954                 (void)mach_port_deallocate(get_task_ipcspace(wq->wq_task), tl->th_thport);
 955
 956                 KERNEL_DEBUG1(0xefffd014 | DBG_FUNC_END, wq, (uintptr_t)thread_tid(current_thread()), wq->wq_nthreads, 0xdead, thread_tid(tl->th_thread));
 957         } else {
 958
 959                 KERNEL_DEBUG1(0xefffd018 | DBG_FUNC_END, wq, (uintptr_t)thread_tid(current_thread()), wq->wq_nthreads, 0xdead, thread_tid(tl->th_thread));
 960         }
 961         /*
 962          * drop our ref on the thread
 963          */
 964         thread_deallocate(tl->th_thread);
 965
 966         kfree(tl, sizeof(struct threadlist));
 967 }
 968
 969
 970
 971 static boolean_t
 972 workqueue_addnewthread(struct workqueue *wq)
 973 {
 974         struct threadlist *tl;
 975         struct uthread  *uth;
 976         kern_return_t   kret;
 977         thread_t        th;
 978         proc_t          p;
 979         void            *sright;
 980         mach_vm_offset_t stackaddr;
 981
 982         if (wq->wq_nthreads >= wq_max_threads || wq->wq_nthreads >= (CONFIG_THREAD_MAX - 20))
 983                 return (FALSE);
 984         wq->wq_nthreads++;
 985
 986         p = wq->wq_proc;
 987         workqueue_unlock(p);
 988
 989         kret = thread_create_workq(wq->wq_task, &th);
 990
 991         if (kret != KERN_SUCCESS)
 992                 goto failed;
 993
 994         tl = kalloc(sizeof(struct threadlist));
 995         bzero(tl, sizeof(struct threadlist));
 996
 997 #if defined(__ppc__)
 998         stackaddr = 0xF0000000;
 999 #elif defined(__i386__) || defined(__x86_64__)
1000         stackaddr = 0xB0000000;
1001 #else
1002 #error Need to define a stack address hint for this architecture
1003 #endif
1004         tl->th_allocsize = PTH_DEFAULT_STACKSIZE + PTH_DEFAULT_GUARDSIZE + p->p_pthsize;
1005
1006         kret = mach_vm_map(wq->wq_map, &stackaddr,
1007                         tl->th_allocsize,
1008                         page_size-1,
1009                         VM_MAKE_TAG(VM_MEMORY_STACK)| VM_FLAGS_ANYWHERE , NULL,
1010                         0, FALSE, VM_PROT_DEFAULT, VM_PROT_ALL,
1011                         VM_INHERIT_DEFAULT);
1012
1013         if (kret != KERN_SUCCESS) {
1014                 kret = mach_vm_allocate(wq->wq_map,
1015                                         &stackaddr, tl->th_allocsize,
1016                                         VM_MAKE_TAG(VM_MEMORY_STACK) | VM_FLAGS_ANYWHERE);
1017         }
1018         if (kret == KERN_SUCCESS) {
1019                 /*
1020                  * The guard page is at the lowest address
1021                  * The stack base is the highest address
1022                  */
1023                 kret = mach_vm_protect(wq->wq_map, stackaddr, PTH_DEFAULT_GUARDSIZE, FALSE, VM_PROT_NONE);
1024
1025                 if (kret != KERN_SUCCESS)
1026                         (void) mach_vm_deallocate(wq->wq_map, stackaddr, tl->th_allocsize);
1027         }
1028         if (kret != KERN_SUCCESS) {
1029                 (void) thread_terminate(th);
1030
1031                 kfree(tl, sizeof(struct threadlist));
1032                 goto failed;
1033         }
1034         thread_reference(th);
1035
1036         sright = (void *) convert_thread_to_port(th);
1037         tl->th_thport = ipc_port_copyout_send(sright, get_task_ipcspace(wq->wq_task));
1038
1039         thread_static_param(th, TRUE);
1040
1041         tl->th_flags = TH_LIST_INITED | TH_LIST_SUSPENDED;
1042
1043         tl->th_thread = th;
1044         tl->th_workq = wq;
1045         tl->th_stackaddr = stackaddr;
1046         tl->th_affinity_tag = -1;
1047         tl->th_priority = WORKQUEUE_NUMPRIOS;
1048         tl->th_policy = -1;
1049         tl->th_suspended = 1;
1050
1051 #if defined(__ppc__)
1052         //ml_fp_setvalid(FALSE);
1053         thread_set_cthreadself(th, (uint64_t)(tl->th_stackaddr + PTH_DEFAULT_STACKSIZE + PTH_DEFAULT_GUARDSIZE), IS_64BIT_PROCESS(p));
1054 #endif /* __ppc__ */
1055
1056         uth = get_bsdthread_info(tl->th_thread);
1057         uth->uu_threadlist = (void *)tl;
1058
1059         workqueue_lock_spin(p);
1060
1061         TAILQ_INSERT_TAIL(&wq->wq_thidlelist, tl, th_entry);
1062
1063         wq->wq_thidlecount++;
1064
1065         KERNEL_DEBUG1(0xefffd014 | DBG_FUNC_START, wq, wq->wq_nthreads, 0, thread_tid(current_thread()), thread_tid(tl->th_thread));
1066
1067         return (TRUE);
1068
1069 failed:
1070         workqueue_lock_spin(p);
1071         wq->wq_nthreads--;
1072
1073         return (FALSE);
1074 }
1075
1076
1077 int
1078 workq_open(struct proc *p, __unused struct workq_open_args  *uap, __unused int32_t *retval)
1079 {
1080         struct workqueue * wq;
1081         int wq_size;
1082         char * ptr;
1083         char * nptr;
1084         int j;
1085         uint32_t i;
1086         uint32_t num_cpus;
1087         int error = 0;
1088         boolean_t need_wakeup = FALSE;
1089         struct workitem * witem;
1090         struct workitemlist *wl;
1091
1092         if ((p->p_lflag & P_LREGISTER) == 0)
1093                 return(EINVAL);
1094
1095         workqueue_lock_spin(p);
1096
1097         if (p->p_wqptr == NULL) {
1098
1099                 while (p->p_wqiniting == TRUE) {
1100
1101                         assert_wait((caddr_t)&p->p_wqiniting, THREAD_UNINT);
1102                         workqueue_unlock(p);
1103
1104                         thread_block(THREAD_CONTINUE_NULL);
1105
1106                         workqueue_lock_spin(p);
1107                 }
1108                 if (p->p_wqptr != NULL)
1109                         goto out;
1110
1111                 p->p_wqiniting = TRUE;
1112
1113                 workqueue_unlock(p);
1114
1115                 num_cpus = ml_get_max_cpus();
1116
1117                 wq_size = sizeof(struct workqueue) +
1118                         (num_cpus * WORKQUEUE_NUMPRIOS * sizeof(uint32_t)) +
1119                         (num_cpus * WORKQUEUE_NUMPRIOS * sizeof(uint32_t)) +
1120                         (num_cpus * WORKQUEUE_NUMPRIOS * sizeof(uint64_t)) +
1121                         sizeof(uint64_t);
1122
1123                 ptr = (char *)kalloc(wq_size);
1124                 bzero(ptr, wq_size);
1125
1126                 wq = (struct workqueue *)ptr;
1127                 wq->wq_flags = WQ_LIST_INITED;
1128                 wq->wq_proc = p;
1129                 wq->wq_affinity_max = num_cpus;
1130                 wq->wq_task = current_task();
1131                 wq->wq_map  = current_map();
1132
1133                 for (i = 0; i < WORKQUEUE_NUMPRIOS; i++) {
1134                         wl = (struct workitemlist *)&wq->wq_list[i];
1135                         TAILQ_INIT(&wl->wl_itemlist);
1136                         TAILQ_INIT(&wl->wl_freelist);
1137
1138                         for (j = 0; j < WORKITEM_SIZE; j++) {
1139                                 witem = &wq->wq_array[(i*WORKITEM_SIZE) + j];
1140                                 TAILQ_INSERT_TAIL(&wl->wl_freelist, witem, wi_entry);
1141                         }
1142                         wq->wq_reqconc[i] = wq->wq_affinity_max;
1143                 }
1144                 nptr = ptr + sizeof(struct workqueue);
1145
1146                 for (i = 0; i < WORKQUEUE_NUMPRIOS; i++) {
1147                         wq->wq_thactive_count[i] = (uint32_t *)nptr;
1148                         nptr += (num_cpus * sizeof(uint32_t));
1149                 }
1150                 for (i = 0; i < WORKQUEUE_NUMPRIOS; i++) {
1151                         wq->wq_thscheduled_count[i] = (uint32_t *)nptr;
1152                         nptr += (num_cpus * sizeof(uint32_t));
1153                 }
1154                 /*
1155                  * align nptr on a 64 bit boundary so that we can do nice
1156                  * atomic64 operations on the timestamps...
1157                  * note that we requested an extra uint64_t when calcuating
1158                  * the size for the allocation of the workqueue struct
1159                  */
1160                 nptr += (sizeof(uint64_t) - 1);
1161                 nptr = (char *)((long)nptr & ~(sizeof(uint64_t) - 1));
1162
1163                 for (i = 0; i < WORKQUEUE_NUMPRIOS; i++) {
1164                         wq->wq_lastblocked_ts[i] = (uint64_t *)nptr;
1165                         nptr += (num_cpus * sizeof(uint64_t));
1166                 }
1167                 TAILQ_INIT(&wq->wq_thrunlist);
1168                 TAILQ_INIT(&wq->wq_thidlelist);
1169
1170                 wq->wq_atimer_call = thread_call_allocate((thread_call_func_t)workqueue_add_timer, (thread_call_param_t)wq);
1171
1172                 workqueue_lock_spin(p);
1173
1174                 p->p_wqptr = (void *)wq;
1175                 p->p_wqsize = wq_size;
1176
1177                 p->p_wqiniting = FALSE;
1178                 need_wakeup = TRUE;
1179         }
1180 out:
1181         workqueue_unlock(p);
1182
1183         if (need_wakeup == TRUE)
1184                 wakeup(&p->p_wqiniting);
1185         return(error);
1186 }
1187
1188 int
1189 workq_kernreturn(struct proc *p, struct workq_kernreturn_args  *uap, __unused int32_t *retval)
1190 {
1191         user_addr_t item = uap->item;
1192         int options     = uap->options;
1193         int prio        = uap->prio;    /* should  be used to find the right workqueue */
1194         int affinity    = uap->affinity;
1195         int error       = 0;
1196         thread_t th     = THREAD_NULL;
1197         user_addr_t oc_item = 0;
1198         struct workqueue *wq;
1199
1200         if ((p->p_lflag & P_LREGISTER) == 0)
1201                 return(EINVAL);
1202
1203         /*
1204          * affinity not yet hooked up on this path
1205          */
1206         affinity = -1;
1207
1208         switch (options) {
1209
1210                 case WQOPS_QUEUE_ADD: {
1211
1212                         if (prio & WORKQUEUE_OVERCOMMIT) {
1213                                 prio &= ~WORKQUEUE_OVERCOMMIT;
1214                                 oc_item = item;
1215                         }
1216                         if ((prio < 0) || (prio >= WORKQUEUE_NUMPRIOS))
1217                                 return (EINVAL);
1218
1219                         workqueue_lock_spin(p);
1220
1221                         if ((wq = (struct workqueue *)p->p_wqptr) == NULL) {
1222                                 workqueue_unlock(p);
1223                                 return (EINVAL);
1224                         }
1225                         if (wq->wq_thidlecount == 0 && (oc_item || (wq->wq_nthreads < wq->wq_affinity_max))) {
1226
1227                                 workqueue_addnewthread(wq);
1228
1229                                 if (wq->wq_thidlecount == 0)
1230                                         oc_item = 0;
1231                         }
1232                         if (oc_item == 0)
1233                                 error = workqueue_additem(wq, prio, item, affinity);
1234
1235                         KERNEL_DEBUG(0xefffd008 | DBG_FUNC_NONE, wq, prio, affinity, oc_item, 0);
1236                         }
1237                         break;
1238                 case WQOPS_QUEUE_REMOVE: {
1239
1240                         if ((prio < 0) || (prio >= WORKQUEUE_NUMPRIOS))
1241                                 return (EINVAL);
1242
1243                         workqueue_lock_spin(p);
1244
1245                         if ((wq = (struct workqueue *)p->p_wqptr) == NULL) {
1246                                 workqueue_unlock(p);
1247                                 return (EINVAL);
1248                         }
1249                         error = workqueue_removeitem(wq, prio, item);
1250                         }
1251                         break;
1252                 case WQOPS_THREAD_RETURN: {
1253
1254                         th = current_thread();
1255                         struct uthread *uth = get_bsdthread_info(th);
1256
1257                         /* reset signal mask on the workqueue thread to default state */
1258                         if (uth->uu_sigmask != (sigset_t)(~workq_threadmask)) {
1259                                 proc_lock(p);
1260                                 uth->uu_sigmask = ~workq_threadmask;
1261                                 proc_unlock(p);
1262                         }
1263
1264                         workqueue_lock_spin(p);
1265
1266                         if ((wq = (struct workqueue *)p->p_wqptr) == NULL || (uth->uu_threadlist == NULL)) {
1267                                 workqueue_unlock(p);
1268                                 return (EINVAL);
1269                         }
1270                         KERNEL_DEBUG(0xefffd004 | DBG_FUNC_END, wq, 0, 0, 0, 0);
1271                         }
1272                         break;
1273                 case WQOPS_THREAD_SETCONC: {
1274
1275                         if ((prio < 0) || (prio > WORKQUEUE_NUMPRIOS))
1276                                 return (EINVAL);
1277
1278                         workqueue_lock_spin(p);
1279
1280                         if ((wq = (struct workqueue *)p->p_wqptr) == NULL) {
1281                                 workqueue_unlock(p);
1282                                 return (EINVAL);
1283                         }
1284                         /*
1285                          * for this operation, we re-purpose the affinity
1286                          * argument as the concurrency target
1287                          */
1288                         if (prio < WORKQUEUE_NUMPRIOS)
1289                                 wq->wq_reqconc[prio] = affinity;
1290                         else {
1291                                 for (prio = 0; prio < WORKQUEUE_NUMPRIOS; prio++)
1292                                         wq->wq_reqconc[prio] = affinity;
1293
1294                         }
1295                         }
1296                         break;
1297                 default:
1298                         return (EINVAL);
1299         }
1300         (void)workqueue_run_nextitem(p, wq, th, oc_item, prio, affinity);
1301         /*
1302          * workqueue_run_nextitem is responsible for
1303          * dropping the workqueue lock in all cases
1304          */
1305         return (error);
1306
1307 }
1308
1309
1310 void
1311 workqueue_exit(struct proc *p)
1312 {
1313         struct workqueue  * wq;
1314         struct threadlist  * tl, *tlist;
1315         struct uthread  *uth;
1316         int wq_size = 0;
1317
1318         if (p->p_wqptr != NULL) {
1319
1320                 KERNEL_DEBUG(0x900808c | DBG_FUNC_START, p->p_wqptr, 0, 0, 0, 0);
1321
1322                 workqueue_lock_spin(p);
1323
1324                 wq = (struct workqueue *)p->p_wqptr;
1325
1326                 if (wq == NULL) {
1327                         workqueue_unlock(p);
1328
1329                         KERNEL_DEBUG(0x900808c | DBG_FUNC_END, 0, 0, 0, -1, 0);
1330                         return;
1331                 }
1332                 wq_size = p->p_wqsize;
1333                 p->p_wqptr = NULL;
1334                 p->p_wqsize = 0;
1335
1336                 /*
1337                  * we now arm the timer in the callback function w/o holding the workq lock...
1338                  * we do this by setting  WQ_ATIMER_RUNNING via OSCompareAndSwap in order to
1339                  * insure only a single timer if running and to notice that WQ_EXITING has
1340                  * been set (we don't want to start a timer once WQ_EXITING is posted)
1341                  *
1342                  * so once we have successfully set WQ_EXITING, we cannot fire up a new timer...
1343                  * therefor no need to clear the timer state atomically from the flags
1344                  *
1345                  * since we always hold the workq lock when dropping WQ_ATIMER_RUNNING
1346                  * the check for and sleep until clear is protected
1347                  */
1348                 while ( !(OSCompareAndSwap(wq->wq_flags, (wq->wq_flags | WQ_EXITING), (UInt32 *)&wq->wq_flags)));
1349
1350                 if (wq->wq_flags & WQ_ATIMER_RUNNING) {
1351                         if (thread_call_cancel(wq->wq_atimer_call) == TRUE)
1352                                 wq->wq_flags &= ~WQ_ATIMER_RUNNING;
1353                 }
1354                 while ((wq->wq_flags & WQ_ATIMER_RUNNING) || (wq->wq_lflags & WQL_ATIMER_BUSY)) {
1355
1356                         assert_wait((caddr_t)wq, (THREAD_UNINT));
1357                         workqueue_unlock(p);
1358
1359                         thread_block(THREAD_CONTINUE_NULL);
1360
1361                         workqueue_lock_spin(p);
1362                 }
1363                 workqueue_unlock(p);
1364
1365                 TAILQ_FOREACH_SAFE(tl, &wq->wq_thrunlist, th_entry, tlist) {
1366
1367                         thread_sched_call(tl->th_thread, NULL);
1368
1369                         uth = get_bsdthread_info(tl->th_thread);
1370                         if (uth != (struct uthread *)0) {
1371                                 uth->uu_threadlist = NULL;
1372                         }
1373                         TAILQ_REMOVE(&wq->wq_thrunlist, tl, th_entry);
1374
1375                         /*
1376                          * drop our last ref on the thread
1377                          */
1378                         thread_deallocate(tl->th_thread);
1379
1380                         kfree(tl, sizeof(struct threadlist));
1381                 }
1382                 TAILQ_FOREACH_SAFE(tl, &wq->wq_thidlelist, th_entry, tlist) {
1383
1384                         thread_sched_call(tl->th_thread, NULL);
1385
1386                         uth = get_bsdthread_info(tl->th_thread);
1387                         if (uth != (struct uthread *)0) {
1388                                 uth->uu_threadlist = NULL;
1389                         }
1390                         TAILQ_REMOVE(&wq->wq_thidlelist, tl, th_entry);
1391
1392                         /*
1393                          * drop our last ref on the thread
1394                          */
1395                         thread_deallocate(tl->th_thread);
1396
1397                         kfree(tl, sizeof(struct threadlist));
1398                 }
1399                 thread_call_free(wq->wq_atimer_call);
1400
1401                 kfree(wq, wq_size);
1402
1403                 KERNEL_DEBUG(0x900808c | DBG_FUNC_END, 0, 0, 0, 0, 0);
1404         }
1405 }
1406
1407 static int
1408 workqueue_additem(struct workqueue *wq, int prio, user_addr_t item, int affinity)
1409 {
1410         struct workitem *witem;
1411         struct workitemlist *wl;
1412
1413         wl = (struct workitemlist *)&wq->wq_list[prio];
1414
1415         if (TAILQ_EMPTY(&wl->wl_freelist))
1416                 return (ENOMEM);
1417
1418         witem = (struct workitem *)TAILQ_FIRST(&wl->wl_freelist);
1419         TAILQ_REMOVE(&wl->wl_freelist, witem, wi_entry);
1420
1421         witem->wi_item = item;
1422         witem->wi_affinity = affinity;
1423         TAILQ_INSERT_TAIL(&wl->wl_itemlist, witem, wi_entry);
1424
1425         wq->wq_list_bitmap |= (1 << prio);
1426
1427         wq->wq_itemcount++;
1428
1429         return (0);
1430 }
1431
1432 static int
1433 workqueue_removeitem(struct workqueue *wq, int prio, user_addr_t item)
1434 {
1435         struct workitem *witem;
1436         struct workitemlist *wl;
1437         int error = ESRCH;
1438
1439         wl = (struct workitemlist *)&wq->wq_list[prio];
1440
1441         TAILQ_FOREACH(witem, &wl->wl_itemlist, wi_entry) {
1442                 if (witem->wi_item == item) {
1443                         TAILQ_REMOVE(&wl->wl_itemlist, witem, wi_entry);
1444
1445                         if (TAILQ_EMPTY(&wl->wl_itemlist))
1446                                 wq->wq_list_bitmap &= ~(1 << prio);
1447                         wq->wq_itemcount--;
1448
1449                         witem->wi_item = (user_addr_t)0;
1450                         witem->wi_affinity = 0;
1451                         TAILQ_INSERT_HEAD(&wl->wl_freelist, witem, wi_entry);
1452
1453                         error = 0;
1454                         break;
1455                 }
1456         }
1457         return (error);
1458 }
1459
1460
1461
1462
1463 static int workqueue_importance[WORKQUEUE_NUMPRIOS] =
1464 {
1465         2, 0, -2,
1466 };
1467
1468 static int workqueue_policy[WORKQUEUE_NUMPRIOS] =
1469 {
1470         1, 1, 1,
1471 };
1472
1473
1474 /*
1475  * workqueue_run_nextitem:
1476  *   called with the workqueue lock held...
1477  *   responsible for dropping it in all cases
1478  */
1479 static boolean_t
1480 workqueue_run_nextitem(proc_t p, struct workqueue *wq, thread_t thread, user_addr_t oc_item, int oc_prio, int oc_affinity)
1481 {
1482         struct workitem *witem = NULL;
1483         user_addr_t item = 0;
1484         thread_t th_to_run = THREAD_NULL;
1485         thread_t th_to_park = THREAD_NULL;
1486         int wake_thread = 0;
1487         int reuse_thread = 1;
1488         uint32_t priority, orig_priority;
1489         uint32_t affinity_tag, orig_affinity_tag;
1490         uint32_t i, n;
1491         uint32_t activecount;
1492         uint32_t busycount;
1493         uint32_t us_to_wait;
1494         struct threadlist *tl = NULL;
1495         struct threadlist *ttl = NULL;
1496         struct uthread *uth = NULL;
1497         struct workitemlist *wl = NULL;
1498         boolean_t start_timer = FALSE;
1499         boolean_t adjust_counters = TRUE;
1500         uint64_t  curtime;
1501
1502
1503         KERNEL_DEBUG(0xefffd000 | DBG_FUNC_START, wq, thread, wq->wq_thidlecount, wq->wq_itemcount, 0);
1504
1505         /*
1506          * from here until we drop the workq lock
1507          * we can't be pre-empted since we hold
1508          * the lock in spin mode... this is important
1509          * since we have to independently update the priority
1510          * and affinity that the thread is associated with
1511          * and these values are used to index the multi-dimensional
1512          * counter arrays in 'workqueue_callback'
1513          */
1514         if (oc_item) {
1515                 uint32_t min_scheduled = 0;
1516                 uint32_t scheduled_count;
1517                 uint32_t active_count;
1518                 uint32_t t_affinity = 0;
1519
1520                 priority = oc_prio;
1521                 item = oc_item;
1522
1523                 if ((affinity_tag = oc_affinity) == (uint32_t)-1) {
1524                         for (affinity_tag = 0; affinity_tag < wq->wq_reqconc[priority]; affinity_tag++) {
1525                                 /*
1526                                  * look for the affinity group with the least number of threads
1527                                  */
1528                                 scheduled_count = 0;
1529                                 active_count = 0;
1530
1531                                 for (i = 0; i <= priority; i++) {
1532                                         scheduled_count += wq->wq_thscheduled_count[i][affinity_tag];
1533                                         active_count += wq->wq_thactive_count[i][affinity_tag];
1534                                 }
1535                                 if (active_count == 0) {
1536                                         t_affinity = affinity_tag;
1537                                         break;
1538                                 }
1539                                 if (affinity_tag == 0 || scheduled_count < min_scheduled) {
1540                                         min_scheduled = scheduled_count;
1541                                         t_affinity = affinity_tag;
1542                                 }
1543                         }
1544                         affinity_tag = t_affinity;
1545                 }
1546                 goto grab_idle_thread;
1547         }
1548         if (wq->wq_itemcount == 0) {
1549                 if ((th_to_park = thread) == THREAD_NULL)
1550                         goto out_of_work;
1551                 goto parkit;
1552         }
1553         for (priority = 0; priority < WORKQUEUE_NUMPRIOS; priority++) {
1554                 if (wq->wq_list_bitmap & (1 << priority)) {
1555                         wl = (struct workitemlist *)&wq->wq_list[priority];
1556                         break;
1557                 }
1558         }
1559         assert(wl != NULL);
1560         assert(!(TAILQ_EMPTY(&wl->wl_itemlist)));
1561
1562         curtime = mach_absolute_time();
1563
1564         if (thread != THREAD_NULL) {
1565                 uth = get_bsdthread_info(thread);
1566                 tl = uth->uu_threadlist;
1567                 affinity_tag = tl->th_affinity_tag;
1568
1569                 /*
1570                  * check to see if the affinity group this thread is
1571                  * associated with is still within the bounds of the
1572                  * specified concurrency for the priority level
1573                  * we're considering running work for
1574                  */
1575                 if (affinity_tag < wq->wq_reqconc[priority]) {
1576                         /*
1577                          * we're a worker thread from the pool... currently we
1578                          * are considered 'active' which means we're counted
1579                          * in "wq_thactive_count"
1580                          * add up the active counts of all the priority levels
1581                          * up to and including the one we want to schedule
1582                          */
1583                         for (activecount = 0, i = 0; i <= priority; i++) {
1584                                 uint32_t  acount;
1585
1586                                 acount = wq->wq_thactive_count[i][affinity_tag];
1587
1588                                 if (acount == 0 && wq->wq_thscheduled_count[i][affinity_tag]) {
1589                                         if (wq_thread_is_busy(curtime, &wq->wq_lastblocked_ts[i][affinity_tag]))
1590                                                 acount = 1;
1591                                 }
1592                                 activecount += acount;
1593                         }
1594                         if (activecount == 1) {
1595                                 /*
1596                                  * we're the only active thread associated with our
1597                                  * affinity group at this priority level and higher,
1598                                  * so pick up some work and keep going
1599                                  */
1600                                 th_to_run = thread;
1601                                 goto pick_up_work;
1602                         }
1603                 }
1604                 /*
1605                  * there's more than 1 thread running in this affinity group
1606                  * or the concurrency level has been cut back for this priority...
1607                  * lets continue on and look for an 'empty' group to run this
1608                  * work item in
1609                  */
1610         }
1611         busycount = 0;
1612
1613         for (affinity_tag = 0; affinity_tag < wq->wq_reqconc[priority]; affinity_tag++) {
1614                 /*
1615                  * look for first affinity group that is currently not active
1616                  * i.e. no active threads at this priority level or higher
1617                  * and no threads that have run recently
1618                  */
1619                 for (activecount = 0, i = 0; i <= priority; i++) {
1620                         if ((activecount = wq->wq_thactive_count[i][affinity_tag]))
1621                                 break;
1622
1623                         if (wq->wq_thscheduled_count[i][affinity_tag]) {
1624                                 if (wq_thread_is_busy(curtime, &wq->wq_lastblocked_ts[i][affinity_tag])) {
1625                                         busycount++;
1626                                         break;
1627                                 }
1628                         }
1629                 }
1630                 if (activecount == 0 && busycount == 0)
1631                         break;
1632         }
1633         if (affinity_tag >= wq->wq_reqconc[priority]) {
1634                 /*
1635                  * we've already got at least 1 thread per
1636                  * affinity group in the active state...
1637                  */
1638                 if (busycount) {
1639                         /*
1640                          * we found at least 1 thread in the
1641                          * 'busy' state... make sure we start
1642                          * the timer because if they are the only
1643                          * threads keeping us from scheduling
1644                          * this workitem, we won't get a callback
1645                          * to kick off the timer... we need to
1646                          * start it now...
1647                          */
1648                         WQ_TIMER_NEEDED(wq, start_timer);
1649                 }
1650                 KERNEL_DEBUG(0xefffd000 | DBG_FUNC_NONE, wq, busycount, start_timer, 0, 0);
1651
1652                 if (thread != THREAD_NULL) {
1653                         /*
1654                          * go park this one for later
1655                          */
1656                         th_to_park = thread;
1657                         goto parkit;
1658                 }
1659                 goto out_of_work;
1660         }
1661         if (thread != THREAD_NULL) {
1662                 /*
1663                  * we're overbooked on the affinity group this thread is
1664                  * currently associated with, but we have work to do
1665                  * and at least 1 idle processor, so we'll just retarget
1666                  * this thread to a new affinity group
1667                  */
1668                 th_to_run = thread;
1669                 goto pick_up_work;
1670         }
1671         if (wq->wq_thidlecount == 0) {
1672                 /*
1673                  * we don't have a thread to schedule, but we have
1674                  * work to do and at least 1 affinity group that
1675                  * doesn't currently have an active thread...
1676                  */
1677                 WQ_TIMER_NEEDED(wq, start_timer);
1678
1679                 KERNEL_DEBUG(0xefffd118, wq, wq->wq_nthreads, start_timer, 0, 0);
1680
1681                 goto no_thread_to_run;
1682         }
1683
1684 grab_idle_thread:
1685         /*
1686          * we've got a candidate (affinity group with no currently
1687          * active threads) to start a new thread on...
1688          * we already know there is both work available
1689          * and an idle thread, so activate a thread and then
1690          * fall into the code that pulls a new workitem...
1691          */
1692         TAILQ_FOREACH(ttl, &wq->wq_thidlelist, th_entry) {
1693                 if (ttl->th_affinity_tag == affinity_tag || ttl->th_affinity_tag == (uint16_t)-1) {
1694
1695                         TAILQ_REMOVE(&wq->wq_thidlelist, ttl, th_entry);
1696                         tl = ttl;
1697
1698                         break;
1699                 }
1700         }
1701         if (tl == NULL) {
1702                 tl = TAILQ_FIRST(&wq->wq_thidlelist);
1703                 TAILQ_REMOVE(&wq->wq_thidlelist, tl, th_entry);
1704         }
1705         wq->wq_thidlecount--;
1706
1707         TAILQ_INSERT_TAIL(&wq->wq_thrunlist, tl, th_entry);
1708
1709         if ((tl->th_flags & TH_LIST_SUSPENDED) == TH_LIST_SUSPENDED) {
1710                 tl->th_flags &= ~TH_LIST_SUSPENDED;
1711                 reuse_thread = 0;
1712
1713                 thread_sched_call(tl->th_thread, workqueue_callback);
1714
1715         } else if ((tl->th_flags & TH_LIST_BLOCKED) == TH_LIST_BLOCKED) {
1716                 tl->th_flags &= ~TH_LIST_BLOCKED;
1717                 tl->th_flags |= TH_LIST_BUSY;
1718                 wake_thread = 1;
1719         }
1720         tl->th_flags |= TH_LIST_RUNNING;
1721
1722         wq->wq_threads_scheduled++;
1723         wq->wq_thscheduled_count[priority][affinity_tag]++;
1724         OSAddAtomic(1, &wq->wq_thactive_count[priority][affinity_tag]);
1725
1726         adjust_counters = FALSE;
1727         th_to_run = tl->th_thread;
1728
1729 pick_up_work:
1730         if (item == 0) {
1731                 witem = TAILQ_FIRST(&wl->wl_itemlist);
1732                 TAILQ_REMOVE(&wl->wl_itemlist, witem, wi_entry);
1733
1734                 if (TAILQ_EMPTY(&wl->wl_itemlist))
1735                         wq->wq_list_bitmap &= ~(1 << priority);
1736                 wq->wq_itemcount--;
1737
1738                 item = witem->wi_item;
1739                 witem->wi_item = (user_addr_t)0;
1740                 witem->wi_affinity = 0;
1741                 TAILQ_INSERT_HEAD(&wl->wl_freelist, witem, wi_entry);
1742         }
1743         orig_priority = tl->th_priority;
1744         orig_affinity_tag = tl->th_affinity_tag;
1745
1746         tl->th_priority = priority;
1747         tl->th_affinity_tag = affinity_tag;
1748
1749         if (adjust_counters == TRUE && (orig_priority != priority || orig_affinity_tag != affinity_tag)) {
1750                 /*
1751                  * we need to adjust these counters based on this
1752                  * thread's new disposition w/r to affinity and priority
1753                  */
1754                 OSAddAtomic(-1, &wq->wq_thactive_count[orig_priority][orig_affinity_tag]);
1755                 OSAddAtomic(1, &wq->wq_thactive_count[priority][affinity_tag]);
1756
1757                 wq->wq_thscheduled_count[orig_priority][orig_affinity_tag]--;
1758                 wq->wq_thscheduled_count[priority][affinity_tag]++;
1759         }
1760         wq->wq_thread_yielded_count = 0;
1761
1762         workqueue_unlock(p);
1763
1764         if (orig_affinity_tag != affinity_tag) {
1765                 /*
1766                  * this thread's affinity does not match the affinity group
1767                  * its being placed on (it's either a brand new thread or
1768                  * we're retargeting an existing thread to a new group)...
1769                  * affinity tag of 0 means no affinity...
1770                  * but we want our tags to be 0 based because they
1771                  * are used to index arrays, so...
1772                  * keep it 0 based internally and bump by 1 when
1773                  * calling out to set it
1774                  */
1775                 KERNEL_DEBUG(0xefffd114 | DBG_FUNC_START, wq, orig_affinity_tag, 0, 0, 0);
1776
1777                 (void)thread_affinity_set(th_to_run, affinity_tag + 1);
1778
1779                 KERNEL_DEBUG(0xefffd114 | DBG_FUNC_END, wq, affinity_tag, 0, 0, 0);
1780         }
1781         if (orig_priority != priority) {
1782                 thread_precedence_policy_data_t precedinfo;
1783                 thread_extended_policy_data_t   extinfo;
1784                 uint32_t        policy;
1785
1786                 policy = workqueue_policy[priority];
1787
1788                 KERNEL_DEBUG(0xefffd120 | DBG_FUNC_START, wq, orig_priority, tl->th_policy, 0, 0);
1789
1790                 if (tl->th_policy != policy) {
1791
1792                         extinfo.timeshare = policy;
1793                         (void)thread_policy_set_internal(th_to_run, THREAD_EXTENDED_POLICY, (thread_policy_t)&extinfo, THREAD_EXTENDED_POLICY_COUNT);
1794
1795                         tl->th_policy = policy;
1796                 }
1797                 precedinfo.importance = workqueue_importance[priority];
1798                 (void)thread_policy_set_internal(th_to_run, THREAD_PRECEDENCE_POLICY, (thread_policy_t)&precedinfo, THREAD_PRECEDENCE_POLICY_COUNT);
1799
1800                 KERNEL_DEBUG(0xefffd120 | DBG_FUNC_END, wq,  priority, policy, 0, 0);
1801         }
1802         if (kdebug_enable) {
1803                 int     lpri = -1;
1804                 int     laffinity = -1;
1805                 int     first = -1;
1806                 uint32_t  code = 0xefffd02c | DBG_FUNC_START;
1807
1808                 for (n = 0; n < WORKQUEUE_NUMPRIOS; n++) {
1809                         for (i = 0; i < wq->wq_affinity_max; i++) {
1810                                 if (wq->wq_thactive_count[n][i]) {
1811                                         if (lpri != -1) {
1812                                                 KERNEL_DEBUG(code, lpri, laffinity, wq->wq_thactive_count[lpri][laffinity], first, 0);
1813                                                 code = 0xefffd02c;
1814                                                 first = 0;
1815                                         }
1816                                         lpri = n;
1817                                         laffinity = i;
1818                                 }
1819                         }
1820                 }
1821                 if (lpri != -1) {
1822                         if (first == -1)
1823                                 first = 0xeeeeeeee;
1824                         KERNEL_DEBUG(0xefffd02c | DBG_FUNC_END, lpri, laffinity, wq->wq_thactive_count[lpri][laffinity], first, 0);
1825                 }
1826         }
1827         /*
1828          * if current thread is reused for workitem, does not return via unix_syscall
1829          */
1830         wq_runitem(p, item, th_to_run, tl, reuse_thread, wake_thread, (thread == th_to_run));
1831
1832         KERNEL_DEBUG(0xefffd000 | DBG_FUNC_END, wq, thread_tid(th_to_run), item, 1, 0);
1833
1834         return (TRUE);
1835
1836 out_of_work:
1837         /*
1838          * we have no work to do or we are fully booked
1839          * w/r to running threads...
1840          */
1841 no_thread_to_run:
1842         workqueue_unlock(p);
1843
1844         if (start_timer)
1845                 workqueue_interval_timer_start(wq);
1846
1847         KERNEL_DEBUG(0xefffd000 | DBG_FUNC_END, wq, thread_tid(thread), 0, 2, 0);
1848
1849         return (FALSE);
1850
1851 parkit:
1852         /*
1853          * this is a workqueue thread with no more
1854          * work to do... park it for now
1855          */
1856         uth = get_bsdthread_info(th_to_park);
1857         tl = uth->uu_threadlist;
1858         if (tl == 0)
1859                 panic("wq thread with no threadlist ");
1860
1861         TAILQ_REMOVE(&wq->wq_thrunlist, tl, th_entry);
1862         tl->th_flags &= ~TH_LIST_RUNNING;
1863
1864         tl->th_flags |= TH_LIST_BLOCKED;
1865         TAILQ_INSERT_HEAD(&wq->wq_thidlelist, tl, th_entry);
1866
1867         thread_sched_call(th_to_park, NULL);
1868
1869         OSAddAtomic(-1, &wq->wq_thactive_count[tl->th_priority][tl->th_affinity_tag]);
1870         wq->wq_thscheduled_count[tl->th_priority][tl->th_affinity_tag]--;
1871         wq->wq_threads_scheduled--;
1872
1873         if (wq->wq_thidlecount < 100)
1874                 us_to_wait = wq_reduce_pool_window_usecs - (wq->wq_thidlecount * (wq_reduce_pool_window_usecs / 100));
1875         else
1876                 us_to_wait = wq_reduce_pool_window_usecs / 100;
1877
1878         wq->wq_thidlecount++;
1879
1880         assert_wait_timeout((caddr_t)tl, (THREAD_INTERRUPTIBLE), us_to_wait, NSEC_PER_USEC);
1881
1882         workqueue_unlock(p);
1883
1884         if (start_timer)
1885                 workqueue_interval_timer_start(wq);
1886
1887         KERNEL_DEBUG1(0xefffd018 | DBG_FUNC_START, wq, wq->wq_threads_scheduled, wq->wq_thidlecount, us_to_wait, thread_tid(th_to_park));
1888         KERNEL_DEBUG(0xefffd000 | DBG_FUNC_END, wq, thread_tid(thread), 0, 3, 0);
1889
1890         thread_block((thread_continue_t)wq_unpark_continue);
1891         /* NOT REACHED */
1892
1893         return (FALSE);
1894 }
1895
1896
1897 static void
1898 wq_unpark_continue(void)
1899 {
1900         struct uthread *uth = NULL;
1901         struct threadlist *tl;
1902         thread_t th_to_unpark;
1903         proc_t  p;
1904
1905         th_to_unpark = current_thread();
1906         uth = get_bsdthread_info(th_to_unpark);
1907
1908         if (uth != NULL) {
1909                 if ((tl = uth->uu_threadlist) != NULL) {
1910
1911                         if ((tl->th_flags & (TH_LIST_RUNNING | TH_LIST_BUSY)) == TH_LIST_RUNNING) {
1912                                 /*
1913                                  * a normal wakeup of this thread occurred... no need
1914                                  * for any synchronization with the timer and wq_runitem
1915                                  */
1916 normal_return_to_user:
1917                                 thread_sched_call(th_to_unpark, workqueue_callback);
1918
1919                                 KERNEL_DEBUG(0xefffd018 | DBG_FUNC_END, tl->th_workq, 0, 0, 0, 0);
1920
1921                                 thread_exception_return();
1922                         }
1923                         p = current_proc();
1924
1925                         workqueue_lock_spin(p);
1926
1927                         if ( !(tl->th_flags & TH_LIST_RUNNING)) {
1928                                 /*
1929                                  * the timer popped us out and we've not
1930                                  * been moved off of the idle list
1931                                  * so we should now self-destruct
1932                                  *
1933                                  * workqueue_removethread consumes the lock
1934                                  */
1935                                 workqueue_removethread(tl);
1936
1937                                 thread_exception_return();
1938                         }
1939                         /*
1940                          * the timer woke us up, but we have already
1941                          * started to make this a runnable thread,
1942                          * but have not yet finished that process...
1943                          * so wait for the normal wakeup
1944                          */
1945                         while ((tl->th_flags & TH_LIST_BUSY)) {
1946
1947                                 assert_wait((caddr_t)tl, (THREAD_UNINT));
1948
1949                                 workqueue_unlock(p);
1950
1951                                 thread_block(THREAD_CONTINUE_NULL);
1952
1953                                 workqueue_lock_spin(p);
1954                         }
1955                         /*
1956                          * we have finished setting up the thread's context
1957                          * now we can return as if we got a normal wakeup
1958                          */
1959                         workqueue_unlock(p);
1960
1961                         goto normal_return_to_user;
1962                 }
1963         }
1964         thread_exception_return();
1965 }
1966
1967
1968
1969 static void
1970 wq_runitem(proc_t p, user_addr_t item, thread_t th, struct threadlist *tl,
1971            int reuse_thread, int wake_thread, int return_directly)
1972 {
1973         int ret = 0;
1974
1975         KERNEL_DEBUG1(0xefffd004 | DBG_FUNC_START, tl->th_workq, tl->th_priority, tl->th_affinity_tag, thread_tid(current_thread()), thread_tid(th));
1976
1977         ret = setup_wqthread(p, th, item, reuse_thread, tl);
1978
1979         if (ret != 0)
1980                 panic("setup_wqthread failed  %x\n", ret);
1981
1982         if (return_directly) {
1983                 KERNEL_DEBUG(0xefffd000 | DBG_FUNC_END, tl->th_workq, 0, 0, 4, 0);
1984
1985                 thread_exception_return();
1986
1987                 panic("wq_runitem: thread_exception_return returned ...\n");
1988         }
1989         if (wake_thread) {
1990                 workqueue_lock_spin(p);
1991
1992                 tl->th_flags &= ~TH_LIST_BUSY;
1993                 wakeup(tl);
1994
1995                 workqueue_unlock(p);
1996         } else {
1997                 KERNEL_DEBUG1(0xefffd014 | DBG_FUNC_END, tl->th_workq, 0, 0, thread_tid(current_thread()), thread_tid(th));
1998
1999                 thread_resume(th);
2000         }
2001 }
2002
2003
2004 int
2005 setup_wqthread(proc_t p, thread_t th, user_addr_t item, int reuse_thread, struct threadlist *tl)
2006 {
2007 #if defined(__ppc__)
2008         /*
2009          * Set up PowerPC registers...
2010          * internally they are always kept as 64 bit and
2011          * since the register set is the same between 32 and 64bit modes
2012          * we don't need 2 different methods for setting the state
2013          */
2014         {
2015                 ppc_thread_state64_t state64;
2016                 ppc_thread_state64_t *ts64 = &state64;
2017
2018                 ts64->srr0 = (uint64_t)p->p_wqthread;
2019                 ts64->r1 = (uint64_t)((tl->th_stackaddr + PTH_DEFAULT_STACKSIZE + PTH_DEFAULT_GUARDSIZE) - C_ARGSAVE_LEN - C_RED_ZONE);
2020                 ts64->r3 = (uint64_t)(tl->th_stackaddr + PTH_DEFAULT_STACKSIZE + PTH_DEFAULT_GUARDSIZE);
2021                 ts64->r4 = (uint64_t)(tl->th_thport);
2022                 ts64->r5 = (uint64_t)(tl->th_stackaddr + PTH_DEFAULT_GUARDSIZE);
2023                 ts64->r6 = (uint64_t)item;
2024                 ts64->r7 = (uint64_t)reuse_thread;
2025                 ts64->r8 = (uint64_t)0;
2026
2027                 if ((reuse_thread != 0) && (ts64->r3 == (uint64_t)0))
2028                         panic("setup_wqthread: setting reuse thread with null pthread\n");
2029                 thread_set_wq_state64(th, (thread_state_t)ts64);
2030         }
2031 #elif defined(__i386__) || defined(__x86_64__)
2032         int isLP64 = 0;
2033
2034         isLP64 = IS_64BIT_PROCESS(p);
2035         /*
2036          * Set up i386 registers & function call.
2037          */
2038         if (isLP64 == 0) {
2039                 x86_thread_state32_t state;
2040                 x86_thread_state32_t *ts = &state;
2041
2042                 ts->eip = (int)p->p_wqthread;
2043                 ts->eax = (unsigned int)(tl->th_stackaddr + PTH_DEFAULT_STACKSIZE + PTH_DEFAULT_GUARDSIZE);
2044                 ts->ebx = (unsigned int)tl->th_thport;
2045                 ts->ecx = (unsigned int)(tl->th_stackaddr + PTH_DEFAULT_GUARDSIZE);
2046                 ts->edx = (unsigned int)item;
2047                 ts->edi = (unsigned int)reuse_thread;
2048                 ts->esi = (unsigned int)0;
2049                 /*
2050                  * set stack pointer
2051                  */
2052                 ts->esp = (int)((vm_offset_t)((tl->th_stackaddr + PTH_DEFAULT_STACKSIZE + PTH_DEFAULT_GUARDSIZE) - C_32_STK_ALIGN));
2053
2054                 if ((reuse_thread != 0) && (ts->eax == (unsigned int)0))
2055                         panic("setup_wqthread: setting reuse thread with null pthread\n");
2056                 thread_set_wq_state32(th, (thread_state_t)ts);
2057
2058         } else {
2059                 x86_thread_state64_t state64;
2060                 x86_thread_state64_t *ts64 = &state64;
2061
2062                 ts64->rip = (uint64_t)p->p_wqthread;
2063                 ts64->rdi = (uint64_t)(tl->th_stackaddr + PTH_DEFAULT_STACKSIZE + PTH_DEFAULT_GUARDSIZE);
2064                 ts64->rsi = (uint64_t)(tl->th_thport);
2065                 ts64->rdx = (uint64_t)(tl->th_stackaddr + PTH_DEFAULT_GUARDSIZE);
2066                 ts64->rcx = (uint64_t)item;
2067                 ts64->r8 = (uint64_t)reuse_thread;
2068                 ts64->r9 = (uint64_t)0;
2069
2070                 /*
2071                  * set stack pointer aligned to 16 byte boundary
2072                  */
2073                 ts64->rsp = (uint64_t)((tl->th_stackaddr + PTH_DEFAULT_STACKSIZE + PTH_DEFAULT_GUARDSIZE) - C_64_REDZONE_LEN);
2074
2075                 if ((reuse_thread != 0) && (ts64->rdi == (uint64_t)0))
2076                         panic("setup_wqthread: setting reuse thread with null pthread\n");
2077                 thread_set_wq_state64(th, (thread_state_t)ts64);
2078         }
2079 #else
2080 #error setup_wqthread  not defined for this architecture
2081 #endif
2082         return(0);
2083 }
2084
2085 int
2086 fill_procworkqueue(proc_t p, struct proc_workqueueinfo * pwqinfo)
2087 {
2088         struct workqueue * wq;
2089         int error = 0;
2090         int     activecount;
2091         uint32_t pri, affinity;
2092
2093         workqueue_lock_spin(p);
2094         if ((wq = p->p_wqptr) == NULL) {
2095                 error = EINVAL;
2096                 goto out;
2097         }
2098         activecount = 0;
2099
2100         for (pri = 0; pri < WORKQUEUE_NUMPRIOS; pri++) {
2101                 for (affinity = 0; affinity < wq->wq_affinity_max; affinity++)
2102                         activecount += wq->wq_thactive_count[pri][affinity];
2103         }
2104         pwqinfo->pwq_nthreads = wq->wq_nthreads;
2105         pwqinfo->pwq_runthreads = activecount;
2106         pwqinfo->pwq_blockedthreads = wq->wq_threads_scheduled - activecount;
2107 out:
2108         workqueue_unlock(p);
2109         return(error);
2110 }
2111
2112 /* Set target concurrency of one of the  queue(0,1,2) with specified value */
2113 int
2114 proc_settargetconc(pid_t pid, int queuenum, int32_t targetconc)
2115 {
2116         proc_t p, self;
2117         uint64_t addr;
2118         int32_t conc = targetconc;
2119         int error = 0;
2120         vm_map_t oldmap = VM_MAP_NULL;
2121         int gotref = 0;
2122
2123         self = current_proc();
2124         if (self->p_pid != pid) {
2125                 /* if not on self, hold a refernce on the process */
2126
2127                 if (pid == 0)
2128                         return(EINVAL);
2129
2130                 p = proc_find(pid);
2131
2132                 if (p == PROC_NULL)
2133                         return(ESRCH);
2134                 gotref = 1;
2135
2136         } else
2137                 p = self;
2138
2139         if ((addr = p->p_targconc) == (uint64_t)0) {
2140                 error = EINVAL;
2141                 goto out;
2142         }
2143
2144
2145         if ((queuenum >= WQ_MAXPRI_MIN) && (queuenum <= WQ_MAXPRI_MAX)) {
2146                 addr += (queuenum * sizeof(int32_t));
2147                 if (gotref == 1)
2148                         oldmap = vm_map_switch(get_task_map(p->task));
2149                 error = copyout(&conc, addr, sizeof(int32_t));
2150                 if (gotref == 1)
2151                         (void)vm_map_switch(oldmap);
2152
2153         } else  {
2154                 error = EINVAL;
2155         }
2156 out:
2157         if (gotref == 1)
2158                 proc_rele(p);
2159         return(error);
2160 }
2161
2162
2163 /* Set target concurrency on all the prio queues with specified value */
2164 int
2165 proc_setalltargetconc(pid_t pid, int32_t * targetconcp)
2166 {
2167         proc_t p, self;
2168         uint64_t addr;
2169         int error = 0;
2170         vm_map_t oldmap = VM_MAP_NULL;
2171         int gotref = 0;
2172
2173         self = current_proc();
2174         if (self->p_pid != pid) {
2175                 /* if not on self, hold a refernce on the process */
2176
2177                 if (pid == 0)
2178                         return(EINVAL);
2179
2180                 p = proc_find(pid);
2181
2182                 if (p == PROC_NULL)
2183                         return(ESRCH);
2184                 gotref = 1;
2185
2186         } else
2187                 p = self;
2188
2189         if ((addr = (uint64_t)p->p_targconc) == (uint64_t)0) {
2190                 error = EINVAL;
2191                 goto out;
2192         }
2193
2194
2195         if (gotref == 1)
2196                 oldmap = vm_map_switch(get_task_map(p->task));
2197
2198         error = copyout(targetconcp, addr, WQ_PRI_NUM * sizeof(int32_t));
2199         if (gotref == 1)
2200                 (void)vm_map_switch(oldmap);
2201
2202 out:
2203         if (gotref == 1)
2204                 proc_rele(p);
2205         return(error);
2206 }
2207
2208 int thread_selfid(__unused struct proc *p, __unused struct thread_selfid_args *uap, user_addr_t *retval)
2209 {
2210         thread_t                thread = current_thread();
2211         uint64_t                thread_id = thread_tid(thread);
2212         *retval = thread_id;
2213         return KERN_SUCCESS;
2214 }
2215
2216 void
2217 pthread_init(void)
2218 {
2219         pthread_lck_grp_attr = lck_grp_attr_alloc_init();
2220         pthread_lck_grp = lck_grp_alloc_init("pthread", pthread_lck_grp_attr);
2221
2222         /*
2223          * allocate the lock attribute for pthread synchronizers
2224          */
2225         pthread_lck_attr = lck_attr_alloc_init();
2226
2227         workqueue_init_lock((proc_t) get_bsdtask_info(kernel_task));
2228 #if PSYNCH
2229         pthread_list_mlock = lck_mtx_alloc_init(pthread_lck_grp, pthread_lck_attr);
2230
2231         pth_global_hashinit();
2232         psynch_thcall = thread_call_allocate(psynch_wq_cleanup, NULL);
2233 #endif /* PSYNCH */
2234 }