bsd/kern/pthread_synch.c

   1 /*
   2  * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995-2005 Apple Computer, Inc. All Rights Reserved */
  29 /*
  30  *      pthread_synch.c
  31  */
  32
  33 #define  _PTHREAD_CONDATTR_T
  34 #define  _PTHREAD_COND_T
  35 #define _PTHREAD_MUTEXATTR_T
  36 #define _PTHREAD_MUTEX_T
  37 #define _PTHREAD_RWLOCKATTR_T
  38 #define _PTHREAD_RWLOCK_T
  39
  40 #undef pthread_mutexattr_t
  41 #undef pthread_mutex_t
  42 #undef pthread_condattr_t
  43 #undef pthread_cond_t
  44 #undef pthread_rwlockattr_t
  45 #undef pthread_rwlock_t
  46
  47 #include <sys/param.h>
  48 #include <sys/queue.h>
  49 #include <sys/resourcevar.h>
  50 #include <sys/proc_internal.h>
  51 #include <sys/kauth.h>
  52 #include <sys/systm.h>
  53 #include <sys/timeb.h>
  54 #include <sys/times.h>
  55 #include <sys/acct.h>
  56 #include <sys/kernel.h>
  57 #include <sys/wait.h>
  58 #include <sys/signalvar.h>
  59 #include <sys/syslog.h>
  60 #include <sys/stat.h>
  61 #include <sys/lock.h>
  62 #include <sys/kdebug.h>
  63 #include <sys/sysproto.h>
  64 #include <sys/pthread_internal.h>
  65 #include <sys/vm.h>
  66 #include <sys/user.h>           /* for coredump */
  67
  68
  69 #include <mach/mach_types.h>
  70 #include <mach/vm_prot.h>
  71 #include <mach/semaphore.h>
  72 #include <mach/sync_policy.h>
  73 #include <mach/task.h>
  74 #include <kern/kern_types.h>
  75 #include <kern/task.h>
  76 #include <kern/clock.h>
  77 #include <mach/kern_return.h>
  78 #include <kern/thread.h>
  79 #include <kern/sched_prim.h>
  80 #include <kern/kalloc.h>
  81 #include <kern/sched_prim.h>    /* for thread_exception_return */
  82 #include <kern/processor.h>
  83 #include <kern/affinity.h>
  84 #include <mach/mach_vm.h>
  85 #include <mach/mach_param.h>
  86 #include <mach/thread_status.h>
  87 #include <mach/thread_policy.h>
  88 #include <mach/message.h>
  89 #include <mach/port.h>
  90 #include <vm/vm_protos.h>
  91 #include <vm/vm_map.h>` /* for current_map() */
  92 #include <mach/thread_act.h> /* for thread_resume */
  93 #include <machine/machine_routines.h>
  94 #if defined(__i386__)
  95 #include <i386/machine_routines.h>
  96 #include <i386/eflags.h>
  97 #include <i386/psl.h>
  98 #include <i386/seg.h>
  99 #endif
 100
 101 #include <libkern/OSAtomic.h>
 102
 103 #if 0
 104 #undef KERNEL_DEBUG
 105 #define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT
 106 #undef KERNEL_DEBUG1
 107 #define KERNEL_DEBUG1 KERNEL_DEBUG_CONSTANT1
 108 #endif
 109
 110
 111 #if defined(__ppc__) || defined(__ppc64__)
 112 #include <architecture/ppc/cframe.h>
 113 #endif
 114
 115
 116 lck_grp_attr_t   *pthread_lck_grp_attr;
 117 lck_grp_t    *pthread_lck_grp;
 118 lck_attr_t   *pthread_lck_attr;
 119 lck_mtx_t * pthread_list_mlock;
 120 extern void pthread_init(void);
 121
 122 extern kern_return_t thread_getstatus(register thread_t act, int flavor,
 123                         thread_state_t tstate, mach_msg_type_number_t *count);
 124 extern kern_return_t thread_setstatus(thread_t thread, int flavor,
 125                         thread_state_t tstate, mach_msg_type_number_t count);
 126 extern void thread_set_cthreadself(thread_t thread, uint64_t pself, int isLP64);
 127 extern kern_return_t mach_port_deallocate(ipc_space_t, mach_port_name_t);
 128 extern kern_return_t semaphore_signal_internal_trap(mach_port_name_t);
 129
 130 static int workqueue_additem(struct workqueue *wq, int prio, user_addr_t item);
 131 static int workqueue_removeitem(struct workqueue *wq, int prio, user_addr_t item);
 132 static void workqueue_run_nextitem(proc_t p, thread_t th);
 133 static void wq_runitem(proc_t p, user_addr_t item, thread_t th, struct threadlist *tl,
 134                        int reuse_thread, int wake_thread, int return_directly);
 135 static int setup_wqthread(proc_t p, thread_t th, user_addr_t item, int reuse_thread, struct threadlist *tl);
 136 static int  workqueue_addnewthread(struct workqueue *wq);
 137 static void workqueue_removethread(struct workqueue *wq);
 138 static void workqueue_lock(proc_t);
 139 static void workqueue_lock_spin(proc_t);
 140 static void workqueue_unlock(proc_t);
 141
 142 #define C_32_STK_ALIGN          16
 143 #define C_64_STK_ALIGN          16
 144 #define C_64_REDZONE_LEN        128
 145 #define TRUNC_DOWN32(a,c)       ((((uint32_t)a)-(c)) & ((uint32_t)(-(c))))
 146 #define TRUNC_DOWN64(a,c)       ((((uint64_t)a)-(c)) & ((uint64_t)(-(c))))
 147
 148
 149 /*
 150  * Flags filed passed to bsdthread_create and back in pthread_start
 151 31  <---------------------------------> 0
 152 _________________________________________
 153 | flags(8) | policy(8) | importance(16) |
 154 -----------------------------------------
 155 */
 156 void _pthread_start(pthread_t self, mach_port_t kport, void *(*fun)(void *), void * funarg, size_t stacksize, unsigned int flags);
 157
 158 #define PTHREAD_START_CUSTOM    0x01000000
 159 #define PTHREAD_START_SETSCHED  0x02000000
 160 #define PTHREAD_START_DETACHED  0x04000000
 161 #define PTHREAD_START_POLICY_BITSHIFT 16
 162 #define PTHREAD_START_POLICY_MASK 0xffff
 163 #define PTHREAD_START_IMPORTANCE_MASK 0xffff
 164
 165 #define SCHED_OTHER      POLICY_TIMESHARE
 166 #define SCHED_FIFO       POLICY_FIFO
 167 #define SCHED_RR         POLICY_RR
 168
 169 void
 170 pthread_init(void)
 171 {
 172
 173         pthread_lck_grp_attr = lck_grp_attr_alloc_init();
 174         pthread_lck_grp = lck_grp_alloc_init("pthread", pthread_lck_grp_attr);
 175
 176         /*
 177          * allocate the lock attribute for pthread synchronizers
 178          */
 179         pthread_lck_attr = lck_attr_alloc_init();
 180
 181         pthread_list_mlock = lck_mtx_alloc_init(pthread_lck_grp, pthread_lck_attr);
 182
 183 }
 184
 185 void
 186 pthread_list_lock(void)
 187 {
 188         lck_mtx_lock(pthread_list_mlock);
 189 }
 190
 191 void
 192 pthread_list_unlock(void)
 193 {
 194         lck_mtx_unlock(pthread_list_mlock);
 195 }
 196
 197
 198 int
 199 __pthread_mutex_destroy(__unused struct proc *p, struct __pthread_mutex_destroy_args *uap, __unused register_t *retval)
 200 {
 201         int res;
 202         int mutexid = uap->mutexid;
 203         pthread_mutex_t * mutex;
 204         lck_mtx_t * lmtx;
 205         lck_mtx_t * lmtx1;
 206
 207
 208         mutex = pthread_id_to_mutex(mutexid);
 209         if (mutex == 0)
 210                 return(EINVAL);
 211
 212         MTX_LOCK(mutex->lock);
 213         if (mutex->sig == _PTHREAD_KERN_MUTEX_SIG)
 214         {
 215                 if (mutex->owner == (thread_t)NULL &&
 216                     mutex->refcount == 1)
 217                 {
 218                         mutex->sig = _PTHREAD_NO_SIG;
 219                         lmtx = mutex->mutex;
 220                         lmtx1 = mutex->lock;
 221                         mutex->mutex = NULL;
 222                         pthread_id_mutex_remove(mutexid);
 223                         mutex->refcount --;
 224                         MTX_UNLOCK(mutex->lock);
 225                         lck_mtx_free(lmtx, pthread_lck_grp);
 226                         lck_mtx_free(lmtx1, pthread_lck_grp);
 227                         kfree((void *)mutex, sizeof(struct _pthread_mutex));
 228                         return(0);
 229                 }
 230                 else
 231                         res = EBUSY;
 232         }
 233         else
 234                 res = EINVAL;
 235         MTX_UNLOCK(mutex->lock);
 236         pthread_mutex_release(mutex);
 237         return (res);
 238 }
 239
 240 /*
 241  * Initialize a mutex variable, possibly with additional attributes.
 242  */
 243 static void
 244 pthread_mutex_init_internal(pthread_mutex_t *mutex, const pthread_mutexattr_t *attr)
 245 {
 246         mutex->prioceiling = attr->prioceiling;
 247         mutex->protocol = attr->protocol;
 248         mutex->type = attr->type;
 249         mutex->pshared = attr->pshared;
 250         mutex->refcount = 0;
 251         mutex->owner = (thread_t)NULL;
 252         mutex->owner_proc = current_proc();
 253         mutex->sig = _PTHREAD_KERN_MUTEX_SIG;
 254         mutex->lock = lck_mtx_alloc_init(pthread_lck_grp, pthread_lck_attr);
 255         mutex->mutex = lck_mtx_alloc_init(pthread_lck_grp, pthread_lck_attr);
 256 }
 257
 258 /*
 259  * Initialize a mutex variable, possibly with additional attributes.
 260  * Public interface - so don't trust the lock - initialize it first.
 261  */
 262 int
 263 __pthread_mutex_init(__unused struct proc *p, struct __pthread_mutex_init_args *uap, __unused register_t *retval)
 264 {
 265         user_addr_t umutex = uap->mutex;
 266         pthread_mutex_t * mutex;
 267         user_addr_t uattr = uap->attr;
 268         pthread_mutexattr_t attr;
 269         unsigned int addr = (unsigned int)((uintptr_t)uap->mutex);
 270         int pmutex_sig;
 271         int mutexid;
 272         int error = 0;
 273
 274         if ((umutex == 0) || (uattr == 0))
 275                 return(EINVAL);
 276
 277         if ((error = copyin(uattr, &attr, sizeof(pthread_mutexattr_t))))
 278                 return(error);
 279
 280         if (attr.sig != _PTHREAD_MUTEX_ATTR_SIG)
 281                         return (EINVAL);
 282
 283         if ((error = copyin(umutex, &pmutex_sig, sizeof(int))))
 284                 return(error);
 285
 286         if (pmutex_sig == _PTHREAD_KERN_MUTEX_SIG)
 287                 return(EBUSY);
 288         mutex = (pthread_mutex_t *)kalloc(sizeof(pthread_mutex_t));
 289
 290          pthread_mutex_init_internal(mutex, &attr);
 291
 292
 293         addr += 8;
 294         mutexid = pthread_id_mutex_add(mutex);
 295         if (mutexid) {
 296                 if ((error = copyout(&mutexid, ((user_addr_t)((uintptr_t)(addr))), 4)))
 297                         goto cleanup;
 298                 return(0);
 299         }  else
 300                 error = ENOMEM;
 301 cleanup:
 302         if(mutexid)
 303                 pthread_id_mutex_remove(mutexid);
 304         lck_mtx_free(mutex->lock, pthread_lck_grp);
 305         lck_mtx_free(mutex->mutex, pthread_lck_grp);
 306         kfree(mutex, sizeof(struct _pthread_mutex));
 307         return(error);
 308 }
 309
 310 /*
 311  * Lock a mutex.
 312  * TODO: Priority inheritance stuff
 313  */
 314 int
 315 __pthread_mutex_lock(struct proc *p, struct __pthread_mutex_lock_args *uap, __unused register_t *retval)
 316 {
 317         int mutexid = uap->mutexid;
 318         pthread_mutex_t  * mutex;
 319         int error;
 320
 321         mutex = pthread_id_to_mutex(mutexid);
 322         if (mutex == 0)
 323                 return(EINVAL);
 324
 325         MTX_LOCK(mutex->lock);
 326
 327         if (mutex->sig != _PTHREAD_KERN_MUTEX_SIG)
 328         {
 329                 error = EINVAL;
 330                 goto out;
 331         }
 332
 333         if ((p != mutex->owner_proc) && (mutex->pshared != PTHREAD_PROCESS_SHARED)) {
 334                 error = EINVAL;
 335                 goto out;
 336         }
 337
 338         MTX_UNLOCK(mutex->lock);
 339
 340         lck_mtx_lock(mutex->mutex);
 341
 342         MTX_LOCK(mutex->lock);
 343         mutex->owner = current_thread();
 344         error = 0;
 345 out:
 346         MTX_UNLOCK(mutex->lock);
 347         pthread_mutex_release(mutex);
 348         return (error);
 349 }
 350
 351 /*
 352  * Attempt to lock a mutex, but don't block if this isn't possible.
 353  */
 354 int
 355 __pthread_mutex_trylock(struct proc *p, struct __pthread_mutex_trylock_args *uap, __unused register_t *retval)
 356 {
 357         int mutexid = uap->mutexid;
 358         pthread_mutex_t  * mutex;
 359         boolean_t state;
 360         int error;
 361
 362         mutex = pthread_id_to_mutex(mutexid);
 363         if (mutex == 0)
 364                 return(EINVAL);
 365
 366         MTX_LOCK(mutex->lock);
 367
 368         if (mutex->sig != _PTHREAD_KERN_MUTEX_SIG)
 369         {
 370                 error = EINVAL;
 371                 goto out;
 372         }
 373
 374         if ((p != mutex->owner_proc) && (mutex->pshared != PTHREAD_PROCESS_SHARED)) {
 375                 error = EINVAL;
 376                 goto out;
 377         }
 378
 379         MTX_UNLOCK(mutex->lock);
 380
 381         state = lck_mtx_try_lock(mutex->mutex);
 382         if (state) {
 383                 MTX_LOCK(mutex->lock);
 384                 mutex->owner = current_thread();
 385                 MTX_UNLOCK(mutex->lock);
 386                 error = 0;
 387         } else
 388                 error = EBUSY;
 389
 390         pthread_mutex_release(mutex);
 391         return (error);
 392 out:
 393         MTX_UNLOCK(mutex->lock);
 394         pthread_mutex_release(mutex);
 395         return (error);
 396 }
 397
 398 /*
 399  * Unlock a mutex.
 400  * TODO: Priority inheritance stuff
 401  */
 402 int
 403 __pthread_mutex_unlock(struct proc *p, struct __pthread_mutex_unlock_args *uap, __unused register_t *retval)
 404 {
 405         int mutexid = uap->mutexid;
 406         pthread_mutex_t  * mutex;
 407         int error;
 408
 409         mutex = pthread_id_to_mutex(mutexid);
 410         if (mutex == 0)
 411                 return(EINVAL);
 412
 413         MTX_LOCK(mutex->lock);
 414
 415         if (mutex->sig != _PTHREAD_KERN_MUTEX_SIG)
 416         {
 417                 error = EINVAL;
 418                 goto out;
 419         }
 420
 421         if ((p != mutex->owner_proc) && (mutex->pshared != PTHREAD_PROCESS_SHARED)) {
 422                 error = EINVAL;
 423                 goto out;
 424         }
 425
 426         MTX_UNLOCK(mutex->lock);
 427
 428         lck_mtx_unlock(mutex->mutex);
 429
 430         MTX_LOCK(mutex->lock);
 431         mutex->owner = NULL;
 432         error = 0;
 433 out:
 434         MTX_UNLOCK(mutex->lock);
 435         pthread_mutex_release(mutex);
 436         return (error);
 437 }
 438
 439
 440 int
 441 __pthread_cond_init(__unused struct proc *p, struct __pthread_cond_init_args *uap, __unused register_t *retval)
 442 {
 443         pthread_cond_t * cond;
 444         pthread_condattr_t attr;
 445         user_addr_t ucond = uap->cond;
 446         user_addr_t uattr = uap->attr;
 447         unsigned int addr = (unsigned int)((uintptr_t)uap->cond);
 448         int condid, error, cond_sig;
 449         semaphore_t sem;
 450         kern_return_t kret;
 451         int value = 0;
 452
 453         if ((ucond == 0) || (uattr == 0))
 454                 return(EINVAL);
 455
 456         if ((error = copyin(uattr, &attr, sizeof(pthread_condattr_t))))
 457                 return(error);
 458
 459         if (attr.sig != _PTHREAD_COND_ATTR_SIG)
 460                         return (EINVAL);
 461
 462         if ((error = copyin(ucond, &cond_sig, sizeof(int))))
 463                 return(error);
 464
 465         if (cond_sig == _PTHREAD_KERN_COND_SIG)
 466                 return(EBUSY);
 467         kret = semaphore_create(kernel_task, &sem, SYNC_POLICY_FIFO, value);
 468         if (kret != KERN_SUCCESS)
 469                 return(ENOMEM);
 470
 471         cond = (pthread_cond_t *)kalloc(sizeof(pthread_cond_t));
 472
 473         cond->lock = lck_mtx_alloc_init(pthread_lck_grp, pthread_lck_attr);
 474         cond->pshared = attr.pshared;
 475         cond->sig = _PTHREAD_KERN_COND_SIG;
 476         cond->sigpending = 0;
 477         cond->waiters = 0;
 478         cond->refcount = 0;
 479         cond->mutex = (pthread_mutex_t *)0;
 480         cond->owner_proc = current_proc();
 481         cond->sem = sem;
 482
 483         addr += 8;
 484         condid = pthread_id_cond_add(cond);
 485         if (condid) {
 486                 if ((error = copyout(&condid, ((user_addr_t)((uintptr_t)(addr))), 4)))
 487                         goto cleanup;
 488                 return(0);
 489         }  else
 490                 error = ENOMEM;
 491 cleanup:
 492         if(condid)
 493                 pthread_id_cond_remove(condid);
 494         semaphore_destroy(kernel_task, cond->sem);
 495         kfree(cond, sizeof(pthread_cond_t));
 496         return(error);
 497 }
 498
 499
 500 /*
 501  * Destroy a condition variable.
 502  */
 503 int
 504 __pthread_cond_destroy(__unused struct proc *p, struct __pthread_cond_destroy_args  *uap, __unused register_t *retval)
 505 {
 506         pthread_cond_t *cond;
 507         int condid = uap->condid;
 508         semaphore_t sem;
 509         lck_mtx_t * lmtx;
 510         int res;
 511
 512         cond = pthread_id_to_cond(condid);
 513         if (cond == 0)
 514                 return(EINVAL);
 515
 516         COND_LOCK(cond->lock);
 517         if (cond->sig == _PTHREAD_KERN_COND_SIG)
 518         {
 519                 if (cond->refcount == 1)
 520                 {
 521                         cond->sig = _PTHREAD_NO_SIG;
 522                         sem = cond->sem;
 523                         cond->sem = NULL;
 524                         lmtx = cond->lock;
 525                         pthread_id_cond_remove(condid);
 526                         cond->refcount --;
 527                         COND_UNLOCK(cond->lock);
 528                         lck_mtx_free(lmtx, pthread_lck_grp);
 529                         (void)semaphore_destroy(kernel_task, sem);
 530                         kfree((void *)cond, sizeof(pthread_cond_t));
 531                         return(0);
 532                 }
 533                 else
 534                         res = EBUSY;
 535         }
 536         else
 537                 res = EINVAL;
 538         COND_UNLOCK(cond->lock);
 539         pthread_cond_release(cond);
 540         return (res);
 541 }
 542
 543
 544 /*
 545  * Signal a condition variable, waking up all threads waiting for it.
 546  */
 547 int
 548 __pthread_cond_broadcast(__unused struct proc *p, struct __pthread_cond_broadcast_args  *uap, __unused register_t *retval)
 549 {
 550         int condid = uap->condid;
 551         pthread_cond_t  * cond;
 552         int error;
 553         kern_return_t kret;
 554
 555         cond = pthread_id_to_cond(condid);
 556         if (cond == 0)
 557                 return(EINVAL);
 558
 559         COND_LOCK(cond->lock);
 560
 561         if (cond->sig != _PTHREAD_KERN_COND_SIG)
 562         {
 563                 error = EINVAL;
 564                 goto out;
 565         }
 566
 567         if ((p != cond->owner_proc) && (cond->pshared != PTHREAD_PROCESS_SHARED)) {
 568                 error = EINVAL;
 569                 goto out;
 570         }
 571
 572         COND_UNLOCK(cond->lock);
 573
 574         kret = semaphore_signal_all(cond->sem);
 575     switch (kret) {
 576     case KERN_INVALID_ADDRESS:
 577     case KERN_PROTECTION_FAILURE:
 578         error = EINVAL;
 579         break;
 580     case KERN_ABORTED:
 581     case KERN_OPERATION_TIMED_OUT:
 582         error = EINTR;
 583         break;
 584     case KERN_SUCCESS:
 585         error = 0;
 586         break;
 587     default:
 588         error = EINVAL;
 589         break;
 590     }
 591
 592         COND_LOCK(cond->lock);
 593 out:
 594         COND_UNLOCK(cond->lock);
 595         pthread_cond_release(cond);
 596         return (error);
 597 }
 598
 599
 600 /*
 601  * Signal a condition variable, waking only one thread.
 602  */
 603 int
 604 __pthread_cond_signal(__unused struct proc *p, struct __pthread_cond_signal_args  *uap, __unused register_t *retval)
 605 {
 606         int condid = uap->condid;
 607         pthread_cond_t  * cond;
 608         int error;
 609         kern_return_t kret;
 610
 611         cond = pthread_id_to_cond(condid);
 612         if (cond == 0)
 613                 return(EINVAL);
 614
 615         COND_LOCK(cond->lock);
 616
 617         if (cond->sig != _PTHREAD_KERN_COND_SIG)
 618         {
 619                 error = EINVAL;
 620                 goto out;
 621         }
 622
 623         if ((p != cond->owner_proc) && (cond->pshared != PTHREAD_PROCESS_SHARED)) {
 624                 error = EINVAL;
 625                 goto out;
 626         }
 627
 628         COND_UNLOCK(cond->lock);
 629
 630         kret = semaphore_signal(cond->sem);
 631     switch (kret) {
 632     case KERN_INVALID_ADDRESS:
 633     case KERN_PROTECTION_FAILURE:
 634         error = EINVAL;
 635         break;
 636     case KERN_ABORTED:
 637     case KERN_OPERATION_TIMED_OUT:
 638         error = EINTR;
 639         break;
 640     case KERN_SUCCESS:
 641         error = 0;
 642         break;
 643     default:
 644         error = EINVAL;
 645         break;
 646     }
 647
 648         COND_LOCK(cond->lock);
 649 out:
 650         COND_UNLOCK(cond->lock);
 651         pthread_cond_release(cond);
 652         return (error);
 653 }
 654
 655
 656 int
 657 __pthread_cond_wait(__unused struct proc *p, struct __pthread_cond_wait_args  *uap, __unused register_t *retval)
 658 {
 659         int condid = uap->condid;
 660         pthread_cond_t  * cond;
 661         int mutexid = uap->mutexid;
 662         pthread_mutex_t  * mutex;
 663         int error;
 664         kern_return_t kret;
 665
 666         cond = pthread_id_to_cond(condid);
 667         if (cond == 0)
 668                 return(EINVAL);
 669
 670         mutex = pthread_id_to_mutex(mutexid);
 671         if (mutex == 0) {
 672                 pthread_cond_release(cond);
 673                 return(EINVAL);
 674         }
 675         COND_LOCK(cond->lock);
 676
 677         if (cond->sig != _PTHREAD_KERN_COND_SIG)
 678         {
 679                 error = EINVAL;
 680                 goto out;
 681         }
 682
 683         if ((p != cond->owner_proc) && (cond->pshared != PTHREAD_PROCESS_SHARED)) {
 684                 error = EINVAL;
 685                 goto out;
 686         }
 687
 688         COND_UNLOCK(cond->lock);
 689
 690         kret = semaphore_wait(cond->sem);
 691     switch (kret) {
 692     case KERN_INVALID_ADDRESS:
 693     case KERN_PROTECTION_FAILURE:
 694         error = EACCES;
 695         break;
 696     case KERN_ABORTED:
 697     case KERN_OPERATION_TIMED_OUT:
 698         error = EINTR;
 699         break;
 700     case KERN_SUCCESS:
 701         error = 0;
 702         break;
 703     default:
 704         error = EINVAL;
 705         break;
 706     }
 707
 708         COND_LOCK(cond->lock);
 709 out:
 710         COND_UNLOCK(cond->lock);
 711         pthread_cond_release(cond);
 712         pthread_mutex_release(mutex);
 713         return (error);
 714 }
 715
 716 int
 717 __pthread_cond_timedwait(__unused struct proc *p, struct __pthread_cond_timedwait_args  *uap, __unused register_t *retval)
 718 {
 719         int condid = uap->condid;
 720         pthread_cond_t  * cond;
 721         int mutexid = uap->mutexid;
 722         pthread_mutex_t  * mutex;
 723         mach_timespec_t absts;
 724         int error;
 725         kern_return_t kret;
 726
 727         absts.tv_sec = 0;
 728         absts.tv_nsec = 0;
 729
 730         if (uap->abstime)
 731                 if ((error = copyin(uap->abstime, &absts, sizeof(mach_timespec_t ))))
 732                         return(error);
 733         cond = pthread_id_to_cond(condid);
 734         if (cond == 0)
 735                 return(EINVAL);
 736
 737         mutex = pthread_id_to_mutex(mutexid);
 738         if (mutex == 0) {
 739                 pthread_cond_release(cond);
 740                 return(EINVAL);
 741         }
 742         COND_LOCK(cond->lock);
 743
 744         if (cond->sig != _PTHREAD_KERN_COND_SIG)
 745         {
 746                 error = EINVAL;
 747                 goto out;
 748         }
 749
 750         if ((p != cond->owner_proc) && (cond->pshared != PTHREAD_PROCESS_SHARED)) {
 751                 error = EINVAL;
 752                 goto out;
 753         }
 754
 755         COND_UNLOCK(cond->lock);
 756
 757         kret = semaphore_timedwait(cond->sem, absts);
 758     switch (kret) {
 759     case KERN_INVALID_ADDRESS:
 760     case KERN_PROTECTION_FAILURE:
 761         error = EACCES;
 762         break;
 763     case KERN_ABORTED:
 764     case KERN_OPERATION_TIMED_OUT:
 765         error = EINTR;
 766         break;
 767     case KERN_SUCCESS:
 768         error = 0;
 769         break;
 770     default:
 771         error = EINVAL;
 772         break;
 773     }
 774
 775         COND_LOCK(cond->lock);
 776 out:
 777         COND_UNLOCK(cond->lock);
 778         pthread_cond_release(cond);
 779         pthread_mutex_release(mutex);
 780         return (error);
 781 }
 782
 783 int
 784 bsdthread_create(__unused struct proc *p, struct bsdthread_create_args  *uap, user_addr_t *retval)
 785 {
 786         kern_return_t kret;
 787         void * sright;
 788         int error = 0;
 789         int allocated = 0;
 790         mach_vm_offset_t stackaddr;
 791         mach_vm_size_t th_allocsize = 0;
 792         mach_vm_size_t user_stacksize;
 793         mach_vm_size_t th_stacksize;
 794         mach_vm_offset_t th_stackaddr;
 795         mach_vm_offset_t th_stack;
 796         mach_vm_offset_t th_pthread;
 797         mach_port_t th_thport;
 798         thread_t th;
 799         user_addr_t user_func = uap->func;
 800         user_addr_t user_funcarg = uap->func_arg;
 801         user_addr_t user_stack = uap->stack;
 802         user_addr_t user_pthread = uap->pthread;
 803         unsigned int  flags = (unsigned int)uap->flags;
 804         vm_map_t vmap = current_map();
 805         task_t ctask = current_task();
 806         unsigned int policy, importance;
 807
 808         int isLP64 = 0;
 809
 810
 811 #if 0
 812         KERNEL_DEBUG_CONSTANT(0x9000080 | DBG_FUNC_START, flags, 0, 0, 0, 0);
 813 #endif
 814
 815         isLP64 = IS_64BIT_PROCESS(p);
 816
 817
 818 #if defined(__ppc__)
 819         stackaddr = 0xF0000000;
 820 #elif defined(__i386__)
 821         stackaddr = 0xB0000000;
 822 #elif defined(__arm__)
 823         stackaddr = 0xB0000000; /* XXX ARM */
 824 #else
 825 #error Need to define a stack address hint for this architecture
 826 #endif
 827         kret = thread_create(ctask, &th);
 828         if (kret != KERN_SUCCESS)
 829                 return(ENOMEM);
 830         thread_reference(th);
 831
 832         sright = (void *) convert_thread_to_port(th);
 833         th_thport = (void *)ipc_port_copyout_send(sright, get_task_ipcspace(ctask));
 834
 835         if ((flags & PTHREAD_START_CUSTOM) == 0) {
 836                 th_stacksize = (mach_vm_size_t)user_stack;              /* if it is custom them it is stacksize */
 837                 th_allocsize = th_stacksize + PTH_DEFAULT_GUARDSIZE + p->p_pthsize;
 838
 839                 kret = mach_vm_map(vmap, &stackaddr,
 840                                 th_allocsize,
 841                                 page_size-1,
 842                                 VM_MAKE_TAG(VM_MEMORY_STACK)| VM_FLAGS_ANYWHERE , NULL,
 843                                 0, FALSE, VM_PROT_DEFAULT, VM_PROT_ALL,
 844                                 VM_INHERIT_DEFAULT);
 845                 if (kret != KERN_SUCCESS)
 846                         kret = mach_vm_allocate(vmap,
 847                                         &stackaddr, th_allocsize,
 848                                         VM_MAKE_TAG(VM_MEMORY_STACK)| VM_FLAGS_ANYWHERE);
 849                 if (kret != KERN_SUCCESS) {
 850                         error = ENOMEM;
 851                         goto out;
 852                 }
 853 #if 0
 854                 KERNEL_DEBUG_CONSTANT(0x9000080 |DBG_FUNC_NONE, th_allocsize, stackaddr, 0, 2, 0);
 855 #endif
 856                 th_stackaddr = stackaddr;
 857                 allocated = 1;
 858                 /*
 859                  * The guard page is at the lowest address
 860                  * The stack base is the highest address
 861                  */
 862                 kret = mach_vm_protect(vmap,  stackaddr, PTH_DEFAULT_GUARDSIZE, FALSE, VM_PROT_NONE);
 863
 864                 if (kret != KERN_SUCCESS) {
 865                         error = ENOMEM;
 866                         goto out1;
 867                 }
 868                 th_stack = (stackaddr + th_stacksize + PTH_DEFAULT_GUARDSIZE);
 869                 th_pthread = (stackaddr + th_stacksize + PTH_DEFAULT_GUARDSIZE);
 870                 user_stacksize = th_stacksize;
 871         } else {
 872                 th_stack = user_stack;
 873                 user_stacksize = user_stack;
 874                 th_pthread = user_pthread;
 875 #if 0
 876                 KERNEL_DEBUG_CONSTANT(0x9000080 |DBG_FUNC_NONE, 0, 0, 0, 3, 0);
 877 #endif
 878         }
 879
 880 #if defined(__ppc__)
 881         /*
 882          * Set up PowerPC registers...
 883          * internally they are always kept as 64 bit and
 884          * since the register set is the same between 32 and 64bit modes
 885          * we don't need 2 different methods for setting the state
 886          */
 887         {
 888                 ppc_thread_state64_t state64;
 889                 ppc_thread_state64_t *ts64 = &state64;
 890
 891                 ts64->srr0 = (uint64_t)p->p_threadstart;
 892                 ts64->r1 = (uint64_t)(th_stack - C_ARGSAVE_LEN - C_RED_ZONE);
 893                 ts64->r3 = (uint64_t)th_pthread;
 894                 ts64->r4 = (uint64_t)((unsigned int)th_thport);
 895                 ts64->r5 = (uint64_t)user_func;
 896                 ts64->r6 = (uint64_t)user_funcarg;
 897                 ts64->r7 = (uint64_t)user_stacksize;
 898                 ts64->r8 = (uint64_t)uap->flags;
 899
 900                 thread_set_wq_state64(th, (thread_state_t)ts64);
 901
 902                 thread_set_cthreadself(th, (uint64_t)th_pthread, isLP64);
 903         }
 904 #elif defined(__i386__)
 905         {
 906         /*
 907          * Set up i386 registers & function call.
 908          */
 909         if (isLP64 == 0) {
 910                 x86_thread_state32_t state;
 911                 x86_thread_state32_t *ts = &state;
 912
 913                 ts->eip = (int)p->p_threadstart;
 914                 ts->eax = (unsigned int)th_pthread;
 915                 ts->ebx = (unsigned int)th_thport;
 916                 ts->ecx = (unsigned int)user_func;
 917                 ts->edx = (unsigned int)user_funcarg;
 918                 ts->edi = (unsigned int)user_stacksize;
 919                 ts->esi = (unsigned int)uap->flags;
 920                 /*
 921                  * set stack pointer
 922                  */
 923                 ts->esp = (int)((vm_offset_t)(th_stack-C_32_STK_ALIGN));
 924
 925                 thread_set_wq_state32(th, (thread_state_t)ts);
 926
 927         } else {
 928                 x86_thread_state64_t state64;
 929                 x86_thread_state64_t *ts64 = &state64;
 930
 931                 ts64->rip = (uint64_t)p->p_threadstart;
 932                 ts64->rdi = (uint64_t)th_pthread;
 933                 ts64->rsi = (uint64_t)((unsigned int)(th_thport));
 934                 ts64->rdx = (uint64_t)user_func;
 935                 ts64->rcx = (uint64_t)user_funcarg;
 936                 ts64->r8 = (uint64_t)user_stacksize;
 937                 ts64->r9 = (uint64_t)uap->flags;
 938                 /*
 939                  * set stack pointer aligned to 16 byte boundary
 940                  */
 941                 ts64->rsp = (uint64_t)(th_stack - C_64_REDZONE_LEN);
 942
 943                 thread_set_wq_state64(th, (thread_state_t)ts64);
 944         }
 945         }
 946 #elif defined(__arm__)
 947         {
 948         int flavor=0, count=0;
 949         void * state;
 950
 951         kret = thread_getstatus(th, flavor, (thread_state_t)&state, &count);
 952         if (kret != KERN_SUCCESS) {
 953                 error = EINVAL;
 954                 goto out1;
 955         }
 956
 957         /* XXX ARM TODO */
 958
 959         kret = thread_setstatus(th, flavor, (thread_state_t)&state, count);
 960         if (kret != KERN_SUCCESS)
 961                 error = EINVAL;
 962                 goto out1;
 963         }
 964 #else
 965 #error bsdthread_create  not defined for this architecture
 966 #endif
 967         /* Set scheduling parameters if needed */
 968         if ((flags & PTHREAD_START_SETSCHED) != 0) {
 969                 thread_extended_policy_data_t    extinfo;
 970                 thread_precedence_policy_data_t   precedinfo;
 971
 972                 importance = (flags & PTHREAD_START_IMPORTANCE_MASK);
 973                 policy = (flags >> PTHREAD_START_POLICY_BITSHIFT) & PTHREAD_START_POLICY_MASK;
 974
 975                 if (policy == SCHED_OTHER)
 976                         extinfo.timeshare = 1;
 977                 else
 978                         extinfo.timeshare = 0;
 979                 thread_policy_set(th, THREAD_EXTENDED_POLICY, (thread_policy_t)&extinfo, THREAD_EXTENDED_POLICY_COUNT);
 980
 981                 precedinfo.importance = importance;
 982                 thread_policy_set(th, THREAD_PRECEDENCE_POLICY, (thread_policy_t)&precedinfo, THREAD_PRECEDENCE_POLICY_COUNT);
 983         }
 984
 985         kret = thread_resume(th);
 986         if (kret != KERN_SUCCESS) {
 987                 error = EINVAL;
 988                 goto out1;
 989         }
 990         thread_deallocate(th);  /* drop the creator reference */
 991 #if 0
 992         KERNEL_DEBUG_CONSTANT(0x9000080 |DBG_FUNC_END, error, (unsigned int)th_pthread, 0, 0, 0);
 993 #endif
 994         *retval = th_pthread;
 995
 996         return(0);
 997
 998 out1:
 999         if (allocated != 0)
1000                 (void)mach_vm_deallocate(vmap,  stackaddr, th_allocsize);
1001 out:
1002         (void)mach_port_deallocate(get_task_ipcspace(ctask), (mach_port_name_t)th_thport);
1003         (void)thread_terminate(th);
1004         (void)thread_deallocate(th);
1005         return(error);
1006 }
1007
1008 int
1009 bsdthread_terminate(__unused struct proc *p, struct bsdthread_terminate_args  *uap, __unused register_t *retval)
1010 {
1011         mach_vm_offset_t  freeaddr;
1012         mach_vm_size_t freesize;
1013         kern_return_t kret;
1014         mach_port_name_t kthport = (mach_port_name_t)uap->port;
1015         mach_port_name_t sem = (mach_port_name_t)uap->sem;
1016
1017         freeaddr = (mach_vm_offset_t)uap->stackaddr;
1018         freesize = uap->freesize;
1019
1020 #if 0
1021         KERNEL_DEBUG_CONSTANT(0x9000084 |DBG_FUNC_START, (unsigned int)freeaddr, (unsigned int)freesize, (unsigned int)kthport, 0xff, 0);
1022 #endif
1023         if (sem != MACH_PORT_NULL) {
1024                  kret = semaphore_signal_internal_trap(sem);
1025                 if (kret != KERN_SUCCESS) {
1026                         return(EINVAL);
1027                 }
1028         }
1029         if ((freesize != (mach_vm_size_t)0) && (freeaddr != (mach_vm_offset_t)0)) {
1030                 kret = mach_vm_deallocate(current_map(), freeaddr, freesize);
1031                 if (kret != KERN_SUCCESS) {
1032                         return(EINVAL);
1033                 }
1034         }
1035
1036         (void) thread_terminate(current_thread());
1037         if (kthport != MACH_PORT_NULL)
1038                         mach_port_deallocate(get_task_ipcspace(current_task()), kthport);
1039         thread_exception_return();
1040         panic("bsdthread_terminate: still running\n");
1041 #if 0
1042         KERNEL_DEBUG_CONSTANT(0x9000084 |DBG_FUNC_END, 0, 0, 0, 0xff, 0);
1043 #endif
1044         return(0);
1045 }
1046
1047
1048 int
1049 bsdthread_register(struct proc *p, struct bsdthread_register_args  *uap, __unused register_t *retval)
1050 {
1051         /* syscall randomizer test can pass bogus values */
1052         if (uap->pthsize > MAX_PTHREAD_SIZE) {
1053                 return(EINVAL);
1054         }
1055         p->p_threadstart = uap->threadstart;
1056         p->p_wqthread = uap->wqthread;
1057         p->p_pthsize = uap->pthsize;
1058
1059         return(0);
1060 }
1061
1062
1063
1064
1065 int wq_stalled_window_usecs     = WQ_STALLED_WINDOW_USECS;
1066 int wq_reduce_pool_window_usecs = WQ_REDUCE_POOL_WINDOW_USECS;
1067 int wq_max_run_latency_usecs    = WQ_MAX_RUN_LATENCY_USECS;
1068 int wq_timer_interval_msecs     = WQ_TIMER_INTERVAL_MSECS;
1069
1070
1071 SYSCTL_INT(_kern, OID_AUTO, wq_stalled_window_usecs, CTLFLAG_RW,
1072            &wq_stalled_window_usecs, 0, "");
1073
1074 SYSCTL_INT(_kern, OID_AUTO, wq_reduce_pool_window_usecs, CTLFLAG_RW,
1075            &wq_reduce_pool_window_usecs, 0, "");
1076
1077 SYSCTL_INT(_kern, OID_AUTO, wq_max_run_latency_usecs, CTLFLAG_RW,
1078            &wq_max_run_latency_usecs, 0, "");
1079
1080 SYSCTL_INT(_kern, OID_AUTO, wq_timer_interval_msecs, CTLFLAG_RW,
1081            &wq_timer_interval_msecs, 0, "");
1082
1083
1084
1085
1086 void
1087 workqueue_init_lock(proc_t p)
1088 {
1089         lck_mtx_init(&p->p_wqlock, pthread_lck_grp, pthread_lck_attr);
1090 }
1091
1092 void
1093 workqueue_destroy_lock(proc_t p)
1094 {
1095         lck_mtx_destroy(&p->p_wqlock, pthread_lck_grp);
1096 }
1097
1098 static void
1099 workqueue_lock(proc_t p)
1100 {
1101         lck_mtx_lock(&p->p_wqlock);
1102 }
1103
1104 static void
1105 workqueue_lock_spin(proc_t p)
1106 {
1107         lck_mtx_lock_spin(&p->p_wqlock);
1108 }
1109
1110 static void
1111 workqueue_unlock(proc_t p)
1112 {
1113         lck_mtx_unlock(&p->p_wqlock);
1114 }
1115
1116
1117
1118 static void
1119 workqueue_interval_timer_start(thread_call_t call, int interval_in_ms)
1120 {
1121         uint64_t deadline;
1122
1123         clock_interval_to_deadline(interval_in_ms, 1000 * 1000, &deadline);
1124
1125         thread_call_enter_delayed(call, deadline);
1126 }
1127
1128
1129 static void
1130 workqueue_timer(struct workqueue *wq, __unused int param1)
1131 {
1132         struct timeval tv, dtv;
1133         uint32_t i;
1134         boolean_t added_more_threads = FALSE;
1135         boolean_t reset_maxactive = FALSE;
1136         boolean_t restart_timer = FALSE;
1137
1138         microuptime(&tv);
1139
1140         KERNEL_DEBUG(0xefffd108, (int)wq, 0, 0, 0, 0);
1141
1142         /*
1143          * check to see if the stall frequency was beyond our tolerance
1144          * or we have work on the queue, but haven't scheduled any
1145          * new work within our acceptable time interval because
1146          * there were no idle threads left to schedule
1147          *
1148          * WQ_TIMER_WATCH will only be set if we have 1 or more affinity
1149          * groups that have stalled (no active threads and no idle threads)...
1150          * it will not be set if all affinity groups have at least 1 thread
1151          * that is currently runnable... if all processors have a runnable
1152          * thread, there is no need to add more threads even if we're not
1153          * scheduling new work within our allowed window... it just means
1154          * that the work items are taking a long time to complete.
1155          */
1156         if (wq->wq_flags & (WQ_ADD_TO_POOL | WQ_TIMER_WATCH)) {
1157
1158                 if (wq->wq_flags & WQ_ADD_TO_POOL)
1159                         added_more_threads = TRUE;
1160                 else {
1161                         timersub(&tv, &wq->wq_lastran_ts, &dtv);
1162
1163                         if (((dtv.tv_sec * 1000000) + dtv.tv_usec) > wq_stalled_window_usecs)
1164                                 added_more_threads = TRUE;
1165                 }
1166                 if (added_more_threads == TRUE) {
1167                         for (i = 0; i < wq->wq_affinity_max && wq->wq_nthreads < WORKQUEUE_MAXTHREADS; i++) {
1168                                 (void)workqueue_addnewthread(wq);
1169                         }
1170                 }
1171         }
1172         timersub(&tv, &wq->wq_reduce_ts, &dtv);
1173
1174         if (((dtv.tv_sec * 1000000) + dtv.tv_usec) > wq_reduce_pool_window_usecs)
1175                 reset_maxactive = TRUE;
1176
1177         /*
1178          * if the pool size has grown beyond the minimum number
1179          * of threads needed to keep all of the processors busy, and
1180          * the maximum number of threads scheduled concurrently during
1181          * the last sample period didn't exceed half the current pool
1182          * size, then its time to trim the pool size back
1183          */
1184         if (added_more_threads == FALSE &&
1185             reset_maxactive == TRUE &&
1186             wq->wq_nthreads > wq->wq_affinity_max &&
1187             wq->wq_max_threads_scheduled <= (wq->wq_nthreads / 2)) {
1188                 uint32_t nthreads_to_remove;
1189
1190                 if ((nthreads_to_remove = (wq->wq_nthreads / 4)) == 0)
1191                         nthreads_to_remove = 1;
1192
1193                 for (i = 0; i < nthreads_to_remove && wq->wq_nthreads > wq->wq_affinity_max; i++)
1194                         workqueue_removethread(wq);
1195         }
1196         workqueue_lock_spin(wq->wq_proc);
1197
1198         if (reset_maxactive == TRUE) {
1199                 wq->wq_max_threads_scheduled = 0;
1200                 microuptime(&wq->wq_reduce_ts);
1201         }
1202         if (added_more_threads) {
1203                 wq->wq_flags &= ~(WQ_ADD_TO_POOL | WQ_TIMER_WATCH);
1204
1205                 /*
1206                  * since we added more threads, we should be
1207                  * able to run some work if its still available
1208                  */
1209                 workqueue_run_nextitem(wq->wq_proc, THREAD_NULL);
1210                 workqueue_lock_spin(wq->wq_proc);
1211         }
1212         if ((wq->wq_nthreads > wq->wq_affinity_max) ||
1213             (wq->wq_flags & WQ_TIMER_WATCH)) {
1214                 restart_timer = TRUE;
1215         } else
1216                 wq->wq_flags &= ~WQ_TIMER_RUNNING;
1217
1218         workqueue_unlock(wq->wq_proc);
1219
1220         /*
1221          * we needed to knock down the WQ_TIMER_RUNNING flag while behind
1222          * the workqueue lock... however, we don't want to hold the lock
1223          * while restarting the timer and we certainly don't want 2 or more
1224          * instances of the timer... so set a local to indicate the need
1225          * for a restart since the state of wq_flags may change once we
1226          * drop the workqueue lock...
1227          */
1228         if (restart_timer == TRUE)
1229                 workqueue_interval_timer_start(wq->wq_timer_call, wq_timer_interval_msecs);
1230 }
1231
1232
1233 static void
1234 workqueue_callback(
1235                    int          type,
1236                    thread_t     thread)
1237 {
1238         struct uthread    *uth;
1239         struct threadlist *tl;
1240         struct workqueue  *wq;
1241
1242         uth = get_bsdthread_info(thread);
1243         tl  = uth->uu_threadlist;
1244         wq  = tl->th_workq;
1245
1246         switch (type) {
1247
1248               case SCHED_CALL_BLOCK:
1249                 {
1250                 uint32_t        old_activecount;
1251
1252                 old_activecount = OSAddAtomic(-1, (SInt32 *)&wq->wq_thactivecount[tl->th_affinity_tag]);
1253
1254                 if (old_activecount == 1 && wq->wq_itemcount) {
1255                         /*
1256                          * we were the last active thread on this affinity set
1257                          * and we've got work to do
1258                          */
1259                         workqueue_lock_spin(wq->wq_proc);
1260                         /*
1261                          * if this thread is blocking (not parking)
1262                          * and the idle list is empty for this affinity group
1263                          * we'll count it as a 'stall'
1264                          */
1265                         if ((tl->th_flags & TH_LIST_RUNNING) &&
1266                             TAILQ_EMPTY(&wq->wq_thidlelist[tl->th_affinity_tag]))
1267                                 wq->wq_stalled_count++;
1268
1269                         workqueue_run_nextitem(wq->wq_proc, THREAD_NULL);
1270                         /*
1271                          * workqueue_run_nextitem will drop the workqueue
1272                          * lock before it returns
1273                          */
1274                 }
1275                 KERNEL_DEBUG(0xefffd020, (int)thread, wq->wq_threads_scheduled, tl->th_affinity_tag, 0, 0);
1276                 }
1277                 break;
1278
1279               case SCHED_CALL_UNBLOCK:
1280                 /*
1281                  * we cannot take the workqueue_lock here...
1282                  * an UNBLOCK can occur from a timer event which
1283                  * is run from an interrupt context... if the workqueue_lock
1284                  * is already held by this processor, we'll deadlock...
1285                  * the thread lock for the thread being UNBLOCKED
1286                  * is also held
1287                  */
1288                 if (tl->th_unparked)
1289                         OSAddAtomic(-1, (SInt32 *)&tl->th_unparked);
1290                 else
1291                         OSAddAtomic(1, (SInt32 *)&wq->wq_thactivecount[tl->th_affinity_tag]);
1292
1293                 KERNEL_DEBUG(0xefffd024, (int)thread, wq->wq_threads_scheduled, tl->th_affinity_tag, 0, 0);
1294                 break;
1295         }
1296 }
1297
1298 static void
1299 workqueue_removethread(struct workqueue *wq)
1300 {
1301         struct threadlist *tl;
1302         uint32_t        i, affinity_tag = 0;
1303
1304         tl = NULL;
1305
1306         workqueue_lock_spin(wq->wq_proc);
1307
1308         for (i = 0; i < wq->wq_affinity_max; i++) {
1309
1310                 affinity_tag = wq->wq_nextaffinitytag;
1311
1312                 if (affinity_tag == 0)
1313                         affinity_tag = wq->wq_affinity_max - 1;
1314                 else
1315                         affinity_tag--;
1316                 wq->wq_nextaffinitytag = affinity_tag;
1317
1318                 /*
1319                  * look for an idle thread to steal from this affinity group
1320                  * but don't grab the only thread associated with it
1321                  */
1322                 if (!TAILQ_EMPTY(&wq->wq_thidlelist[affinity_tag]) && wq->wq_thcount[affinity_tag] > 1) {
1323                         tl = TAILQ_FIRST(&wq->wq_thidlelist[affinity_tag]);
1324                         TAILQ_REMOVE(&wq->wq_thidlelist[affinity_tag], tl, th_entry);
1325
1326                         wq->wq_nthreads--;
1327                         wq->wq_thcount[affinity_tag]--;
1328
1329                         break;
1330                 }
1331         }
1332         workqueue_unlock(wq->wq_proc);
1333
1334         if (tl != NULL) {
1335                 thread_sched_call(tl->th_thread, NULL);
1336
1337                 if ( (tl->th_flags & TH_LIST_BLOCKED) )
1338                         wakeup(tl);
1339                 else {
1340                         /*
1341                          * thread was created, but never used...
1342                          * need to clean up the stack and port ourselves
1343                          * since we're not going to spin up through the
1344                          * normal exit path triggered from Libc
1345                          */
1346                         (void)mach_vm_deallocate(wq->wq_map, tl->th_stackaddr, tl->th_allocsize);
1347                         (void)mach_port_deallocate(get_task_ipcspace(wq->wq_task), (mach_port_name_t)tl->th_thport);
1348
1349                         thread_terminate(tl->th_thread);
1350                 }
1351                 KERNEL_DEBUG(0xefffd030, (int)tl->th_thread, wq->wq_nthreads, tl->th_flags & TH_LIST_BLOCKED, 0, 0);
1352                 /*
1353                  * drop our ref on the thread
1354                  */
1355                 thread_deallocate(tl->th_thread);
1356
1357                 kfree(tl, sizeof(struct threadlist));
1358         }
1359 }
1360
1361
1362 static int
1363 workqueue_addnewthread(struct workqueue *wq)
1364 {
1365         struct threadlist *tl;
1366         struct uthread  *uth;
1367         kern_return_t   kret;
1368         thread_t        th;
1369         proc_t          p;
1370         void            *sright;
1371         mach_vm_offset_t stackaddr;
1372         uint32_t        affinity_tag;
1373
1374         p = wq->wq_proc;
1375
1376         kret = thread_create(wq->wq_task, &th);
1377
1378         if (kret != KERN_SUCCESS)
1379                 return(EINVAL);
1380
1381         tl = kalloc(sizeof(struct threadlist));
1382         bzero(tl, sizeof(struct threadlist));
1383
1384 #if defined(__ppc__)
1385         stackaddr = 0xF0000000;
1386 #elif defined(__i386__)
1387         stackaddr = 0xB0000000;
1388 #elif defined(__arm__)
1389         stackaddr = 0xB0000000; /* XXX ARM */
1390 #else
1391 #error Need to define a stack address hint for this architecture
1392 #endif
1393         tl->th_allocsize = PTH_DEFAULT_STACKSIZE + PTH_DEFAULT_GUARDSIZE + p->p_pthsize;
1394
1395         kret = mach_vm_map(wq->wq_map, &stackaddr,
1396                         tl->th_allocsize,
1397                         page_size-1,
1398                         VM_MAKE_TAG(VM_MEMORY_STACK)| VM_FLAGS_ANYWHERE , NULL,
1399                         0, FALSE, VM_PROT_DEFAULT, VM_PROT_ALL,
1400                         VM_INHERIT_DEFAULT);
1401
1402         if (kret != KERN_SUCCESS) {
1403                 kret = mach_vm_allocate(wq->wq_map,
1404                                         &stackaddr, tl->th_allocsize,
1405                                         VM_MAKE_TAG(VM_MEMORY_STACK) | VM_FLAGS_ANYWHERE);
1406         }
1407         if (kret == KERN_SUCCESS) {
1408                 /*
1409                  * The guard page is at the lowest address
1410                  * The stack base is the highest address
1411                  */
1412                 kret = mach_vm_protect(wq->wq_map, stackaddr, PTH_DEFAULT_GUARDSIZE, FALSE, VM_PROT_NONE);
1413
1414                 if (kret != KERN_SUCCESS)
1415                         (void) mach_vm_deallocate(wq->wq_map, stackaddr, tl->th_allocsize);
1416         }
1417         if (kret != KERN_SUCCESS) {
1418                 (void) thread_terminate(th);
1419
1420                 kfree(tl, sizeof(struct threadlist));
1421
1422                 return(EINVAL);
1423         }
1424         thread_reference(th);
1425
1426         sright = (void *) convert_thread_to_port(th);
1427         tl->th_thport = (void *)ipc_port_copyout_send(sright, get_task_ipcspace(wq->wq_task));
1428
1429         thread_static_param(th, TRUE);
1430
1431         workqueue_lock_spin(p);
1432
1433         affinity_tag = wq->wq_nextaffinitytag;
1434         wq->wq_nextaffinitytag = (affinity_tag + 1) % wq->wq_affinity_max;
1435
1436         workqueue_unlock(p);
1437
1438         tl->th_flags = TH_LIST_INITED | TH_LIST_SUSPENDED;
1439
1440         tl->th_thread = th;
1441         tl->th_workq = wq;
1442         tl->th_stackaddr = stackaddr;
1443         tl->th_affinity_tag = affinity_tag;
1444
1445 #if defined(__ppc__)
1446         //ml_fp_setvalid(FALSE);
1447         thread_set_cthreadself(th, (uint64_t)(tl->th_stackaddr + PTH_DEFAULT_STACKSIZE + PTH_DEFAULT_GUARDSIZE), IS_64BIT_PROCESS(p));
1448 #endif /* __ppc__ */
1449         /*
1450          * affinity tag of 0 means no affinity...
1451          * but we want our tags to be 0 based because they
1452          * are used to index arrays, so...
1453          * keep it 0 based internally and bump by 1 when
1454          * calling out to set it
1455          */
1456         (void)thread_affinity_set(th, affinity_tag + 1);
1457         thread_sched_call(th, workqueue_callback);
1458
1459         uth = get_bsdthread_info(tl->th_thread);
1460         uth->uu_threadlist = (void *)tl;
1461
1462         workqueue_lock_spin(p);
1463
1464         TAILQ_INSERT_TAIL(&wq->wq_thidlelist[tl->th_affinity_tag], tl, th_entry);
1465         wq->wq_nthreads++;
1466         wq->wq_thcount[affinity_tag]++;
1467
1468         KERNEL_DEBUG1(0xefffd014 | DBG_FUNC_START, (int)current_thread(), affinity_tag, wq->wq_nthreads, 0, (int)tl->th_thread);
1469
1470         /*
1471          * work may have come into the queue while
1472          * no threads were available to run... since
1473          * we're adding a new thread, go evaluate the
1474          * current state
1475          */
1476         workqueue_run_nextitem(p, THREAD_NULL);
1477         /*
1478          * workqueue_run_nextitem is responsible for
1479          * dropping the workqueue lock in all cases
1480          */
1481
1482         return(0);
1483 }
1484
1485 int
1486 workq_open(__unused struct proc *p, __unused struct workq_open_args  *uap, __unused register_t *retval)
1487 {
1488         struct workqueue * wq;
1489         int size;
1490         char * ptr;
1491         int j;
1492         uint32_t i;
1493         int error = 0;
1494         int num_cpus;
1495         struct workitem * witem;
1496         struct workitemlist *wl;
1497
1498         workqueue_lock(p);
1499
1500         if (p->p_wqptr == NULL) {
1501                 num_cpus = ml_get_max_cpus();
1502
1503                 size = (sizeof(struct workqueue)) +
1504                        (num_cpus * sizeof(int *)) +
1505                        (num_cpus * sizeof(TAILQ_HEAD(, threadlist)));
1506
1507                 ptr = (char *)kalloc(size);
1508                 bzero(ptr, size);
1509
1510                 wq = (struct workqueue *)ptr;
1511                 wq->wq_flags = WQ_LIST_INITED;
1512                 wq->wq_proc = p;
1513                 wq->wq_affinity_max = num_cpus;
1514                 wq->wq_task = current_task();
1515                 wq->wq_map  = current_map();
1516
1517                 for (i = 0; i < WORKQUEUE_NUMPRIOS; i++) {
1518                         wl = (struct workitemlist *)&wq->wq_list[i];
1519                         TAILQ_INIT(&wl->wl_itemlist);
1520                         TAILQ_INIT(&wl->wl_freelist);
1521
1522                         for (j = 0; j < WORKITEM_SIZE; j++) {
1523                                 witem = &wq->wq_array[(i*WORKITEM_SIZE) + j];
1524                                 TAILQ_INSERT_TAIL(&wl->wl_freelist, witem, wi_entry);
1525                         }
1526                 }
1527                 wq->wq_thactivecount = (uint32_t *)((char *)ptr + sizeof(struct workqueue));
1528                 wq->wq_thcount       = (uint32_t *)&wq->wq_thactivecount[wq->wq_affinity_max];
1529                 wq->wq_thidlelist    = (struct wq_thidlelist *)&wq->wq_thcount[wq->wq_affinity_max];
1530
1531                 for (i = 0; i < wq->wq_affinity_max; i++)
1532                         TAILQ_INIT(&wq->wq_thidlelist[i]);
1533
1534                 TAILQ_INIT(&wq->wq_thrunlist);
1535
1536                 p->p_wqptr = (void *)wq;
1537                 p->p_wqsize = size;
1538
1539                 workqueue_unlock(p);
1540
1541                 wq->wq_timer_call = thread_call_allocate((thread_call_func_t)workqueue_timer, (thread_call_param_t)wq);
1542
1543                 for (i = 0; i < wq->wq_affinity_max; i++) {
1544                         (void)workqueue_addnewthread(wq);
1545                 }
1546                 /* If unable to create any threads, return error */
1547                 if (wq->wq_nthreads == 0)
1548                         error = EINVAL;
1549                 workqueue_lock_spin(p);
1550
1551                 microuptime(&wq->wq_reduce_ts);
1552                 microuptime(&wq->wq_lastran_ts);
1553                 wq->wq_max_threads_scheduled = 0;
1554                 wq->wq_stalled_count = 0;
1555         }
1556         workqueue_unlock(p);
1557
1558         return(error);
1559 }
1560
1561 int
1562 workq_ops(struct proc *p, struct workq_ops_args  *uap, __unused register_t *retval)
1563 {
1564         int options      = uap->options;
1565         int prio         = uap->prio;   /* should  be used to find the right workqueue */
1566         user_addr_t item = uap->item;
1567         int error = 0;
1568         thread_t th = THREAD_NULL;
1569         struct workqueue *wq;
1570
1571         prio += 2;      /* normalize prio -2 to +2 to 0 -4 */
1572
1573         switch (options) {
1574
1575                 case WQOPS_QUEUE_ADD: {
1576
1577                         KERNEL_DEBUG(0xefffd008 | DBG_FUNC_NONE, (int)item, 0, 0, 0, 0);
1578
1579                         workqueue_lock_spin(p);
1580
1581                         if ((wq = (struct workqueue *)p->p_wqptr) == NULL) {
1582                                 workqueue_unlock(p);
1583                                 return (EINVAL);
1584                         }
1585                         error = workqueue_additem(wq, prio, item);
1586
1587                         }
1588                         break;
1589                 case WQOPS_QUEUE_REMOVE: {
1590
1591                         workqueue_lock_spin(p);
1592
1593                         if ((wq = (struct workqueue *)p->p_wqptr) == NULL) {
1594                                 workqueue_unlock(p);
1595                                 return (EINVAL);
1596                         }
1597                         error = workqueue_removeitem(wq, prio, item);
1598                         }
1599                         break;
1600                 case WQOPS_THREAD_RETURN: {
1601
1602                         th = current_thread();
1603
1604                         KERNEL_DEBUG(0xefffd004 | DBG_FUNC_END, 0, 0, 0, 0, 0);
1605
1606                         workqueue_lock_spin(p);
1607
1608                         if ((wq = (struct workqueue *)p->p_wqptr) == NULL) {
1609                                 workqueue_unlock(p);
1610                                 return (EINVAL);
1611                         }
1612                         }
1613                         break;
1614                 default:
1615                         return (EINVAL);
1616         }
1617         workqueue_run_nextitem(p, th);
1618         /*
1619          * workqueue_run_nextitem is responsible for
1620          * dropping the workqueue lock in all cases
1621          */
1622         return(error);
1623 }
1624
1625 void
1626 workqueue_exit(struct proc *p)
1627 {
1628         struct workqueue  * wq;
1629         struct threadlist  * tl, *tlist;
1630         uint32_t i;
1631
1632         if (p->p_wqptr != NULL) {
1633
1634                 workqueue_lock_spin(p);
1635
1636                 wq = (struct workqueue *)p->p_wqptr;
1637                 p->p_wqptr = NULL;
1638
1639                 workqueue_unlock(p);
1640
1641                 if (wq == NULL)
1642                         return;
1643
1644                 if (wq->wq_flags & WQ_TIMER_RUNNING)
1645                         thread_call_cancel(wq->wq_timer_call);
1646                 thread_call_free(wq->wq_timer_call);
1647
1648                 TAILQ_FOREACH_SAFE(tl, &wq->wq_thrunlist, th_entry, tlist) {
1649                         /*
1650                          * drop our last ref on the thread
1651                          */
1652                         thread_sched_call(tl->th_thread, NULL);
1653                         thread_deallocate(tl->th_thread);
1654
1655                         TAILQ_REMOVE(&wq->wq_thrunlist, tl, th_entry);
1656                         kfree(tl, sizeof(struct threadlist));
1657                 }
1658                 for (i = 0; i < wq->wq_affinity_max; i++) {
1659                         TAILQ_FOREACH_SAFE(tl, &wq->wq_thidlelist[i], th_entry, tlist) {
1660                                 /*
1661                                  * drop our last ref on the thread
1662                                  */
1663                                 thread_sched_call(tl->th_thread, NULL);
1664                                 thread_deallocate(tl->th_thread);
1665
1666                                 TAILQ_REMOVE(&wq->wq_thidlelist[i], tl, th_entry);
1667                                 kfree(tl, sizeof(struct threadlist));
1668                         }
1669                 }
1670                 kfree(wq, p->p_wqsize);
1671         }
1672 }
1673
1674 static int
1675 workqueue_additem(struct workqueue *wq, int prio, user_addr_t item)
1676 {
1677         struct workitem *witem;
1678         struct workitemlist *wl;
1679
1680         wl = (struct workitemlist *)&wq->wq_list[prio];
1681
1682         if (TAILQ_EMPTY(&wl->wl_freelist))
1683                 return (ENOMEM);
1684
1685         witem = (struct workitem *)TAILQ_FIRST(&wl->wl_freelist);
1686         TAILQ_REMOVE(&wl->wl_freelist, witem, wi_entry);
1687
1688         witem->wi_item = item;
1689         TAILQ_INSERT_TAIL(&wl->wl_itemlist, witem, wi_entry);
1690
1691         if (wq->wq_itemcount == 0) {
1692                 microuptime(&wq->wq_lastran_ts);
1693                 wq->wq_stalled_count = 0;
1694         }
1695         wq->wq_itemcount++;
1696
1697         return (0);
1698 }
1699
1700 static int
1701 workqueue_removeitem(struct workqueue *wq, int prio, user_addr_t item)
1702 {
1703         struct workitem *witem;
1704         struct workitemlist *wl;
1705         int error = ESRCH;
1706
1707         wl = (struct workitemlist *)&wq->wq_list[prio];
1708
1709         TAILQ_FOREACH(witem, &wl->wl_itemlist, wi_entry) {
1710                 if (witem->wi_item == item) {
1711                         TAILQ_REMOVE(&wl->wl_itemlist, witem, wi_entry);
1712                         wq->wq_itemcount--;
1713
1714                         witem->wi_item = (user_addr_t)0;
1715                         TAILQ_INSERT_HEAD(&wl->wl_freelist, witem, wi_entry);
1716
1717                         error = 0;
1718                         break;
1719                 }
1720         }
1721         if (wq->wq_itemcount == 0)
1722                 wq->wq_flags &= ~(WQ_ADD_TO_POOL | WQ_TIMER_WATCH);
1723
1724         return (error);
1725 }
1726
1727 /*
1728  * workqueue_run_nextitem:
1729  *   called with the workqueue lock held...
1730  *   responsible for dropping it in all cases
1731  */
1732 static void
1733 workqueue_run_nextitem(proc_t p, thread_t thread)
1734 {
1735         struct workqueue *wq;
1736         struct workitem *witem = NULL;
1737         user_addr_t item = 0;
1738         thread_t th_to_run = THREAD_NULL;
1739         thread_t th_to_park = THREAD_NULL;
1740         int wake_thread = 0;
1741         int reuse_thread = 1;
1742         uint32_t stalled_affinity_count = 0;
1743         int i;
1744         uint32_t affinity_tag;
1745         struct threadlist *tl = NULL;
1746         struct uthread *uth = NULL;
1747         struct workitemlist *wl;
1748         boolean_t start_timer = FALSE;
1749         struct timeval tv, lat_tv;
1750
1751         wq = (struct workqueue *)p->p_wqptr;
1752
1753         KERNEL_DEBUG(0xefffd000 | DBG_FUNC_START, (int)thread, wq->wq_threads_scheduled, wq->wq_stalled_count, 0, 0);
1754
1755         if (wq->wq_itemcount == 0) {
1756                 if ((th_to_park = thread) == THREAD_NULL)
1757                         goto out;
1758                 goto parkit;
1759         }
1760         if (thread != THREAD_NULL) {
1761                 /*
1762                  * we're a worker thread from the pool... currently we
1763                  * are considered 'active' which means we're counted
1764                  * in "wq_thactivecount"
1765                  */
1766                 uth = get_bsdthread_info(thread);
1767                 tl = uth->uu_threadlist;
1768
1769                 if (wq->wq_thactivecount[tl->th_affinity_tag] == 1) {
1770                         /*
1771                          * we're the only active thread associated with our
1772                          * affinity group, so pick up some work and keep going
1773                          */
1774                         th_to_run = thread;
1775                         goto pick_up_work;
1776                 }
1777         }
1778         for (affinity_tag = 0; affinity_tag < wq->wq_affinity_max; affinity_tag++) {
1779                 /*
1780                  * look for first affinity group that is currently not active
1781                  * and has at least 1 idle thread
1782                  */
1783                 if (wq->wq_thactivecount[affinity_tag] == 0) {
1784                         if (!TAILQ_EMPTY(&wq->wq_thidlelist[affinity_tag]))
1785                                 break;
1786                         stalled_affinity_count++;
1787                 }
1788         }
1789         if (thread == THREAD_NULL) {
1790                 /*
1791                  * we're not one of the 'worker' threads
1792                  */
1793                 if (affinity_tag >= wq->wq_affinity_max) {
1794                         /*
1795                          * we've already got at least 1 thread per
1796                          * affinity group in the active state... or
1797                          * we've got no idle threads to play with
1798                          */
1799                         if (stalled_affinity_count) {
1800
1801                                 if ( !(wq->wq_flags & WQ_TIMER_RUNNING) ) {
1802                                         wq->wq_flags |= WQ_TIMER_RUNNING;
1803                                         start_timer = TRUE;
1804                                 }
1805                                 wq->wq_flags |= WQ_TIMER_WATCH;
1806                         }
1807                         goto out;
1808                 }
1809         } else {
1810                 /*
1811                  * we're overbooked on the affinity group we're associated with,
1812                  * so park this thread
1813                  */
1814                 th_to_park = thread;
1815
1816                 if (affinity_tag >= wq->wq_affinity_max) {
1817                         /*
1818                          * all the affinity groups have active threads
1819                          * running, or there are no idle threads to
1820                          * schedule
1821                          */
1822                         if (stalled_affinity_count) {
1823
1824                                 if ( !(wq->wq_flags & WQ_TIMER_RUNNING) ) {
1825                                         wq->wq_flags |= WQ_TIMER_RUNNING;
1826                                         start_timer = TRUE;
1827                                 }
1828                                 wq->wq_flags |= WQ_TIMER_WATCH;
1829                         }
1830                         goto parkit;
1831                 }
1832                 /*
1833                  * we've got a candidate (affinity group with no currently
1834                  * active threads) to start a new thread on...
1835                  * we already know there is both work available
1836                  * and an idle thread with the correct affinity tag, so
1837                  * fall into the code that pulls a new thread and workitem...
1838                  * once we've kicked that thread off, we'll park this one
1839                  */
1840         }
1841         tl = TAILQ_FIRST(&wq->wq_thidlelist[affinity_tag]);
1842         TAILQ_REMOVE(&wq->wq_thidlelist[affinity_tag], tl, th_entry);
1843
1844         th_to_run = tl->th_thread;
1845         TAILQ_INSERT_TAIL(&wq->wq_thrunlist, tl, th_entry);
1846
1847         if ((tl->th_flags & TH_LIST_SUSPENDED) == TH_LIST_SUSPENDED) {
1848                 tl->th_flags &= ~TH_LIST_SUSPENDED;
1849                 reuse_thread = 0;
1850         } else if ((tl->th_flags & TH_LIST_BLOCKED) == TH_LIST_BLOCKED) {
1851                 tl->th_flags &= ~TH_LIST_BLOCKED;
1852                 wake_thread = 1;
1853         }
1854         tl->th_flags |= TH_LIST_RUNNING;
1855
1856         wq->wq_threads_scheduled++;
1857
1858         if (wq->wq_threads_scheduled > wq->wq_max_threads_scheduled)
1859                 wq->wq_max_threads_scheduled = wq->wq_threads_scheduled;
1860
1861 pick_up_work:
1862         for (i = 0; i < WORKQUEUE_NUMPRIOS; i++) {
1863                 wl = (struct workitemlist *)&wq->wq_list[i];
1864
1865                 if (!(TAILQ_EMPTY(&wl->wl_itemlist))) {
1866
1867                         witem = TAILQ_FIRST(&wl->wl_itemlist);
1868                         TAILQ_REMOVE(&wl->wl_itemlist, witem, wi_entry);
1869                         wq->wq_itemcount--;
1870
1871                         item = witem->wi_item;
1872                         witem->wi_item = (user_addr_t)0;
1873                         TAILQ_INSERT_HEAD(&wl->wl_freelist, witem, wi_entry);
1874
1875                         break;
1876                 }
1877         }
1878         if (witem == NULL)
1879                 panic("workq_run_nextitem: NULL witem");
1880
1881         if (thread != th_to_run) {
1882                 /*
1883                  * we're starting up a thread from a parked/suspended condition
1884                  */
1885                 OSAddAtomic(1, (SInt32 *)&wq->wq_thactivecount[tl->th_affinity_tag]);
1886                 OSAddAtomic(1, (SInt32 *)&tl->th_unparked);
1887         }
1888         if (wq->wq_itemcount == 0)
1889                 wq->wq_flags &= ~WQ_TIMER_WATCH;
1890         else {
1891                 microuptime(&tv);
1892                 /*
1893                  * if we had any affinity groups stall (no threads runnable)
1894                  * since we last scheduled an item... and
1895                  * the elapsed time since we last scheduled an item
1896                  * exceeds the latency tolerance...
1897                  * we ask the timer thread (which should already be running)
1898                  * to add some more threads to the pool
1899                  */
1900                 if (wq->wq_stalled_count && !(wq->wq_flags & WQ_ADD_TO_POOL)) {
1901                         timersub(&tv, &wq->wq_lastran_ts, &lat_tv);
1902
1903                         if (((lat_tv.tv_sec * 1000000) + lat_tv.tv_usec) > wq_max_run_latency_usecs)
1904                                 wq->wq_flags |= WQ_ADD_TO_POOL;
1905
1906                         KERNEL_DEBUG(0xefffd10c, wq->wq_stalled_count, lat_tv.tv_sec, lat_tv.tv_usec, wq->wq_flags, 0);
1907                 }
1908                 wq->wq_lastran_ts = tv;
1909         }
1910         wq->wq_stalled_count = 0;
1911         workqueue_unlock(p);
1912
1913         KERNEL_DEBUG(0xefffd02c, wq->wq_thactivecount[0], wq->wq_thactivecount[1],
1914                      wq->wq_thactivecount[2], wq->wq_thactivecount[3], 0);
1915
1916         KERNEL_DEBUG(0xefffd02c, wq->wq_thactivecount[4], wq->wq_thactivecount[5],
1917                      wq->wq_thactivecount[6], wq->wq_thactivecount[7], 0);
1918
1919         /*
1920          * if current thread is reused for workitem, does not return via unix_syscall
1921          */
1922         wq_runitem(p, item, th_to_run, tl, reuse_thread, wake_thread, (thread == th_to_run));
1923
1924         if (th_to_park == THREAD_NULL) {
1925
1926                 KERNEL_DEBUG(0xefffd000 | DBG_FUNC_END, (int)thread, (int)item, wq->wq_flags, 1, 0);
1927
1928                 return;
1929         }
1930         workqueue_lock_spin(p);
1931
1932 parkit:
1933         wq->wq_threads_scheduled--;
1934         /*
1935          * this is a workqueue thread with no more
1936          * work to do... park it for now
1937          */
1938         uth = get_bsdthread_info(th_to_park);
1939         tl = uth->uu_threadlist;
1940         if (tl == 0)
1941                 panic("wq thread with no threadlist ");
1942
1943         TAILQ_REMOVE(&wq->wq_thrunlist, tl, th_entry);
1944         tl->th_flags &= ~TH_LIST_RUNNING;
1945
1946         tl->th_flags |= TH_LIST_BLOCKED;
1947         TAILQ_INSERT_HEAD(&wq->wq_thidlelist[tl->th_affinity_tag], tl, th_entry);
1948
1949         assert_wait((caddr_t)tl, (THREAD_INTERRUPTIBLE));
1950
1951         workqueue_unlock(p);
1952
1953         if (start_timer)
1954                 workqueue_interval_timer_start(wq->wq_timer_call, wq_timer_interval_msecs);
1955
1956         KERNEL_DEBUG1(0xefffd018 | DBG_FUNC_START, (int)current_thread(), wq->wq_threads_scheduled, 0, 0, (int)th_to_park);
1957
1958         thread_block((thread_continue_t)thread_exception_return);
1959
1960         panic("unexpected return from thread_block");
1961
1962 out:
1963         workqueue_unlock(p);
1964
1965         if (start_timer)
1966                 workqueue_interval_timer_start(wq->wq_timer_call, wq_timer_interval_msecs);
1967
1968         KERNEL_DEBUG(0xefffd000 | DBG_FUNC_END, (int)thread, 0, wq->wq_flags, 2, 0);
1969
1970         return;
1971 }
1972
1973 static void
1974 wq_runitem(proc_t p, user_addr_t item, thread_t th, struct threadlist *tl,
1975            int reuse_thread, int wake_thread, int return_directly)
1976 {
1977         int ret = 0;
1978
1979         KERNEL_DEBUG1(0xefffd004 | DBG_FUNC_START, (int)current_thread(), (int)item, wake_thread, tl->th_affinity_tag, (int)th);
1980
1981         ret = setup_wqthread(p, th, item, reuse_thread, tl);
1982
1983         if (ret != 0)
1984                 panic("setup_wqthread failed  %x\n", ret);
1985
1986         if (return_directly) {
1987                 thread_exception_return();
1988
1989                 panic("wq_runitem: thread_exception_return returned ...\n");
1990         }
1991         if (wake_thread) {
1992                 KERNEL_DEBUG1(0xefffd018 | DBG_FUNC_END, (int)current_thread(), 0, 0, 0, (int)th);
1993
1994                 wakeup(tl);
1995         } else {
1996                 KERNEL_DEBUG1(0xefffd014 | DBG_FUNC_END, (int)current_thread(), 0, 0, 0, (int)th);
1997
1998                 thread_resume(th);
1999         }
2000 }
2001
2002
2003 int
2004 setup_wqthread(proc_t p, thread_t th, user_addr_t item, int reuse_thread, struct threadlist *tl)
2005 {
2006
2007 #if defined(__ppc__)
2008         /*
2009          * Set up PowerPC registers...
2010          * internally they are always kept as 64 bit and
2011          * since the register set is the same between 32 and 64bit modes
2012          * we don't need 2 different methods for setting the state
2013          */
2014         {
2015                 ppc_thread_state64_t state64;
2016                 ppc_thread_state64_t *ts64 = &state64;
2017
2018                 ts64->srr0 = (uint64_t)p->p_wqthread;
2019                 ts64->r1 = (uint64_t)((tl->th_stackaddr + PTH_DEFAULT_STACKSIZE + PTH_DEFAULT_GUARDSIZE) - C_ARGSAVE_LEN - C_RED_ZONE);
2020                 ts64->r3 = (uint64_t)(tl->th_stackaddr + PTH_DEFAULT_STACKSIZE + PTH_DEFAULT_GUARDSIZE);
2021                 ts64->r4 = (uint64_t)((unsigned int)tl->th_thport);
2022                 ts64->r5 = (uint64_t)(tl->th_stackaddr + PTH_DEFAULT_GUARDSIZE);
2023                 ts64->r6 = (uint64_t)item;
2024                 ts64->r7 = (uint64_t)reuse_thread;
2025                 ts64->r8 = (uint64_t)0;
2026
2027                 thread_set_wq_state64(th, (thread_state_t)ts64);
2028         }
2029 #elif defined(__i386__)
2030         int isLP64 = 0;
2031
2032         isLP64 = IS_64BIT_PROCESS(p);
2033         /*
2034          * Set up i386 registers & function call.
2035          */
2036         if (isLP64 == 0) {
2037                 x86_thread_state32_t state;
2038                 x86_thread_state32_t *ts = &state;
2039
2040                 ts->eip = (int)p->p_wqthread;
2041                 ts->eax = (unsigned int)(tl->th_stackaddr + PTH_DEFAULT_STACKSIZE + PTH_DEFAULT_GUARDSIZE);
2042                 ts->ebx = (unsigned int)tl->th_thport;
2043                 ts->ecx = (unsigned int)(tl->th_stackaddr + PTH_DEFAULT_GUARDSIZE);
2044                 ts->edx = (unsigned int)item;
2045                 ts->edi = (unsigned int)reuse_thread;
2046                 ts->esi = (unsigned int)0;
2047                 /*
2048                  * set stack pointer
2049                  */
2050                 ts->esp = (int)((vm_offset_t)((tl->th_stackaddr + PTH_DEFAULT_STACKSIZE + PTH_DEFAULT_GUARDSIZE) - C_32_STK_ALIGN));
2051
2052                 thread_set_wq_state32(th, (thread_state_t)ts);
2053
2054         } else {
2055                 x86_thread_state64_t state64;
2056                 x86_thread_state64_t *ts64 = &state64;
2057
2058                 ts64->rip = (uint64_t)p->p_wqthread;
2059                 ts64->rdi = (uint64_t)(tl->th_stackaddr + PTH_DEFAULT_STACKSIZE + PTH_DEFAULT_GUARDSIZE);
2060                 ts64->rsi = (uint64_t)((unsigned int)(tl->th_thport));
2061                 ts64->rdx = (uint64_t)(tl->th_stackaddr + PTH_DEFAULT_GUARDSIZE);
2062                 ts64->rcx = (uint64_t)item;
2063                 ts64->r8 = (uint64_t)reuse_thread;
2064                 ts64->r9 = (uint64_t)0;
2065
2066                 /*
2067                  * set stack pointer aligned to 16 byte boundary
2068                  */
2069                 ts64->rsp = (uint64_t)((tl->th_stackaddr + PTH_DEFAULT_STACKSIZE + PTH_DEFAULT_GUARDSIZE) - C_64_REDZONE_LEN);
2070
2071                 thread_set_wq_state64(th, (thread_state_t)ts64);
2072         }
2073 #elif defined(__arm__)
2074         arm_thread_state_t state;
2075         arm_thread_state_t *ts = &state;
2076
2077         /* XXX ARM add more */
2078         ts->pc = p->p_wqthread;
2079         ts->sp = tl->th_stackaddr + PTH_DEFAULT_GUARDSIZE;
2080
2081         thread_set_wq_state32(th, (thread_state_t)ts);
2082 #else
2083 #error setup_wqthread  not defined for this architecture
2084 #endif
2085         return(0);
2086 }
2087