xnu-2050.48.11.tar.gz

[apple/xnu.git] / bsd / kern / pthread_synch.c
diff --git a/bsd/kern/pthread_synch.c b/bsd/kern/pthread_synch.c

index 7a00399cc23cd26e444a9a958bbd04a946858f8f..d037ee0a1402234839c5e9c7754c9776403d3d13 100644 (file)
--- a/bsd/kern/pthread_synch.c
+++ b/bsd/kern/pthread_synch.c
@@ -91,6 +91,7 @@
  #include <mach/port.h>
  #include <vm/vm_protos.h>
  #include <vm/vm_map.h> /* for current_map() */
+#include <vm/vm_fault.h>
  #include <mach/thread_act.h> /* for thread_resume */
  #include <machine/machine_routines.h>
  #if defined(__i386__)
@@ -109,39 +110,38 @@
  #define KERNEL_DEBUG1 KERNEL_DEBUG_CONSTANT1
  #endif
  
-
-#if defined(__ppc__) || defined(__ppc64__)
-#include <architecture/ppc/cframe.h>
-#endif
-
-
  lck_grp_attr_t   *pthread_lck_grp_attr;
  lck_grp_t    *pthread_lck_grp;
  lck_attr_t   *pthread_lck_attr;
  
-extern kern_return_t thread_getstatus(register thread_t act, int flavor,
-                       thread_state_t tstate, mach_msg_type_number_t *count);
-extern kern_return_t thread_setstatus(thread_t thread, int flavor,
-                       thread_state_t tstate, mach_msg_type_number_t count);
  extern void thread_set_cthreadself(thread_t thread, uint64_t pself, int isLP64);
  extern kern_return_t mach_port_deallocate(ipc_space_t, mach_port_name_t);
  extern kern_return_t semaphore_signal_internal_trap(mach_port_name_t);
  
  extern void workqueue_thread_yielded(void);
  
-static int workqueue_additem(struct workqueue *wq, int prio, user_addr_t item, int affinity);
-static int workqueue_removeitem(struct workqueue *wq, int prio, user_addr_t item);
-static boolean_t workqueue_run_nextitem(proc_t p, struct workqueue *wq, thread_t th,
-                                       user_addr_t oc_item, int oc_prio, int oc_affinity);
-static void wq_runitem(proc_t p, user_addr_t item, thread_t th, struct threadlist *tl,
+#if defined(__i386__) || defined(__x86_64__)
+extern boolean_t is_useraddr64_canonical(uint64_t addr64);
+#endif
+
+static boolean_t workqueue_run_nextreq(proc_t p, struct workqueue *wq, thread_t th, boolean_t force_oc,
+                                       boolean_t  overcommit, int oc_prio, int oc_affinity);
+
+static boolean_t workqueue_run_one(proc_t p, struct workqueue *wq, boolean_t overcommit, int priority);
+
+static void wq_runreq(proc_t p, boolean_t overcommit, uint32_t priority, thread_t th, struct threadlist *tl,
                        int reuse_thread, int wake_thread, int return_directly);
+
+static int setup_wqthread(proc_t p, thread_t th, boolean_t overcommit, uint32_t priority, int reuse_thread, struct threadlist *tl);
+
  static void wq_unpark_continue(void);
  static void wq_unsuspend_continue(void);
-static int setup_wqthread(proc_t p, thread_t th, user_addr_t item, int reuse_thread, struct threadlist *tl);
-static boolean_t workqueue_addnewthread(struct workqueue *wq);
-static void workqueue_removethread(struct threadlist *tl);
+
+static boolean_t workqueue_addnewthread(struct workqueue *wq, boolean_t oc_thread);
+static void workqueue_removethread(struct threadlist *tl, int fromexit);
  static void workqueue_lock_spin(proc_t);
  static void workqueue_unlock(proc_t);
+
  int proc_settargetconc(pid_t pid, int queuenum, int32_t targetconc);
  int proc_setalltargetconc(pid_t pid, int32_t * targetconcp);
  
@@ -156,6 +156,12 @@ int proc_setalltargetconc(pid_t pid, int32_t * targetconcp);
  #define TRUNC_DOWN64(a,c)       ((((uint64_t)a)-(c)) & ((uint64_t)(-(c))))
  
  
+/* flag values for reuse field in the libc side _pthread_wqthread */
+#define        WQ_FLAG_THREAD_PRIOMASK         0x0000ffff
+#define        WQ_FLAG_THREAD_OVERCOMMIT       0x00010000      /* thread is with overcommit prio */
+#define        WQ_FLAG_THREAD_REUSE            0x00020000      /* thread is being reused */
+#define        WQ_FLAG_THREAD_NEWSPI           0x00040000      /* the call is with new SPIs */
+
  /*
   * Flags filed passed to bsdthread_create and back in pthread_start 
  31  <---------------------------------> 0
@@ -215,9 +221,7 @@ bsdthread_create(__unused struct proc *p, struct bsdthread_create_args  *uap, us
         isLP64 = IS_64BIT_PROCESS(p);
  
  
-#if defined(__ppc__)
-       stackaddr = 0xF0000000;
-#elif defined(__i386__) || defined(__x86_64__)
+#if defined(__i386__) || defined(__x86_64__)
         stackaddr = 0xB0000000;
  #else
  #error Need to define a stack address hint for this architecture
@@ -266,6 +270,22 @@ bsdthread_create(__unused struct proc *p, struct bsdthread_create_args  *uap, us
                 th_stack = (stackaddr + th_stacksize + PTH_DEFAULT_GUARDSIZE);
                 th_pthread = (stackaddr + th_stacksize + PTH_DEFAULT_GUARDSIZE);
                 user_stacksize = th_stacksize;
+               
+              /*
+               * Pre-fault the first page of the new thread's stack and the page that will
+               * contain the pthread_t structure.
+               */      
+               vm_fault( vmap,
+                 vm_map_trunc_page(th_stack - PAGE_SIZE_64),
+                 VM_PROT_READ | VM_PROT_WRITE,
+                 FALSE, 
+                 THREAD_UNINT, NULL, 0);
+               
+               vm_fault( vmap,
+                 vm_map_trunc_page(th_pthread),
+                 VM_PROT_READ | VM_PROT_WRITE,
+                 FALSE, 
+                 THREAD_UNINT, NULL, 0);
         } else {
                 th_stack = user_stack;
                 user_stacksize = user_stack;
@@ -275,31 +295,7 @@ bsdthread_create(__unused struct proc *p, struct bsdthread_create_args  *uap, us
  #endif
         }
         
-#if defined(__ppc__)
-       /*
-        * Set up PowerPC registers...
-        * internally they are always kept as 64 bit and
-        * since the register set is the same between 32 and 64bit modes
-        * we don't need 2 different methods for setting the state
-        */
-       {
-               ppc_thread_state64_t state64;
-               ppc_thread_state64_t *ts64 = &state64;
-
-               ts64->srr0 = (uint64_t)p->p_threadstart;
-               ts64->r1 = (uint64_t)(th_stack - C_ARGSAVE_LEN - C_RED_ZONE);
-               ts64->r3 = (uint64_t)th_pthread;
-               ts64->r4 = (uint64_t)(th_thport);
-               ts64->r5 = (uint64_t)user_func;
-               ts64->r6 = (uint64_t)user_funcarg;
-               ts64->r7 = (uint64_t)user_stacksize;
-               ts64->r8 = (uint64_t)uap->flags;
-
-               thread_set_wq_state64(th, (thread_state_t)ts64);
-
-               thread_set_cthreadself(th, (uint64_t)th_pthread, isLP64);
-       }
-#elif defined(__i386__) || defined(__x86_64__)
+#if defined(__i386__) || defined(__x86_64__)
         {
          /*
           * Set up i386 registers & function call.
@@ -338,6 +334,13 @@ bsdthread_create(__unused struct proc *p, struct bsdthread_create_args  *uap, us
                  */
                 ts64->rsp = (uint64_t)(th_stack - C_64_REDZONE_LEN);
  
+               /* Disallow setting non-canonical PC or stack */
+               if (!is_useraddr64_canonical(ts64->rsp) ||
+                   !is_useraddr64_canonical(ts64->rip)) {
+                       error = EINVAL;
+                       goto out;
+               }
+
                 thread_set_wq_state64(th, (thread_state_t)ts64);
         }
         }
@@ -348,8 +351,16 @@ bsdthread_create(__unused struct proc *p, struct bsdthread_create_args  *uap, us
         if ((flags & PTHREAD_START_SETSCHED) != 0) {
                 thread_extended_policy_data_t    extinfo;
                 thread_precedence_policy_data_t   precedinfo;
+#if CONFIG_EMBEDDED
+               int ret = 0;
+#endif /* CONFIG_EMBEDDED */
  
                 importance = (flags & PTHREAD_START_IMPORTANCE_MASK);
+#if CONFIG_EMBEDDED
+               /* sets the saved importance for apple ios daemon if backgrounded. else returns 0 */
+               ret = proc_setthread_saved_importance(th, importance);
+               if (ret == 0) {
+#endif /* CONFIG_EMBEDDED */
                 policy = (flags >> PTHREAD_START_POLICY_BITSHIFT) & PTHREAD_START_POLICY_MASK;
  
                 if (policy == SCHED_OTHER)
@@ -361,6 +372,9 @@ bsdthread_create(__unused struct proc *p, struct bsdthread_create_args  *uap, us
  #define BASEPRI_DEFAULT 31
                 precedinfo.importance = (importance - BASEPRI_DEFAULT);
                 thread_policy_set(th, THREAD_PRECEDENCE_POLICY, (thread_policy_t)&precedinfo, THREAD_PRECEDENCE_POLICY_COUNT);
+#if CONFIG_EMBEDDED
+               }
+#endif /* CONFIG_EMBEDDED */
         }
  
         kret = thread_resume(th);
@@ -453,26 +467,33 @@ uint32_t wq_stalled_window_usecs  = WQ_STALLED_WINDOW_USECS;
  uint32_t wq_reduce_pool_window_usecs   = WQ_REDUCE_POOL_WINDOW_USECS;
  uint32_t wq_max_timer_interval_usecs   = WQ_MAX_TIMER_INTERVAL_USECS;
  uint32_t wq_max_threads                        = WORKQUEUE_MAXTHREADS;
+uint32_t wq_max_constrained_threads    = WORKQUEUE_MAXTHREADS / 8;
  
  
-SYSCTL_INT(_kern, OID_AUTO, wq_yielded_threshold, CTLFLAG_RW,
+SYSCTL_INT(_kern, OID_AUTO, wq_yielded_threshold, CTLFLAG_RW | CTLFLAG_LOCKED,
            &wq_yielded_threshold, 0, "");
  
-SYSCTL_INT(_kern, OID_AUTO, wq_yielded_window_usecs, CTLFLAG_RW,
+SYSCTL_INT(_kern, OID_AUTO, wq_yielded_window_usecs, CTLFLAG_RW | CTLFLAG_LOCKED,
            &wq_yielded_window_usecs, 0, "");
  
-SYSCTL_INT(_kern, OID_AUTO, wq_stalled_window_usecs, CTLFLAG_RW,
+SYSCTL_INT(_kern, OID_AUTO, wq_stalled_window_usecs, CTLFLAG_RW | CTLFLAG_LOCKED,
            &wq_stalled_window_usecs, 0, "");
  
-SYSCTL_INT(_kern, OID_AUTO, wq_reduce_pool_window_usecs, CTLFLAG_RW,
+SYSCTL_INT(_kern, OID_AUTO, wq_reduce_pool_window_usecs, CTLFLAG_RW | CTLFLAG_LOCKED,
            &wq_reduce_pool_window_usecs, 0, "");
  
-SYSCTL_INT(_kern, OID_AUTO, wq_max_timer_interval_usecs, CTLFLAG_RW,
+SYSCTL_INT(_kern, OID_AUTO, wq_max_timer_interval_usecs, CTLFLAG_RW | CTLFLAG_LOCKED,
            &wq_max_timer_interval_usecs, 0, "");
  
-SYSCTL_INT(_kern, OID_AUTO, wq_max_threads, CTLFLAG_RW,
+SYSCTL_INT(_kern, OID_AUTO, wq_max_threads, CTLFLAG_RW | CTLFLAG_LOCKED,
            &wq_max_threads, 0, "");
  
+SYSCTL_INT(_kern, OID_AUTO, wq_max_constrained_threads, CTLFLAG_RW | CTLFLAG_LOCKED,
+          &wq_max_constrained_threads, 0, "");
+
+
+static uint32_t wq_init_constrained_limit = 1;
+
  
  void
  workqueue_init_lock(proc_t p)
@@ -519,7 +540,7 @@ workqueue_interval_timer_start(struct workqueue *wq)
  
         thread_call_enter_delayed(wq->wq_atimer_call, deadline);
  
-       KERNEL_DEBUG(0xefffd110, wq, wq->wq_itemcount, wq->wq_flags, wq->wq_timer_interval, 0);
+       KERNEL_DEBUG(0xefffd110, wq, wq->wq_reqcount, wq->wq_flags, wq->wq_timer_interval, 0);
  }
  
  
@@ -542,11 +563,9 @@ wq_thread_is_busy(uint64_t cur_ts, uint64_t *lastblocked_tsp)
          */
         lastblocked_ts = *lastblocked_tsp;
  
-#if defined(__ppc__)
-#else
         if ( !OSCompareAndSwap64((UInt64)lastblocked_ts, (UInt64)lastblocked_ts, lastblocked_tsp))
                 return (TRUE);
-#endif
+
         if (lastblocked_ts >= cur_ts) {
                 /*
                  * because the update of the timestamp when a thread blocks isn't
@@ -641,14 +660,14 @@ again:
                  * new work within our acceptable time interval because
                  * there were no idle threads left to schedule
                  */
-               if (wq->wq_itemcount) {
+               if (wq->wq_reqcount) {
                         uint32_t        priority;
                         uint32_t        affinity_tag;
                         uint32_t        i;
                         uint64_t        curtime;
  
                         for (priority = 0; priority < WORKQUEUE_NUMPRIOS; priority++) {
-                               if (wq->wq_list_bitmap & (1 << priority))
+                               if (wq->wq_requests[priority])
                                         break;
                         }
                         assert(priority < WORKQUEUE_NUMPRIOS);
@@ -682,27 +701,27 @@ again:
                                         }
                                 }
                                 if (add_thread == TRUE) {
-                                       retval = workqueue_addnewthread(wq);
+                                       retval = workqueue_addnewthread(wq, FALSE);
                                         break;
                                 }
                         }
-                       if (wq->wq_itemcount) {
+                       if (wq->wq_reqcount) {
                                 /*
                                  * as long as we have threads to schedule, and we successfully
                                  * scheduled new work, keep trying
                                  */
                                 while (wq->wq_thidlecount && !(wq->wq_flags & WQ_EXITING)) {
                                         /*
-                                        * workqueue_run_nextitem is responsible for
+                                        * workqueue_run_nextreq is responsible for
                                          * dropping the workqueue lock in all cases
                                          */
-                                       retval = workqueue_run_nextitem(p, wq, THREAD_NULL, 0, 0, 0);
+                                       retval = workqueue_run_nextreq(p, wq, THREAD_NULL, FALSE, FALSE, 0, 0);
                                         workqueue_lock_spin(p);
  
                                         if (retval == FALSE)
                                                 break;
                                 }
-                               if ( !(wq->wq_flags & WQ_EXITING) && wq->wq_itemcount) {
+                               if ( !(wq->wq_flags & WQ_EXITING) && wq->wq_reqcount) {
  
                                         if (wq->wq_thidlecount == 0 && retval == TRUE && add_thread == TRUE)
                                                 goto again;
@@ -710,7 +729,7 @@ again:
                                         if (wq->wq_thidlecount == 0 || busycount)
                                                 WQ_TIMER_NEEDED(wq, start_timer);
  
-                                       KERNEL_DEBUG(0xefffd108 | DBG_FUNC_NONE, wq, wq->wq_itemcount, wq->wq_thidlecount, busycount, 0);
+                                       KERNEL_DEBUG(0xefffd108 | DBG_FUNC_NONE, wq, wq->wq_reqcount, wq->wq_thidlecount, busycount, 0);
                                 }
                         }
                 }
@@ -745,12 +764,12 @@ workqueue_thread_yielded(void)
  
         p = current_proc();
  
-       if ((wq = p->p_wqptr) == NULL || wq->wq_itemcount == 0)
+       if ((wq = p->p_wqptr) == NULL || wq->wq_reqcount == 0)
                 return;
         
         workqueue_lock_spin(p);
  
-       if (wq->wq_itemcount) {
+       if (wq->wq_reqcount) {
                 uint64_t        curtime;
                 uint64_t        elapsed;
                 clock_sec_t     secs;
@@ -763,7 +782,7 @@ workqueue_thread_yielded(void)
                         workqueue_unlock(p);
                         return;
                 }
-               KERNEL_DEBUG(0xefffd138 | DBG_FUNC_START, wq, wq->wq_thread_yielded_count, wq->wq_itemcount, 0, 0);
+               KERNEL_DEBUG(0xefffd138 | DBG_FUNC_START, wq, wq->wq_thread_yielded_count, wq->wq_reqcount, 0, 0);
  
                 wq->wq_thread_yielded_count = 0;
  
@@ -774,16 +793,16 @@ workqueue_thread_yielded(void)
                 if (secs == 0 && usecs < wq_yielded_window_usecs) {
  
                         if (wq->wq_thidlecount == 0) {
-                               workqueue_addnewthread(wq);
+                               workqueue_addnewthread(wq, TRUE);
                                 /*
                                  * 'workqueue_addnewthread' drops the workqueue lock
                                  * when creating the new thread and then retakes it before
                                  * returning... this window allows other threads to process
-                                * work on the queue, so we need to recheck for available work
+                                * requests, so we need to recheck for available work
                                  * if none found, we just return...  the newly created thread
                                  * will eventually get used (if it hasn't already)...
                                  */
-                               if (wq->wq_itemcount == 0) {
+                               if (wq->wq_reqcount == 0) {
                                         workqueue_unlock(p);
                                         return;
                                 }
@@ -791,9 +810,8 @@ workqueue_thread_yielded(void)
                         if (wq->wq_thidlecount) {
                                 uint32_t        priority;
                                 uint32_t        affinity = -1;
-                               user_addr_t     item;
-                               struct workitem *witem = NULL;
-                               struct workitemlist *wl = NULL;
+                               boolean_t       overcommit = FALSE;
+                               boolean_t       force_oc = FALSE;
                                 struct uthread    *uth;
                                 struct threadlist *tl;
  
@@ -802,38 +820,31 @@ workqueue_thread_yielded(void)
                                         affinity = tl->th_affinity_tag;
  
                                 for (priority = 0; priority < WORKQUEUE_NUMPRIOS; priority++) {
-                                       if (wq->wq_list_bitmap & (1 << priority)) {
-                                               wl = (struct workitemlist *)&wq->wq_list[priority];
+                                       if (wq->wq_requests[priority])
                                                 break;
-                                       }
                                 }
-                               assert(wl != NULL);
-                               assert(!(TAILQ_EMPTY(&wl->wl_itemlist)));
-
-                               witem = TAILQ_FIRST(&wl->wl_itemlist);
-                               TAILQ_REMOVE(&wl->wl_itemlist, witem, wi_entry);
+                               assert(priority < WORKQUEUE_NUMPRIOS);
  
-                               if (TAILQ_EMPTY(&wl->wl_itemlist))
-                                       wq->wq_list_bitmap &= ~(1 << priority);
-                               wq->wq_itemcount--;
+                               wq->wq_reqcount--;
+                               wq->wq_requests[priority]--;
  
-                               item = witem->wi_item;
-                               witem->wi_item = (user_addr_t)0;
-                               witem->wi_affinity = 0;
+                               if (wq->wq_ocrequests[priority]) {
+                                       wq->wq_ocrequests[priority]--;
+                                       overcommit = TRUE;
+                               } else
+                                       force_oc = TRUE;
  
-                               TAILQ_INSERT_HEAD(&wl->wl_freelist, witem, wi_entry);
-
-                               (void)workqueue_run_nextitem(p, wq, THREAD_NULL, item, priority, affinity);
+                               (void)workqueue_run_nextreq(p, wq, THREAD_NULL, force_oc, overcommit, priority, affinity);
                                 /*
-                                * workqueue_run_nextitem is responsible for
+                                * workqueue_run_nextreq is responsible for
                                  * dropping the workqueue lock in all cases
                                  */
-                               KERNEL_DEBUG(0xefffd138 | DBG_FUNC_END, wq, wq->wq_thread_yielded_count, wq->wq_itemcount, 1, 0);
+                               KERNEL_DEBUG(0xefffd138 | DBG_FUNC_END, wq, wq->wq_thread_yielded_count, wq->wq_reqcount, 1, 0);
  
                                 return;
                         }
                 }
-               KERNEL_DEBUG(0xefffd138 | DBG_FUNC_END, wq, wq->wq_thread_yielded_count, wq->wq_itemcount, 2, 0);
+               KERNEL_DEBUG(0xefffd138 | DBG_FUNC_END, wq, wq->wq_thread_yielded_count, wq->wq_reqcount, 2, 0);
         }
         workqueue_unlock(p);
  }
@@ -876,15 +887,10 @@ workqueue_callback(int type, thread_t thread)
                          * since another thread would have to get scheduled and then block after we start down 
                          * this path), it's not a problem.  Either timestamp is adequate, so no need to retry
                          */
-#if defined(__ppc__)
-                       /*
-                        * this doesn't have to actually work reliablly for PPC, it just has to compile/link
-                        */
-                       *lastblocked_ptr = (UInt64)curtime;
-#else
+
                         OSCompareAndSwap64(*lastblocked_ptr, (UInt64)curtime, lastblocked_ptr);
-#endif
-                       if (wq->wq_itemcount)
+
+                       if (wq->wq_reqcount)
                                 WQ_TIMER_NEEDED(wq, start_timer);
  
                         if (start_timer == TRUE)
@@ -913,17 +919,23 @@ workqueue_callback(int type, thread_t thread)
  
  
  static void
-workqueue_removethread(struct threadlist *tl)
+workqueue_removethread(struct threadlist *tl, int fromexit)
  {
         struct workqueue *wq;
         struct uthread * uth;
  
+       /* 
+        * If fromexit is set, the call is from workqueue_exit(,
+        * so some cleanups are to be avoided.
+        */
         wq = tl->th_workq;
  
         TAILQ_REMOVE(&wq->wq_thidlelist, tl, th_entry);
  
-       wq->wq_nthreads--;
-       wq->wq_thidlecount--;
+       if (fromexit == 0) {
+               wq->wq_nthreads--;
+               wq->wq_thidlecount--;
+       }
  
         /*
          * Clear the threadlist pointer in uthread so 
@@ -937,7 +949,10 @@ workqueue_removethread(struct threadlist *tl)
         if (uth != (struct uthread *)0) {
                 uth->uu_threadlist = NULL;
         }
-       workqueue_unlock(wq->wq_proc);
+       if (fromexit == 0) {
+               /* during exit the lock is not held */
+               workqueue_unlock(wq->wq_proc);
+       }
  
         if ( (tl->th_flags & TH_LIST_SUSPENDED) ) {
                 /*
@@ -946,7 +961,10 @@ workqueue_removethread(struct threadlist *tl)
                  * since we're not going to spin up through the
                  * normal exit path triggered from Libc
                  */
-               (void)mach_vm_deallocate(wq->wq_map, tl->th_stackaddr, tl->th_allocsize);
+               if (fromexit == 0) {
+                       /* vm map is already deallocated when this is called from exit */
+                       (void)mach_vm_deallocate(wq->wq_map, tl->th_stackaddr, tl->th_allocsize);
+               }
                 (void)mach_port_deallocate(get_task_ipcspace(wq->wq_task), tl->th_thport);
  
                 KERNEL_DEBUG1(0xefffd014 | DBG_FUNC_END, wq, (uintptr_t)thread_tid(current_thread()), wq->wq_nthreads, 0xdead, thread_tid(tl->th_thread));
@@ -963,9 +981,13 @@ workqueue_removethread(struct threadlist *tl)
  }
  
  
-
+/*
+ * called with workq lock held
+ * dropped and retaken around thread creation
+ * return with workq lock held
+ */
  static boolean_t
-workqueue_addnewthread(struct workqueue *wq)
+workqueue_addnewthread(struct workqueue *wq, boolean_t oc_thread)
  {
         struct threadlist *tl;
         struct uthread  *uth;
@@ -975,8 +997,28 @@ workqueue_addnewthread(struct workqueue *wq)
         void            *sright;
         mach_vm_offset_t stackaddr;
  
-       if (wq->wq_nthreads >= wq_max_threads || wq->wq_nthreads >= (CONFIG_THREAD_MAX - 20))
+       if ((wq->wq_flags & WQ_EXITING) == WQ_EXITING)
                 return (FALSE);
+
+       if (wq->wq_nthreads >= wq_max_threads || wq->wq_nthreads >= (CONFIG_THREAD_MAX - 20)) {
+               wq->wq_lflags |= WQL_EXCEEDED_TOTAL_THREAD_LIMIT;
+               return (FALSE);
+       }
+       wq->wq_lflags &= ~WQL_EXCEEDED_TOTAL_THREAD_LIMIT;
+
+       if (oc_thread == FALSE && wq->wq_constrained_threads_scheduled >= wq_max_constrained_threads) {
+               /*
+                * if we're not creating this thread to service an overcommit request,
+                * then check the size of the constrained thread pool...  if we've already
+                * reached our max for threads scheduled from this pool, don't create a new
+                * one... the callers of this function are prepared for failure.
+                */
+               wq->wq_lflags |= WQL_EXCEEDED_CONSTRAINED_THREAD_LIMIT;
+               return (FALSE);
+       }
+       if (wq->wq_constrained_threads_scheduled < wq_max_constrained_threads)
+               wq->wq_lflags &= ~WQL_EXCEEDED_CONSTRAINED_THREAD_LIMIT;
+
         wq->wq_nthreads++;
  
         p = wq->wq_proc;
@@ -990,9 +1032,7 @@ workqueue_addnewthread(struct workqueue *wq)
         tl = kalloc(sizeof(struct threadlist));
         bzero(tl, sizeof(struct threadlist));
  
-#if defined(__ppc__)
-       stackaddr = 0xF0000000;
-#elif defined(__i386__) || defined(__x86_64__)
+#if defined(__i386__) || defined(__x86_64__)
         stackaddr = 0xB0000000;
  #else
  #error Need to define a stack address hint for this architecture
@@ -1023,6 +1063,7 @@ workqueue_addnewthread(struct workqueue *wq)
         }
         if (kret != KERN_SUCCESS) {
                 (void) thread_terminate(th);
+               thread_deallocate(th);
  
                 kfree(tl, sizeof(struct threadlist));
                 goto failed;
@@ -1043,16 +1084,11 @@ workqueue_addnewthread(struct workqueue *wq)
         tl->th_priority = WORKQUEUE_NUMPRIOS;
         tl->th_policy = -1;
  
-#if defined(__ppc__)
-       //ml_fp_setvalid(FALSE);
-       thread_set_cthreadself(th, (uint64_t)(tl->th_stackaddr + PTH_DEFAULT_STACKSIZE + PTH_DEFAULT_GUARDSIZE), IS_64BIT_PROCESS(p));
-#endif /* __ppc__ */
-
         uth = get_bsdthread_info(tl->th_thread);
-       uth->uu_threadlist = (void *)tl;
  
          workqueue_lock_spin(p);
         
+       uth->uu_threadlist = (void *)tl;
         TAILQ_INSERT_TAIL(&wq->wq_thidlelist, tl, th_entry);
  
         wq->wq_thidlecount++;
@@ -1076,17 +1112,31 @@ workq_open(struct proc *p, __unused struct workq_open_args  *uap, __unused int32
         int wq_size;
         char * ptr;
         char * nptr;
-       int j;
         uint32_t i;
         uint32_t num_cpus;
         int error = 0;
         boolean_t need_wakeup = FALSE;
-       struct workitem * witem;
-       struct workitemlist *wl;
+
  
         if ((p->p_lflag & P_LREGISTER) == 0)
                 return(EINVAL);
  
+       num_cpus = ml_get_max_cpus();
+
+       if (wq_init_constrained_limit) {
+               uint32_t limit;
+               /*
+                * set up the limit for the constrained pool
+                * this is a virtual pool in that we don't
+                * maintain it on a separate idle and run list
+                */
+               limit = num_cpus * (WORKQUEUE_NUMPRIOS + 1);
+
+               if (limit > wq_max_constrained_threads)
+                       wq_max_constrained_threads = limit;
+
+               wq_init_constrained_limit = 0;
+       }
         workqueue_lock_spin(p);
  
         if (p->p_wqptr == NULL) {
@@ -1107,13 +1157,11 @@ workq_open(struct proc *p, __unused struct workq_open_args  *uap, __unused int32
  
                 workqueue_unlock(p);
  
-               num_cpus = ml_get_max_cpus();
-
                 wq_size = sizeof(struct workqueue) +
-                       (num_cpus * WORKQUEUE_NUMPRIOS * sizeof(uint32_t)) +
+                       (num_cpus * WORKQUEUE_NUMPRIOS * sizeof(uint16_t)) +
                         (num_cpus * WORKQUEUE_NUMPRIOS * sizeof(uint32_t)) +
                         (num_cpus * WORKQUEUE_NUMPRIOS * sizeof(uint64_t)) +
-                       sizeof(uint64_t);
+                       sizeof(uint32_t) + sizeof(uint64_t);
  
                 ptr = (char *)kalloc(wq_size);
                 bzero(ptr, wq_size);
@@ -1125,25 +1173,20 @@ workq_open(struct proc *p, __unused struct workq_open_args  *uap, __unused int32
                 wq->wq_task = current_task();
                 wq->wq_map  = current_map();
  
-               for (i = 0; i < WORKQUEUE_NUMPRIOS; i++) {
-                       wl = (struct workitemlist *)&wq->wq_list[i];
-                       TAILQ_INIT(&wl->wl_itemlist);
-                       TAILQ_INIT(&wl->wl_freelist);
-
-                       for (j = 0; j < WORKITEM_SIZE; j++) {
-                               witem = &wq->wq_array[(i*WORKITEM_SIZE) + j];
-                               TAILQ_INSERT_TAIL(&wl->wl_freelist, witem, wi_entry);
-                       }
+               for (i = 0; i < WORKQUEUE_NUMPRIOS; i++)
                         wq->wq_reqconc[i] = wq->wq_affinity_max;
-               }
+
                 nptr = ptr + sizeof(struct workqueue);
  
                 for (i = 0; i < WORKQUEUE_NUMPRIOS; i++) {
-                       wq->wq_thactive_count[i] = (uint32_t *)nptr;
-                       nptr += (num_cpus * sizeof(uint32_t));
+                       wq->wq_thscheduled_count[i] = (uint16_t *)nptr;
+                       nptr += (num_cpus * sizeof(uint16_t));
                 }
+               nptr += (sizeof(uint32_t) - 1);
+               nptr = (char *)((uintptr_t)nptr & ~(sizeof(uint32_t) - 1));
+
                 for (i = 0; i < WORKQUEUE_NUMPRIOS; i++) {
-                       wq->wq_thscheduled_count[i] = (uint32_t *)nptr;
+                       wq->wq_thactive_count[i] = (uint32_t *)nptr;
                         nptr += (num_cpus * sizeof(uint32_t));
                 }
                 /*
@@ -1153,7 +1196,7 @@ workq_open(struct proc *p, __unused struct workq_open_args  *uap, __unused int32
                  * the size for the allocation of the workqueue struct
                  */
                 nptr += (sizeof(uint64_t) - 1);
-               nptr = (char *)((long)nptr & ~(sizeof(uint64_t) - 1));
+               nptr = (char *)((uintptr_t)nptr & ~(sizeof(uint64_t) - 1));
  
                 for (i = 0; i < WORKQUEUE_NUMPRIOS; i++) {
                         wq->wq_lastblocked_ts[i] = (uint64_t *)nptr;
@@ -1180,73 +1223,86 @@ out:
         return(error);
  }
  
+
  int
  workq_kernreturn(struct proc *p, struct workq_kernreturn_args  *uap, __unused int32_t *retval)
  {
-       user_addr_t item = uap->item;
-       int options     = uap->options;
-       int prio        = uap->prio;    /* should  be used to find the right workqueue */
-       int affinity    = uap->affinity;
-       int error       = 0;
-       thread_t th     = THREAD_NULL;
-       user_addr_t oc_item = 0;
          struct workqueue *wq;
+       int error       = 0;
  
         if ((p->p_lflag & P_LREGISTER) == 0)
                 return(EINVAL);
  
-       /*
-        * affinity not yet hooked up on this path
-        */
-       affinity = -1;
+       switch (uap->options) {
  
-       switch (options) {
+               case WQOPS_QUEUE_NEWSPISUPP:
+                       break;
+
+               case WQOPS_QUEUE_REQTHREADS: {
+                       /*
+                        * for this operation, we re-purpose the affinity
+                        * argument as the number of threads to start
+                        */
+                       boolean_t overcommit = FALSE;
+                       int priority         = uap->prio;
+                       int reqcount         = uap->affinity;
  
-               case WQOPS_QUEUE_ADD: {
-                       
-                       if (prio & WORKQUEUE_OVERCOMMIT) {
-                               prio &= ~WORKQUEUE_OVERCOMMIT;
-                               oc_item = item;
+                       if (priority & WORKQUEUE_OVERCOMMIT) {
+                               priority &= ~WORKQUEUE_OVERCOMMIT;
+                               overcommit = TRUE;
+                       }
+                       if ((reqcount <= 0) || (priority < 0) || (priority >= WORKQUEUE_NUMPRIOS)) {
+                               error = EINVAL;
+                               break;
                         }
-                       if ((prio < 0) || (prio >= WORKQUEUE_NUMPRIOS))
-                               return (EINVAL);
+                        workqueue_lock_spin(p);
  
-                       workqueue_lock_spin(p);
+                        if ((wq = (struct workqueue *)p->p_wqptr) == NULL) {
+                                workqueue_unlock(p);
  
-                       if ((wq = (struct workqueue *)p->p_wqptr) == NULL) {
-                               workqueue_unlock(p);
-                               return (EINVAL);
-                       }
-                       if (wq->wq_thidlecount == 0 && (oc_item || (wq->wq_nthreads < wq->wq_affinity_max))) {
+                               error = EINVAL;
+                               break;
+                        }
+                       if (overcommit == FALSE) {
+                               wq->wq_reqcount += reqcount;
+                               wq->wq_requests[priority] += reqcount;
+                               
+                               KERNEL_DEBUG(0xefffd008 | DBG_FUNC_NONE, wq, priority, wq->wq_requests[priority], reqcount, 0);
+
+                               while (wq->wq_reqcount) {
+                                       if (workqueue_run_one(p, wq, overcommit, priority) == FALSE)
+                                               break;
+                               }
+                       } else {
+                               KERNEL_DEBUG(0xefffd13c | DBG_FUNC_NONE, wq, priority, wq->wq_requests[priority], reqcount, 0);
  
-                               workqueue_addnewthread(wq);
+                               while (reqcount) {
+                                       if (workqueue_run_one(p, wq, overcommit, priority) == FALSE)
+                                               break;
+                                       reqcount--;
+                               }
+                               if (reqcount) {
+                                       /*
+                                        * we need to delay starting some of the overcommit requests...
+                                        * we should only fail to create the overcommit threads if
+                                        * we're at the max thread limit... as existing threads
+                                        * return to the kernel, we'll notice the ocrequests
+                                        * and spin them back to user space as the overcommit variety
+                                        */
+                                       wq->wq_reqcount += reqcount;
+                                       wq->wq_requests[priority] += reqcount;
+                                       wq->wq_ocrequests[priority] += reqcount;
  
-                               if (wq->wq_thidlecount == 0)
-                                       oc_item = 0;
+                                       KERNEL_DEBUG(0xefffd140 | DBG_FUNC_NONE, wq, priority, wq->wq_requests[priority], reqcount, 0);
+                               }
                         }
-                       if (oc_item == 0)
-                               error = workqueue_additem(wq, prio, item, affinity);
+                       workqueue_unlock(p);
  
-                       KERNEL_DEBUG(0xefffd008 | DBG_FUNC_NONE, wq, prio, affinity, oc_item, 0);
                         }
                         break;
-               case WQOPS_QUEUE_REMOVE: {
  
-                       if ((prio < 0) || (prio >= WORKQUEUE_NUMPRIOS))
-                               return (EINVAL);
-
-                       workqueue_lock_spin(p);
-
-                       if ((wq = (struct workqueue *)p->p_wqptr) == NULL) {
-                               workqueue_unlock(p);
-                               return (EINVAL);
-                       }
-                       error = workqueue_removeitem(wq, prio, item);
-                       }
-                       break;
                 case WQOPS_THREAD_RETURN: {
-
-                       th = current_thread();
+                       thread_t th = current_thread();
                         struct uthread *uth = get_bsdthread_info(th);
  
                         /* reset signal mask on the workqueue thread to default state */
@@ -1255,78 +1311,51 @@ workq_kernreturn(struct proc *p, struct workq_kernreturn_args  *uap, __unused in
                                 uth->uu_sigmask = ~workq_threadmask;
                                 proc_unlock(p);
                         }
-
                         workqueue_lock_spin(p);
  
                         if ((wq = (struct workqueue *)p->p_wqptr) == NULL || (uth->uu_threadlist == NULL)) {
                                 workqueue_unlock(p);
-                               return (EINVAL);
+
+                               error = EINVAL;
+                               break;
                         }
                         KERNEL_DEBUG(0xefffd004 | DBG_FUNC_END, wq, 0, 0, 0, 0);
-                       }
-                       break;
-               case WQOPS_THREAD_SETCONC: {
-
-                       if ((prio < 0) || (prio > WORKQUEUE_NUMPRIOS))
-                               return (EINVAL);
  
-                       workqueue_lock_spin(p);
-
-                       if ((wq = (struct workqueue *)p->p_wqptr) == NULL) {
-                               workqueue_unlock(p);
-                               return (EINVAL);
-                       }
+                       (void)workqueue_run_nextreq(p, wq, th, FALSE, FALSE, 0, -1);
                         /*
-                        * for this operation, we re-purpose the affinity
-                        * argument as the concurrency target
+                        * workqueue_run_nextreq is responsible for
+                        * dropping the workqueue lock in all cases
                          */
-                       if (prio < WORKQUEUE_NUMPRIOS)
-                               wq->wq_reqconc[prio] = affinity;
-                       else {
-                               for (prio = 0; prio < WORKQUEUE_NUMPRIOS; prio++)
-                                       wq->wq_reqconc[prio] = affinity;
-
-                       }
                         }
                         break;
+               
                 default:
-                       return (EINVAL);
+                       error = EINVAL;
+                       break;
         }
-       (void)workqueue_run_nextitem(p, wq, th, oc_item, prio, affinity);
-       /*
-        * workqueue_run_nextitem is responsible for
-        * dropping the workqueue lock in all cases
-        */
         return (error);
-
  }
  
+/*
+ * Routine:    workqueue_mark_exiting
+ *
+ * Function:   Mark the work queue such that new threads will not be added to the
+ *             work queue after we return.  
+ *
+ * Conditions: Called against the current process.
+ */
  void
-workqueue_exit(struct proc *p)
+workqueue_mark_exiting(struct proc *p)
  {
         struct workqueue  * wq;
-       struct threadlist  * tl, *tlist;
-       struct uthread  *uth;
-       int wq_size = 0;
  
-       if (p->p_wqptr != NULL) {
+       wq = p->p_wqptr;
+       if (wq != NULL) {
  
-               KERNEL_DEBUG(0x900808c | DBG_FUNC_START, p->p_wqptr, 0, 0, 0, 0);
+               KERNEL_DEBUG(0x9008088 | DBG_FUNC_START, p->p_wqptr, 0, 0, 0, 0);
  
                 workqueue_lock_spin(p);
  
-               wq = (struct workqueue *)p->p_wqptr;
-
-               if (wq == NULL) {
-                       workqueue_unlock(p);
-
-                       KERNEL_DEBUG(0x900808c | DBG_FUNC_END, 0, 0, 0, -1, 0);
-                       return;
-               }
-               wq_size = p->p_wqsize;
-               p->p_wqptr = NULL;
-               p->p_wqsize = 0;
-
                 /*
                  * we now arm the timer in the callback function w/o holding the workq lock...
                  * we do this by setting  WQ_ATIMER_RUNNING via OSCompareAndSwap in order to 
@@ -1356,6 +1385,40 @@ workqueue_exit(struct proc *p)
                 }
                 workqueue_unlock(p);
  
+               KERNEL_DEBUG(0x9008088 | DBG_FUNC_END, 0, 0, 0, 0, 0);
+       }
+}
+
+/*
+ * Routine:    workqueue_exit
+ *
+ * Function:   clean up the work queue structure(s) now that there are no threads
+ *             left running inside the work queue (except possibly current_thread).
+ *
+ * Conditions: Called by the last thread in the process.
+ *             Called against current process.
+ */
+void
+workqueue_exit(struct proc *p)
+{
+       struct workqueue  * wq;
+       struct threadlist  * tl, *tlist;
+       struct uthread  *uth;
+       int wq_size = 0;
+
+       wq = (struct workqueue *)p->p_wqptr;
+       if (wq != NULL) {
+
+               KERNEL_DEBUG(0x900808c | DBG_FUNC_START, p->p_wqptr, 0, 0, 0, 0);
+
+               wq_size = p->p_wqsize;
+               p->p_wqptr = NULL;
+               p->p_wqsize = 0;
+
+               /*
+                * Clean up workqueue data structures for threads that exited and
+                * didn't get a chance to clean up after themselves.
+                */
                 TAILQ_FOREACH_SAFE(tl, &wq->wq_thrunlist, th_entry, tlist) {
  
                         thread_sched_call(tl->th_thread, NULL);
@@ -1374,21 +1437,7 @@ workqueue_exit(struct proc *p)
                         kfree(tl, sizeof(struct threadlist));
                 }
                 TAILQ_FOREACH_SAFE(tl, &wq->wq_thidlelist, th_entry, tlist) {
-
-                       thread_sched_call(tl->th_thread, NULL);
-
-                       uth = get_bsdthread_info(tl->th_thread);
-                       if (uth != (struct uthread *)0) {
-                               uth->uu_threadlist = NULL;
-                       }
-                       TAILQ_REMOVE(&wq->wq_thidlelist, tl, th_entry);
-
-                       /*
-                        * drop our last ref on the thread
-                        */
-                       thread_deallocate(tl->th_thread);
-
-                       kfree(tl, sizeof(struct threadlist));
+                       workqueue_removethread(tl, 1);
                 }
                 thread_call_free(wq->wq_atimer_call);
  
@@ -1398,101 +1447,83 @@ workqueue_exit(struct proc *p)
         }
  }
  
-static int 
-workqueue_additem(struct workqueue *wq, int prio, user_addr_t item, int affinity)
-{
-       struct workitem *witem;
-       struct workitemlist *wl;
-
-       wl = (struct workitemlist *)&wq->wq_list[prio];
-
-       if (TAILQ_EMPTY(&wl->wl_freelist))
-               return (ENOMEM);
-
-       witem = (struct workitem *)TAILQ_FIRST(&wl->wl_freelist);
-       TAILQ_REMOVE(&wl->wl_freelist, witem, wi_entry);
  
-       witem->wi_item = item;
-       witem->wi_affinity = affinity;
-       TAILQ_INSERT_TAIL(&wl->wl_itemlist, witem, wi_entry);
-
-       wq->wq_list_bitmap |= (1 << prio);
-
-       wq->wq_itemcount++;
+static int workqueue_importance[WORKQUEUE_NUMPRIOS] = 
+{
+       2, 0, -2, INT_MIN,
+};
  
-       return (0);
-}
+#define WORKQ_POLICY_TIMESHARE 1
  
-static int 
-workqueue_removeitem(struct workqueue *wq, int prio, user_addr_t item)
+static int workqueue_policy[WORKQUEUE_NUMPRIOS] = 
  {
-       struct workitem *witem;
-       struct workitemlist *wl;
-       int error = ESRCH;
+       WORKQ_POLICY_TIMESHARE, WORKQ_POLICY_TIMESHARE, WORKQ_POLICY_TIMESHARE, WORKQ_POLICY_TIMESHARE
+};
  
-       wl = (struct workitemlist *)&wq->wq_list[prio];
  
-       TAILQ_FOREACH(witem, &wl->wl_itemlist, wi_entry) {
-               if (witem->wi_item == item) {
-                       TAILQ_REMOVE(&wl->wl_itemlist, witem, wi_entry);
  
-                       if (TAILQ_EMPTY(&wl->wl_itemlist))
-                               wq->wq_list_bitmap &= ~(1 << prio);
-                       wq->wq_itemcount--;
-                       
-                       witem->wi_item = (user_addr_t)0;
-                       witem->wi_affinity = 0;
-                       TAILQ_INSERT_HEAD(&wl->wl_freelist, witem, wi_entry);
+static boolean_t
+workqueue_run_one(proc_t p, struct workqueue *wq, boolean_t overcommit, int priority)
+{
+       boolean_t       ran_one;
  
-                       error = 0;
-                       break;
+       if (wq->wq_thidlecount == 0) {
+               if (overcommit == FALSE) {
+                       if (wq->wq_constrained_threads_scheduled < wq->wq_affinity_max)
+                               workqueue_addnewthread(wq, overcommit);
+               } else {
+                       workqueue_addnewthread(wq, overcommit);
+
+                       if (wq->wq_thidlecount == 0)
+                               return (FALSE);
                 }
         }
-       return (error);
-}
+       ran_one = workqueue_run_nextreq(p, wq, THREAD_NULL, FALSE, overcommit, priority, -1);
+       /*
+        * workqueue_run_nextreq is responsible for
+        * dropping the workqueue lock in all cases
+        */
+       workqueue_lock_spin(p);
  
-static int workqueue_importance[WORKQUEUE_NUMPRIOS] = 
-{
-       2, 0, -2,
-};
+       return (ran_one);
+}
  
-static int workqueue_policy[WORKQUEUE_NUMPRIOS] = 
-{
-       1, 1, 1,
-};
  
  
  /*
- * workqueue_run_nextitem:
+ * workqueue_run_nextreq:
   *   called with the workqueue lock held...
   *   responsible for dropping it in all cases
   */
  static boolean_t
-workqueue_run_nextitem(proc_t p, struct workqueue *wq, thread_t thread, user_addr_t oc_item, int oc_prio, int oc_affinity)
+workqueue_run_nextreq(proc_t p, struct workqueue *wq, thread_t thread,
+                     boolean_t force_oc, boolean_t overcommit, int oc_prio, int oc_affinity)
  {
-       struct workitem *witem = NULL;
-       user_addr_t item = 0;
         thread_t th_to_run = THREAD_NULL;
         thread_t th_to_park = THREAD_NULL;
         int wake_thread = 0;
-       int reuse_thread = 1;
+       int reuse_thread = WQ_FLAG_THREAD_REUSE;
         uint32_t priority, orig_priority;
         uint32_t affinity_tag, orig_affinity_tag;
         uint32_t i, n;
-       uint32_t activecount;
         uint32_t busycount;
         uint32_t us_to_wait;
         struct threadlist *tl = NULL;
         struct threadlist *ttl = NULL;
         struct uthread *uth = NULL;
-       struct workitemlist *wl = NULL;
         boolean_t start_timer = FALSE;
         boolean_t adjust_counters = TRUE;
         uint64_t  curtime;
  
  
-       KERNEL_DEBUG(0xefffd000 | DBG_FUNC_START, wq, thread, wq->wq_thidlecount, wq->wq_itemcount, 0);
+       KERNEL_DEBUG(0xefffd000 | DBG_FUNC_START, wq, thread, wq->wq_thidlecount, wq->wq_reqcount, 0);
  
+       if (thread != THREAD_NULL) {
+               uth = get_bsdthread_info(thread);
+
+               if ( (tl = uth->uu_threadlist) == NULL)
+                       panic("wq thread with no threadlist ");
+       }
         /*
          * from here until we drop the workq lock
          * we can't be pre-empted since we hold 
@@ -1502,14 +1533,15 @@ workqueue_run_nextitem(proc_t p, struct workqueue *wq, thread_t thread, user_add
          * and these values are used to index the multi-dimensional
          * counter arrays in 'workqueue_callback'
          */
-       if (oc_item) {
+dispatch_overcommit:
+
+       if (overcommit == TRUE || force_oc == TRUE) {
                 uint32_t min_scheduled = 0;
                 uint32_t scheduled_count;
                 uint32_t active_count;
                 uint32_t t_affinity = 0;
  
                 priority = oc_prio;
-               item = oc_item;
  
                 if ((affinity_tag = oc_affinity) == (uint32_t)-1) {
                         for (affinity_tag = 0; affinity_tag < wq->wq_reqconc[priority]; affinity_tag++) {
@@ -1534,27 +1566,55 @@ workqueue_run_nextitem(proc_t p, struct workqueue *wq, thread_t thread, user_add
                         }
                         affinity_tag = t_affinity;
                 }
+               if (thread != THREAD_NULL) {
+                       th_to_run = thread;
+                       goto pick_up_work;
+               }
                 goto grab_idle_thread;
         }
-       if (wq->wq_itemcount == 0) {
+       if (wq->wq_reqcount) {
+               for (priority = 0; priority < WORKQUEUE_NUMPRIOS; priority++) {
+                       if (wq->wq_requests[priority])
+                               break;
+               }
+               assert(priority < WORKQUEUE_NUMPRIOS);
+
+               if (wq->wq_ocrequests[priority] && (thread != THREAD_NULL || wq->wq_thidlecount)) {
+                       /*
+                        * handle delayed overcommit request...
+                        * they have priority over normal requests
+                        * within a given priority level
+                        */
+                       wq->wq_reqcount--;
+                       wq->wq_requests[priority]--;
+                       wq->wq_ocrequests[priority]--;
+
+                       oc_prio = priority;
+                       overcommit = TRUE;
+
+                       goto dispatch_overcommit;
+               }
+       }
+       /*
+        * if we get here, the work should be handled by a constrained thread
+        */
+       if (wq->wq_reqcount == 0 || wq->wq_constrained_threads_scheduled >= wq_max_constrained_threads) {
+               /*
+                * no work to do, or we're already at or over the scheduling limit for
+                * constrained threads...  just return or park the thread...
+                * do not start the timer for this condition... if we don't have any work,
+                * we'll check again when new work arrives... if we're over the limit, we need 1 or more
+                * constrained threads to return to the kernel before we can dispatch additional work
+                */
                 if ((th_to_park = thread) == THREAD_NULL)
                         goto out_of_work;
-               goto parkit;
+               goto parkit;
         }
-       for (priority = 0; priority < WORKQUEUE_NUMPRIOS; priority++) {
-               if (wq->wq_list_bitmap & (1 << priority)) {
-                       wl = (struct workitemlist *)&wq->wq_list[priority];
-                       break;
-               }
-       }
-       assert(wl != NULL);
-       assert(!(TAILQ_EMPTY(&wl->wl_itemlist)));
  
         curtime = mach_absolute_time();
  
         if (thread != THREAD_NULL) {
-               uth = get_bsdthread_info(thread);
-               tl = uth->uu_threadlist;
+
                 affinity_tag = tl->th_affinity_tag;
  
                 /*
@@ -1564,6 +1624,10 @@ workqueue_run_nextitem(proc_t p, struct workqueue *wq, thread_t thread, user_add
                  * we're considering running work for
                  */
                 if (affinity_tag < wq->wq_reqconc[priority]) {
+                       uint32_t  bcount = 0;
+                       uint32_t  acount = 0;
+                       uint32_t  tcount = 0;
+
                         /*
                          * we're a worker thread from the pool... currently we
                          * are considered 'active' which means we're counted
@@ -1571,56 +1635,84 @@ workqueue_run_nextitem(proc_t p, struct workqueue *wq, thread_t thread, user_add
                          * add up the active counts of all the priority levels
                          * up to and including the one we want to schedule
                          */
-                       for (activecount = 0, i = 0; i <= priority; i++) {
-                               uint32_t  acount;
+                       for (i = 0; i <= priority; i++) {
  
-                               acount = wq->wq_thactive_count[i][affinity_tag];
+                               tcount = wq->wq_thactive_count[i][affinity_tag];
+                               acount += tcount;
  
-                               if (acount == 0 && wq->wq_thscheduled_count[i][affinity_tag]) {
+                               if (tcount == 0 && wq->wq_thscheduled_count[i][affinity_tag]) {
                                         if (wq_thread_is_busy(curtime, &wq->wq_lastblocked_ts[i][affinity_tag]))
-                                               acount = 1;
+                                               bcount++;
                                 }
-                               activecount += acount;
                         }
-                       if (activecount == 1) {
+                       if ((acount + bcount) == 1) {
                                 /*
                                  * we're the only active thread associated with our
                                  * affinity group at this priority level and higher,
+                                * and there are no threads considered 'busy',
                                  * so pick up some work and keep going
                                  */
                                 th_to_run = thread;
                                 goto pick_up_work;
                         }
+                       if (wq->wq_reqconc[priority] == 1) {
+                               /*
+                                * we have at least one other active or busy thread running at this
+                                * priority level or higher and since we only have 
+                                * 1 affinity group to schedule against, no need
+                                * to try and find another... we can't start up another thread to
+                                * service the request and we already have the info
+                                * needed to determine if we need to start a timer or not
+                                */
+                               if (acount == 1) {
+                                       /*
+                                        * we're the only active thread, but we must have found
+                                        * at least 1 busy thread, so indicate that we need
+                                        * to start a timer
+                                        */
+                                       busycount = 1;
+                               } else
+                                       busycount = 0;
+
+                               affinity_tag = 1;
+                               goto cant_schedule;
+                       }
                 }
                 /*
                  * there's more than 1 thread running in this affinity group
                  * or the concurrency level has been cut back for this priority...
-                * lets continue on and look for an 'empty' group to run this
-                * work item in
+                * let's continue on and look for an 'empty' group to run this
+                * work request in
                  */
         }
         busycount = 0;
  
         for (affinity_tag = 0; affinity_tag < wq->wq_reqconc[priority]; affinity_tag++) {
+               boolean_t       can_schedule;
+
                 /*
                  * look for first affinity group that is currently not active
                  * i.e. no active threads at this priority level or higher
                  * and no threads that have run recently
                  */
-               for (activecount = 0, i = 0; i <= priority; i++) {
-                       if ((activecount = wq->wq_thactive_count[i][affinity_tag]))
+               for (i = 0; i <= priority; i++) {
+                       can_schedule = FALSE;
+
+                       if (wq->wq_thactive_count[i][affinity_tag])
                                 break;
  
-                       if (wq->wq_thscheduled_count[i][affinity_tag]) {
-                               if (wq_thread_is_busy(curtime, &wq->wq_lastblocked_ts[i][affinity_tag])) {
-                                       busycount++;
-                                       break;
-                               }
+                       if (wq->wq_thscheduled_count[i][affinity_tag] &&
+                           wq_thread_is_busy(curtime, &wq->wq_lastblocked_ts[i][affinity_tag])) {
+                               busycount++;
+                               break;
                         }
+                       can_schedule = TRUE;
                 }
-               if (activecount == 0 && busycount == 0)
+               if (can_schedule == TRUE)
                         break;
         }
+cant_schedule:
+
         if (affinity_tag >= wq->wq_reqconc[priority]) {
                 /*
                  * we've already got at least 1 thread per
@@ -1632,7 +1724,7 @@ workqueue_run_nextitem(proc_t p, struct workqueue *wq, thread_t thread, user_add
                          * 'busy' state... make sure we start
                          * the timer because if they are the only
                          * threads keeping us from scheduling
-                        * this workitem, we won't get a callback
+                        * this work request, we won't get a callback
                          * to kick off the timer... we need to
                          * start it now...
                          */
@@ -1659,6 +1751,8 @@ workqueue_run_nextitem(proc_t p, struct workqueue *wq, thread_t thread, user_add
                 th_to_run = thread;
                 goto pick_up_work;
         }
+
+grab_idle_thread:
         if (wq->wq_thidlecount == 0) {
                 /*
                  * we don't have a thread to schedule, but we have
@@ -1671,14 +1765,12 @@ workqueue_run_nextitem(proc_t p, struct workqueue *wq, thread_t thread, user_add
  
                 goto no_thread_to_run;
         }
-
-grab_idle_thread:
         /*
          * we've got a candidate (affinity group with no currently
          * active threads) to start a new thread on...
          * we already know there is both work available
          * and an idle thread, so activate a thread and then
-        * fall into the code that pulls a new workitem...
+        * fall into the code that pulls a new work request...
          */
         TAILQ_FOREACH(ttl, &wq->wq_thidlelist, th_entry) {
                 if (ttl->th_affinity_tag == affinity_tag || ttl->th_affinity_tag == (uint16_t)-1) {
@@ -1715,18 +1807,19 @@ grab_idle_thread:
         th_to_run = tl->th_thread;
  
  pick_up_work:
-       if (item == 0) {
-               witem = TAILQ_FIRST(&wl->wl_itemlist);
-               TAILQ_REMOVE(&wl->wl_itemlist, witem, wi_entry);
-
-               if (TAILQ_EMPTY(&wl->wl_itemlist))
-                       wq->wq_list_bitmap &= ~(1 << priority);
-               wq->wq_itemcount--;
-
-               item = witem->wi_item;
-               witem->wi_item = (user_addr_t)0;
-               witem->wi_affinity = 0;
-               TAILQ_INSERT_HEAD(&wl->wl_freelist, witem, wi_entry);
+       if (overcommit == FALSE && force_oc == FALSE) {
+               wq->wq_reqcount--;
+               wq->wq_requests[priority]--;
+
+               if ( !(tl->th_flags & TH_LIST_CONSTRAINED)) {
+                       wq->wq_constrained_threads_scheduled++;
+                       tl->th_flags |= TH_LIST_CONSTRAINED;
+               }
+       } else {
+               if (tl->th_flags & TH_LIST_CONSTRAINED) {
+                       wq->wq_constrained_threads_scheduled--;
+                       tl->th_flags &= ~TH_LIST_CONSTRAINED;
+               }
         }
         orig_priority = tl->th_priority;
         orig_affinity_tag = tl->th_affinity_tag;
@@ -1770,22 +1863,43 @@ pick_up_work:
                 thread_precedence_policy_data_t precedinfo;
                 thread_extended_policy_data_t   extinfo;
                 uint32_t        policy;
+#if CONFIG_EMBEDDED
+               int retval = 0;
  
+               /* sets the saved importance for apple ios daemon if backgrounded. else returns 0 */
+               retval = proc_setthread_saved_importance(th_to_run, workqueue_importance[priority]);
+               if (retval == 0) {
+#endif /* CONFIG_EMBEDDED */
                 policy = workqueue_policy[priority];
                 
                 KERNEL_DEBUG(0xefffd120 | DBG_FUNC_START, wq, orig_priority, tl->th_policy, 0, 0);
  
-               if (tl->th_policy != policy) {
+               if ((orig_priority == WORKQUEUE_BG_PRIOQUEUE) || (priority == WORKQUEUE_BG_PRIOQUEUE)) {
+                       if (orig_priority == WORKQUEUE_BG_PRIOQUEUE) {
+                               /* remove the disk throttle, importance will be reset in anycase */
+                               proc_restore_workq_bgthreadpolicy(th_to_run);
+                       } 
  
+                       if (priority == WORKQUEUE_BG_PRIOQUEUE) {
+                               proc_apply_workq_bgthreadpolicy(th_to_run);
+                       }
+               }
+
+               if (tl->th_policy != policy) {
                         extinfo.timeshare = policy;
                         (void)thread_policy_set_internal(th_to_run, THREAD_EXTENDED_POLICY, (thread_policy_t)&extinfo, THREAD_EXTENDED_POLICY_COUNT);
  
                         tl->th_policy = policy;
                 }
+
                  precedinfo.importance = workqueue_importance[priority];
                  (void)thread_policy_set_internal(th_to_run, THREAD_PRECEDENCE_POLICY, (thread_policy_t)&precedinfo, THREAD_PRECEDENCE_POLICY_COUNT);
  
+
                 KERNEL_DEBUG(0xefffd120 | DBG_FUNC_END, wq,  priority, policy, 0, 0);
+#if CONFIG_EMBEDDED
+               }
+#endif /* CONFIG_EMBEDDED */
         }
         if (kdebug_enable) {
                 int     lpri = -1;
@@ -1813,11 +1927,11 @@ pick_up_work:
                 }
         }
         /*
-        * if current thread is reused for workitem, does not return via unix_syscall
+        * if current thread is reused for work request, does not return via unix_syscall
          */
-       wq_runitem(p, item, th_to_run, tl, reuse_thread, wake_thread, (thread == th_to_run));
+       wq_runreq(p, overcommit, priority, th_to_run, tl, reuse_thread, wake_thread, (thread == th_to_run));
         
-       KERNEL_DEBUG(0xefffd000 | DBG_FUNC_END, wq, thread_tid(th_to_run), item, 1, 0);
+       KERNEL_DEBUG(0xefffd000 | DBG_FUNC_END, wq, thread_tid(th_to_run), overcommit, 1, 0);
  
         return (TRUE);
  
@@ -1841,11 +1955,6 @@ parkit:
          * this is a workqueue thread with no more
          * work to do... park it for now
          */
-       uth = get_bsdthread_info(th_to_park);
-       tl = uth->uu_threadlist;
-       if (tl == 0) 
-               panic("wq thread with no threadlist ");
-       
         TAILQ_REMOVE(&wq->wq_thrunlist, tl, th_entry);
         tl->th_flags &= ~TH_LIST_RUNNING;
  
@@ -1858,12 +1967,18 @@ parkit:
         wq->wq_thscheduled_count[tl->th_priority][tl->th_affinity_tag]--;
         wq->wq_threads_scheduled--;
  
+       if (tl->th_flags & TH_LIST_CONSTRAINED) {
+               wq->wq_constrained_threads_scheduled--;
+               wq->wq_lflags &= ~WQL_EXCEEDED_CONSTRAINED_THREAD_LIMIT;
+               tl->th_flags &= ~TH_LIST_CONSTRAINED;
+       }
         if (wq->wq_thidlecount < 100)
                 us_to_wait = wq_reduce_pool_window_usecs - (wq->wq_thidlecount * (wq_reduce_pool_window_usecs / 100));
         else
                 us_to_wait = wq_reduce_pool_window_usecs / 100;
  
         wq->wq_thidlecount++;
+       wq->wq_lflags &= ~WQL_EXCEEDED_TOTAL_THREAD_LIMIT;
  
         assert_wait_timeout((caddr_t)tl, (THREAD_INTERRUPTIBLE), us_to_wait, NSEC_PER_USEC);
  
@@ -1922,7 +2037,7 @@ normal_resume_to_user:
                          * queue... remove it from our domain...
                          * workqueue_removethread consumes the lock
                          */
-                       workqueue_removethread(tl);
+                       workqueue_removethread(tl, 0);
  
                         thread_bootstrap_return();
                 }
@@ -1973,7 +2088,7 @@ wq_unpark_continue(void)
                         if ((tl->th_flags & (TH_LIST_RUNNING | TH_LIST_BUSY)) == TH_LIST_RUNNING) {
                                 /*
                                  * a normal wakeup of this thread occurred... no need 
-                                * for any synchronization with the timer and wq_runitem
+                                * for any synchronization with the timer and wq_runreq
                                  */
  normal_return_to_user:                 
                                 thread_sched_call(th_to_unpark, workqueue_callback);
@@ -1994,7 +2109,7 @@ normal_return_to_user:
                                  *
                                  * workqueue_removethread consumes the lock
                                  */
-                               workqueue_removethread(tl);
+                               workqueue_removethread(tl, 0);
                                         
                                 thread_exception_return();
                         }
@@ -2029,7 +2144,7 @@ normal_return_to_user:
  
  
  static void 
-wq_runitem(proc_t p, user_addr_t item, thread_t th, struct threadlist *tl,
+wq_runreq(proc_t p, boolean_t overcommit, uint32_t priority, thread_t th, struct threadlist *tl,
            int reuse_thread, int wake_thread, int return_directly)
  {
         int ret = 0;
@@ -2037,7 +2152,7 @@ wq_runitem(proc_t p, user_addr_t item, thread_t th, struct threadlist *tl,
  
         KERNEL_DEBUG1(0xefffd004 | DBG_FUNC_START, tl->th_workq, tl->th_priority, tl->th_affinity_tag, thread_tid(current_thread()), thread_tid(th));
  
-       ret = setup_wqthread(p, th, item, reuse_thread, tl);
+       ret = setup_wqthread(p, th, overcommit, priority, reuse_thread, tl);
  
         if (ret != 0)
                 panic("setup_wqthread failed  %x\n", ret);
@@ -2047,7 +2162,7 @@ wq_runitem(proc_t p, user_addr_t item, thread_t th, struct threadlist *tl,
  
                 thread_exception_return();
  
-               panic("wq_runitem: thread_exception_return returned ...\n");
+               panic("wq_runreq: thread_exception_return returned ...\n");
         }
         if (wake_thread) {
                 workqueue_lock_spin(p);
@@ -2080,34 +2195,18 @@ wq_runitem(proc_t p, user_addr_t item, thread_t th, struct threadlist *tl,
         }
  }
  
+
  int
-setup_wqthread(proc_t p, thread_t th, user_addr_t item, int reuse_thread, struct threadlist *tl)
+setup_wqthread(proc_t p, thread_t th, boolean_t overcommit, uint32_t priority, int reuse_thread, struct threadlist *tl)
  {
-#if defined(__ppc__)
-       /*
-        * Set up PowerPC registers...
-        * internally they are always kept as 64 bit and
-        * since the register set is the same between 32 and 64bit modes
-        * we don't need 2 different methods for setting the state
-        */
-       {
-               ppc_thread_state64_t state64;
-               ppc_thread_state64_t *ts64 = &state64;
-
-               ts64->srr0 = (uint64_t)p->p_wqthread;
-               ts64->r1 = (uint64_t)((tl->th_stackaddr + PTH_DEFAULT_STACKSIZE + PTH_DEFAULT_GUARDSIZE) - C_ARGSAVE_LEN - C_RED_ZONE);
-               ts64->r3 = (uint64_t)(tl->th_stackaddr + PTH_DEFAULT_STACKSIZE + PTH_DEFAULT_GUARDSIZE);
-               ts64->r4 = (uint64_t)(tl->th_thport);
-               ts64->r5 = (uint64_t)(tl->th_stackaddr + PTH_DEFAULT_GUARDSIZE);
-               ts64->r6 = (uint64_t)item;
-               ts64->r7 = (uint64_t)reuse_thread;
-               ts64->r8 = (uint64_t)0;
-
-               if ((reuse_thread != 0) && (ts64->r3 == (uint64_t)0))
-                       panic("setup_wqthread: setting reuse thread with null pthread\n");
-               thread_set_wq_state64(th, (thread_state_t)ts64);
-       }
-#elif defined(__i386__) || defined(__x86_64__)
+       uint32_t flags = reuse_thread | WQ_FLAG_THREAD_NEWSPI;
+
+       if (overcommit == TRUE)
+               flags |= WQ_FLAG_THREAD_OVERCOMMIT;
+
+       flags |= priority;
+
+#if defined(__i386__) || defined(__x86_64__)
         int isLP64 = 0;
  
         isLP64 = IS_64BIT_PROCESS(p);
@@ -2122,16 +2221,14 @@ setup_wqthread(proc_t p, thread_t th, user_addr_t item, int reuse_thread, struct
                 ts->eax = (unsigned int)(tl->th_stackaddr + PTH_DEFAULT_STACKSIZE + PTH_DEFAULT_GUARDSIZE);
                 ts->ebx = (unsigned int)tl->th_thport;
                 ts->ecx = (unsigned int)(tl->th_stackaddr + PTH_DEFAULT_GUARDSIZE);
-               ts->edx = (unsigned int)item;
-               ts->edi = (unsigned int)reuse_thread;
+               ts->edx = (unsigned int)0;
+               ts->edi = (unsigned int)flags;
                 ts->esi = (unsigned int)0;
                 /*
                  * set stack pointer
                  */
                 ts->esp = (int)((vm_offset_t)((tl->th_stackaddr + PTH_DEFAULT_STACKSIZE + PTH_DEFAULT_GUARDSIZE) - C_32_STK_ALIGN));
  
-               if ((reuse_thread != 0) && (ts->eax == (unsigned int)0))
-                       panic("setup_wqthread: setting reuse thread with null pthread\n");
                 thread_set_wq_state32(th, (thread_state_t)ts);
  
         } else {
@@ -2142,8 +2239,8 @@ setup_wqthread(proc_t p, thread_t th, user_addr_t item, int reuse_thread, struct
                 ts64->rdi = (uint64_t)(tl->th_stackaddr + PTH_DEFAULT_STACKSIZE + PTH_DEFAULT_GUARDSIZE);
                 ts64->rsi = (uint64_t)(tl->th_thport);
                 ts64->rdx = (uint64_t)(tl->th_stackaddr + PTH_DEFAULT_GUARDSIZE);
-               ts64->rcx = (uint64_t)item;
-               ts64->r8 = (uint64_t)reuse_thread;
+               ts64->rcx = (uint64_t)0;
+               ts64->r8 = (uint64_t)flags;
                 ts64->r9 = (uint64_t)0;
  
                 /*
@@ -2151,8 +2248,6 @@ setup_wqthread(proc_t p, thread_t th, user_addr_t item, int reuse_thread, struct
                  */
                 ts64->rsp = (uint64_t)((tl->th_stackaddr + PTH_DEFAULT_STACKSIZE + PTH_DEFAULT_GUARDSIZE) - C_64_REDZONE_LEN);
  
-               if ((reuse_thread != 0) && (ts64->rdi == (uint64_t)0))
-                       panic("setup_wqthread: setting reuse thread with null pthread\n");
                 thread_set_wq_state64(th, (thread_state_t)ts64);
         }
  #else
@@ -2183,6 +2278,14 @@ fill_procworkqueue(proc_t p, struct proc_workqueueinfo * pwqinfo)
         pwqinfo->pwq_nthreads = wq->wq_nthreads;
         pwqinfo->pwq_runthreads = activecount;
         pwqinfo->pwq_blockedthreads = wq->wq_threads_scheduled - activecount;
+       pwqinfo->pwq_state = 0;
+
+       if (wq->wq_lflags & WQL_EXCEEDED_CONSTRAINED_THREAD_LIMIT)
+               pwqinfo->pwq_state |= WQ_EXCEEDED_CONSTRAINED_THREAD_LIMIT;
+
+       if (wq->wq_lflags & WQL_EXCEEDED_TOTAL_THREAD_LIMIT)
+               pwqinfo->pwq_state |= WQ_EXCEEDED_TOTAL_THREAD_LIMIT;
+
  out:
         workqueue_unlock(p);
         return(error);
@@ -2308,5 +2411,6 @@ pthread_init(void)
         
         pth_global_hashinit();
         psynch_thcall = thread_call_allocate(psynch_wq_cleanup, NULL);
+       psynch_zoneinit();
  #endif /* PSYNCH */
  }