xnu-344.49.tar.gz

[apple/xnu.git] / osfmk / kern / sched_prim.c
diff --git a/osfmk/kern/sched_prim.c b/osfmk/kern/sched_prim.c

index b4f351d412f68ca88a57c75c4fb60f65d658eadd..2e7a7fc5d9bb4b750504d3f708cfe9d43e3dff71 100644 (file)
--- a/osfmk/kern/sched_prim.c
+++ b/osfmk/kern/sched_prim.c
@@ -3,19 +3,22 @@
   *
   * @APPLE_LICENSE_HEADER_START@
   * 
- * The contents of this file constitute Original Code as defined in and
- * are subject to the Apple Public Source License Version 1.1 (the
- * "License").  You may not use this file except in compliance with the
- * License.  Please obtain a copy of the License at
- * http://www.apple.com/publicsource and read it before using this file.
+ * Copyright (c) 1999-2003 Apple Computer, Inc.  All Rights Reserved.
   * 
- * This Original Code and all software distributed under the License are
- * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this
+ * file.
+ * 
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
   * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
   * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
- * License for the specific language governing rights and limitations
- * under the License.
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
   * 
   * @APPLE_LICENSE_HEADER_END@
   */
@@ -114,10 +117,6 @@ int                        max_poll_quanta = MAX_POLL_QUANTA;
  #define                SCHED_POLL_YIELD_SHIFT          4               /* 1/16 */
  int                    sched_poll_yield_shift = SCHED_POLL_YIELD_SHIFT;
  
-#define                NO_KERNEL_PREEMPT       0
-#define                KERNEL_PREEMPT          1
-int                    kernel_preemption_mode = KERNEL_PREEMPT;
-
  uint32_t       std_quantum_us;
  
  unsigned       sched_tick;
@@ -127,15 +126,8 @@ int                        sched_usec;
  #endif /* SIMPLE_CLOCK */
  
  /* Forwards */
-void        thread_continue(thread_t);
-
  void           wait_queues_init(void);
  
-void           set_pri(
-                               thread_t                thread,
-                               int                             pri,
-                               int                             resched);
-
  thread_t       choose_pset_thread(
                                 processor_t                     myprocessor,
                                 processor_set_t         pset);
@@ -143,18 +135,13 @@ thread_t  choose_pset_thread(
  thread_t       choose_thread(
                                 processor_t             myprocessor);
  
-int                    run_queue_enqueue(
+boolean_t      run_queue_enqueue(
                                 run_queue_t             runq,
                                 thread_t                thread,
                                 boolean_t               tail);
  
-void           idle_thread_continue(void);
  void           do_thread_scan(void);
  
-void           clear_wait_internal(
-                               thread_t                thread,
-                               int                             result);
-
  #if    DEBUG
  void           dump_run_queues(
                                 run_queue_t                     rq);
@@ -395,6 +382,7 @@ void
  thread_timer_terminate(void)
  {
         thread_t                thread = current_thread();
+       wait_result_t   res;
         spl_t                   s;
  
         s = splsched();
@@ -408,11 +396,13 @@ thread_timer_terminate(void)
         thread->wait_timer_active--;
  
         while (thread->wait_timer_active > 0) {
-               assert_wait((event_t)&thread->wait_timer_active, THREAD_UNINT);
+               res = assert_wait((event_t)&thread->wait_timer_active, THREAD_UNINT);
+               assert(res == THREAD_WAITING);
                 wake_unlock(thread);
                 splx(s);
  
-               thread_block((void (*)(void)) 0);
+               res = thread_block(THREAD_CONTINUE_NULL);
+               assert(res == THREAD_AWAKENED);
  
                 s = splsched();
                 wake_lock(thread);
@@ -421,11 +411,13 @@ thread_timer_terminate(void)
         thread->depress_timer_active--;
  
         while (thread->depress_timer_active > 0) {
-               assert_wait((event_t)&thread->depress_timer_active, THREAD_UNINT);
+               res = assert_wait((event_t)&thread->depress_timer_active, THREAD_UNINT);
+               assert(res == THREAD_WAITING);
                 wake_unlock(thread);
                 splx(s);
  
-               thread_block((void (*)(void)) 0);
+               res = thread_block(THREAD_CONTINUE_NULL);
+               assert(res == THREAD_AWAKENED);
  
                 s = splsched();
                 wake_lock(thread);
@@ -444,44 +436,107 @@ thread_timer_terminate(void)
   *     Conditions:
   *             thread lock held, IPC locks may be held.
   *             thread must have been pulled from wait queue under same lock hold.
+ *  Returns:
+ *             KERN_SUCCESS - Thread was set running
+ *             KERN_NOT_WAITING - Thread was not waiting
   */
-void
+kern_return_t
  thread_go_locked(
         thread_t                thread,
-       int                             result)
+       wait_result_t   result)
  {
         assert(thread->at_safe_point == FALSE);
-       assert(thread->wait_event == NO_EVENT);
+       assert(thread->wait_event == NO_EVENT64);
         assert(thread->wait_queue == WAIT_QUEUE_NULL);
  
-       if (thread->state & TH_WAIT) {
+       if ((thread->state & (TH_WAIT|TH_TERMINATE)) == TH_WAIT) {
                 thread->state &= ~(TH_WAIT|TH_UNINT);
                 if (!(thread->state & TH_RUN)) {
                         thread->state |= TH_RUN;
  
-                       _mk_sp_thread_unblock(thread);
+                       if (thread->active_callout)
+                               call_thread_unblock();
+
+                       if (!(thread->state & TH_IDLE)) {
+                               _mk_sp_thread_unblock(thread);
+                               hw_atomic_add(&thread->processor_set->run_count, 1);
+                       }
                 }
  
                 thread->wait_result = result;
+               return KERN_SUCCESS;
         }
+       return KERN_NOT_WAITING;
  }
  
-void
+/*
+ *     Routine:        thread_mark_wait_locked
+ *     Purpose:
+ *             Mark a thread as waiting.  If, given the circumstances,
+ *             it doesn't want to wait (i.e. already aborted), then
+ *             indicate that in the return value.
+ *     Conditions:
+ *             at splsched() and thread is locked.
+ */
+__private_extern__
+wait_result_t
  thread_mark_wait_locked(
-       thread_t                thread,
-       int                 interruptible)
+       thread_t                        thread,
+       wait_interrupt_t        interruptible)
  {
+       wait_result_t   wait_result;
+       boolean_t                       at_safe_point;
  
         assert(thread == current_thread());
  
-       thread->wait_result = -1; /* JMM - Needed for non-assert kernel */
-       thread->state |= (interruptible && thread->interruptible) ? 
-                                                       TH_WAIT : (TH_WAIT | TH_UNINT);
-       thread->at_safe_point = (interruptible == THREAD_ABORTSAFE) && (thread->interruptible);
-       thread->sleep_stamp = sched_tick;
+       /*
+        *      The thread may have certain types of interrupts/aborts masked
+        *      off.  Even if the wait location says these types of interrupts
+        *      are OK, we have to honor mask settings (outer-scoped code may
+        *      not be able to handle aborts at the moment).
+        */
+       if (interruptible > thread->interrupt_level)
+               interruptible = thread->interrupt_level;
+
+       at_safe_point = (interruptible == THREAD_ABORTSAFE);
+
+       if ((interruptible == THREAD_UNINT) || 
+               !(thread->state & TH_ABORT) ||
+               (!at_safe_point && (thread->state & TH_ABORT_SAFELY))) {
+               thread->state |= (interruptible) ? TH_WAIT : (TH_WAIT | TH_UNINT);
+               thread->at_safe_point = at_safe_point;
+               thread->sleep_stamp = sched_tick;
+               return (thread->wait_result = THREAD_WAITING);
+       } else if (thread->state & TH_ABORT_SAFELY) {
+               thread->state &= ~(TH_ABORT|TH_ABORT_SAFELY);
+       }
+       return (thread->wait_result = THREAD_INTERRUPTED);
  }
  
+/*
+ *     Routine:        thread_interrupt_level
+ *     Purpose:
+ *             Set the maximum interruptible state for the
+ *             current thread.  The effective value of any
+ *             interruptible flag passed into assert_wait
+ *             will never exceed this.
+ *
+ *             Useful for code that must not be interrupted,
+ *             but which calls code that doesn't know that.
+ *     Returns:
+ *             The old interrupt level for the thread.
+ */
+__private_extern__ 
+wait_interrupt_t
+thread_interrupt_level(
+       wait_interrupt_t new_level)
+{
+       thread_t thread = current_thread();
+       wait_interrupt_t result = thread->interrupt_level;
  
+       thread->interrupt_level = new_level;
+       return result;
+}
  
  /*
   *     Routine:        assert_wait_timeout
@@ -491,15 +546,17 @@ thread_mark_wait_locked(
   */
  unsigned int assert_wait_timeout_event;
  
-void
+wait_result_t
  assert_wait_timeout(
-        mach_msg_timeout_t             msecs,
-       int                             interruptible)
+       mach_msg_timeout_t              msecs,
+       wait_interrupt_t                interruptible)
  {
-       spl_t           s;
+       wait_result_t res;
  
-       assert_wait((event_t)&assert_wait_timeout_event, interruptible);
-       thread_set_timer(msecs, 1000*NSEC_PER_USEC);
+       res = assert_wait((event_t)&assert_wait_timeout_event, interruptible);
+       if (res == THREAD_WAITING)
+               thread_set_timer(msecs, 1000*NSEC_PER_USEC);
+       return res;
  }
  
  /*
@@ -531,10 +588,10 @@ assert_wait_possible(void)
   *     Assert that the current thread is about to go to
   *     sleep until the specified event occurs.
   */
-void
+wait_result_t
  assert_wait(
         event_t                         event,
-       int                             interruptible)
+       wait_interrupt_t        interruptible)
  {
         register wait_queue_t   wq;
         register int            index;
@@ -544,12 +601,172 @@ assert_wait(
  
         index = wait_hash(event);
         wq = &wait_queues[index];
-       (void)wait_queue_assert_wait(wq,
-                              event,
-                              interruptible);
+       return wait_queue_assert_wait(wq, event, interruptible);
  }
  
+
+/*
+ *     thread_sleep_fast_usimple_lock:
+ *
+ *     Cause the current thread to wait until the specified event
+ *     occurs.  The specified simple_lock is unlocked before releasing
+ *     the cpu and re-acquired as part of waking up.
+ *
+ *     This is the simple lock sleep interface for components that use a
+ *     faster version of simple_lock() than is provided by usimple_lock().
+ */
+__private_extern__ wait_result_t
+thread_sleep_fast_usimple_lock(
+       event_t                 event,
+       simple_lock_t           lock,
+       wait_interrupt_t        interruptible)
+{
+       wait_result_t res;
+
+       res = assert_wait(event, interruptible);
+       if (res == THREAD_WAITING) {
+               simple_unlock(lock);
+               res = thread_block(THREAD_CONTINUE_NULL);
+               simple_lock(lock);
+       }
+       return res;
+}
+
+
+/*
+ *     thread_sleep_usimple_lock:
+ *
+ *     Cause the current thread to wait until the specified event
+ *     occurs.  The specified usimple_lock is unlocked before releasing
+ *     the cpu and re-acquired as part of waking up.
+ *
+ *     This is the simple lock sleep interface for components where
+ *     simple_lock() is defined in terms of usimple_lock().
+ */
+wait_result_t
+thread_sleep_usimple_lock(
+       event_t                 event,
+       usimple_lock_t          lock,
+       wait_interrupt_t        interruptible)
+{
+       wait_result_t res;
+
+       res = assert_wait(event, interruptible);
+       if (res == THREAD_WAITING) {
+               usimple_unlock(lock);
+               res = thread_block(THREAD_CONTINUE_NULL);
+               usimple_lock(lock);
+       }
+       return res;
+}
+
+/*
+ *     thread_sleep_mutex:
+ *
+ *     Cause the current thread to wait until the specified event
+ *     occurs.  The specified mutex is unlocked before releasing
+ *     the cpu. The mutex will be re-acquired before returning.
+ *
+ *     JMM - Add hint to make sure mutex is available before rousting
+ */
+wait_result_t
+thread_sleep_mutex(
+       event_t                 event,
+       mutex_t                 *mutex,
+       wait_interrupt_t interruptible)
+{
+       wait_result_t   res;
+
+       res = assert_wait(event, interruptible);
+       if (res == THREAD_WAITING) {
+               mutex_unlock(mutex);
+               res = thread_block(THREAD_CONTINUE_NULL);
+               mutex_lock(mutex);
+       }
+       return res;
+}
    
+/*
+ *     thread_sleep_mutex_deadline:
+ *
+ *     Cause the current thread to wait until the specified event
+ *     (or deadline) occurs.  The specified mutex is unlocked before
+ *     releasing the cpu. The mutex will be re-acquired before returning.
+ *
+ *     JMM - Add hint to make sure mutex is available before rousting
+ */
+wait_result_t
+thread_sleep_mutex_deadline(
+       event_t                 event,
+       mutex_t                 *mutex,
+       uint64_t                deadline,
+       wait_interrupt_t interruptible)
+{
+       wait_result_t   res;
+
+       res = assert_wait(event, interruptible);
+       if (res == THREAD_WAITING) {
+               mutex_unlock(mutex);
+               thread_set_timer_deadline(deadline);
+               res = thread_block(THREAD_CONTINUE_NULL);
+               if (res != THREAD_TIMED_OUT)
+                       thread_cancel_timer();
+               mutex_lock(mutex);
+       }
+       return res;
+}
+
+/*
+ *     thread_sleep_lock_write:
+ *
+ *     Cause the current thread to wait until the specified event
+ *     occurs.  The specified (write) lock is unlocked before releasing
+ *     the cpu. The (write) lock will be re-acquired before returning.
+ *
+ *     JMM - Add hint to make sure mutex is available before rousting
+ */
+wait_result_t
+thread_sleep_lock_write(
+       event_t                 event,
+       lock_t                  *lock,
+       wait_interrupt_t interruptible)
+{
+       wait_result_t   res;
+
+       res = assert_wait(event, interruptible);
+       if (res == THREAD_WAITING) {
+               lock_write_done(lock);
+               res = thread_block(THREAD_CONTINUE_NULL);
+               lock_write(lock);
+       }
+       return res;
+}
+
+
+/*
+ *     thread_sleep_funnel:
+ *
+ *     Cause the current thread to wait until the specified event
+ *     occurs.  If the thread is funnelled, the funnel will be released
+ *     before giving up the cpu. The funnel will be re-acquired before returning.
+ *
+ *     JMM - Right now the funnel is dropped and re-acquired inside
+ *               thread_block().  At some point, this may give thread_block() a hint.
+ */
+wait_result_t
+thread_sleep_funnel(
+       event_t                 event,
+       wait_interrupt_t interruptible)
+{
+       wait_result_t   res;
+
+       res = assert_wait(event, interruptible);
+       if (res == THREAD_WAITING) {
+               res = thread_block(THREAD_CONTINUE_NULL);
+       }
+       return res;
+}
+
  /*
   * thread_[un]stop(thread)
   *     Once a thread has blocked interruptibly (via assert_wait) prevent 
@@ -568,32 +785,62 @@ assert_wait(
   */
  boolean_t
  thread_stop(
-       thread_t                        thread)
+       thread_t        thread)
  {
-       spl_t                           s;
+       spl_t           s = splsched();
  
-       s = splsched();
         wake_lock(thread);
  
         while (thread->state & TH_SUSP) {
-               int wait_result;
+               wait_result_t   result;
  
                 thread->wake_active = TRUE;
-               assert_wait((event_t)&thread->wake_active, THREAD_ABORTSAFE);
+               result = assert_wait(&thread->wake_active, THREAD_ABORTSAFE);
                 wake_unlock(thread);
                 splx(s);
  
-               wait_result = thread_block((void (*)(void)) 0);
-               if (wait_result != THREAD_AWAKENED)
+               if (result == THREAD_WAITING)
+                       result = thread_block(THREAD_CONTINUE_NULL);
+
+               if (result != THREAD_AWAKENED)
                         return (FALSE);
  
                 s = splsched();
                 wake_lock(thread);
         }
+
         thread_lock(thread);
         thread->state |= TH_SUSP;
-       thread_unlock(thread);
  
+       while (thread->state & TH_RUN) {
+               wait_result_t   result;
+               processor_t             processor = thread->last_processor;
+
+               if (    processor != PROCESSOR_NULL                                             &&
+                               processor->state == PROCESSOR_RUNNING                   &&
+                               processor->cpu_data->active_thread == thread    )
+                       cause_ast_check(processor);
+               thread_unlock(thread);
+
+               thread->wake_active = TRUE;
+               result = assert_wait(&thread->wake_active, THREAD_ABORTSAFE);
+               wake_unlock(thread);
+               splx(s);
+
+               if (result == THREAD_WAITING)
+                       result = thread_block(THREAD_CONTINUE_NULL);
+
+               if (result != THREAD_AWAKENED) {
+                       thread_unstop(thread);
+                       return (FALSE);
+               }
+
+               s = splsched();
+               wake_lock(thread);
+               thread_lock(thread);
+       }
+
+       thread_unlock(thread);
         wake_unlock(thread);
         splx(s);
  
@@ -606,19 +853,20 @@ thread_stop(
   */
  void
  thread_unstop(
-       thread_t                        thread)
+       thread_t        thread)
  {
-       spl_t                           s;
+       spl_t           s = splsched();
  
-       s = splsched();
         wake_lock(thread);
         thread_lock(thread);
  
-       if ((thread->state & (TH_RUN|TH_WAIT|TH_SUSP/*|TH_UNINT*/)) == TH_SUSP) {
+       if ((thread->state & (TH_RUN|TH_WAIT|TH_SUSP)) == TH_SUSP) {
                 thread->state &= ~TH_SUSP;
                 thread->state |= TH_RUN;
  
+               assert(!(thread->state & TH_IDLE));
                 _mk_sp_thread_unblock(thread);
+               hw_atomic_add(&thread->processor_set->run_count, 1);
         }
         else
         if (thread->state & TH_SUSP) {
@@ -629,8 +877,8 @@ thread_unstop(
                         thread_unlock(thread);
                         wake_unlock(thread);
                         splx(s);
-                       thread_wakeup((event_t)&thread->wake_active);
  
+                       thread_wakeup(&thread->wake_active);
                         return;
                 }
         }
@@ -645,58 +893,46 @@ thread_unstop(
   */
  boolean_t
  thread_wait(
-       thread_t                thread)
+       thread_t        thread)
  {
-       spl_t                   s;
+       spl_t           s = splsched();
  
-       s = splsched();
         wake_lock(thread);
+       thread_lock(thread);
  
-       while (thread->state & (TH_RUN/*|TH_UNINT*/)) {
-               int wait_result;
+       while (thread->state & TH_RUN) {
+               wait_result_t   result;
+               processor_t             processor = thread->last_processor;
  
-               if (thread->last_processor != PROCESSOR_NULL)
-                       cause_ast_check(thread->last_processor);
+               if (    processor != PROCESSOR_NULL                                             &&
+                               processor->state == PROCESSOR_RUNNING                   &&
+                               processor->cpu_data->active_thread == thread    )
+                       cause_ast_check(processor);
+               thread_unlock(thread);
  
                 thread->wake_active = TRUE;
-               assert_wait((event_t)&thread->wake_active, THREAD_ABORTSAFE);
+               result = assert_wait(&thread->wake_active, THREAD_ABORTSAFE);
                 wake_unlock(thread);
                 splx(s);
  
-               wait_result = thread_block((void (*)(void))0);
-               if (wait_result != THREAD_AWAKENED)
-                       return FALSE;
+               if (result == THREAD_WAITING)
+                       result = thread_block(THREAD_CONTINUE_NULL);
+
+               if (result != THREAD_AWAKENED)
+                       return (FALSE);
  
                 s = splsched();
                 wake_lock(thread);
+               thread_lock(thread);
         }
  
+       thread_unlock(thread);
         wake_unlock(thread);
         splx(s);
  
         return (TRUE);
  }
  
-
-/*
- * thread_stop_wait(thread)
- *     Stop the thread then wait for it to block interruptibly
- */
-boolean_t
-thread_stop_wait(
-       thread_t                thread)
-{
-       if (thread_stop(thread)) {
-               if (thread_wait(thread))
-                       return (TRUE);
-
-               thread_unstop(thread);
-       }
-
-       return (FALSE);
-}
-
-
  /*
   *     Routine: clear_wait_internal
   *
@@ -708,23 +944,46 @@ thread_stop_wait(
   *     Conditions:
   *             At splsched
   *             the thread is locked.
+ *     Returns:
+ *             KERN_SUCCESS            thread was rousted out a wait
+ *             KERN_FAILURE            thread was waiting but could not be rousted
+ *             KERN_NOT_WAITING        thread was not waiting
   */
-void
+__private_extern__ kern_return_t
  clear_wait_internal(
-       thread_t        thread,
-       int             result)
+       thread_t                thread,
+       wait_result_t   result)
  {
-       /*
-        * If the thread isn't in a wait queue, just set it running.  Otherwise,
-        * try to remove it from the queue and, if successful, then set it
-        * running.  NEVER interrupt an uninterruptible thread.
-        */
-       if (!((result == THREAD_INTERRUPTED) && (thread->state & TH_UNINT))) {
-               if (wait_queue_assert_possible(thread) ||
-                   (wait_queue_remove(thread) == KERN_SUCCESS)) {
-                       thread_go_locked(thread, result);
+       wait_queue_t    wq = thread->wait_queue;
+       kern_return_t   ret;
+       int                             loop_count;
+
+       loop_count = 0;
+       do {
+               if ((result == THREAD_INTERRUPTED) && (thread->state & TH_UNINT))
+                       return KERN_FAILURE;
+
+               if (wq != WAIT_QUEUE_NULL) {
+                       if (wait_queue_lock_try(wq)) {
+                               wait_queue_pull_thread_locked(wq, thread, TRUE);
+                               /* wait queue unlocked, thread still locked */
+                       } else {
+                               thread_unlock(thread);
+                               delay(1);
+                               thread_lock(thread);
+
+                               if (wq != thread->wait_queue) {
+                                       return KERN_NOT_WAITING; /* we know it moved */
+                               }
+                               continue;
+                       }
                 }
-       }
+               ret = thread_go_locked(thread, result);
+               return ret; 
+       } while (++loop_count < LockTimeOut);
+       panic("clear_wait_internal: deadlock: thread=0x%x, wq=0x%x, cpu=%d\n",
+                 thread, wq, cpu_number());
+       return KERN_FAILURE;
  }
  
  
@@ -738,18 +997,20 @@ clear_wait_internal(
   *       thread                thread to awaken
   *       result                Wakeup result the thread should see
   */
-void
+kern_return_t
  clear_wait(
-       thread_t        thread,
-       int             result)
+       thread_t                thread,
+       wait_result_t   result)
  {
+       kern_return_t ret;
         spl_t           s;
  
         s = splsched();
         thread_lock(thread);
-       clear_wait_internal(thread, result);
+       ret = clear_wait_internal(thread, result);
         thread_unlock(thread);
         splx(s);
+       return ret;
  }
  
  
@@ -760,11 +1021,11 @@ clear_wait(
   *     and thread_wakeup_one.
   *
   */
-void
+kern_return_t
  thread_wakeup_prim(
         event_t                 event,
         boolean_t               one_thread,
-       int                     result)
+       wait_result_t   result)
  {
         register wait_queue_t   wq;
         register int                    index;
@@ -772,9 +1033,9 @@ thread_wakeup_prim(
         index = wait_hash(event);
         wq = &wait_queues[index];
         if (one_thread)
-           wait_queue_wakeup_one(wq, event, result);
+           return (wait_queue_wakeup_one(wq, event, result));
         else
-           wait_queue_wakeup_all(wq, event, result);
+           return (wait_queue_wakeup_all(wq, event, result));
  }
  
  /*
@@ -818,19 +1079,14 @@ thread_select(
          *      Check for other non-idle runnable threads.
          */
         pset = myprocessor->processor_set;
-       thread = current_thread();
-
-       /*
-        *      Update set_quanta for timesharing.
-        */
-       pset->set_quanta = pset->machine_quanta[
-                                                       (pset->runq.count > pset->processor_count) ?
-                                                                 pset->processor_count : pset->runq.count];
+       thread = myprocessor->cpu_data->active_thread;
  
         /* Update the thread's priority */
         if (thread->sched_stamp != sched_tick)
                 update_priority(thread);
  
+       myprocessor->current_pri = thread->sched_pri;
+
         simple_lock(&runq->lock);
         simple_lock(&pset->runq.lock);
  
@@ -852,11 +1108,8 @@ thread_select(
                                 (thread->sched_mode & TH_MODE_TIMESHARE)? pset->set_quanta: 1;
         }
         else
-       if (other_runnable) {
-               simple_unlock(&pset->runq.lock);
-               simple_unlock(&runq->lock);
+       if (other_runnable)
                 thread = choose_thread(myprocessor);
-       }
         else {
                 simple_unlock(&pset->runq.lock);
                 simple_unlock(&runq->lock);
@@ -866,23 +1119,19 @@ thread_select(
                  *      was running.  If it was in an assignment or shutdown,
                  *      leave it alone.  Return its idle thread.
                  */
-               simple_lock(&pset->idle_lock);
+               simple_lock(&pset->sched_lock);
                 if (myprocessor->state == PROCESSOR_RUNNING) {
+                       remqueue(&pset->active_queue, (queue_entry_t)myprocessor);
                         myprocessor->state = PROCESSOR_IDLE;
-                       /*
-                        *      XXX Until it goes away, put master on end of queue, others
-                        *      XXX on front so master gets used last.
-                        */
+
                         if (myprocessor == master_processor)
-                               queue_enter(&(pset->idle_queue), myprocessor,
-                                                                                       processor_t, processor_queue);
+                               enqueue_tail(&pset->idle_queue, (queue_entry_t)myprocessor);
                         else
-                               queue_enter_first(&(pset->idle_queue), myprocessor,
-                                                                                       processor_t, processor_queue);
+                               enqueue_head(&pset->idle_queue, (queue_entry_t)myprocessor);
  
                         pset->idle_count++;
                 }
-               simple_unlock(&pset->idle_lock);
+               simple_unlock(&pset->sched_lock);
  
                 thread = myprocessor->idle_thread;
         }
@@ -896,8 +1145,7 @@ thread_select(
   *     If continuation is non-zero, and the current thread is blocked,
   *     then it will resume by executing continuation on a new stack.
   *     Returns TRUE if the hand-off succeeds.
- *     The reason parameter contains | AST_QUANTUM if the thread blocked
- *     because its quantum expired.
+ *
   *     Assumes splsched.
   */
  
@@ -912,297 +1160,300 @@ thread_invoke(
         register thread_t       old_thread,
         register thread_t       new_thread,
         int                                     reason,
-       void                (*continuation)(void))
+       thread_continue_t       old_cont)
  {
-       void                (*lcont)(void);
+       thread_continue_t       new_cont;
+       processor_t                     processor;
  
-       if (cpu_data[cpu_number()].preemption_level != 0)
+       if (get_preemption_level() != 0)
                 panic("thread_invoke: preemption_level %d\n",
-                                                               cpu_data[cpu_number()].preemption_level);
+                                                               get_preemption_level());
  
         /*
-        *      Mark thread interruptible.
+        * Mark thread interruptible.
          */
         thread_lock(new_thread);
         new_thread->state &= ~TH_UNINT;
  
         assert(thread_runnable(new_thread));
  
-       assert(old_thread->continuation == (void (*)(void))0);  
+       assert(old_thread->continuation == NULL);       
  
+       /*
+        * Allow time constraint threads to hang onto
+        * a stack.
+        */
         if (    (old_thread->sched_mode & TH_MODE_REALTIME)             &&
                                         !old_thread->stack_privilege                            ) {
                 old_thread->stack_privilege = old_thread->kernel_stack;
         }
  
-       if (continuation != (void (*)()) 0) {
-         switch (new_thread->state & TH_STACK_STATE) {
-         case TH_STACK_HANDOFF:
-
-               /*
-                * If the old thread has stack privilege, we can't give
-                * his stack away. So go and get him one and treat this
-                * as a traditional context switch.
-                */
-               if (old_thread->stack_privilege == current_stack()) 
-                       goto get_new_stack;
+       if (old_cont != NULL) {
+               if (new_thread->state & TH_STACK_HANDOFF) {
+                       /*
+                        * If the old thread is using a privileged stack,
+                        * check to see whether we can exchange it with
+                        * that of the new thread.
+                        */
+                       if (    old_thread->kernel_stack == old_thread->stack_privilege &&
+                                                       !new_thread->stack_privilege)
+                               goto need_stack;
  
-               /*
-                * Make the whole handoff/dispatch atomic to match the
-                * non-handoff case.
-                */
-               disable_preemption();
+                       new_thread->state &= ~TH_STACK_HANDOFF;
+                       new_cont = new_thread->continuation;
+                       new_thread->continuation = NULL;
  
-               /*
-                *      Set up ast context of new thread and switch to its timer.
-                */
-               new_thread->state &= ~(TH_STACK_HANDOFF|TH_UNINT);
-               new_thread->last_processor = current_processor();
-               ast_context(new_thread->top_act, cpu_number());
-               timer_switch(&new_thread->system_timer);
-               thread_unlock(new_thread);
+                       /*
+                        * Set up ast context of new thread and switch
+                        * to its timer.
+                        */
+                       processor = current_processor();
+                       new_thread->last_processor = processor;
+                       processor->current_pri = new_thread->sched_pri;
+                       ast_context(new_thread->top_act, processor->slot_num);
+                       timer_switch(&new_thread->system_timer);
+                       thread_unlock(new_thread);
                 
-               current_task()->csw++;
-
-               old_thread->continuation = continuation;
-               stack_handoff(old_thread, new_thread);
+                       current_task()->csw++;
  
-               wake_lock(old_thread);
-               thread_lock(old_thread);
-               act_machine_sv_free(old_thread->top_act);
+                       old_thread->reason = reason;
+                       old_thread->continuation = old_cont;
            
-               _mk_sp_thread_done(old_thread);
+                       _mk_sp_thread_done(old_thread, new_thread, processor);
  
-               /* 
-                *  inline thread_dispatch but don't free stack
-                */
+                       stack_handoff(old_thread, new_thread);
+
+                       _mk_sp_thread_begin(new_thread, processor);
  
-               switch (old_thread->state & (TH_RUN|TH_WAIT|TH_UNINT|TH_IDLE)) {
- 
-           case TH_RUN                     | TH_UNINT:
-           case TH_RUN:
-                 /*
-                  *    No reason to stop.  Put back on a run queue.
-                  */
-                 old_thread->state |= TH_STACK_HANDOFF;
-                 _mk_sp_thread_dispatch(old_thread);
-                 break;
-
-           case TH_RUN | TH_WAIT           | TH_UNINT:
-           case TH_RUN | TH_WAIT:
-                 old_thread->sleep_stamp = sched_tick;
-                 /* fallthrough */
-
-           case          TH_WAIT:                      /* this happens! */
-                 /*
-                  *    Waiting
-                  */
-                 old_thread->state |= TH_STACK_HANDOFF;
-                 old_thread->state &= ~TH_RUN;
-                 if (old_thread->state & TH_TERMINATE)
-                       thread_reaper_enqueue(old_thread);
-
-                 if (old_thread->wake_active) {
-                       old_thread->wake_active = FALSE;
-                       thread_unlock(old_thread);
-                       wake_unlock(old_thread);
-                       thread_wakeup((event_t)&old_thread->wake_active);
                         wake_lock(old_thread);
                         thread_lock(old_thread);
-                 }
-                 break;
-
-           case TH_RUN | TH_IDLE:
-                 /*
-                  *    Drop idle thread -- it is already in
-                  *    idle_thread_array.
-                  */
-                 old_thread->state |= TH_STACK_HANDOFF;
-                 break;
-
-           default:
-                 panic("State 0x%x \n",old_thread->state);
-               }
  
-               thread_unlock(old_thread);
-               wake_unlock(old_thread);
+                       /* 
+                        * Inline thread_dispatch but
+                        * don't free stack.
+                        */
+
+                       switch (old_thread->state & (TH_RUN|TH_WAIT|TH_UNINT|TH_IDLE)) {
+ 
+                       case TH_RUN                             | TH_UNINT:
+                       case TH_RUN:
+                               /*
+                                * Still running, put back
+                                * onto a run queue.
+                                */
+                               old_thread->state |= TH_STACK_HANDOFF;
+                               _mk_sp_thread_dispatch(old_thread);
  
-               thread_lock(new_thread);
-               assert(thread_runnable(new_thread));
-               _mk_sp_thread_begin(new_thread);
+                               thread_unlock(old_thread);
+                               wake_unlock(old_thread);
+                               break;
  
-               lcont = new_thread->continuation;
-               new_thread->continuation = (void(*)(void))0;
+                       case TH_RUN | TH_WAIT   | TH_UNINT:
+                       case TH_RUN | TH_WAIT:
+                       {
+                               boolean_t       reap, wake, callblock;
  
-               thread_unlock(new_thread);
-               enable_preemption();
+                               /*
+                                * Waiting.
+                                */
+                               old_thread->sleep_stamp = sched_tick;
+                               old_thread->state |= TH_STACK_HANDOFF;
+                               old_thread->state &= ~TH_RUN;
+                               hw_atomic_sub(&old_thread->processor_set->run_count, 1);
+                               callblock = old_thread->active_callout;
+                               wake = old_thread->wake_active;
+                               old_thread->wake_active = FALSE;
+                               reap = (old_thread->state & TH_TERMINATE)? TRUE: FALSE;
+
+                               thread_unlock(old_thread);
+                               wake_unlock(old_thread);
+
+                               if (callblock)
+                                       call_thread_block();
+
+                               if (wake)
+                                       thread_wakeup((event_t)&old_thread->wake_active);
+
+                               if (reap)
+                                       thread_reaper_enqueue(old_thread);
+                               break;
+                       }
  
-               counter_always(c_thread_invoke_hits++);
+                       case TH_RUN                             | TH_IDLE:
+                               /*
+                                * The idle threads don't go
+                                * onto a run queue.
+                                */
+                               old_thread->state |= TH_STACK_HANDOFF;
+                               thread_unlock(old_thread);
+                               wake_unlock(old_thread);
+                               break;
  
-               if (new_thread->funnel_state & TH_FN_REFUNNEL) {
-                 kern_return_t save_wait_result;
-                 new_thread->funnel_state = 0;
-                 save_wait_result = new_thread->wait_result;
-                 KERNEL_DEBUG(0x6032428 | DBG_FUNC_NONE, new_thread->funnel_lock, 2, 0, 0, 0);
-                 //mutex_lock(new_thread->funnel_lock);
-                 funnel_lock(new_thread->funnel_lock);
-                 KERNEL_DEBUG(0x6032430 | DBG_FUNC_NONE, new_thread->funnel_lock, 2, 0, 0, 0);
-                 new_thread->funnel_state = TH_FN_OWNED;
-                 new_thread->wait_result = save_wait_result;
-               }
-               (void) spllo();
+                       default:
+                               panic("thread_invoke: state 0x%x\n", old_thread->state);
+                       }
  
-               assert(lcont);
-               call_continuation(lcont);
-               /*NOTREACHED*/
-               return TRUE;
+                       counter_always(c_thread_invoke_hits++);
  
-         case TH_STACK_ALLOC:
-               /*
-                * waiting for a stack
-                */
-               thread_swapin(new_thread);
-               thread_unlock(new_thread);
-               counter_always(c_thread_invoke_misses++);
-               return FALSE;
-
-      case 0:
-        /*
-         * already has a stack - can't handoff
-         */
-               if (new_thread == old_thread) {
+                       if (new_thread->funnel_state & TH_FN_REFUNNEL) {
+                               kern_return_t           wait_result = new_thread->wait_result;
  
-                 /* same thread but with continuation */
-                 counter(++c_thread_invoke_same);
-                 thread_unlock(new_thread);
-
-                 if (old_thread->funnel_state & TH_FN_REFUNNEL) {
-                       kern_return_t save_wait_result;
-
-                       old_thread->funnel_state = 0;
-                       save_wait_result = old_thread->wait_result;
-                       KERNEL_DEBUG(0x6032428 | DBG_FUNC_NONE, old_thread->funnel_lock, 3, 0, 0, 0);
-                       funnel_lock(old_thread->funnel_lock);
-                       KERNEL_DEBUG(0x6032430 | DBG_FUNC_NONE, old_thread->funnel_lock, 3, 0, 0, 0);
-                       old_thread->funnel_state = TH_FN_OWNED;
-                       old_thread->wait_result = save_wait_result;
-                 }
-                 (void) spllo();
-                 call_continuation(continuation);
-          /*NOTREACHED*/
+                               new_thread->funnel_state = 0;
+                               KERNEL_DEBUG(0x6032428 | DBG_FUNC_NONE,
+                                                                       new_thread->funnel_lock, 2, 0, 0, 0);
+                               funnel_lock(new_thread->funnel_lock);
+                               KERNEL_DEBUG(0x6032430 | DBG_FUNC_NONE,
+                                                                       new_thread->funnel_lock, 2, 0, 0, 0);
+                               new_thread->funnel_state = TH_FN_OWNED;
+                               new_thread->wait_result = wait_result;
+                       }
+                       (void) spllo();
+
+                       assert(new_cont);
+                       call_continuation(new_cont);
+                       /*NOTREACHED*/
+                       return (TRUE);
                 }
-        break;
-      }
-       } else {
-         /*
-          * check that the new thread has a stack
-          */
-         if (new_thread->state & TH_STACK_STATE) {
-         get_new_stack:
-               /* has no stack. if not already waiting for one try to get one */
-               if ((new_thread->state & TH_STACK_ALLOC) ||
-                       /* not already waiting. nonblocking try to get one */
-                       !stack_alloc_try(new_thread, thread_continue))
-                 {
-                       /* couldn't get one. schedule new thread to get a stack and
-                          return failure so we can try another thread. */
-                       thread_swapin(new_thread);
-                       thread_unlock(new_thread);
+               else
+               if (new_thread->state & TH_STACK_ALLOC) {
+                       /*
+                        * Waiting for a stack
+                        */
                         counter_always(c_thread_invoke_misses++);
-            return FALSE;
-          }
-         } else if (old_thread == new_thread) {
-                 counter(++c_thread_invoke_same);
-                 thread_unlock(new_thread);
-                 return TRUE;
-         }
-
-         /* new thread now has a stack. it has been setup to resume in
-                thread_continue so it can dispatch the old thread, deal with
-                funnelling and then go to it's true continuation point */
+                       thread_unlock(new_thread);
+                       return (FALSE);
+               }
+               else
+               if (new_thread == old_thread) {
+                       /* same thread but with continuation */
+                       counter(++c_thread_invoke_same);
+                       thread_unlock(new_thread);
+
+                       if (new_thread->funnel_state & TH_FN_REFUNNEL) {
+                               kern_return_t   wait_result = new_thread->wait_result;
+
+                               new_thread->funnel_state = 0;
+                               KERNEL_DEBUG(0x6032428 | DBG_FUNC_NONE,
+                                                                       new_thread->funnel_lock, 3, 0, 0, 0);
+                               funnel_lock(new_thread->funnel_lock);
+                               KERNEL_DEBUG(0x6032430 | DBG_FUNC_NONE,
+                                                                       new_thread->funnel_lock, 3, 0, 0, 0);
+                               new_thread->funnel_state = TH_FN_OWNED;
+                               new_thread->wait_result = wait_result;
+                       }
+                       (void) spllo();
+                       call_continuation(old_cont);
+                       /*NOTREACHED*/
+               }
         }
+       else {
+               /*
+                * Check that the new thread has a stack
+                */
+               if (new_thread->state & TH_STACK_HANDOFF) {
+need_stack:
+                       if (!stack_alloc_try(new_thread, thread_continue)) {
+                               counter_always(c_thread_invoke_misses++);
+                               thread_swapin(new_thread);
+                               return (FALSE);
+                       }
          
-       new_thread->state &= ~(TH_STACK_HANDOFF | TH_UNINT);
+                       new_thread->state &= ~TH_STACK_HANDOFF;
+               }
+               else
+               if (new_thread->state & TH_STACK_ALLOC) {
+                       /*
+                        * Waiting for a stack
+                        */
+                       counter_always(c_thread_invoke_misses++);
+                       thread_unlock(new_thread);
+                       return (FALSE);
+               }
+               else
+               if (old_thread == new_thread) {
+                       counter(++c_thread_invoke_same);
+                       thread_unlock(new_thread);
+                       return (TRUE);
+               }
+       }
  
         /*
-        *      Set up ast context of new thread and switch to its timer.
+        * Set up ast context of new thread and switch to its timer.
          */
-       new_thread->last_processor = current_processor();
-       ast_context(new_thread->top_act, cpu_number());
+       processor = current_processor();
+       new_thread->last_processor = processor;
+       processor->current_pri = new_thread->sched_pri;
+       ast_context(new_thread->top_act, processor->slot_num);
         timer_switch(&new_thread->system_timer);
         assert(thread_runnable(new_thread));
-       
-       /*
-        * N.B. On return from the call to switch_context, 'old_thread'
-        * points at the thread that yielded to us.  Unfortunately, at
-        * this point, there are no simple_locks held, so if we are preempted
-        * before the call to thread_dispatch blocks preemption, it is
-        * possible for 'old_thread' to terminate, leaving us with a
-        * stale thread pointer.
-        */
-       disable_preemption();
-
         thread_unlock(new_thread);
  
         counter_always(c_thread_invoke_csw++);
         current_task()->csw++;
  
-       thread_lock(old_thread);
-       old_thread->reason = reason;
         assert(old_thread->runq == RUN_QUEUE_NULL);
+       old_thread->reason = reason;
+       old_thread->continuation = old_cont;
  
-       if (continuation != (void (*)(void))0)
-               old_thread->continuation = continuation;
-
-       _mk_sp_thread_done(old_thread);
-       thread_unlock(old_thread);
+       _mk_sp_thread_done(old_thread, new_thread, processor);
  
         /*
          *      switch_context is machine-dependent.  It does the
          *      machine-dependent components of a context-switch, like
          *      changing address spaces.  It updates active_threads.
          */
-       old_thread = switch_context(old_thread, continuation, new_thread);
+       old_thread = switch_context(old_thread, old_cont, new_thread);
         
         /* Now on new thread's stack.  Set a local variable to refer to it. */
         new_thread = __current_thread();
         assert(old_thread != new_thread);
  
-       thread_lock(new_thread);
         assert(thread_runnable(new_thread));
-       _mk_sp_thread_begin(new_thread);
-       thread_unlock(new_thread);
+       _mk_sp_thread_begin(new_thread, new_thread->last_processor);
  
         /*
          *      We're back.  Now old_thread is the thread that resumed
          *      us, and we have to dispatch it.
          */
-
         thread_dispatch(old_thread);
-       enable_preemption();
-
-       /* if we get here and 'continuation' is set that means the
-        * switch_context() path returned and did not call out
-        * to the continuation. we will do it manually here */
-       if (continuation) {
-         call_continuation(continuation);
-         /* NOTREACHED */
+
+       if (old_cont) {
+               if (new_thread->funnel_state & TH_FN_REFUNNEL) {
+                       kern_return_t           wait_result = new_thread->wait_result;
+
+                       new_thread->funnel_state = 0;
+                       KERNEL_DEBUG(0x6032428 | DBG_FUNC_NONE,
+                                                               new_thread->funnel_lock, 3, 0, 0, 0);
+                       funnel_lock(new_thread->funnel_lock);
+                       KERNEL_DEBUG(0x6032430 | DBG_FUNC_NONE,
+                                                               new_thread->funnel_lock, 3, 0, 0, 0);
+                       new_thread->funnel_state = TH_FN_OWNED;
+                       new_thread->wait_result = wait_result;
+               }
+               (void) spllo();
+               call_continuation(old_cont);
+               /*NOTREACHED*/
         }
  
-       return TRUE;
+       return (TRUE);
  }
  
  /*
   *     thread_continue:
   *
- *     Called when the launching a new thread, at splsched();
+ *     Called when a thread gets a new stack, at splsched();
   */
  void
  thread_continue(
         register thread_t       old_thread)
  {
-       register thread_t       self = current_thread();
-       register void           (*continuation)();
+       register thread_t                       self = current_thread();
+       register thread_continue_t      continuation;
+       
+       continuation = self->continuation;
+       self->continuation = NULL;
+
+       _mk_sp_thread_begin(self, self->last_processor);
         
         /*
          *      We must dispatch the old thread and then
@@ -1212,39 +1463,20 @@ thread_continue(
          */
         if (old_thread != THREAD_NULL)
                 thread_dispatch(old_thread);
-       
-       thread_lock(self);
-       continuation = self->continuation;
-       self->continuation = (void (*)(void))0;
-
-       _mk_sp_thread_begin(self);
-       thread_unlock(self);
-
-       /*
-        * N.B. - the following is necessary, since thread_invoke()
-        * inhibits preemption on entry and reenables before it
-        * returns.  Unfortunately, the first time a newly-created
-        * thread executes, it magically appears here, and never
-        * executes the enable_preemption() call in thread_invoke().
-        */
-       enable_preemption();
  
         if (self->funnel_state & TH_FN_REFUNNEL) {
-               kern_return_t   save_wait_result;
+               kern_return_t           wait_result = self->wait_result;
  
                 self->funnel_state = 0;
-               save_wait_result = self->wait_result;
                 KERNEL_DEBUG(0x6032428 | DBG_FUNC_NONE, self->funnel_lock, 4, 0, 0, 0);
                 funnel_lock(self->funnel_lock);
                 KERNEL_DEBUG(0x6032430 | DBG_FUNC_NONE, self->funnel_lock, 4, 0, 0, 0);
-               self->wait_result = save_wait_result;
                 self->funnel_state = TH_FN_OWNED;
+               self->wait_result = wait_result;
         }
-
-       spllo();
-
+       (void)spllo();
         assert(continuation);
-       (*continuation)();
+       call_continuation(continuation);
         /*NOTREACHED*/
  }
  
@@ -1357,8 +1589,8 @@ counter(mach_counter_t  c_thread_block_calls = 0;)
   
  int
  thread_block_reason(
-       void            (*continuation)(void),
-       int                     reason)
+       thread_continue_t       continuation,
+       ast_t                           reason)
  {
         register thread_t               thread = current_thread();
         register processor_t    myprocessor;
@@ -1374,61 +1606,58 @@ thread_block_reason(
         s = splsched();
  
         if ((thread->funnel_state & TH_FN_OWNED) && !(reason & AST_PREEMPT)) {
-         thread->funnel_state = TH_FN_REFUNNEL;
-         KERNEL_DEBUG(0x603242c | DBG_FUNC_NONE, thread->funnel_lock, 2, 0, 0, 0);
-         funnel_unlock(thread->funnel_lock);
+               thread->funnel_state = TH_FN_REFUNNEL;
+               KERNEL_DEBUG(
+                       0x603242c | DBG_FUNC_NONE, thread->funnel_lock, 2, 0, 0, 0);
+               funnel_unlock(thread->funnel_lock);
         }
  
         myprocessor = current_processor();
  
-       thread_lock(thread);
-       if (thread->state & TH_ABORT)
-               clear_wait_internal(thread, THREAD_INTERRUPTED);
-
-       if (!(reason & AST_BLOCK))
+       /* If we're explicitly yielding, force a subsequent quantum */
+       if (reason & AST_YIELD)
                 myprocessor->slice_quanta = 0;
  
-       /* Unconditionally remove either | both */
-       ast_off(AST_PREEMPT);
+       /* We're handling all scheduling AST's */
+       ast_off(AST_SCHEDULING);
  
+       thread_lock(thread);
         new_thread = thread_select(myprocessor);
-       assert(new_thread);
-       assert(thread_runnable(new_thread));
+       assert(new_thread && thread_runnable(new_thread));
         thread_unlock(thread);
         while (!thread_invoke(thread, new_thread, reason, continuation)) {
                 thread_lock(thread);
                 new_thread = thread_select(myprocessor);
-               assert(new_thread);
-               assert(thread_runnable(new_thread));
+               assert(new_thread && thread_runnable(new_thread));
                 thread_unlock(thread);
         }
  
         if (thread->funnel_state & TH_FN_REFUNNEL) {
-               kern_return_t   save_wait_result;
+               kern_return_t   wait_result = thread->wait_result;
  
-               save_wait_result = thread->wait_result;
                 thread->funnel_state = 0;
-               KERNEL_DEBUG(0x6032428 | DBG_FUNC_NONE, thread->funnel_lock, 5, 0, 0, 0);
+               KERNEL_DEBUG(
+                       0x6032428 | DBG_FUNC_NONE, thread->funnel_lock, 5, 0, 0, 0);
                 funnel_lock(thread->funnel_lock);
-               KERNEL_DEBUG(0x6032430 | DBG_FUNC_NONE, thread->funnel_lock, 5, 0, 0, 0);
+               KERNEL_DEBUG(
+                       0x6032430 | DBG_FUNC_NONE, thread->funnel_lock, 5, 0, 0, 0);
                 thread->funnel_state = TH_FN_OWNED;
-               thread->wait_result = save_wait_result;
+               thread->wait_result = wait_result;
         }
  
         splx(s);
  
-       return thread->wait_result;
+       return (thread->wait_result);
  }
  
  /*
   *     thread_block:
   *
- *     Block the current thread if a wait has been asserted,
- *     otherwise yield the remainder of the current quantum.
+ *     Block the current thread if a wait has been asserted.
   */
  int
  thread_block(
-       void            (*continuation)(void))
+       thread_continue_t       continuation)
  {
         return thread_block_reason(continuation, AST_NONE);
  }
@@ -1436,26 +1665,57 @@ thread_block(
  /*
   *     thread_run:
   *
- *     Switch directly from the current thread to a specified
- *     thread.  Both the current and new threads must be
- *     runnable.
+ *     Switch directly from the current (old) thread to the
+ *     specified thread, handing off our quantum if possible.
+ *
+ *     New thread must be runnable, and not on a run queue.
   *
   *  Assumption:
   *     at splsched.
   */
  int
  thread_run(
-       thread_t        old_thread,
-       void            (*continuation)(void),
-       thread_t        new_thread)
+       thread_t                        old_thread,
+       thread_continue_t       continuation,
+       thread_t                        new_thread)
  {
-       while (!thread_invoke(old_thread, new_thread, 0, continuation)) {
-               register processor_t myprocessor = current_processor();
+       ast_t           handoff = AST_HANDOFF;
+
+       assert(old_thread == current_thread());
+
+       machine_clock_assist();
+
+       if (old_thread->funnel_state & TH_FN_OWNED) {
+               old_thread->funnel_state = TH_FN_REFUNNEL;
+               KERNEL_DEBUG(
+                       0x603242c | DBG_FUNC_NONE, old_thread->funnel_lock, 3, 0, 0, 0);
+               funnel_unlock(old_thread->funnel_lock);
+       }
+
+       while (!thread_invoke(old_thread, new_thread, handoff, continuation)) {
+               register processor_t            myprocessor = current_processor();
+
                 thread_lock(old_thread);
                 new_thread = thread_select(myprocessor);
                 thread_unlock(old_thread);
+               handoff = AST_NONE;
+       }
+
+       /* if we fell thru */
+       if (old_thread->funnel_state & TH_FN_REFUNNEL) {
+               kern_return_t   wait_result = old_thread->wait_result;
+
+               old_thread->funnel_state = 0;
+               KERNEL_DEBUG(
+                       0x6032428 | DBG_FUNC_NONE, old_thread->funnel_lock, 6, 0, 0, 0);
+               funnel_lock(old_thread->funnel_lock);
+               KERNEL_DEBUG(
+                       0x6032430 | DBG_FUNC_NONE, old_thread->funnel_lock, 6, 0, 0, 0);
+               old_thread->funnel_state = TH_FN_OWNED;
+               old_thread->wait_result = wait_result;
         }
-       return old_thread->wait_result;
+
+       return (old_thread->wait_result);
  }
  
  /*
@@ -1466,23 +1726,19 @@ void
  thread_dispatch(
         register thread_t       thread)
  {
+       wake_lock(thread);
+       thread_lock(thread);
+
         /*
          *      If we are discarding the thread's stack, we must do it
          *      before the thread has a chance to run.
          */
-       wake_lock(thread);
-       thread_lock(thread);
-
  #ifndef i386
-       /* no continuations on i386 for now */
-    if (thread->continuation != (void (*)())0) {
-      assert((thread->state & TH_STACK_STATE) == 0);
-      thread->state |= TH_STACK_HANDOFF;
-      stack_free(thread);
-      if (thread->top_act) {
-        act_machine_sv_free(thread->top_act);
-        }
-      }
+    if (thread->continuation != NULL) {
+               assert((thread->state & TH_STACK_STATE) == 0);
+               thread->state |= TH_STACK_HANDOFF;
+               stack_free(thread);
+       }
  #endif
  
         switch (thread->state & (TH_RUN|TH_WAIT|TH_UNINT|TH_IDLE)) {
@@ -1497,101 +1753,128 @@ thread_dispatch(
  
         case TH_RUN | TH_WAIT   | TH_UNINT:
         case TH_RUN | TH_WAIT:
-               thread->sleep_stamp = sched_tick;
-               /* fallthrough */
-       case              TH_WAIT:                      /* this happens! */
+       {
+               boolean_t       reap, wake, callblock;
         
                 /*
                  *      Waiting
                  */
+               thread->sleep_stamp = sched_tick;
                 thread->state &= ~TH_RUN;
-               if (thread->state & TH_TERMINATE)
-                       thread_reaper_enqueue(thread);
+               hw_atomic_sub(&thread->processor_set->run_count, 1);
+               callblock = thread->active_callout;
+               wake = thread->wake_active;
+               thread->wake_active = FALSE;
+               reap = (thread->state & TH_TERMINATE)? TRUE: FALSE;
  
-               if (thread->wake_active) {
-                   thread->wake_active = FALSE;
-                   thread_unlock(thread);
-                   wake_unlock(thread);
+               thread_unlock(thread);
+               wake_unlock(thread);
+
+               if (callblock)
+                       call_thread_block();
+
+               if (wake)
                     thread_wakeup((event_t)&thread->wake_active);
-                   return;
-               }
-               break;
+
+               if (reap)
+                       thread_reaper_enqueue(thread);
+
+               return;
+       }
  
         case TH_RUN                                             | TH_IDLE:
                 /*
-                *      Drop idle thread -- it is already in
-                *      idle_thread_array.
+                * The idle threads don't go
+                * onto a run queue.
                  */
                 break;
  
         default:
-               panic("State 0x%x \n",thread->state);
+               panic("thread_dispatch: bad thread state 0x%x\n", thread->state);
         }
+
         thread_unlock(thread);
         wake_unlock(thread);
  }
  
  /*
   * Enqueue thread on run queue.  Thread must be locked,
- * and not already be on a run queue.
+ * and not already be on a run queue.  Returns TRUE iff
+ * the particular queue level was empty beforehand.
   */
-int
+boolean_t
  run_queue_enqueue(
         register run_queue_t    rq,
         register thread_t               thread,
         boolean_t                               tail)
  {
-       register int                    whichq;
-       int                                             oldrqcount;
+       register int                    whichq = thread->sched_pri;
+       register queue_t                queue = &rq->queues[whichq];
+       boolean_t                               result = FALSE;
         
-       whichq = thread->sched_pri;
         assert(whichq >= MINPRI && whichq <= MAXPRI);
  
-       simple_lock(&rq->lock); /* lock the run queue */
+       simple_lock(&rq->lock);
         assert(thread->runq == RUN_QUEUE_NULL);
+       if (queue_empty(queue)) {
+               enqueue_tail(queue, (queue_entry_t)thread);
+
+               setbit(MAXPRI - whichq, rq->bitmap);
+               if (whichq > rq->highq)
+                       rq->highq = whichq;
+               result = TRUE;
+       }
+       else
         if (tail)
-               enqueue_tail(&rq->queues[whichq], (queue_entry_t)thread);
+               enqueue_tail(queue, (queue_entry_t)thread);
         else
-               enqueue_head(&rq->queues[whichq], (queue_entry_t)thread);
-
-       setbit(MAXPRI - whichq, rq->bitmap);
-       if (whichq > rq->highq)
-               rq->highq = whichq;
+               enqueue_head(queue, (queue_entry_t)thread);
  
-       oldrqcount = rq->count++;
         thread->runq = rq;
-       thread->whichq = whichq;
+       if (thread->sched_mode & TH_MODE_PREEMPT)
+               rq->urgency++;
+       rq->count++;
  #if    DEBUG
         thread_check(thread, rq);
  #endif /* DEBUG */
         simple_unlock(&rq->lock);
  
-       return (oldrqcount);
+       return (result);
  }
  
+struct {
+       uint32_t        pset_idle_last,
+                               pset_idle_any,
+                               pset_self,
+                               pset_last,
+                               pset_other,
+                               bound_idle,
+                               bound_self,
+                               bound_other;
+} dispatch_counts;
+
  /*
   *     thread_setrun:
   *
- *     Make thread runnable; dispatch directly onto an idle processor
- *     if possible.  Else put on appropriate run queue (processor
- *     if bound, else processor set.  Caller must have lock on thread.
- *     This is always called at splsched.
- *     The tail parameter, if TRUE || TAIL_Q, indicates that the 
- *     thread should be placed at the tail of the runq. If 
- *     FALSE || HEAD_Q the thread will be placed at the head of the 
- *      appropriate runq.
+ *     Dispatch thread for execution, directly onto an idle
+ *     processor if possible.  Else put on appropriate run
+ *     queue. (local if bound, else processor set)
+ *
+ *     Thread must be locked.
+ *
+ *     The tail parameter indicates the proper placement of
+ *     the thread on a run queue.
   */
  void
  thread_setrun(
         register thread_t                       new_thread,
-       boolean_t                                       may_preempt,
         boolean_t                                       tail)
  {
         register processor_t            processor;
-       register run_queue_t            runq;
         register processor_set_t        pset;
-       thread_t                                        thread;
-       ast_t                                           ast_flags = AST_BLOCK;
+       register thread_t                       thread;
+       boolean_t                                       try_preempt = FALSE;
+       ast_t                                           preempt = AST_BLOCK;
  
         assert(thread_runnable(new_thread));
         
@@ -1601,135 +1884,301 @@ thread_setrun(
         if (new_thread->sched_stamp != sched_tick)
                 update_priority(new_thread);
  
-       if (    new_thread->sched_pri >= BASEPRI_PREEMPT                &&
-                       kernel_preemption_mode == KERNEL_PREEMPT                        )
-               ast_flags |= AST_URGENT;
-       
-       assert(new_thread->runq == RUN_QUEUE_NULL);
-
         /*
-        *      Try to dispatch the thread directly onto an idle processor.
+        *      Check for urgent preemption.
          */
+       if (new_thread->sched_mode & TH_MODE_PREEMPT)
+               preempt |= AST_URGENT;
+
+       assert(new_thread->runq == RUN_QUEUE_NULL);
+
         if ((processor = new_thread->bound_processor) == PROCESSOR_NULL) {
             /*
-            *  Not bound, any processor in the processor set is ok.
+            *  First try to dispatch on
+                *      the last processor.
              */
             pset = new_thread->processor_set;
-           if (pset->idle_count > 0) {
-                       simple_lock(&pset->idle_lock);
-                       if (pset->idle_count > 0) {
-                               processor = (processor_t) queue_first(&pset->idle_queue);
-                               queue_remove(&(pset->idle_queue), processor, processor_t,
-                                       processor_queue);
+               processor = new_thread->last_processor;
+               if (    pset->processor_count > 1                               &&
+                               processor != PROCESSOR_NULL                             &&
+                               processor->state == PROCESSOR_IDLE              ) {
+                       simple_lock(&processor->lock);
+                       simple_lock(&pset->sched_lock);
+                       if (    processor->processor_set == pset                &&
+                                       processor->state == PROCESSOR_IDLE              ) {
+                               remqueue(&pset->idle_queue, (queue_entry_t)processor);
                                 pset->idle_count--;
                                 processor->next_thread = new_thread;
                                 processor->state = PROCESSOR_DISPATCHING;
-                               simple_unlock(&pset->idle_lock);
-                               if(processor->slot_num != cpu_number()) 
+                               simple_unlock(&pset->sched_lock);
+                               simple_unlock(&processor->lock);
+                               if (processor != current_processor())
                                         machine_signal_idle(processor);
+                               dispatch_counts.pset_idle_last++;
                                 return;
                         }
-                       simple_unlock(&pset->idle_lock);
-           }
+                       simple_unlock(&processor->lock);
+               }
+               else
+               simple_lock(&pset->sched_lock);
+
+               /*
+                *      Next pick any idle processor
+                *      in the processor set.
+                */
+               if (pset->idle_count > 0) {
+                       processor = (processor_t)dequeue_head(&pset->idle_queue);
+                       pset->idle_count--;
+                       processor->next_thread = new_thread;
+                       processor->state = PROCESSOR_DISPATCHING;
+                       simple_unlock(&pset->sched_lock);
+                       if (processor != current_processor())   
+                               machine_signal_idle(processor);
+                       dispatch_counts.pset_idle_any++;
+                       return;
+               }
  
                 /*
-                * Place thread on processor set run queue.
+                * Place thread on run queue.
                  */
-           runq = &pset->runq;
-               run_queue_enqueue(runq, new_thread, tail);
+               if (run_queue_enqueue(&pset->runq, new_thread, tail))
+                       try_preempt = TRUE;
+
+               /*
+                *      Update the timesharing quanta.
+                */
+               pset_quanta_update(pset);
         
             /*
-            * Preempt check
+            *  Preempt check.
              */
-               thread = current_thread();
             processor = current_processor();
-           if (        may_preempt                                                     &&
-                               pset == processor->processor_set                ) {
-                   /*
-                    * XXX if we have a non-empty local runq or are
-                    * XXX running a bound thread, ought to check for
-                    * XXX another cpu running lower-pri thread to preempt.
+               thread = processor->cpu_data->active_thread;
+           if (try_preempt) {
+                       /*
+                        * First try the current processor
+                        * if it is a member of the correct
+                        * processor set.
                          */
-                       if (csw_needed(thread, processor))
-                               ast_on(ast_flags);
+                       if (    pset == processor->processor_set        &&
+                                       csw_needed(thread, processor)           ) {
+                               simple_unlock(&pset->sched_lock);
+
+                               ast_on(preempt);
+                               dispatch_counts.pset_self++;
+                               return;
+                       }
+
+                       /*
+                        * If that failed and we have other
+                        * processors available keep trying.
+                        */
+                       if (    pset->processor_count > 1                       ||
+                                       pset != processor->processor_set        ) {
+                               queue_t                 active = &pset->active_queue;
+                               processor_t             myprocessor, lastprocessor;
+                               queue_entry_t   next;
+
+                               /*
+                                * Next try the last processor
+                                * dispatched on.
+                                */
+                               myprocessor = processor;
+                               processor = new_thread->last_processor;
+                               if (    processor != myprocessor                                                &&
+                                               processor != PROCESSOR_NULL                                             &&
+                                               processor->processor_set == pset                                &&
+                                               processor->state == PROCESSOR_RUNNING                   &&
+                                               new_thread->sched_pri > processor->current_pri  ) {
+                                       cause_ast_check(processor);
+                                       simple_unlock(&pset->sched_lock);
+                                       dispatch_counts.pset_last++;
+                                       return;
+                               }
+
+                               /*
+                                * Lastly, pick any other
+                                * available processor.
+                                */
+                               lastprocessor = processor;
+                               processor = (processor_t)queue_first(active);
+                               while (!queue_end(active, (queue_entry_t)processor)) {
+                                       next = queue_next((queue_entry_t)processor);
+
+                                       if (    processor != myprocessor                                                &&
+                                                       processor != lastprocessor                                              &&
+                                                       new_thread->sched_pri > processor->current_pri  ) {
+                                               if (!queue_end(active, next)) {
+                                                       remqueue(active, (queue_entry_t)processor);
+                                                       enqueue_tail(active, (queue_entry_t)processor);
+                                               }
+                                               cause_ast_check(processor);
+                                               simple_unlock(&pset->sched_lock);
+                                               dispatch_counts.pset_other++;
+                                               return;
+                                       }
+
+                                       processor = (processor_t)next;
+                               }
+                       }
             }
+
+               simple_unlock(&pset->sched_lock);
         }
         else {
             /*
              *  Bound, can only run on bound processor.  Have to lock
              *  processor here because it may not be the current one.
              */
-           if (processor->state == PROCESSOR_IDLE) {
+               if (processor->state == PROCESSOR_IDLE) {
                         simple_lock(&processor->lock);
                         pset = processor->processor_set;
-                       simple_lock(&pset->idle_lock);
+                       simple_lock(&pset->sched_lock);
                         if (processor->state == PROCESSOR_IDLE) {
-                               queue_remove(&pset->idle_queue, processor,
-                               processor_t, processor_queue);
+                               remqueue(&pset->idle_queue, (queue_entry_t)processor);
                                 pset->idle_count--;
                                 processor->next_thread = new_thread;
                                 processor->state = PROCESSOR_DISPATCHING;
-                               simple_unlock(&pset->idle_lock);
+                               simple_unlock(&pset->sched_lock);
                                 simple_unlock(&processor->lock);
-                               if(processor->slot_num != cpu_number()) 
+                               if (processor != current_processor())   
                                         machine_signal_idle(processor);
+                               dispatch_counts.bound_idle++;
                                 return;
                         }
-                       simple_unlock(&pset->idle_lock);
+                       simple_unlock(&pset->sched_lock);
                         simple_unlock(&processor->lock);
                 }
           
-           /*
-            * Cause ast on processor if processor is on line, and the
-            * currently executing thread is not bound to that processor
-            * (bound threads have implicit priority over non-bound threads).
-            * We also avoid sending the AST to the idle thread (if it got
-            * scheduled in the window between the 'if' above and here),
-            * since the idle_thread is bound.
-            */
-           runq = &processor->runq;
-           if (processor == current_processor()) {
-                       run_queue_enqueue(runq, new_thread, tail);
-
-                       thread = current_thread();
-                       if (    thread->bound_processor == PROCESSOR_NULL       ||
-                                               csw_needed(thread, processor))
-                               ast_on(ast_flags);
-           }
+               if (run_queue_enqueue(&processor->runq, new_thread, tail))
+                       try_preempt = TRUE;
+
+               if (processor == current_processor()) {
+                       if (try_preempt) {
+                               thread = processor->cpu_data->active_thread;
+                               if (csw_needed(thread, processor)) {
+                                       ast_on(preempt);
+                                       dispatch_counts.bound_self++;
+                               }
+                       }
+               }
                 else {
-                       thread = cpu_data[processor->slot_num].active_thread;
-                       if (    run_queue_enqueue(runq, new_thread, tail) == 0  &&
-                                       processor->state != PROCESSOR_OFF_LINE                  &&
-                                       thread && thread->bound_processor != processor          )
-                               cause_ast_check(processor);
-           }
+                       if (try_preempt) {
+                               if (    processor->state == PROCESSOR_RUNNING                   &&
+                                               new_thread->sched_pri > processor->current_pri  ) {
+                                       cause_ast_check(processor);
+                                       dispatch_counts.bound_other++;
+                                       return;
+                               }
+                       }
+
+                       if (processor->state == PROCESSOR_IDLE) {
+                               machine_signal_idle(processor);
+                               dispatch_counts.bound_idle++;
+                       }
+               }
+       }
+}
+
+/*
+ * Called at splsched by a thread on itself.
+ */
+ast_t
+csw_check(
+       thread_t                thread,
+       processor_t             processor)
+{
+       int                             current_pri = thread->sched_pri;
+       ast_t                   result = AST_NONE;
+       run_queue_t             runq;
+
+       if (first_quantum(processor)) {
+               runq = &processor->processor_set->runq;
+               if (runq->highq > current_pri) {
+                       if (runq->urgency > 0)
+                               return (AST_BLOCK | AST_URGENT);
+
+                       result |= AST_BLOCK;
+               }
+
+               runq = &processor->runq;
+               if (runq->highq > current_pri) {
+                       if (runq->urgency > 0)
+                               return (AST_BLOCK | AST_URGENT);
+
+                       result |= AST_BLOCK;
+               }
+       }
+       else {
+               runq = &processor->processor_set->runq;
+               if (runq->highq >= current_pri) {
+                       if (runq->urgency > 0)
+                               return (AST_BLOCK | AST_URGENT);
+
+                       result |= AST_BLOCK;
+               }
+
+               runq = &processor->runq;
+               if (runq->highq >= current_pri) {
+                       if (runq->urgency > 0)
+                               return (AST_BLOCK | AST_URGENT);
+
+                       result |= AST_BLOCK;
+               }
         }
+
+       if (result != AST_NONE)
+               return (result);
+
+       if (thread->state & TH_SUSP)
+               result |= AST_BLOCK;
+
+       return (result);
  }
  
  /*
- *     set_pri:
+ *     set_sched_pri:
   *
- *     Set the priority of the specified thread to the specified
- *     priority.  This may cause the thread to change queues.
+ *     Set the current scheduled priority of the specified thread.
+ *     This may cause the thread to change queues.
   *
   *     The thread *must* be locked by the caller.
   */
  void
-set_pri(
+set_sched_pri(
         thread_t                        thread,
-       int                                     pri,
-       boolean_t                       resched)
+       int                                     priority)
  {
-       register struct run_queue       *rq;
+       register struct run_queue       *rq = rem_runq(thread);
+
+       if (    !(thread->sched_mode & TH_MODE_TIMESHARE)                               &&
+                       (priority >= BASEPRI_PREEMPT                                            ||
+                        (thread->task_priority < MINPRI_KERNEL                 &&
+                         thread->task_priority >= BASEPRI_BACKGROUND   &&
+                         priority > thread->task_priority)                                     ||
+                        (thread->sched_mode & TH_MODE_FORCEDPREEMPT)           )       )
+               thread->sched_mode |= TH_MODE_PREEMPT;
+       else
+               thread->sched_mode &= ~TH_MODE_PREEMPT;
  
-       rq = rem_runq(thread);
-       assert(thread->runq == RUN_QUEUE_NULL);
-       thread->sched_pri = pri;
-       if (rq != RUN_QUEUE_NULL) {
-           if (resched)
-                       thread_setrun(thread, TRUE, TAIL_Q);
-           else
-                       run_queue_enqueue(rq, thread, TAIL_Q);
+       thread->sched_pri = priority;
+       if (rq != RUN_QUEUE_NULL)
+               thread_setrun(thread, TAIL_Q);
+       else
+       if ((thread->state & (TH_RUN|TH_WAIT)) == TH_RUN) {
+               processor_t             processor = thread->last_processor;
+
+               if (thread == current_thread()) {
+                       ast_t           preempt = csw_check(thread, processor);
+
+                       if (preempt != AST_NONE)
+                               ast_on(preempt);
+                       processor->current_pri = priority;
+               }
+               else
+               if (    processor != PROCESSOR_NULL                                             &&
+                               processor->cpu_data->active_thread == thread    )
+                       cause_ast_check(processor);
         }
  }
  
@@ -1766,6 +2215,9 @@ rem_runq(
  #endif /* DEBUG */
                         remqueue(&rq->queues[0], (queue_entry_t)thread);
                         rq->count--;
+                       if (thread->sched_mode & TH_MODE_PREEMPT)
+                               rq->urgency--;
+                       assert(rq->urgency >= 0);
  
                         if (queue_empty(rq->queues + thread->sched_pri)) {
                                 /* update run queue status */
@@ -1792,7 +2244,6 @@ rem_runq(
         return (rq);
  }
  
-
  /*
   *     choose_thread:
   *
@@ -1805,8 +2256,9 @@ rem_runq(
   *             Else check pset runq; if nothing found, return idle thread.
   *
   *     Second line of strategy is implemented by choose_pset_thread.
- *     This is only called on processor startup and when thread_block
- *     thinks there's something in the processor runq.
+ *
+ *     Called with both the local & pset run queues locked, returned
+ *     unlocked.
   */
  thread_t
  choose_thread(
@@ -1820,8 +2272,8 @@ choose_thread(
         runq = &myprocessor->runq;
         pset = myprocessor->processor_set;
  
-       simple_lock(&runq->lock);
         if (runq->count > 0 && runq->highq >= pset->runq.highq) {
+               simple_unlock(&pset->runq.lock);
                 q = runq->queues + runq->highq;
  #if    MACH_ASSERT
                 if (!queue_empty(q)) {
@@ -1831,6 +2283,9 @@ choose_thread(
                         q->next = ((queue_entry_t)thread)->next;
                         thread->runq = RUN_QUEUE_NULL;
                         runq->count--;
+                       if (thread->sched_mode & TH_MODE_PREEMPT)
+                               runq->urgency--;
+                       assert(runq->urgency >= 0);
                         if (queue_empty(q)) {
                                 if (runq->highq != IDLEPRI)
                                         clrbit(MAXPRI - runq->highq, runq->bitmap);
@@ -1844,23 +2299,21 @@ choose_thread(
  #endif /*MACH_ASSERT*/
                 /*NOTREACHED*/
         }
+       simple_unlock(&myprocessor->runq.lock);
  
-       simple_unlock(&runq->lock);
-       simple_lock(&pset->runq.lock);
         return (choose_pset_thread(myprocessor, pset));
  }
  
-
  /*
   *     choose_pset_thread:  choose a thread from processor_set runq or
   *             set processor idle and choose its idle thread.
   *
- *     Caller must be at splsched and have a lock on the runq.  This
- *     lock is released by this routine.  myprocessor is always the current
- *     processor, and pset must be its processor set.
   *     This routine chooses and removes a thread from the runq if there
   *     is one (and returns it), else it sets the processor idle and
   *     returns its idle thread.
+ *
+ *     Called with both local & pset run queues locked, returned
+ *     unlocked.
   */
  thread_t
  choose_pset_thread(
@@ -1882,11 +2335,15 @@ choose_pset_thread(
                         q->next = ((queue_entry_t)thread)->next;
                         thread->runq = RUN_QUEUE_NULL;
                         runq->count--;
+                       if (thread->sched_mode & TH_MODE_PREEMPT)
+                               runq->urgency--;
+                       assert(runq->urgency >= 0);
                         if (queue_empty(q)) {
                                 if (runq->highq != IDLEPRI)
                                         clrbit(MAXPRI - runq->highq, runq->bitmap);
                                 runq->highq = MAXPRI - ffsbit(runq->bitmap);
                         }
+                       pset_quanta_update(pset);
                         simple_unlock(&runq->lock);
                         return (thread);
  #if    MACH_ASSERT
@@ -1902,23 +2359,19 @@ choose_pset_thread(
          *      was running.  If it was in an assignment or shutdown,
          *      leave it alone.  Return its idle thread.
          */
-       simple_lock(&pset->idle_lock);
+       simple_lock(&pset->sched_lock);
         if (myprocessor->state == PROCESSOR_RUNNING) {
+               remqueue(&pset->active_queue, (queue_entry_t)myprocessor);
             myprocessor->state = PROCESSOR_IDLE;
-           /*
-            *  XXX Until it goes away, put master on end of queue, others
-            *  XXX on front so master gets used last.
-            */
+
             if (myprocessor == master_processor)
-                       queue_enter(&(pset->idle_queue), myprocessor,
-                                                                       processor_t, processor_queue);
+                       enqueue_tail(&pset->idle_queue, (queue_entry_t)myprocessor);
             else
-                       queue_enter_first(&(pset->idle_queue), myprocessor,
-                                                                               processor_t, processor_queue);
+                       enqueue_head(&pset->idle_queue, (queue_entry_t)myprocessor);
  
             pset->idle_count++;
         }
-       simple_unlock(&pset->idle_lock);
+       simple_unlock(&pset->sched_lock);
  
         return (myprocessor->idle_thread);
  }
@@ -1946,15 +2399,11 @@ idle_thread_continue(void)
         int                                                     mycpu;
  
         mycpu = cpu_number();
-       myprocessor = current_processor();
+       myprocessor = cpu_to_processor(mycpu);
         threadp = (volatile thread_t *) &myprocessor->next_thread;
         lcount = (volatile int *) &myprocessor->runq.count;
  
         for (;;) {
-#ifdef MARK_CPU_IDLE
-               MARK_CPU_IDLE(mycpu);
-#endif /* MARK_CPU_IDLE */
-
                 gcount = (volatile int *)&myprocessor->processor_set->runq.count;
  
                 (void)splsched();
@@ -1962,11 +2411,9 @@ idle_thread_continue(void)
                                         (*gcount == 0) && (*lcount == 0)                                ) {
  
                         /* check for ASTs while we wait */
-                       if (need_ast[mycpu] &~ (        AST_SCHEDULING | AST_PREEMPT |
-                                                                               AST_BSD | AST_BSD_INIT                  )) {
+                       if (need_ast[mycpu] &~ (        AST_SCHEDULING | AST_BSD        )) {
                                 /* don't allow scheduling ASTs */
-                               need_ast[mycpu] &= ~(   AST_SCHEDULING | AST_PREEMPT |
-                                                                               AST_BSD | AST_BSD_INIT                  );
+                               need_ast[mycpu] &= ~(   AST_SCHEDULING | AST_BSD        );
                                 ast_taken(AST_ALL, TRUE);       /* back at spllo */
                         }
                         else
@@ -1980,18 +2427,12 @@ idle_thread_continue(void)
                         (void)splsched();
                 }
  
-#ifdef MARK_CPU_ACTIVE
-               (void)spllo();
-               MARK_CPU_ACTIVE(mycpu);
-               (void)splsched();
-#endif /* MARK_CPU_ACTIVE */
-
                 /*
                  *      This is not a switch statement to avoid the
                  *      bounds checking code in the common case.
                  */
                 pset = myprocessor->processor_set;
-               simple_lock(&pset->idle_lock);
+               simple_lock(&pset->sched_lock);
  retry:
                 state = myprocessor->state;
                 if (state == PROCESSOR_DISPATCHING) {
@@ -2001,33 +2442,24 @@ retry:
                         new_thread = *threadp;
                         *threadp = (volatile thread_t) THREAD_NULL;
                         myprocessor->state = PROCESSOR_RUNNING;
-                       simple_unlock(&pset->idle_lock);
+                       enqueue_tail(&pset->active_queue, (queue_entry_t)myprocessor);
+                       simple_unlock(&pset->sched_lock);
  
-                       thread_lock(new_thread);
-                       simple_lock(&myprocessor->runq.lock);
-                       simple_lock(&pset->runq.lock);
                         if (    myprocessor->runq.highq > new_thread->sched_pri         ||
                                         pset->runq.highq > new_thread->sched_pri                                ) {
-                               simple_unlock(&pset->runq.lock);
-                               simple_unlock(&myprocessor->runq.lock);
-
-                               if (new_thread->bound_processor != PROCESSOR_NULL)
-                                       run_queue_enqueue(&myprocessor->runq, new_thread, HEAD_Q);
-                               else
-                                       run_queue_enqueue(&pset->runq, new_thread, HEAD_Q);
+                               thread_lock(new_thread);
+                               thread_setrun(new_thread, HEAD_Q);
                                 thread_unlock(new_thread);
  
                                 counter(c_idle_thread_block++);
                                 thread_block(idle_thread_continue);
+                               /* NOTREACHED */
                         }
                         else {
-                               simple_unlock(&pset->runq.lock);
-                               simple_unlock(&myprocessor->runq.lock);
-                               thread_unlock(new_thread);
-
                                 counter(c_idle_thread_handoff++);
                                 thread_run(myprocessor->idle_thread,
                                                                         idle_thread_continue, new_thread);
+                               /* NOTREACHED */
                         }
                 }
                 else
@@ -2044,13 +2476,14 @@ retry:
                          */
                         no_dispatch_count++;
                         pset->idle_count--;
-                       queue_remove(&pset->idle_queue, myprocessor,
-                                                                       processor_t, processor_queue);
+                       remqueue(&pset->idle_queue, (queue_entry_t)myprocessor);
                         myprocessor->state = PROCESSOR_RUNNING;
-                       simple_unlock(&pset->idle_lock);
+                       enqueue_tail(&pset->active_queue, (queue_entry_t)myprocessor);
+                       simple_unlock(&pset->sched_lock);
  
                         counter(c_idle_thread_block++);
                         thread_block(idle_thread_continue);
+                       /* NOTREACHED */
                 }
                 else
                 if (    state == PROCESSOR_ASSIGN               ||
@@ -2062,22 +2495,23 @@ retry:
                          */
                         if ((new_thread = (thread_t)*threadp) != THREAD_NULL) {
                                 *threadp = (volatile thread_t) THREAD_NULL;
-                               simple_unlock(&pset->idle_lock);
+                               simple_unlock(&pset->sched_lock);
+
                                 thread_lock(new_thread);
-                               thread_setrun(new_thread, FALSE, TAIL_Q);
+                               thread_setrun(new_thread, TAIL_Q);
                                 thread_unlock(new_thread);
-                       } else
-                               simple_unlock(&pset->idle_lock);
+                       }
+                       else
+                               simple_unlock(&pset->sched_lock);
  
                         counter(c_idle_thread_block++);
                         thread_block(idle_thread_continue);
+                       /* NOTREACHED */
                 }
                 else {
-                       simple_unlock(&pset->idle_lock);
-                       printf("Bad processor state %d (Cpu %d)\n",
-                                                                                               cpu_state(mycpu), mycpu);
-                       panic("idle_thread");
+                       simple_unlock(&pset->sched_lock);
  
+                       panic("idle_thread: bad processor state %d\n", cpu_state(mycpu));
                 }
  
                 (void)spllo();
@@ -2094,16 +2528,13 @@ idle_thread(void)
  
         s = splsched();
         thread_lock(self);
-
         self->priority = IDLEPRI;
-       self->sched_pri = self->priority;
-
+       set_sched_pri(self, self->priority);
         thread_unlock(self);
         splx(s);
  
         counter(c_idle_thread_block++);
-       thread_block((void(*)(void))0);
-       idle_thread_continue();
+       thread_block(idle_thread_continue);
         /*NOTREACHED*/
  }
  
@@ -2196,10 +2627,10 @@ sched_tick_thread(void)
   *     thread ids away in an array  (takes out references for them).
   *     Pass two does the priority updates.  This is necessary because
   *     the run queue lock is required for the candidate scan, but
- *     cannot be held during updates [set_pri will deadlock].
+ *     cannot be held during updates.
   *
   *     Array length should be enough so that restart isn't necessary,
- *     but restart logic is included.  Does not scan processor runqs.
+ *     but restart logic is included.
   *
   */
  thread_t               stuck_threads[MAX_STUCK_THREADS];
@@ -2250,7 +2681,7 @@ do_runq_scan(
                                                  * and ignore this thread if we fail, we might
                                                  * have better luck next time.
                                                  */
-                                               if (simple_lock_try(&thread->lock)) {
+                                               if (thread_lock_try(thread)) {
                                                         thread->ref_count++;
                                                         thread_unlock(thread);
                                                         stuck_threads[stuck_count++] = thread;
@@ -2329,6 +2760,9 @@ do_thread_scan(void)
                         if (!(thread->state & TH_IDLE))
                                 thread_deallocate(thread);
             }
+
+               if (restart_needed)
+                       delay(1);                       /* XXX */
                 
         } while (restart_needed);
  }
@@ -2348,6 +2782,7 @@ thread_wakeup(
         thread_wakeup_with_result(x, THREAD_AWAKENED);
  }
  
+
  #if    DEBUG
  
  static boolean_t
@@ -2513,9 +2948,6 @@ thread_check(
         if (whichq < MINPRI || whichq > MAXPRI)
                 panic("thread_check: bad pri");
  
-       if (whichq != thread->whichq)
-               panic("thread_check: whichq");
-
         queue = &rq->queues[whichq];
         entry = queue_first(queue);
         while (!queue_end(queue, entry)) {