]> git.saurik.com Git - apple/xnu.git/blobdiff - osfmk/kern/sched_prim.c
xnu-1228.12.14.tar.gz
[apple/xnu.git] / osfmk / kern / sched_prim.c
index 0cbde484cd7c7f8e54240934f1229e32af6ad935..4e281607d965916f54c0699912848aeb07baf96e 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2009 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -144,12 +144,15 @@ uint32_t  sched_fixed_shift;
 uint32_t       sched_run_count, sched_share_count;
 uint32_t       sched_load_average, sched_mach_factor;
 
-void           (*pm_tick_callout)(void)        = NULL;
-
 /* Forwards */
 void wait_queues_init(void) __attribute__((section("__TEXT, initcode")));
 
 static void load_shift_init(void) __attribute__((section("__TEXT, initcode")));
+static void preempt_pri_init(void) __attribute__((section("__TEXT, initcode")));
+
+static thread_t        run_queue_dequeue(
+                                       run_queue_t             runq,
+                                       integer_t               options);
 
 static thread_t        thread_select_idle(
                                        thread_t                        thread,
@@ -159,10 +162,10 @@ static thread_t   processor_idle(
                                        thread_t                        thread,
                                        processor_t                     processor);
 
-static thread_t        choose_thread(
-                                       processor_t                     processor);
-
 static thread_t        steal_thread(
+                                       processor_set_t         pset);
+
+static thread_t        steal_processor_thread(
                                        processor_t                     processor);
 
 static void            thread_update_scan(void);
@@ -181,8 +184,6 @@ boolean_t   thread_runnable(
 
 #endif /*DEBUG*/
 
-
-
 /*
  *     State machine
  *
@@ -243,6 +244,7 @@ struct wait_queue wait_queues[NUMQUEUES];
        ((((int)(event) < 0)? ~(int)(event): (int)(event)) % NUMQUEUES)
 
 int8_t         sched_load_shifts[NRQS];
+int                    sched_preempt_pri[NRQBM];
 
 void
 sched_init(void)
@@ -262,6 +264,7 @@ sched_init(void)
 
        wait_queues_init();
        load_shift_init();
+       preempt_pri_init();
        simple_lock_init(&rt_lock, 0);
        run_queue_init(&rt_runq);
        sched_tick = 0;
@@ -343,6 +346,18 @@ load_shift_init(void)
        }
 }
 
+static void
+preempt_pri_init(void)
+{
+       int             i, *p = sched_preempt_pri;
+
+       for (i = BASEPRI_FOREGROUND + 1; i < MINPRI_KERNEL; ++i)
+               setbit(i, p);
+
+       for (i = BASEPRI_PREEMPT; i <= MAXPRI; ++i)
+               setbit(i, p);
+}
+
 /*
  *     Thread wait timer expiration.
  */
@@ -1155,8 +1170,8 @@ thread_select(
        processor_t                     processor)
 {
        processor_set_t         pset = processor->processor_set;
-       thread_t                        new_thread;
-       boolean_t                       other_runnable;
+       thread_t                        new_thread = THREAD_NULL;
+       boolean_t                       other_runnable, inactive_state;
 
        do {
                /*
@@ -1169,6 +1184,8 @@ thread_select(
 
                pset_lock(pset);
 
+               inactive_state = processor->state != PROCESSOR_SHUTDOWN && machine_cpu_is_inactive(processor->cpu_num);
+
                simple_lock(&rt_lock);
 
                /*
@@ -1200,8 +1217,8 @@ thread_select(
                                                ((queue_entry_t)thread)->next->prev = q;
                                                q->next = ((queue_entry_t)thread)->next;
                                                thread->runq = PROCESSOR_NULL;
-                                               assert(thread->sched_mode & TH_MODE_PREEMPT);
                                                runq->count--; runq->urgency--;
+                                               assert(runq->urgency >= 0);
                                                if (queue_empty(q)) {
                                                        if (runq->highq != IDLEPRI)
                                                                clrbit(MAXPRI - runq->highq, runq->bitmap);
@@ -1219,7 +1236,8 @@ thread_select(
                                return (thread);
                        }
 
-                       if (    (!other_runnable                                                        ||
+                       if (!inactive_state &&
+                                       (!other_runnable                                                        ||
                                         (processor->runq.highq < thread->sched_pri             &&
                                          rt_runq.highq < thread->sched_pri))                           ) {
 
@@ -1227,8 +1245,9 @@ thread_select(
 
                                /* I am the highest priority runnable (non-idle) thread */
 
-                               pset_hint_low(pset, processor);
-                               pset_hint_high(pset, processor);
+                               pset_pri_hint(pset, processor, processor->current_pri);
+
+                               pset_count_hint(pset, processor, processor->runq.count);
 
                                processor->deadline = UINT64_MAX;
 
@@ -1238,23 +1257,67 @@ thread_select(
                        }
                }
 
-               if (other_runnable)
-                       return choose_thread(processor);
+               if (other_runnable) {
+                       if (processor->runq.count > 0 && processor->runq.highq >= rt_runq.highq) {
+                               simple_unlock(&rt_lock);
+
+                               thread = run_queue_dequeue(&processor->runq, SCHED_HEADQ);
+
+                               if (!inactive_state) {
+                                       pset_pri_hint(pset, processor, thread->sched_pri);
+
+                                       pset_count_hint(pset, processor, processor->runq.count);
+                               }
+
+                               processor->deadline = UINT64_MAX;
+                               pset_unlock(pset);
+
+                               return (thread);
+                       }
+
+                       thread = run_queue_dequeue(&rt_runq, SCHED_HEADQ);
+                       simple_unlock(&rt_lock);
+
+                       processor->deadline = thread->realtime.deadline;
+                       pset_unlock(pset);
+
+                       return (thread);
+               }
 
                simple_unlock(&rt_lock);
 
+               processor->deadline = UINT64_MAX;
+
+               if (inactive_state) {
+                       if (processor->state == PROCESSOR_RUNNING)
+                               remqueue(&pset->active_queue, (queue_entry_t)processor);
+                       else
+                       if (processor->state == PROCESSOR_IDLE)
+                               remqueue(&pset->idle_queue, (queue_entry_t)processor);
+
+                       processor->state = PROCESSOR_INACTIVE;
+
+                       pset_unlock(pset);
+
+                       return (processor->idle_thread);
+               }
+
                /*
                 *      No runnable threads, attempt to steal
                 *      from other processors.
                 */
-               if (pset->high_hint != PROCESSOR_NULL && pset->high_hint->runq.count > 0) {
-                       new_thread = steal_thread(pset->high_hint);
-                       if (new_thread != THREAD_NULL) {
-                               pset_unlock(pset);
+               new_thread = steal_thread(pset);
+               if (new_thread != THREAD_NULL)
+                       return (new_thread);
 
-                               return (new_thread);
-                       }
-               }
+               /*
+                *      If other threads have appeared, shortcut
+                *      around again.
+                */
+               if (processor->runq.count > 0 || rt_runq.count > 0)
+                       continue;
+
+               pset_lock(pset);
 
                /*
                 *      Nothing is runnable, so set this processor idle if it
@@ -1265,12 +1328,9 @@ thread_select(
                        processor->state = PROCESSOR_IDLE;
 
                        enqueue_head(&pset->idle_queue, (queue_entry_t)processor);
-                       pset->low_hint = processor;
-                       pset->idle_count++;
+                       pset->low_pri = pset->low_count = processor;
                }
 
-               processor->deadline = UINT64_MAX;
-
                pset_unlock(pset);
 
                /*
@@ -1335,6 +1395,9 @@ thread_select_idle(
         */
        spllo(); new_thread = processor_idle(thread, processor);
 
+       /*
+        *      Return at splsched.
+        */
        (*thread->sched_call)(SCHED_CALL_UNBLOCK, thread);
 
        thread_lock(thread);
@@ -1916,8 +1979,9 @@ run_queue_dequeue(
 
        thread->runq = PROCESSOR_NULL;
        rq->count--;
-       if (thread->sched_mode & TH_MODE_PREEMPT)
-               rq->urgency--;
+       if (testbit(rq->highq, sched_preempt_pri)) {
+               rq->urgency--; assert(rq->urgency >= 0);
+       }
        if (queue_empty(queue)) {
                if (rq->highq != IDLEPRI)
                        clrbit(MAXPRI - rq->highq, rq->bitmap);
@@ -1971,7 +2035,6 @@ realtime_queue_insert(
        }
 
        thread->runq = RT_RUNQ;
-       assert(thread->sched_mode & TH_MODE_PREEMPT);
        rq->count++; rq->urgency++;
 
        simple_unlock(&rt_lock);
@@ -1999,8 +2062,7 @@ realtime_setrun(
         */
        if (processor->state == PROCESSOR_IDLE) {
                remqueue(&pset->idle_queue, (queue_entry_t)processor);
-               pset->idle_count--;
-               enqueue_head(&pset->active_queue, (queue_entry_t)processor);
+               enqueue_tail(&pset->active_queue, (queue_entry_t)processor);
 
                processor->next_thread = thread;
                processor->deadline = thread->realtime.deadline;
@@ -2060,7 +2122,7 @@ processor_enqueue(
                enqueue_head(queue, (queue_entry_t)thread);
 
        thread->runq = processor;
-       if (thread->sched_mode & TH_MODE_PREEMPT)
+       if (testbit(thread->sched_pri, sched_preempt_pri))
                rq->urgency++;
        rq->count++;
 
@@ -2090,8 +2152,7 @@ processor_setrun(
         */
        if (processor->state == PROCESSOR_IDLE) {
                remqueue(&pset->idle_queue, (queue_entry_t)processor);
-               pset->idle_count--;
-               enqueue_head(&pset->active_queue, (queue_entry_t)processor);
+               enqueue_tail(&pset->active_queue, (queue_entry_t)processor);
 
                processor->next_thread = thread;
                processor->deadline = UINT64_MAX;
@@ -2106,10 +2167,10 @@ processor_setrun(
        /*
         *      Set preemption mode.
         */
-       if (thread->sched_mode & TH_MODE_PREEMPT)
+       if (testbit(thread->sched_pri, sched_preempt_pri))
                preempt = (AST_PREEMPT | AST_URGENT);
        else
-       if (thread->sched_mode & TH_MODE_TIMESHARE && thread->priority < BASEPRI_BACKGROUND)
+       if (thread->sched_mode & TH_MODE_TIMESHARE && thread->sched_pri < thread->priority)
                preempt = AST_NONE;
        else
                preempt = (options & SCHED_PREEMPT)? AST_PREEMPT: AST_NONE;
@@ -2117,13 +2178,9 @@ processor_setrun(
        if (!processor_enqueue(processor, thread, options))
                preempt = AST_NONE;
 
-       pset_hint_high(pset, processor);
-
        if (preempt != AST_NONE) {
                if (processor == current_processor()) {
-                       thread_t        self = processor->active_thread;
-
-                       if (csw_needed(self, processor))
+                       if (csw_check(processor) != AST_NONE)
                                ast_on(preempt);
                }
                else
@@ -2163,7 +2220,7 @@ choose_next_pset(
                nset = next_pset(nset);
        } while (nset->processor_count < 1 && nset != pset);
 
-       return ((nset != pset)? nset: pset);
+       return (nset);
 }
 
 /*
@@ -2183,7 +2240,19 @@ choose_processor(
        thread_t                        thread)
 {
        processor_set_t         nset, cset = pset;
-       processor_t                     processor;
+       processor_t                     processor = thread->last_processor;
+
+       /*
+        *      Prefer the last processor, when appropriate.
+        */
+       if (processor != PROCESSOR_NULL) {
+               if (processor->processor_set != pset || processor->state == PROCESSOR_INACTIVE ||
+                               processor->state == PROCESSOR_SHUTDOWN || processor->state == PROCESSOR_OFF_LINE)
+                       processor = PROCESSOR_NULL;
+               else
+               if (processor->state == PROCESSOR_IDLE || ( thread->sched_pri > BASEPRI_DEFAULT && processor->current_pri < thread->sched_pri))
+                       return (processor);
+       }
 
        /*
         *      Iterate through the processor sets to locate
@@ -2208,23 +2277,34 @@ choose_processor(
 
                                processor = (processor_t)queue_next((queue_entry_t)processor);
                        }
+
+                       processor = PROCESSOR_NULL;
                }
                else {
                        /*
-                        *      Choose the low hint processor in the processor set if available.
+                        *      Check any hinted processors in the processor set if available.
                         */
-                       processor = cset->low_hint;
-                       if (processor != PROCESSOR_NULL &&
-                                               processor->state != PROCESSOR_SHUTDOWN && processor->state != PROCESSOR_OFF_LINE)
-                               return (processor);
+                       if (cset->low_pri != PROCESSOR_NULL && cset->low_pri->state != PROCESSOR_INACTIVE &&
+                                       cset->low_pri->state != PROCESSOR_SHUTDOWN && cset->low_pri->state != PROCESSOR_OFF_LINE &&
+                                               (processor == PROCESSOR_NULL ||
+                                                       (thread->sched_pri > BASEPRI_DEFAULT && cset->low_pri->current_pri < thread->sched_pri))) {
+                               processor = cset->low_pri;
+                       }
+                       else
+                       if (cset->low_count != PROCESSOR_NULL && cset->low_count->state != PROCESSOR_INACTIVE &&
+                                       cset->low_count->state != PROCESSOR_SHUTDOWN && cset->low_count->state != PROCESSOR_OFF_LINE &&
+                                               (processor == PROCESSOR_NULL || 
+                                                ( thread->sched_pri <= BASEPRI_DEFAULT && cset->low_count->runq.count < processor->runq.count))) {
+                               processor = cset->low_count;
+                       }
 
                        /*
-                        *      Choose any active processor if the hint was invalid.
+                        *      Otherwise, choose an available processor in the set.
                         */
-                       processor = (processor_t)dequeue_head(&cset->active_queue);
-                       if (processor != PROCESSOR_NULL) {
-                               enqueue_tail(&cset->active_queue, (queue_entry_t)processor);
-                               return (processor);
+                       if (processor == PROCESSOR_NULL) {
+                               processor = (processor_t)dequeue_head(&cset->active_queue);
+                               if (processor != PROCESSOR_NULL)
+                                       enqueue_tail(&cset->active_queue, (queue_entry_t)processor);
                        }
                }
 
@@ -2242,16 +2322,49 @@ choose_processor(
        } while (nset != pset);
 
        /*
-        *      If all else fails choose the current processor,
-        *      this routine must return a running processor.
+        *      Make sure that we pick a running processor,
+        *      and that the correct processor set is locked.
         */
-       processor = current_processor();
-       if (cset != processor->processor_set) {
-               pset_unlock(cset);
+       do {
+               /*
+                *      If we haven't been able to choose a processor,
+                *      pick the boot processor and return it.
+                */
+               if (processor == PROCESSOR_NULL) {
+                       processor = master_processor;
 
-               cset = processor->processor_set;
-               pset_lock(cset);
-       }
+                       /*
+                        *      Check that the correct processor set is
+                        *      returned locked.
+                        */
+                       if (cset != processor->processor_set) {
+                               pset_unlock(cset);
+
+                               cset = processor->processor_set;
+                               pset_lock(cset);
+                       }
+
+                       return (processor);
+               }
+
+               /*
+                *      Check that the processor set for the chosen
+                *      processor is locked.
+                */
+               if (cset != processor->processor_set) {
+                       pset_unlock(cset);
+
+                       cset = processor->processor_set;
+                       pset_lock(cset);
+               }
+
+               /*
+                *      We must verify that the chosen processor is still available.
+                */
+               if (processor->state == PROCESSOR_INACTIVE ||
+                                       processor->state == PROCESSOR_SHUTDOWN || processor->state == PROCESSOR_OFF_LINE)
+                       processor = PROCESSOR_NULL;
+       } while (processor == PROCESSOR_NULL);
 
        return (processor);
 }
@@ -2310,9 +2423,6 @@ thread_setrun(
                        /*
                         *      Choose a different processor in certain cases.
                         */
-                       if (processor->state == PROCESSOR_SHUTDOWN || processor->state == PROCESSOR_OFF_LINE)
-                               processor = choose_processor(pset, thread);
-                       else
                        if (thread->sched_pri >= BASEPRI_RTQUEUES) {
                                /*
                                 *      If the processor is executing an RT thread with
@@ -2323,38 +2433,26 @@ thread_setrun(
                                        processor = choose_processor(pset, thread);
                        }
                        else
-                       if (processor->state != PROCESSOR_IDLE && pset->idle_count > 0) {
                                processor = choose_processor(pset, thread);
-                       }
-                       else {
-                               processor_set_t         nset = choose_next_pset(pset);
-
-                               /*
-                                *      Bump into a lesser loaded processor set if appropriate.
-                                */
-                               if (pset != nset && (nset->low_hint == PROCESSOR_NULL ||
-                                                                                               (pset->idle_count == 0 && nset->idle_count > 0) ||
-                                                                                                               processor->runq.count > nset->low_hint->runq.count)) {
-                                       pset_unlock(pset);
-
-                                       pset = nset;
-                                       pset_lock(pset);
-
-                                       processor = choose_processor(pset, thread);
-                               }
-                       }
                }
                else {
                        /*
                         *      No Affinity case:
                         *
-                        *      Choose a processor from the current processor set.
+                        *      Utilitize a per task hint to spread threads
+                        *      among the available processor sets.
                         */
-                       processor = current_processor();
-                       pset = processor->processor_set;
+                       task_t          task = thread->task;
+
+                       pset = task->pset_hint;
+                       if (pset == PROCESSOR_SET_NULL)
+                               pset = current_processor()->processor_set;
+
+                       pset = choose_next_pset(pset);
                        pset_lock(pset);
 
                        processor = choose_processor(pset, thread);
+                       task->pset_hint = processor->processor_set;
                }
        }
        else {
@@ -2380,8 +2478,8 @@ thread_setrun(
 /*
  *     processor_queue_shutdown:
  *
- *     Shutdown a processor run queue by moving
- *     non-bound threads to the current processor.
+ *     Shutdown a processor run queue by
+ *     re-dispatching non-bound threads.
  *
  *     Associated pset must be locked, and is
  *     returned unlocked.
@@ -2409,8 +2507,9 @@ processor_queue_shutdown(
 
                                thread->runq = PROCESSOR_NULL;
                                rq->count--;
-                               if (thread->sched_mode & TH_MODE_PREEMPT)
-                                       rq->urgency--;
+                               if (testbit(pri, sched_preempt_pri)) {
+                                       rq->urgency--; assert(rq->urgency >= 0);
+                               }
                                if (queue_empty(queue)) {
                                        if (pri != IDLEPRI)
                                                clrbit(MAXPRI - pri, rq->bitmap);
@@ -2429,35 +2528,25 @@ processor_queue_shutdown(
 
        pset_unlock(pset);
 
-       processor = current_processor();
-       pset = processor->processor_set;
-
        while ((thread = (thread_t)dequeue_head(&tqueue)) != THREAD_NULL) {
                thread_lock(thread);
-               thread->last_processor = PROCESSOR_NULL;
-
-               pset_lock(pset);
 
-               processor_enqueue(processor, thread, SCHED_TAILQ);
-
-               pset_unlock(pset);
+               thread_setrun(thread, SCHED_TAILQ);
 
                thread_unlock(thread);
        }
 }
 
 /*
- *     Check for a possible preemption point in
- *     the (current) thread.
+ *     Check for a preemption point in
+ *     the current context.
  *
  *     Called at splsched.
  */
 ast_t
 csw_check(
-       thread_t                thread,
        processor_t             processor)
 {
-       int                             current_pri = thread->sched_pri;
        ast_t                   result = AST_NONE;
        run_queue_t             runq;
 
@@ -2466,7 +2555,7 @@ csw_check(
                if (runq->highq >= BASEPRI_RTQUEUES)
                        return (AST_PREEMPT | AST_URGENT);
 
-               if (runq->highq > current_pri) {
+               if (runq->highq > processor->current_pri) {
                        if (runq->urgency > 0)
                                return (AST_PREEMPT | AST_URGENT);
 
@@ -2474,7 +2563,7 @@ csw_check(
                }
 
                runq = &processor->runq;
-               if (runq->highq > current_pri) {
+               if (runq->highq > processor->current_pri) {
                        if (runq->urgency > 0)
                                return (AST_PREEMPT | AST_URGENT);
 
@@ -2483,7 +2572,7 @@ csw_check(
        }
        else {
                runq = &rt_runq;
-               if (runq->highq >= current_pri) {
+               if (runq->highq >= processor->current_pri) {
                        if (runq->urgency > 0)
                                return (AST_PREEMPT | AST_URGENT);
 
@@ -2491,7 +2580,7 @@ csw_check(
                }
 
                runq = &processor->runq;
-               if (runq->highq >= current_pri) {
+               if (runq->highq >= processor->current_pri) {
                        if (runq->urgency > 0)
                                return (AST_PREEMPT | AST_URGENT);
 
@@ -2502,10 +2591,13 @@ csw_check(
        if (result != AST_NONE)
                return (result);
 
-       if (thread->state & TH_SUSP)
-               result |= AST_PREEMPT;
+       if (machine_cpu_is_inactive(processor->cpu_num))
+               return (AST_PREEMPT);
 
-       return (result);
+       if (processor->active_thread->state & TH_SUSP)
+               return (AST_PREEMPT);
+
+       return (AST_NONE);
 }
 
 /*
@@ -2524,15 +2616,6 @@ set_sched_pri(
 {
        boolean_t               removed = run_queue_remove(thread);
 
-       if (    !(thread->sched_mode & TH_MODE_TIMESHARE)                               &&
-                       (priority >= BASEPRI_PREEMPT                                            ||
-                        (thread->task_priority < MINPRI_KERNEL                 &&
-                         thread->task_priority >= BASEPRI_BACKGROUND   &&
-                         priority > thread->task_priority)                                     )       )
-               thread->sched_mode |= TH_MODE_PREEMPT;
-       else
-               thread->sched_mode &= ~TH_MODE_PREEMPT;
-
        thread->sched_pri = priority;
        if (removed)
                thread_setrun(thread, SCHED_PREEMPT | SCHED_TAILQ);
@@ -2541,11 +2624,11 @@ set_sched_pri(
                processor_t             processor = thread->last_processor;
 
                if (thread == current_thread()) {
-                       ast_t           preempt = csw_check(thread, processor);
+                       ast_t                   preempt;
 
-                       if (preempt != AST_NONE)
-                               ast_on(preempt);
                        processor->current_pri = priority;
+                       if ((preempt = csw_check(processor)) != AST_NONE)
+                               ast_on(preempt);
                }
                else
                if (    processor != PROCESSOR_NULL                                             &&
@@ -2630,9 +2713,9 @@ run_queue_remove(
                         */
                        remqueue(&rq->queues[0], (queue_entry_t)thread);
                        rq->count--;
-                       if (thread->sched_mode & TH_MODE_PREEMPT)
-                               rq->urgency--;
-                       assert(rq->urgency >= 0);
+                       if (testbit(thread->sched_pri, sched_preempt_pri)) {
+                               rq->urgency--; assert(rq->urgency >= 0);
+                       }
 
                        if (queue_empty(rq->queues + thread->sched_pri)) {
                                /* update run queue status */
@@ -2659,79 +2742,22 @@ run_queue_remove(
 }
 
 /*
- *     choose_thread:
+ *     steal_processor_thread:
  *
- *     Choose a thread to execute from the run queues
- *     and return it.  May steal a thread from another
- *     processor.
- *
- *     Called with pset scheduling lock and rt lock held,
- *     released on return.
- */
-static thread_t
-choose_thread(
-       processor_t                     processor)
-{
-       processor_set_t         pset = processor->processor_set;
-       thread_t                        thread;
-
-       if (processor->runq.count > 0 && processor->runq.highq >= rt_runq.highq) {
-               simple_unlock(&rt_lock);
-
-               pset_hint_low(pset, processor);
-
-               if (pset->high_hint != PROCESSOR_NULL) {
-                       if (processor != pset->high_hint) {
-                               if (processor->runq.count >= pset->high_hint->runq.count)
-                                       pset->high_hint = processor;
-                               else
-                               if (pset->high_hint->runq.highq > processor->runq.highq) {
-                                       thread = steal_thread(pset->high_hint);
-                                       if (thread != THREAD_NULL) {
-                                               processor->deadline = UINT64_MAX;
-                                               pset_unlock(pset);
-
-                                               return (thread);
-                                       }
-                               }
-                       }
-               }
-               else
-                       pset->high_hint = processor;
-
-               thread = run_queue_dequeue(&processor->runq, SCHED_HEADQ);
-
-               processor->deadline = UINT64_MAX;
-               pset_unlock(pset);
-
-               return (thread);
-       }
-
-       thread = run_queue_dequeue(&rt_runq, SCHED_HEADQ);
-       simple_unlock(&rt_lock);
-
-       processor->deadline = thread->realtime.deadline;
-       pset_unlock(pset);
-
-       return (thread);
-}
-
-/*
- *     steal_thread:
- *
- *     Steal a thread from a processor and return it.
+ *     Locate a thread to steal from the processor and
+ *     return it.
  *
  *     Associated pset must be locked.  Returns THREAD_NULL
  *     on failure.
  */
 static thread_t
-steal_thread(
+steal_processor_thread(
        processor_t             processor)
 {
        run_queue_t             rq = &processor->runq;
        queue_t                 queue = rq->queues + rq->highq;
        int                             pri = rq->highq, count = rq->count;
-       thread_t                thread = THREAD_NULL;
+       thread_t                thread;
 
        while (count > 0) {
                thread = (thread_t)queue_first(queue);
@@ -2741,8 +2767,9 @@ steal_thread(
 
                                thread->runq = PROCESSOR_NULL;
                                rq->count--;
-                               if (thread->sched_mode & TH_MODE_PREEMPT)
-                                       rq->urgency--;
+                               if (testbit(pri, sched_preempt_pri)) {
+                                       rq->urgency--; assert(rq->urgency >= 0);
+                               }
                                if (queue_empty(queue)) {
                                        if (pri != IDLEPRI)
                                                clrbit(MAXPRI - pri, rq->bitmap);
@@ -2762,6 +2789,57 @@ steal_thread(
        return (THREAD_NULL);
 }
 
+/*
+ *     Locate and steal a thread, beginning
+ *     at the pset.
+ *
+ *     The pset must be locked, and is returned
+ *     unlocked.
+ *
+ *     Returns the stolen thread, or THREAD_NULL on
+ *     failure.
+ */
+static thread_t
+steal_thread(
+       processor_set_t         pset)
+{
+       processor_set_t         nset, cset = pset;
+       processor_t                     processor;
+       thread_t                        thread;
+
+       do {
+               processor = (processor_t)queue_first(&cset->active_queue);
+               while (!queue_end(&cset->active_queue, (queue_entry_t)processor)) {
+                       if (processor->runq.count > 0) {
+                               thread = steal_processor_thread(processor);
+                               if (thread != THREAD_NULL) {
+                                       remqueue(&cset->active_queue, (queue_entry_t)processor);
+                                       enqueue_tail(&cset->active_queue, (queue_entry_t)processor);
+
+                                       pset_unlock(cset);
+
+                                       return (thread);
+                               }
+                       }
+
+                       processor = (processor_t)queue_next((queue_entry_t)processor);
+               }
+
+               nset = next_pset(cset);
+
+               if (nset != pset) {
+                       pset_unlock(cset);
+
+                       cset = nset;
+                       pset_lock(cset);
+               }
+       } while (nset != pset);
+
+       pset_unlock(cset);
+
+       return (THREAD_NULL);
+}
+
 /*
  *     This is the processor idle loop, which just looks for other threads
  *     to execute.  Processor idle threads invoke this without supplying a
@@ -2791,25 +2869,16 @@ processor_idle(
                                                                        mach_absolute_time(), &PROCESSOR_DATA(processor, idle_state));
        PROCESSOR_DATA(processor, current_state) = &PROCESSOR_DATA(processor, idle_state);
 
-       while (processor->next_thread == THREAD_NULL && processor->runq.count == 0 &&
+       while (processor->next_thread == THREAD_NULL && processor->runq.count == 0 && rt_runq.count == 0 &&
                                (thread == THREAD_NULL || ((thread->state & (TH_WAIT|TH_SUSP)) == TH_WAIT && !thread->wake_active))) {
-               volatile processor_t    hint;
-
                machine_idle();
 
                (void)splsched();
 
-               if (pset->low_hint == PROCESSOR_NULL)
-                       break;
-
-               hint = pset->high_hint;
-               if (hint != PROCESSOR_NULL && hint->runq.count > 0)
+               if (processor->state == PROCESSOR_INACTIVE && !machine_cpu_is_inactive(processor->cpu_num))
                        break;
        }
 
-       KERNEL_DEBUG_CONSTANT(
-               MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_END, (int)thread, 0, 0, 0, 0);
-
        timer_switch(&PROCESSOR_DATA(processor, idle_state),
                                                                        mach_absolute_time(), &PROCESSOR_DATA(processor, system_state));
        PROCESSOR_DATA(processor, current_state) = &PROCESSOR_DATA(processor, system_state);
@@ -2829,8 +2898,8 @@ processor_idle(
                processor->next_thread = THREAD_NULL;
                processor->state = PROCESSOR_RUNNING;
 
-               if (    processor->runq.highq > new_thread->sched_pri   ||
-                               rt_runq.highq >= new_thread->sched_pri                  ) {
+               if (    processor->runq.highq > new_thread->sched_pri                                   ||
+                               (rt_runq.highq > 0 && rt_runq.highq >= new_thread->sched_pri)   ) {
                        processor->deadline = UINT64_MAX;
 
                        pset_unlock(pset);
@@ -2839,20 +2908,30 @@ processor_idle(
                        thread_setrun(new_thread, SCHED_HEADQ);
                        thread_unlock(new_thread);
 
+                       KERNEL_DEBUG_CONSTANT(
+                               MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_END, (int)thread, (int)state, 0, 0, 0);
+       
                        return (THREAD_NULL);
                }
 
                pset_unlock(pset);
 
+               KERNEL_DEBUG_CONSTANT(
+                       MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_END, (int)thread, (int)state, (int)new_thread, 0, 0);
+
                return (new_thread);
        }
        else
        if (state == PROCESSOR_IDLE) {
                remqueue(&pset->idle_queue, (queue_entry_t)processor);
-               pset->idle_count--;
 
                processor->state = PROCESSOR_RUNNING;
-               enqueue_head(&pset->active_queue, (queue_entry_t)processor);
+               enqueue_tail(&pset->active_queue, (queue_entry_t)processor);
+       }
+       else
+       if (state == PROCESSOR_INACTIVE) {
+               processor->state = PROCESSOR_RUNNING;
+               enqueue_tail(&pset->active_queue, (queue_entry_t)processor);
        }
        else
        if (state == PROCESSOR_SHUTDOWN) {
@@ -2870,15 +2949,26 @@ processor_idle(
                        thread_setrun(new_thread, SCHED_HEADQ);
                        thread_unlock(new_thread);
 
+                       KERNEL_DEBUG_CONSTANT(
+                               MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_END, (int)thread, (int)state, 0, 0, 0);
+
                        return (THREAD_NULL);
                }
        }
 
        pset_unlock(pset);
 
+       KERNEL_DEBUG_CONSTANT(
+               MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_END, (int)thread, (int)state, 0, 0, 0);
+
        return (THREAD_NULL);
 }
 
+/*
+ *     Each processor has a dedicated thread which
+ *     executes the idle loop when there is no suitable
+ *     previous context.
+ */
 void
 idle_thread(void)
 {
@@ -2982,9 +3072,6 @@ sched_tick_continue(void)
         */
        thread_update_scan();
 
-       if (pm_tick_callout != NULL)
-           (*pm_tick_callout)();
-
        clock_deadline_for_periodic_event(sched_tick_interval, abstime,
                                                                                                                &sched_tick_deadline);