osfmk/kern/priority.c

   1 /*
   2  * Copyright (c) 2000-2010 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * @OSF_COPYRIGHT@
  30  */
  31 /*
  32  * Mach Operating System
  33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
  34  * All Rights Reserved.
  35  *
  36  * Permission to use, copy, modify and distribute this software and its
  37  * documentation is hereby granted, provided that both the copyright
  38  * notice and this permission notice appear in all copies of the
  39  * software, derivative works or modified versions, and any portions
  40  * thereof, and that both notices appear in supporting documentation.
  41  *
  42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  45  *
  46  * Carnegie Mellon requests users of this software to return to
  47  *
  48  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  49  *  School of Computer Science
  50  *  Carnegie Mellon University
  51  *  Pittsburgh PA 15213-3890
  52  *
  53  * any improvements or extensions that they make and grant Carnegie Mellon
  54  * the rights to redistribute these changes.
  55  */
  56 /*
  57  */
  58 /*
  59  *      File:   priority.c
  60  *      Author: Avadis Tevanian, Jr.
  61  *      Date:   1986
  62  *
  63  *      Priority related scheduler bits.
  64  */
  65
  66 #include <mach/boolean.h>
  67 #include <mach/kern_return.h>
  68 #include <mach/machine.h>
  69 #include <kern/host.h>
  70 #include <kern/mach_param.h>
  71 #include <kern/sched.h>
  72 #include <sys/kdebug.h>
  73 #include <kern/spl.h>
  74 #include <kern/thread.h>
  75 #include <kern/processor.h>
  76 #include <kern/ledger.h>
  77 #include <machine/machparam.h>
  78 #include <kern/machine.h>
  79
  80 #ifdef CONFIG_MACH_APPROXIMATE_TIME
  81 #include <machine/commpage.h>  /* for commpage_update_mach_approximate_time */
  82 #endif
  83
  84 #if MONOTONIC
  85 #include <kern/monotonic.h>
  86 #endif /* MONOTONIC */
  87
  88 static void sched_update_thread_bucket(thread_t thread);
  89
  90 /*
  91  *      thread_quantum_expire:
  92  *
  93  *      Recalculate the quantum and priority for a thread.
  94  *
  95  *      Called at splsched.
  96  */
  97
  98 void
  99 thread_quantum_expire(
 100         timer_call_param_t      p0,
 101         timer_call_param_t      p1)
 102 {
 103         processor_t                     processor = p0;
 104         thread_t                        thread = p1;
 105         ast_t                           preempt;
 106         uint64_t                        ctime;
 107         int                                     urgency;
 108         uint64_t                        ignore1, ignore2;
 109
 110         assert(processor == current_processor());
 111         assert(thread == current_thread());
 112
 113         KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_QUANTUM_EXPIRED) | DBG_FUNC_START, 0, 0, 0, 0, 0);
 114
 115         SCHED_STATS_QUANTUM_TIMER_EXPIRATION(processor);
 116
 117         /*
 118          * We bill CPU time to both the individual thread and its task.
 119          *
 120          * Because this balance adjustment could potentially attempt to wake this very
 121          * thread, we must credit the ledger before taking the thread lock. The ledger
 122          * pointers are only manipulated by the thread itself at the ast boundary.
 123          *
 124          * TODO: This fails to account for the time between when the timer was armed and when it fired.
 125          * It should be based on the system_timer and running a thread_timer_event operation here.
 126          */
 127         ledger_credit(thread->t_ledger, task_ledgers.cpu_time, thread->quantum_remaining);
 128         ledger_credit(thread->t_threadledger, thread_ledgers.cpu_time, thread->quantum_remaining);
 129         if (thread->t_bankledger) {
 130                 ledger_credit(thread->t_bankledger, bank_ledgers.cpu_time,
 131                                 (thread->quantum_remaining - thread->t_deduct_bank_ledger_time));
 132         }
 133         thread->t_deduct_bank_ledger_time = 0;
 134
 135         ctime = mach_absolute_time();
 136
 137 #ifdef CONFIG_MACH_APPROXIMATE_TIME
 138         commpage_update_mach_approximate_time(ctime);
 139 #endif
 140
 141 #if MONOTONIC
 142         mt_sched_update(thread);
 143 #endif /* MONOTONIC */
 144
 145         thread_lock(thread);
 146
 147         /*
 148          * We've run up until our quantum expiration, and will (potentially)
 149          * continue without re-entering the scheduler, so update this now.
 150          */
 151         processor->last_dispatch = ctime;
 152         thread->last_run_time = ctime;
 153
 154         /*
 155          *      Check for fail-safe trip.
 156          */
 157         if ((thread->sched_mode == TH_MODE_REALTIME || thread->sched_mode == TH_MODE_FIXED) &&
 158             !(thread->sched_flags & TH_SFLAG_PROMOTED_MASK) &&
 159             !(thread->options & TH_OPT_SYSTEM_CRITICAL)) {
 160                 uint64_t new_computation;
 161
 162                 new_computation = ctime - thread->computation_epoch;
 163                 new_computation += thread->computation_metered;
 164                 if (new_computation > max_unsafe_computation) {
 165                         KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_FAILSAFE)|DBG_FUNC_NONE,
 166                                         (uintptr_t)thread->sched_pri, (uintptr_t)thread->sched_mode, 0, 0, 0);
 167
 168                         thread->safe_release = ctime + sched_safe_duration;
 169
 170                         sched_thread_mode_demote(thread, TH_SFLAG_FAILSAFE);
 171                 }
 172         }
 173
 174         /*
 175          *      Recompute scheduled priority if appropriate.
 176          */
 177         if (SCHED(can_update_priority)(thread))
 178                 SCHED(update_priority)(thread);
 179         else
 180                 SCHED(lightweight_update_priority)(thread);
 181
 182         if (thread->sched_mode != TH_MODE_REALTIME)
 183                 SCHED(quantum_expire)(thread);
 184
 185         processor_state_update_from_thread(processor, thread);
 186
 187         /*
 188          *      This quantum is up, give this thread another.
 189          */
 190         processor->first_timeslice = FALSE;
 191
 192         thread_quantum_init(thread);
 193
 194         /* Reload precise timing global policy to thread-local policy */
 195         thread->precise_user_kernel_time = use_precise_user_kernel_time(thread);
 196
 197         /*
 198          * Since non-precise user/kernel time doesn't update the state/thread timer
 199          * during privilege transitions, synthesize an event now.
 200          */
 201         if (!thread->precise_user_kernel_time) {
 202                 timer_switch(PROCESSOR_DATA(processor, current_state),
 203                                          ctime,
 204                                          PROCESSOR_DATA(processor, current_state));
 205                 timer_switch(PROCESSOR_DATA(processor, thread_timer),
 206                                          ctime,
 207                                          PROCESSOR_DATA(processor, thread_timer));
 208         }
 209
 210
 211         processor->quantum_end = ctime + thread->quantum_remaining;
 212
 213         /*
 214          * Context switch check
 215          *
 216          * non-urgent flags don't affect kernel threads, so upgrade to urgent
 217          * to ensure that rebalancing and non-recommendation kick in quickly.
 218          */
 219
 220         ast_t check_reason = AST_QUANTUM;
 221         if (thread->task == kernel_task)
 222                 check_reason |= AST_URGENT;
 223
 224         if ((preempt = csw_check(processor, check_reason)) != AST_NONE)
 225                 ast_on(preempt);
 226
 227         /*
 228          * AST_KEVENT does not send an IPI when setting the AST,
 229          * to avoid waiting for the next context switch to propagate the AST,
 230          * the AST is propagated here at quantum expiration.
 231          */
 232         ast_propagate(thread);
 233
 234         thread_unlock(thread);
 235
 236         timer_call_quantum_timer_enter(&processor->quantum_timer, thread,
 237                 processor->quantum_end, ctime);
 238
 239         /* Tell platform layer that we are still running this thread */
 240         urgency = thread_get_urgency(thread, &ignore1, &ignore2);
 241         machine_thread_going_on_core(thread, urgency, 0, 0, ctime);
 242         machine_switch_perfcontrol_state_update(QUANTUM_EXPIRY, ctime,
 243                 0, thread);
 244
 245 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
 246         sched_timeshare_consider_maintenance(ctime);
 247 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
 248
 249 #if __arm__ || __arm64__
 250         if (thread->sched_mode == TH_MODE_REALTIME)
 251                 sched_consider_recommended_cores(ctime, thread);
 252 #endif /* __arm__ || __arm64__ */
 253
 254         KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_QUANTUM_EXPIRED) | DBG_FUNC_END, preempt, 0, 0, 0, 0);
 255 }
 256
 257 /*
 258  *      sched_set_thread_base_priority:
 259  *
 260  *      Set the base priority of the thread
 261  *      and reset its scheduled priority.
 262  *
 263  *      This is the only path to change base_pri.
 264  *
 265  *      Called with the thread locked.
 266  */
 267 void
 268 sched_set_thread_base_priority(thread_t thread, int priority)
 269 {
 270         assert(priority >= MINPRI);
 271         uint64_t ctime = 0;
 272
 273         if (thread->sched_mode == TH_MODE_REALTIME)
 274                 assert(priority <= BASEPRI_RTQUEUES);
 275         else
 276                 assert(priority < BASEPRI_RTQUEUES);
 277
 278         int old_base_pri = thread->base_pri;
 279         thread->base_pri = priority;
 280
 281         if ((thread->state & TH_RUN) == TH_RUN) {
 282                 assert(thread->last_made_runnable_time != THREAD_NOT_RUNNABLE);
 283                 ctime = mach_approximate_time();
 284                 thread->last_basepri_change_time = ctime;
 285         } else {
 286                 assert(thread->last_basepri_change_time == THREAD_NOT_RUNNABLE);
 287                 assert(thread->last_made_runnable_time == THREAD_NOT_RUNNABLE);
 288         }
 289
 290         /*
 291          * Currently the perfcontrol_attr depends on the base pri of the
 292          * thread. Therefore, we use this function as the hook for the
 293          * perfcontrol callout.
 294          */
 295         if (thread == current_thread() && old_base_pri != priority) {
 296                 if (!ctime) {
 297                     ctime = mach_approximate_time();
 298                 }
 299                 machine_switch_perfcontrol_state_update(PERFCONTROL_ATTR_UPDATE,
 300                         ctime, PERFCONTROL_CALLOUT_WAKE_UNSAFE, thread);
 301         }
 302         sched_update_thread_bucket(thread);
 303
 304         thread_recompute_sched_pri(thread, FALSE);
 305 }
 306
 307 /*
 308  *      thread_recompute_sched_pri:
 309  *
 310  *      Reset the scheduled priority of the thread
 311  *      according to its base priority if the
 312  *      thread has not been promoted or depressed.
 313  *
 314  *      This is the standard way to push base_pri changes into sched_pri,
 315  *      or to recalculate the appropriate sched_pri after clearing
 316  *      a promotion or depression.
 317  *
 318  *      Called at splsched with the thread locked.
 319  */
 320 void
 321 thread_recompute_sched_pri(
 322                            thread_t thread,
 323                            boolean_t override_depress)
 324 {
 325         int priority;
 326
 327         if (thread->sched_mode == TH_MODE_TIMESHARE)
 328                 priority = SCHED(compute_timeshare_priority)(thread);
 329         else
 330                 priority = thread->base_pri;
 331
 332         if ((!(thread->sched_flags & TH_SFLAG_PROMOTED_MASK)  || (priority > thread->sched_pri)) &&
 333             (!(thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) || override_depress)) {
 334                 set_sched_pri(thread, priority);
 335         }
 336 }
 337
 338 void
 339 sched_default_quantum_expire(thread_t thread __unused)
 340 {
 341       /*
 342        * No special behavior when a timeshare, fixed, or realtime thread
 343        * uses up its entire quantum
 344        */
 345 }
 346
 347 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
 348
 349 /*
 350  *      lightweight_update_priority:
 351  *
 352  *      Update the scheduled priority for
 353  *      a timesharing thread.
 354  *
 355  *      Only for use on the current thread.
 356  *
 357  *      Called with the thread locked.
 358  */
 359 void
 360 lightweight_update_priority(thread_t thread)
 361 {
 362         assert(thread->runq == PROCESSOR_NULL);
 363         assert(thread == current_thread());
 364
 365         if (thread->sched_mode == TH_MODE_TIMESHARE) {
 366                 int priority;
 367                 uint32_t delta;
 368
 369                 thread_timer_delta(thread, delta);
 370
 371                 /*
 372                  *      Accumulate timesharing usage only
 373                  *      during contention for processor
 374                  *      resources.
 375                  */
 376                 if (thread->pri_shift < INT8_MAX)
 377                         thread->sched_usage += delta;
 378
 379                 thread->cpu_delta += delta;
 380
 381                 priority = sched_compute_timeshare_priority(thread);
 382
 383                 /*
 384                  * Adjust the scheduled priority like thread_recompute_sched_pri,
 385                  * except with the benefit of knowing the thread is on this core.
 386                  */
 387                 if ((!(thread->sched_flags & TH_SFLAG_PROMOTED_MASK)  || (priority > thread->sched_pri)) &&
 388                     (!(thread->sched_flags & TH_SFLAG_DEPRESSED_MASK)) &&
 389                     priority != thread->sched_pri) {
 390
 391                         thread->sched_pri = priority;
 392
 393                         KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHANGE_PRIORITY),
 394                                               (uintptr_t)thread_tid(thread),
 395                                               thread->base_pri,
 396                                               thread->sched_pri,
 397                                               0, /* eventually, 'reason' */
 398                                               0);
 399                 }
 400         }
 401 }
 402
 403 /*
 404  *      Define shifts for simulating (5/8) ** n
 405  *
 406  *      Shift structures for holding update shifts.  Actual computation
 407  *      is  usage = (usage >> shift1) +/- (usage >> abs(shift2))  where the
 408  *      +/- is determined by the sign of shift 2.
 409  */
 410 struct shift_data {
 411         int     shift1;
 412         int     shift2;
 413 };
 414
 415 #define SCHED_DECAY_TICKS       32
 416 static struct shift_data        sched_decay_shifts[SCHED_DECAY_TICKS] = {
 417         {1,1},{1,3},{1,-3},{2,-7},{3,5},{3,-5},{4,-8},{5,7},
 418         {5,-7},{6,-10},{7,10},{7,-9},{8,-11},{9,12},{9,-11},{10,-13},
 419         {11,14},{11,-13},{12,-15},{13,17},{13,-15},{14,-17},{15,19},{16,18},
 420         {16,-19},{17,22},{18,20},{18,-20},{19,26},{20,22},{20,-22},{21,-27}
 421 };
 422
 423 /*
 424  *      sched_compute_timeshare_priority:
 425  *
 426  *      Calculate the timesharing priority based upon usage and load.
 427  */
 428 extern int sched_pri_decay_band_limit;
 429
 430 #ifdef CONFIG_EMBEDDED
 431
 432 int
 433 sched_compute_timeshare_priority(thread_t thread)
 434 {
 435         int decay_amount = (thread->sched_usage >> thread->pri_shift);
 436         int decay_limit = sched_pri_decay_band_limit;
 437
 438         if (thread->base_pri > BASEPRI_FOREGROUND) {
 439                 decay_limit += (thread->base_pri - BASEPRI_FOREGROUND);
 440         }
 441
 442         if (decay_amount > decay_limit) {
 443                 decay_amount = decay_limit;
 444         }
 445
 446         /* start with base priority */
 447         int priority = thread->base_pri - decay_amount;
 448
 449         if (priority < MAXPRI_THROTTLE) {
 450                 if (thread->task->max_priority > MAXPRI_THROTTLE) {
 451                         priority = MAXPRI_THROTTLE;
 452                 } else if (priority < MINPRI_USER) {
 453                         priority = MINPRI_USER;
 454                 }
 455         } else if (priority > MAXPRI_KERNEL) {
 456                 priority = MAXPRI_KERNEL;
 457         }
 458
 459         return priority;
 460 }
 461
 462 #else /* CONFIG_EMBEDDED */
 463
 464 int
 465 sched_compute_timeshare_priority(thread_t thread)
 466 {
 467         /* start with base priority */
 468         int priority = thread->base_pri - (thread->sched_usage >> thread->pri_shift);
 469
 470         if (priority < MINPRI_USER)
 471                 priority = MINPRI_USER;
 472         else if (priority > MAXPRI_KERNEL)
 473                 priority = MAXPRI_KERNEL;
 474
 475         return priority;
 476 }
 477
 478 #endif /* CONFIG_EMBEDDED */
 479
 480 /*
 481  *      can_update_priority
 482  *
 483  *      Make sure we don't do re-dispatches more frequently than a scheduler tick.
 484  *
 485  *      Called with the thread locked.
 486  */
 487 boolean_t
 488 can_update_priority(
 489                                         thread_t        thread)
 490 {
 491         if (sched_tick == thread->sched_stamp)
 492                 return (FALSE);
 493         else
 494                 return (TRUE);
 495 }
 496
 497 /*
 498  *      update_priority
 499  *
 500  *      Perform housekeeping operations driven by scheduler tick.
 501  *
 502  *      Called with the thread locked.
 503  */
 504 void
 505 update_priority(
 506         thread_t        thread)
 507 {
 508         uint32_t ticks, delta;
 509
 510         ticks = sched_tick - thread->sched_stamp;
 511         assert(ticks != 0);
 512
 513         thread->sched_stamp += ticks;
 514
 515         thread->pri_shift = sched_pri_shifts[thread->th_sched_bucket];
 516
 517         /* If requested, accelerate aging of sched_usage */
 518         if (sched_decay_usage_age_factor > 1)
 519                 ticks *= sched_decay_usage_age_factor;
 520
 521         /*
 522          *      Gather cpu usage data.
 523          */
 524         thread_timer_delta(thread, delta);
 525         if (ticks < SCHED_DECAY_TICKS) {
 526                 /*
 527                  *      Accumulate timesharing usage only
 528                  *      during contention for processor
 529                  *      resources.
 530                  */
 531                 if (thread->pri_shift < INT8_MAX)
 532                         thread->sched_usage += delta;
 533
 534                 thread->cpu_usage += delta + thread->cpu_delta;
 535                 thread->cpu_delta = 0;
 536
 537                 struct shift_data *shiftp = &sched_decay_shifts[ticks];
 538
 539                 if (shiftp->shift2 > 0) {
 540                         thread->cpu_usage =   (thread->cpu_usage >> shiftp->shift1) +
 541                                               (thread->cpu_usage >> shiftp->shift2);
 542                         thread->sched_usage = (thread->sched_usage >> shiftp->shift1) +
 543                                               (thread->sched_usage >> shiftp->shift2);
 544                 } else {
 545                         thread->cpu_usage =   (thread->cpu_usage >>   shiftp->shift1) -
 546                                               (thread->cpu_usage >> -(shiftp->shift2));
 547                         thread->sched_usage = (thread->sched_usage >>   shiftp->shift1) -
 548                                               (thread->sched_usage >> -(shiftp->shift2));
 549                 }
 550         } else {
 551                 thread->cpu_usage = thread->cpu_delta = 0;
 552                 thread->sched_usage = 0;
 553         }
 554
 555         /*
 556          *      Check for fail-safe release.
 557          */
 558         if ((thread->sched_flags & TH_SFLAG_FAILSAFE) &&
 559             mach_absolute_time() >= thread->safe_release) {
 560                 sched_thread_mode_undemote(thread, TH_SFLAG_FAILSAFE);
 561         }
 562
 563         /*
 564          *      Recompute scheduled priority if appropriate.
 565          */
 566         if (thread->sched_mode == TH_MODE_TIMESHARE) {
 567                 int priority = sched_compute_timeshare_priority(thread);
 568
 569                 /*
 570                  * Adjust the scheduled priority like thread_recompute_sched_pri,
 571                  * except without setting an AST.
 572                  */
 573                 if ((!(thread->sched_flags & TH_SFLAG_PROMOTED_MASK)  || (priority > thread->sched_pri)) &&
 574                     (!(thread->sched_flags & TH_SFLAG_DEPRESSED_MASK)) &&
 575                     priority != thread->sched_pri) {
 576
 577                         boolean_t removed = thread_run_queue_remove(thread);
 578
 579                         thread->sched_pri = priority;
 580
 581                         KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHANGE_PRIORITY),
 582                                               (uintptr_t)thread_tid(thread),
 583                                               thread->base_pri,
 584                                               thread->sched_pri,
 585                                               0, /* eventually, 'reason' */
 586                                               0);
 587
 588                         if (removed)
 589                                 thread_run_queue_reinsert(thread, SCHED_TAILQ);
 590                 }
 591         }
 592
 593         return;
 594 }
 595
 596 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
 597
 598
 599 /*
 600  * TH_BUCKET_RUN is a count of *all* runnable non-idle threads.
 601  * Each other bucket is a count of the runnable non-idle threads
 602  * with that property.
 603  */
 604 volatile uint32_t       sched_run_buckets[TH_BUCKET_MAX];
 605
 606 static void
 607 sched_incr_bucket(sched_bucket_t bucket)
 608 {
 609         assert(bucket >= TH_BUCKET_FIXPRI &&
 610                bucket <= TH_BUCKET_SHARE_BG);
 611
 612         hw_atomic_add(&sched_run_buckets[bucket], 1);
 613 }
 614
 615 static void
 616 sched_decr_bucket(sched_bucket_t bucket)
 617 {
 618         assert(bucket >= TH_BUCKET_FIXPRI &&
 619                bucket <= TH_BUCKET_SHARE_BG);
 620
 621         assert(sched_run_buckets[bucket] > 0);
 622
 623         hw_atomic_sub(&sched_run_buckets[bucket], 1);
 624 }
 625
 626 /* TH_RUN & !TH_IDLE controls whether a thread has a run count */
 627
 628 uint32_t
 629 sched_run_incr(thread_t thread)
 630 {
 631         assert((thread->state & (TH_RUN|TH_IDLE)) == TH_RUN);
 632
 633         uint32_t new_count = hw_atomic_add(&sched_run_buckets[TH_BUCKET_RUN], 1);
 634
 635         sched_incr_bucket(thread->th_sched_bucket);
 636
 637         return new_count;
 638 }
 639
 640 uint32_t
 641 sched_run_decr(thread_t thread)
 642 {
 643         assert((thread->state & (TH_RUN|TH_IDLE)) != TH_RUN);
 644
 645         sched_decr_bucket(thread->th_sched_bucket);
 646
 647         uint32_t new_count = hw_atomic_sub(&sched_run_buckets[TH_BUCKET_RUN], 1);
 648
 649         return new_count;
 650 }
 651
 652 static void
 653 sched_update_thread_bucket(thread_t thread)
 654 {
 655         sched_bucket_t old_bucket = thread->th_sched_bucket;
 656         sched_bucket_t new_bucket = TH_BUCKET_RUN;
 657
 658         switch (thread->sched_mode) {
 659         case TH_MODE_FIXED:
 660         case TH_MODE_REALTIME:
 661                 new_bucket = TH_BUCKET_FIXPRI;
 662                 break;
 663
 664         case TH_MODE_TIMESHARE:
 665                 if (thread->base_pri > BASEPRI_UTILITY)
 666                         new_bucket = TH_BUCKET_SHARE_FG;
 667                 else if (thread->base_pri > MAXPRI_THROTTLE)
 668                         new_bucket = TH_BUCKET_SHARE_UT;
 669                 else
 670                         new_bucket = TH_BUCKET_SHARE_BG;
 671                 break;
 672
 673         default:
 674                 panic("unexpected mode: %d", thread->sched_mode);
 675                 break;
 676         }
 677
 678         if (old_bucket != new_bucket) {
 679                 thread->th_sched_bucket = new_bucket;
 680                 thread->pri_shift = sched_pri_shifts[new_bucket];
 681
 682                 if ((thread->state & (TH_RUN|TH_IDLE)) == TH_RUN) {
 683                         sched_decr_bucket(old_bucket);
 684                         sched_incr_bucket(new_bucket);
 685                 }
 686         }
 687 }
 688
 689 /*
 690  * Set the thread's true scheduling mode
 691  * Called with thread mutex and thread locked
 692  * The thread has already been removed from the runqueue.
 693  *
 694  * (saved_mode is handled before this point)
 695  */
 696 void
 697 sched_set_thread_mode(thread_t thread, sched_mode_t new_mode)
 698 {
 699         assert(thread->runq == PROCESSOR_NULL);
 700
 701         switch (new_mode) {
 702         case TH_MODE_FIXED:
 703         case TH_MODE_REALTIME:
 704         case TH_MODE_TIMESHARE:
 705                 break;
 706
 707         default:
 708                 panic("unexpected mode: %d", new_mode);
 709                 break;
 710         }
 711
 712         thread->sched_mode = new_mode;
 713
 714         sched_update_thread_bucket(thread);
 715 }
 716
 717 /*
 718  * Demote the true scheduler mode to timeshare (called with the thread locked)
 719  */
 720 void
 721 sched_thread_mode_demote(thread_t thread, uint32_t reason)
 722 {
 723         assert(reason & TH_SFLAG_DEMOTED_MASK);
 724         assert((thread->sched_flags & reason) != reason);
 725
 726         if (thread->policy_reset)
 727                 return;
 728
 729         if (thread->sched_flags & TH_SFLAG_DEMOTED_MASK) {
 730                 /* Another demotion reason is already active */
 731                 thread->sched_flags |= reason;
 732                 return;
 733         }
 734
 735         assert(thread->saved_mode == TH_MODE_NONE);
 736
 737         boolean_t removed = thread_run_queue_remove(thread);
 738
 739         thread->sched_flags |= reason;
 740
 741         thread->saved_mode = thread->sched_mode;
 742
 743         sched_set_thread_mode(thread, TH_MODE_TIMESHARE);
 744
 745         thread_recompute_priority(thread);
 746
 747         if (removed)
 748                 thread_run_queue_reinsert(thread, SCHED_TAILQ);
 749 }
 750
 751 /*
 752  * Un-demote the true scheduler mode back to the saved mode (called with the thread locked)
 753  */
 754 void
 755 sched_thread_mode_undemote(thread_t thread, uint32_t reason)
 756 {
 757         assert(reason & TH_SFLAG_DEMOTED_MASK);
 758         assert((thread->sched_flags & reason) == reason);
 759         assert(thread->saved_mode != TH_MODE_NONE);
 760         assert(thread->sched_mode == TH_MODE_TIMESHARE);
 761         assert(thread->policy_reset == 0);
 762
 763         thread->sched_flags &= ~reason;
 764
 765         if (thread->sched_flags & TH_SFLAG_DEMOTED_MASK) {
 766                 /* Another demotion reason is still active */
 767                 return;
 768         }
 769
 770         boolean_t removed = thread_run_queue_remove(thread);
 771
 772         sched_set_thread_mode(thread, thread->saved_mode);
 773
 774         thread->saved_mode = TH_MODE_NONE;
 775
 776         thread_recompute_priority(thread);
 777
 778         if (removed)
 779                 thread_run_queue_reinsert(thread, SCHED_TAILQ);
 780 }
 781
 782