osfmk/kern/priority.c

   1 /*
   2  * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * @OSF_COPYRIGHT@
  30  */
  31 /*
  32  * Mach Operating System
  33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
  34  * All Rights Reserved.
  35  *
  36  * Permission to use, copy, modify and distribute this software and its
  37  * documentation is hereby granted, provided that both the copyright
  38  * notice and this permission notice appear in all copies of the
  39  * software, derivative works or modified versions, and any portions
  40  * thereof, and that both notices appear in supporting documentation.
  41  *
  42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  45  *
  46  * Carnegie Mellon requests users of this software to return to
  47  *
  48  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  49  *  School of Computer Science
  50  *  Carnegie Mellon University
  51  *  Pittsburgh PA 15213-3890
  52  *
  53  * any improvements or extensions that they make and grant Carnegie Mellon
  54  * the rights to redistribute these changes.
  55  */
  56 /*
  57  */
  58 /*
  59  *      File:   priority.c
  60  *      Author: Avadis Tevanian, Jr.
  61  *      Date:   1986
  62  *
  63  *      Priority related scheduler bits.
  64  */
  65
  66 #include <mach/boolean.h>
  67 #include <mach/kern_return.h>
  68 #include <mach/machine.h>
  69 #include <kern/host.h>
  70 #include <kern/mach_param.h>
  71 #include <kern/sched.h>
  72 #include <sys/kdebug.h>
  73 #include <kern/spl.h>
  74 #include <kern/thread.h>
  75 #include <kern/processor.h>
  76 #include <kern/ledger.h>
  77 #include <machine/machparam.h>
  78 #include <kern/machine.h>
  79 #include <kern/policy_internal.h>
  80 #include <kern/sched_clutch.h>
  81
  82 #ifdef CONFIG_MACH_APPROXIMATE_TIME
  83 #include <machine/commpage.h>  /* for commpage_update_mach_approximate_time */
  84 #endif
  85
  86 #if MONOTONIC
  87 #include <kern/monotonic.h>
  88 #endif /* MONOTONIC */
  89
  90 /*
  91  *      thread_quantum_expire:
  92  *
  93  *      Recalculate the quantum and priority for a thread.
  94  *
  95  *      Called at splsched.
  96  */
  97
  98 void
  99 thread_quantum_expire(
 100         timer_call_param_t      p0,
 101         timer_call_param_t      p1)
 102 {
 103         processor_t                     processor = p0;
 104         thread_t                        thread = p1;
 105         ast_t                           preempt;
 106         uint64_t                        ctime;
 107
 108         assert(processor == current_processor());
 109         assert(thread == current_thread());
 110
 111         KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_QUANTUM_EXPIRED) | DBG_FUNC_START, 0, 0, 0, 0, 0);
 112
 113         SCHED_STATS_QUANTUM_TIMER_EXPIRATION(processor);
 114
 115         /*
 116          * We bill CPU time to both the individual thread and its task.
 117          *
 118          * Because this balance adjustment could potentially attempt to wake this
 119          * very thread, we must credit the ledger before taking the thread lock.
 120          * The ledger pointers are only manipulated by the thread itself at the ast
 121          * boundary.
 122          *
 123          * TODO: This fails to account for the time between when the timer was
 124          * armed and when it fired.  It should be based on the system_timer and
 125          * running a timer_update operation here.
 126          */
 127         ledger_credit(thread->t_ledger, task_ledgers.cpu_time, thread->quantum_remaining);
 128         ledger_credit(thread->t_threadledger, thread_ledgers.cpu_time, thread->quantum_remaining);
 129         if (thread->t_bankledger) {
 130                 ledger_credit(thread->t_bankledger, bank_ledgers.cpu_time,
 131                     (thread->quantum_remaining - thread->t_deduct_bank_ledger_time));
 132         }
 133         thread->t_deduct_bank_ledger_time = 0;
 134
 135         ctime = mach_absolute_time();
 136
 137 #ifdef CONFIG_MACH_APPROXIMATE_TIME
 138         commpage_update_mach_approximate_time(ctime);
 139 #endif
 140
 141 #if MONOTONIC
 142         mt_sched_update(thread);
 143 #endif /* MONOTONIC */
 144
 145         thread_lock(thread);
 146
 147         /*
 148          * We've run up until our quantum expiration, and will (potentially)
 149          * continue without re-entering the scheduler, so update this now.
 150          */
 151         processor->last_dispatch = ctime;
 152         thread->last_run_time = ctime;
 153
 154         /*
 155          *      Check for fail-safe trip.
 156          */
 157         if ((thread->sched_mode == TH_MODE_REALTIME || thread->sched_mode == TH_MODE_FIXED) &&
 158             !(thread->sched_flags & TH_SFLAG_PROMOTED) &&
 159             !(thread->kern_promotion_schedpri != 0) &&
 160             !(thread->sched_flags & TH_SFLAG_PROMOTE_REASON_MASK) &&
 161             !(thread->options & TH_OPT_SYSTEM_CRITICAL)) {
 162                 uint64_t new_computation;
 163
 164                 new_computation = ctime - thread->computation_epoch;
 165                 new_computation += thread->computation_metered;
 166                 if (new_computation > max_unsafe_computation) {
 167                         KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_FAILSAFE) | DBG_FUNC_NONE,
 168                             (uintptr_t)thread->sched_pri, (uintptr_t)thread->sched_mode, 0, 0, 0);
 169
 170                         thread->safe_release = ctime + sched_safe_duration;
 171
 172                         sched_thread_mode_demote(thread, TH_SFLAG_FAILSAFE);
 173                 }
 174         }
 175
 176         /*
 177          *      Recompute scheduled priority if appropriate.
 178          */
 179         if (SCHED(can_update_priority)(thread)) {
 180                 SCHED(update_priority)(thread);
 181         } else {
 182                 SCHED(lightweight_update_priority)(thread);
 183         }
 184
 185         if (thread->sched_mode != TH_MODE_REALTIME) {
 186                 SCHED(quantum_expire)(thread);
 187         }
 188
 189         /*
 190          *      This quantum is up, give this thread another.
 191          */
 192         processor->first_timeslice = FALSE;
 193
 194         thread_quantum_init(thread);
 195
 196         /* Reload precise timing global policy to thread-local policy */
 197         thread->precise_user_kernel_time = use_precise_user_kernel_time(thread);
 198
 199         /*
 200          * Since non-precise user/kernel time doesn't update the state/thread timer
 201          * during privilege transitions, synthesize an event now.
 202          */
 203         if (!thread->precise_user_kernel_time) {
 204                 timer_update(PROCESSOR_DATA(processor, current_state), ctime);
 205                 timer_update(PROCESSOR_DATA(processor, thread_timer), ctime);
 206                 timer_update(&thread->runnable_timer, ctime);
 207         }
 208
 209
 210         processor->quantum_end = ctime + thread->quantum_remaining;
 211
 212         /*
 213          * Context switch check
 214          *
 215          * non-urgent flags don't affect kernel threads, so upgrade to urgent
 216          * to ensure that rebalancing and non-recommendation kick in quickly.
 217          */
 218
 219         ast_t check_reason = AST_QUANTUM;
 220         if (thread->task == kernel_task) {
 221                 check_reason |= AST_URGENT;
 222         }
 223
 224         if ((preempt = csw_check(thread, processor, check_reason)) != AST_NONE) {
 225                 ast_on(preempt);
 226         }
 227
 228         /*
 229          * AST_KEVENT does not send an IPI when setting the AST,
 230          * to avoid waiting for the next context switch to propagate the AST,
 231          * the AST is propagated here at quantum expiration.
 232          */
 233         ast_propagate(thread);
 234
 235         thread_unlock(thread);
 236
 237         timer_call_quantum_timer_enter(&processor->quantum_timer, thread,
 238             processor->quantum_end, ctime);
 239
 240         /* Tell platform layer that we are still running this thread */
 241         thread_urgency_t urgency = thread_get_urgency(thread, NULL, NULL);
 242         machine_thread_going_on_core(thread, urgency, 0, 0, ctime);
 243         machine_switch_perfcontrol_state_update(QUANTUM_EXPIRY, ctime,
 244             0, thread);
 245
 246 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
 247         sched_timeshare_consider_maintenance(ctime);
 248 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
 249
 250 #if __arm__ || __arm64__
 251         if (thread->sched_mode == TH_MODE_REALTIME) {
 252                 sched_consider_recommended_cores(ctime, thread);
 253         }
 254 #endif /* __arm__ || __arm64__ */
 255
 256         KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_QUANTUM_EXPIRED) | DBG_FUNC_END, preempt, 0, 0, 0, 0);
 257 }
 258
 259 /*
 260  *      sched_set_thread_base_priority:
 261  *
 262  *      Set the base priority of the thread
 263  *      and reset its scheduled priority.
 264  *
 265  *      This is the only path to change base_pri.
 266  *
 267  *      Called with the thread locked.
 268  */
 269 void
 270 sched_set_thread_base_priority(thread_t thread, int priority)
 271 {
 272         assert(priority >= MINPRI);
 273         uint64_t ctime = 0;
 274
 275         if (thread->sched_mode == TH_MODE_REALTIME) {
 276                 assert(priority <= BASEPRI_RTQUEUES);
 277         } else {
 278                 assert(priority < BASEPRI_RTQUEUES);
 279         }
 280
 281         int old_base_pri = thread->base_pri;
 282         thread->req_base_pri = priority;
 283         if (thread->sched_flags & TH_SFLAG_BASE_PRI_FROZEN) {
 284                 priority = MAX(priority, old_base_pri);
 285         }
 286         thread->base_pri = priority;
 287
 288         if ((thread->state & TH_RUN) == TH_RUN) {
 289                 assert(thread->last_made_runnable_time != THREAD_NOT_RUNNABLE);
 290                 ctime = mach_approximate_time();
 291                 thread->last_basepri_change_time = ctime;
 292         } else {
 293                 assert(thread->last_basepri_change_time == THREAD_NOT_RUNNABLE);
 294                 assert(thread->last_made_runnable_time == THREAD_NOT_RUNNABLE);
 295         }
 296
 297         /*
 298          * Currently the perfcontrol_attr depends on the base pri of the
 299          * thread. Therefore, we use this function as the hook for the
 300          * perfcontrol callout.
 301          */
 302         if (thread == current_thread() && old_base_pri != priority) {
 303                 if (!ctime) {
 304                         ctime = mach_approximate_time();
 305                 }
 306                 machine_switch_perfcontrol_state_update(PERFCONTROL_ATTR_UPDATE,
 307                     ctime, PERFCONTROL_CALLOUT_WAKE_UNSAFE, thread);
 308         }
 309 #if !CONFIG_SCHED_CLUTCH
 310         /* For the clutch scheduler, this operation is done in set_sched_pri() */
 311         SCHED(update_thread_bucket)(thread);
 312 #endif /* !CONFIG_SCHED_CLUTCH */
 313
 314         thread_recompute_sched_pri(thread, SETPRI_DEFAULT);
 315 }
 316
 317 /*
 318  *      sched_set_kernel_thread_priority:
 319  *
 320  *      Set the absolute base priority of the thread
 321  *      and reset its scheduled priority.
 322  *
 323  *      Called with the thread unlocked.
 324  */
 325 void
 326 sched_set_kernel_thread_priority(thread_t thread, int new_priority)
 327 {
 328         spl_t s = splsched();
 329
 330         thread_lock(thread);
 331
 332         assert(thread->sched_mode != TH_MODE_REALTIME);
 333         assert(thread->effective_policy.thep_qos == THREAD_QOS_UNSPECIFIED);
 334
 335         if (new_priority > thread->max_priority) {
 336                 new_priority = thread->max_priority;
 337         }
 338 #if CONFIG_EMBEDDED
 339         if (new_priority < MAXPRI_THROTTLE) {
 340                 new_priority = MAXPRI_THROTTLE;
 341         }
 342 #endif /* CONFIG_EMBEDDED */
 343
 344         thread->importance = new_priority - thread->task_priority;
 345
 346         sched_set_thread_base_priority(thread, new_priority);
 347
 348         thread_unlock(thread);
 349         splx(s);
 350 }
 351
 352 /*
 353  *      thread_recompute_sched_pri:
 354  *
 355  *      Reset the scheduled priority of the thread
 356  *      according to its base priority if the
 357  *      thread has not been promoted or depressed.
 358  *
 359  *      This is the only way to push base_pri changes into sched_pri,
 360  *      or to recalculate the appropriate sched_pri after changing
 361  *      a promotion or depression.
 362  *
 363  *      Called at splsched with the thread locked.
 364  *
 365  *      TODO: Add an 'update urgency' flag to avoid urgency callouts on every rwlock operation
 366  */
 367 void
 368 thread_recompute_sched_pri(thread_t thread, set_sched_pri_options_t options)
 369 {
 370         uint32_t     sched_flags = thread->sched_flags;
 371         sched_mode_t sched_mode  = thread->sched_mode;
 372
 373         int priority = thread->base_pri;
 374
 375         if (sched_mode == TH_MODE_TIMESHARE) {
 376                 priority = SCHED(compute_timeshare_priority)(thread);
 377         }
 378
 379         if (sched_flags & TH_SFLAG_DEPRESS) {
 380                 /* thread_yield_internal overrides kernel mutex promotion */
 381                 priority = DEPRESSPRI;
 382         } else {
 383                 /* poll-depress is overridden by mutex promotion and promote-reasons */
 384                 if ((sched_flags & TH_SFLAG_POLLDEPRESS)) {
 385                         priority = DEPRESSPRI;
 386                 }
 387
 388                 if (thread->kern_promotion_schedpri > 0) {
 389                         priority = MAX(priority, thread->kern_promotion_schedpri);
 390
 391                         if (sched_mode != TH_MODE_REALTIME) {
 392                                 priority = MIN(priority, MAXPRI_PROMOTE);
 393                         }
 394                 }
 395
 396                 if (sched_flags & TH_SFLAG_PROMOTED) {
 397                         priority = MAX(priority, thread->promotion_priority);
 398
 399                         if (sched_mode != TH_MODE_REALTIME) {
 400                                 priority = MIN(priority, MAXPRI_PROMOTE);
 401                         }
 402                 }
 403
 404                 if (sched_flags & TH_SFLAG_PROMOTE_REASON_MASK) {
 405                         if (sched_flags & TH_SFLAG_RW_PROMOTED) {
 406                                 priority = MAX(priority, MINPRI_RWLOCK);
 407                         }
 408
 409                         if (sched_flags & TH_SFLAG_WAITQ_PROMOTED) {
 410                                 priority = MAX(priority, MINPRI_WAITQ);
 411                         }
 412
 413                         if (sched_flags & TH_SFLAG_EXEC_PROMOTED) {
 414                                 priority = MAX(priority, MINPRI_EXEC);
 415                         }
 416                 }
 417         }
 418
 419         set_sched_pri(thread, priority, options);
 420 }
 421
 422 void
 423 sched_default_quantum_expire(thread_t thread __unused)
 424 {
 425         /*
 426          * No special behavior when a timeshare, fixed, or realtime thread
 427          * uses up its entire quantum
 428          */
 429 }
 430
 431 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
 432
 433 /*
 434  *      lightweight_update_priority:
 435  *
 436  *      Update the scheduled priority for
 437  *      a timesharing thread.
 438  *
 439  *      Only for use on the current thread.
 440  *
 441  *      Called with the thread locked.
 442  */
 443 void
 444 lightweight_update_priority(thread_t thread)
 445 {
 446         assert(thread->runq == PROCESSOR_NULL);
 447         assert(thread == current_thread());
 448
 449         if (thread->sched_mode == TH_MODE_TIMESHARE) {
 450                 int priority;
 451                 uint32_t delta;
 452
 453                 thread_timer_delta(thread, delta);
 454
 455                 /*
 456                  *      Accumulate timesharing usage only
 457                  *      during contention for processor
 458                  *      resources.
 459                  */
 460                 if (thread->pri_shift < INT8_MAX) {
 461                         thread->sched_usage += delta;
 462                 }
 463
 464                 thread->cpu_delta += delta;
 465
 466 #if CONFIG_SCHED_CLUTCH
 467                 /*
 468                  * Update the CPU usage for the thread group to which the thread belongs.
 469                  * The implementation assumes that the thread ran for the entire delta
 470                  * as part of the same thread group.
 471                  */
 472                 sched_clutch_cpu_usage_update(thread, delta);
 473 #endif /* CONFIG_SCHED_CLUTCH */
 474
 475                 priority = sched_compute_timeshare_priority(thread);
 476
 477                 if (priority != thread->sched_pri) {
 478                         thread_recompute_sched_pri(thread, SETPRI_LAZY);
 479                 }
 480         }
 481 }
 482
 483 /*
 484  *      Define shifts for simulating (5/8) ** n
 485  *
 486  *      Shift structures for holding update shifts.  Actual computation
 487  *      is  usage = (usage >> shift1) +/- (usage >> abs(shift2))  where the
 488  *      +/- is determined by the sign of shift 2.
 489  */
 490
 491 const struct shift_data        sched_decay_shifts[SCHED_DECAY_TICKS] = {
 492         { .shift1 = 1, .shift2 = 1 },
 493         { .shift1 = 1, .shift2 = 3 },
 494         { .shift1 = 1, .shift2 = -3 },
 495         { .shift1 = 2, .shift2 = -7 },
 496         { .shift1 = 3, .shift2 = 5 },
 497         { .shift1 = 3, .shift2 = -5 },
 498         { .shift1 = 4, .shift2 = -8 },
 499         { .shift1 = 5, .shift2 = 7 },
 500         { .shift1 = 5, .shift2 = -7 },
 501         { .shift1 = 6, .shift2 = -10 },
 502         { .shift1 = 7, .shift2 = 10 },
 503         { .shift1 = 7, .shift2 = -9 },
 504         { .shift1 = 8, .shift2 = -11 },
 505         { .shift1 = 9, .shift2 = 12 },
 506         { .shift1 = 9, .shift2 = -11 },
 507         { .shift1 = 10, .shift2 = -13 },
 508         { .shift1 = 11, .shift2 = 14 },
 509         { .shift1 = 11, .shift2 = -13 },
 510         { .shift1 = 12, .shift2 = -15 },
 511         { .shift1 = 13, .shift2 = 17 },
 512         { .shift1 = 13, .shift2 = -15 },
 513         { .shift1 = 14, .shift2 = -17 },
 514         { .shift1 = 15, .shift2 = 19 },
 515         { .shift1 = 16, .shift2 = 18 },
 516         { .shift1 = 16, .shift2 = -19 },
 517         { .shift1 = 17, .shift2 = 22 },
 518         { .shift1 = 18, .shift2 = 20 },
 519         { .shift1 = 18, .shift2 = -20 },
 520         { .shift1 = 19, .shift2 = 26 },
 521         { .shift1 = 20, .shift2 = 22 },
 522         { .shift1 = 20, .shift2 = -22 },
 523         { .shift1 = 21, .shift2 = -27 }
 524 };
 525
 526 /*
 527  *      sched_compute_timeshare_priority:
 528  *
 529  *      Calculate the timesharing priority based upon usage and load.
 530  */
 531 extern int sched_pri_decay_band_limit;
 532
 533
 534 /* Only use the decay floor logic on embedded non-clutch schedulers */
 535 #if CONFIG_EMBEDDED && !CONFIG_SCHED_CLUTCH
 536
 537 int
 538 sched_compute_timeshare_priority(thread_t thread)
 539 {
 540         int decay_amount = (thread->sched_usage >> thread->pri_shift);
 541         int decay_limit = sched_pri_decay_band_limit;
 542
 543         if (thread->base_pri > BASEPRI_FOREGROUND) {
 544                 decay_limit += (thread->base_pri - BASEPRI_FOREGROUND);
 545         }
 546
 547         if (decay_amount > decay_limit) {
 548                 decay_amount = decay_limit;
 549         }
 550
 551         /* start with base priority */
 552         int priority = thread->base_pri - decay_amount;
 553
 554         if (priority < MAXPRI_THROTTLE) {
 555                 if (thread->task->max_priority > MAXPRI_THROTTLE) {
 556                         priority = MAXPRI_THROTTLE;
 557                 } else if (priority < MINPRI_USER) {
 558                         priority = MINPRI_USER;
 559                 }
 560         } else if (priority > MAXPRI_KERNEL) {
 561                 priority = MAXPRI_KERNEL;
 562         }
 563
 564         return priority;
 565 }
 566
 567 #else /* CONFIG_EMBEDDED && !CONFIG_SCHED_CLUTCH */
 568
 569 int
 570 sched_compute_timeshare_priority(thread_t thread)
 571 {
 572         /* start with base priority */
 573         int priority = thread->base_pri - (thread->sched_usage >> thread->pri_shift);
 574
 575         if (priority < MINPRI_USER) {
 576                 priority = MINPRI_USER;
 577         } else if (priority > MAXPRI_KERNEL) {
 578                 priority = MAXPRI_KERNEL;
 579         }
 580
 581         return priority;
 582 }
 583
 584 #endif /* CONFIG_EMBEDDED && !CONFIG_SCHED_CLUTCH */
 585
 586 /*
 587  *      can_update_priority
 588  *
 589  *      Make sure we don't do re-dispatches more frequently than a scheduler tick.
 590  *
 591  *      Called with the thread locked.
 592  */
 593 boolean_t
 594 can_update_priority(
 595         thread_t        thread)
 596 {
 597         if (sched_tick == thread->sched_stamp) {
 598                 return FALSE;
 599         } else {
 600                 return TRUE;
 601         }
 602 }
 603
 604 /*
 605  *      update_priority
 606  *
 607  *      Perform housekeeping operations driven by scheduler tick.
 608  *
 609  *      Called with the thread locked.
 610  */
 611 void
 612 update_priority(
 613         thread_t        thread)
 614 {
 615         uint32_t ticks, delta;
 616
 617         ticks = sched_tick - thread->sched_stamp;
 618         assert(ticks != 0);
 619
 620         thread->sched_stamp += ticks;
 621
 622         /* If requested, accelerate aging of sched_usage */
 623         if (sched_decay_usage_age_factor > 1) {
 624                 ticks *= sched_decay_usage_age_factor;
 625         }
 626
 627         /*
 628          *      Gather cpu usage data.
 629          */
 630         thread_timer_delta(thread, delta);
 631         if (ticks < SCHED_DECAY_TICKS) {
 632                 /*
 633                  *      Accumulate timesharing usage only during contention for processor
 634                  *      resources. Use the pri_shift from the previous tick window to
 635                  *      determine if the system was in a contended state.
 636                  */
 637                 if (thread->pri_shift < INT8_MAX) {
 638                         thread->sched_usage += delta;
 639                 }
 640
 641                 thread->cpu_usage += delta + thread->cpu_delta;
 642                 thread->cpu_delta = 0;
 643
 644 #if CONFIG_SCHED_CLUTCH
 645                 /*
 646                  * Update the CPU usage for the thread group to which the thread belongs.
 647                  * The implementation assumes that the thread ran for the entire delta
 648                  * as part of the same thread group.
 649                  */
 650                 sched_clutch_cpu_usage_update(thread, delta);
 651 #endif /* CONFIG_SCHED_CLUTCH */
 652
 653                 const struct shift_data *shiftp = &sched_decay_shifts[ticks];
 654
 655                 if (shiftp->shift2 > 0) {
 656                         thread->cpu_usage =   (thread->cpu_usage >> shiftp->shift1) +
 657                             (thread->cpu_usage >> shiftp->shift2);
 658                         thread->sched_usage = (thread->sched_usage >> shiftp->shift1) +
 659                             (thread->sched_usage >> shiftp->shift2);
 660                 } else {
 661                         thread->cpu_usage =   (thread->cpu_usage >>   shiftp->shift1) -
 662                             (thread->cpu_usage >> -(shiftp->shift2));
 663                         thread->sched_usage = (thread->sched_usage >>   shiftp->shift1) -
 664                             (thread->sched_usage >> -(shiftp->shift2));
 665                 }
 666         } else {
 667                 thread->cpu_usage = thread->cpu_delta = 0;
 668                 thread->sched_usage = 0;
 669         }
 670
 671         /*
 672          *      Check for fail-safe release.
 673          */
 674         if ((thread->sched_flags & TH_SFLAG_FAILSAFE) &&
 675             mach_absolute_time() >= thread->safe_release) {
 676                 sched_thread_mode_undemote(thread, TH_SFLAG_FAILSAFE);
 677         }
 678
 679         /*
 680          * Now that the thread's CPU usage has been accumulated and aged
 681          * based on contention of the previous tick window, update the
 682          * pri_shift of the thread to match the current global load/shift
 683          * values. The updated pri_shift would be used to calculate the
 684          * new priority of the thread.
 685          */
 686 #if CONFIG_SCHED_CLUTCH
 687         thread->pri_shift = sched_clutch_thread_pri_shift(thread, thread->th_sched_bucket);
 688 #else /* CONFIG_SCHED_CLUTCH */
 689         thread->pri_shift = sched_pri_shifts[thread->th_sched_bucket];
 690 #endif /* CONFIG_SCHED_CLUTCH */
 691
 692         /* Recompute scheduled priority if appropriate. */
 693         if (thread->sched_mode == TH_MODE_TIMESHARE) {
 694                 thread_recompute_sched_pri(thread, SETPRI_LAZY);
 695         }
 696 }
 697
 698 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
 699
 700
 701 /*
 702  * TH_BUCKET_RUN is a count of *all* runnable non-idle threads.
 703  * Each other bucket is a count of the runnable non-idle threads
 704  * with that property. All updates to these counts should be
 705  * performed with os_atomic_* operations.
 706  *
 707  * For the clutch scheduler, this global bucket is used only for
 708  * keeping the total global run count.
 709  */
 710 uint32_t       sched_run_buckets[TH_BUCKET_MAX];
 711
 712 static void
 713 sched_incr_bucket(sched_bucket_t bucket)
 714 {
 715         assert(bucket >= TH_BUCKET_FIXPRI &&
 716             bucket <= TH_BUCKET_SHARE_BG);
 717
 718         os_atomic_inc(&sched_run_buckets[bucket], relaxed);
 719 }
 720
 721 static void
 722 sched_decr_bucket(sched_bucket_t bucket)
 723 {
 724         assert(bucket >= TH_BUCKET_FIXPRI &&
 725             bucket <= TH_BUCKET_SHARE_BG);
 726
 727         assert(os_atomic_load(&sched_run_buckets[bucket], relaxed) > 0);
 728
 729         os_atomic_dec(&sched_run_buckets[bucket], relaxed);
 730 }
 731
 732 uint32_t
 733 sched_run_incr(thread_t thread)
 734 {
 735         assert((thread->state & (TH_RUN | TH_IDLE)) == TH_RUN);
 736
 737         uint32_t new_count = os_atomic_inc(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
 738
 739         sched_incr_bucket(thread->th_sched_bucket);
 740
 741         return new_count;
 742 }
 743
 744 uint32_t
 745 sched_run_decr(thread_t thread)
 746 {
 747         assert((thread->state & (TH_RUN | TH_IDLE)) != TH_RUN);
 748
 749         sched_decr_bucket(thread->th_sched_bucket);
 750
 751         uint32_t new_count = os_atomic_dec(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
 752
 753         return new_count;
 754 }
 755
 756 void
 757 sched_update_thread_bucket(thread_t thread)
 758 {
 759         sched_bucket_t old_bucket = thread->th_sched_bucket;
 760         sched_bucket_t new_bucket = TH_BUCKET_RUN;
 761
 762         switch (thread->sched_mode) {
 763         case TH_MODE_FIXED:
 764         case TH_MODE_REALTIME:
 765                 new_bucket = TH_BUCKET_FIXPRI;
 766                 break;
 767
 768         case TH_MODE_TIMESHARE:
 769                 if (thread->base_pri > BASEPRI_DEFAULT) {
 770                         new_bucket = TH_BUCKET_SHARE_FG;
 771                 } else if (thread->base_pri > BASEPRI_UTILITY) {
 772                         new_bucket = TH_BUCKET_SHARE_DF;
 773                 } else if (thread->base_pri > MAXPRI_THROTTLE) {
 774                         new_bucket = TH_BUCKET_SHARE_UT;
 775                 } else {
 776                         new_bucket = TH_BUCKET_SHARE_BG;
 777                 }
 778                 break;
 779
 780         default:
 781                 panic("unexpected mode: %d", thread->sched_mode);
 782                 break;
 783         }
 784
 785         if (old_bucket != new_bucket) {
 786                 thread->th_sched_bucket = new_bucket;
 787                 thread->pri_shift = sched_pri_shifts[new_bucket];
 788
 789                 if ((thread->state & (TH_RUN | TH_IDLE)) == TH_RUN) {
 790                         sched_decr_bucket(old_bucket);
 791                         sched_incr_bucket(new_bucket);
 792                 }
 793         }
 794 }
 795
 796 /*
 797  * Set the thread's true scheduling mode
 798  * Called with thread mutex and thread locked
 799  * The thread has already been removed from the runqueue.
 800  *
 801  * (saved_mode is handled before this point)
 802  */
 803 void
 804 sched_set_thread_mode(thread_t thread, sched_mode_t new_mode)
 805 {
 806         assert(thread->runq == PROCESSOR_NULL);
 807
 808         switch (new_mode) {
 809         case TH_MODE_FIXED:
 810         case TH_MODE_REALTIME:
 811         case TH_MODE_TIMESHARE:
 812                 break;
 813
 814         default:
 815                 panic("unexpected mode: %d", new_mode);
 816                 break;
 817         }
 818
 819         thread->sched_mode = new_mode;
 820
 821         SCHED(update_thread_bucket)(thread);
 822 }
 823
 824 /*
 825  * Demote the true scheduler mode to timeshare (called with the thread locked)
 826  */
 827 void
 828 sched_thread_mode_demote(thread_t thread, uint32_t reason)
 829 {
 830         assert(reason & TH_SFLAG_DEMOTED_MASK);
 831         assert((thread->sched_flags & reason) != reason);
 832
 833         if (thread->policy_reset) {
 834                 return;
 835         }
 836
 837         if (thread->sched_flags & TH_SFLAG_DEMOTED_MASK) {
 838                 /* Another demotion reason is already active */
 839                 thread->sched_flags |= reason;
 840                 return;
 841         }
 842
 843         assert(thread->saved_mode == TH_MODE_NONE);
 844
 845         boolean_t removed = thread_run_queue_remove(thread);
 846
 847         thread->sched_flags |= reason;
 848
 849         thread->saved_mode = thread->sched_mode;
 850
 851         sched_set_thread_mode(thread, TH_MODE_TIMESHARE);
 852
 853         thread_recompute_priority(thread);
 854
 855         if (removed) {
 856                 thread_run_queue_reinsert(thread, SCHED_TAILQ);
 857         }
 858 }
 859
 860 /*
 861  * Un-demote the true scheduler mode back to the saved mode (called with the thread locked)
 862  */
 863 void
 864 sched_thread_mode_undemote(thread_t thread, uint32_t reason)
 865 {
 866         assert(reason & TH_SFLAG_DEMOTED_MASK);
 867         assert((thread->sched_flags & reason) == reason);
 868         assert(thread->saved_mode != TH_MODE_NONE);
 869         assert(thread->sched_mode == TH_MODE_TIMESHARE);
 870         assert(thread->policy_reset == 0);
 871
 872         thread->sched_flags &= ~reason;
 873
 874         if (thread->sched_flags & TH_SFLAG_DEMOTED_MASK) {
 875                 /* Another demotion reason is still active */
 876                 return;
 877         }
 878
 879         boolean_t removed = thread_run_queue_remove(thread);
 880
 881         sched_set_thread_mode(thread, thread->saved_mode);
 882
 883         thread->saved_mode = TH_MODE_NONE;
 884
 885         thread_recompute_priority(thread);
 886
 887         if (removed) {
 888                 thread_run_queue_reinsert(thread, SCHED_TAILQ);
 889         }
 890 }
 891
 892 /*
 893  * Promote thread to have a sched pri floor for a specific reason
 894  *
 895  * Promotion must not last past syscall boundary
 896  * Clients must always pair promote and demote 1:1,
 897  * Handling nesting of the same promote reason is the client's responsibility
 898  *
 899  * Called at splsched with thread locked
 900  */
 901 void
 902 sched_thread_promote_reason(thread_t    thread,
 903     uint32_t    reason,
 904     __kdebug_only uintptr_t   trace_obj /* already unslid */)
 905 {
 906         assert(reason & TH_SFLAG_PROMOTE_REASON_MASK);
 907         assert((thread->sched_flags & reason) != reason);
 908
 909         switch (reason) {
 910         case TH_SFLAG_RW_PROMOTED:
 911                 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RW_PROMOTE),
 912                     thread_tid(thread), thread->sched_pri,
 913                     thread->base_pri, trace_obj);
 914                 break;
 915         case TH_SFLAG_WAITQ_PROMOTED:
 916                 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAITQ_PROMOTE),
 917                     thread_tid(thread), thread->sched_pri,
 918                     thread->base_pri, trace_obj);
 919                 break;
 920         case TH_SFLAG_EXEC_PROMOTED:
 921                 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_EXEC_PROMOTE),
 922                     thread_tid(thread), thread->sched_pri,
 923                     thread->base_pri, trace_obj);
 924                 break;
 925         }
 926
 927         thread->sched_flags |= reason;
 928
 929         thread_recompute_sched_pri(thread, SETPRI_DEFAULT);
 930 }
 931
 932 /*
 933  * End a specific promotion reason
 934  * Demotes a thread back to its expected priority without the promotion in place
 935  *
 936  * Called at splsched with thread locked
 937  */
 938 void
 939 sched_thread_unpromote_reason(thread_t  thread,
 940     uint32_t  reason,
 941     __kdebug_only uintptr_t trace_obj /* already unslid */)
 942 {
 943         assert(reason & TH_SFLAG_PROMOTE_REASON_MASK);
 944         assert((thread->sched_flags & reason) == reason);
 945
 946         switch (reason) {
 947         case TH_SFLAG_RW_PROMOTED:
 948                 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RW_DEMOTE),
 949                     thread_tid(thread), thread->sched_pri,
 950                     thread->base_pri, trace_obj);
 951                 break;
 952         case TH_SFLAG_WAITQ_PROMOTED:
 953                 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAITQ_DEMOTE),
 954                     thread_tid(thread), thread->sched_pri,
 955                     thread->base_pri, trace_obj);
 956                 break;
 957         case TH_SFLAG_EXEC_PROMOTED:
 958                 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_EXEC_DEMOTE),
 959                     thread_tid(thread), thread->sched_pri,
 960                     thread->base_pri, trace_obj);
 961                 break;
 962         }
 963
 964         thread->sched_flags &= ~reason;
 965
 966         thread_recompute_sched_pri(thread, SETPRI_DEFAULT);
 967 }