osfmk/kern/priority.c

   1 /*
   2  * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * @OSF_COPYRIGHT@
  30  */
  31 /*
  32  * Mach Operating System
  33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
  34  * All Rights Reserved.
  35  *
  36  * Permission to use, copy, modify and distribute this software and its
  37  * documentation is hereby granted, provided that both the copyright
  38  * notice and this permission notice appear in all copies of the
  39  * software, derivative works or modified versions, and any portions
  40  * thereof, and that both notices appear in supporting documentation.
  41  *
  42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  45  *
  46  * Carnegie Mellon requests users of this software to return to
  47  *
  48  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  49  *  School of Computer Science
  50  *  Carnegie Mellon University
  51  *  Pittsburgh PA 15213-3890
  52  *
  53  * any improvements or extensions that they make and grant Carnegie Mellon
  54  * the rights to redistribute these changes.
  55  */
  56 /*
  57  */
  58 /*
  59  *      File:   priority.c
  60  *      Author: Avadis Tevanian, Jr.
  61  *      Date:   1986
  62  *
  63  *      Priority related scheduler bits.
  64  */
  65
  66 #include <mach/boolean.h>
  67 #include <mach/kern_return.h>
  68 #include <mach/machine.h>
  69 #include <kern/host.h>
  70 #include <kern/mach_param.h>
  71 #include <kern/sched.h>
  72 #include <sys/kdebug.h>
  73 #include <kern/spl.h>
  74 #include <kern/thread.h>
  75 #include <kern/processor.h>
  76 #include <kern/ledger.h>
  77 #include <machine/machparam.h>
  78 #include <kern/machine.h>
  79 #include <kern/policy_internal.h>
  80 #include <kern/sched_clutch.h>
  81
  82 #ifdef CONFIG_MACH_APPROXIMATE_TIME
  83 #include <machine/commpage.h>  /* for commpage_update_mach_approximate_time */
  84 #endif
  85
  86 #if MONOTONIC
  87 #include <kern/monotonic.h>
  88 #endif /* MONOTONIC */
  89
  90 /*
  91  *      thread_quantum_expire:
  92  *
  93  *      Recalculate the quantum and priority for a thread.
  94  *
  95  *      Called at splsched.
  96  */
  97
  98 void
  99 thread_quantum_expire(
 100         timer_call_param_t      p0,
 101         timer_call_param_t      p1)
 102 {
 103         processor_t                     processor = p0;
 104         thread_t                        thread = p1;
 105         ast_t                           preempt;
 106         uint64_t                        ctime;
 107
 108         assert(processor == current_processor());
 109         assert(thread == current_thread());
 110
 111         KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_QUANTUM_EXPIRED) | DBG_FUNC_START, 0, 0, 0, 0, 0);
 112
 113         SCHED_STATS_INC(quantum_timer_expirations);
 114
 115         /*
 116          * We bill CPU time to both the individual thread and its task.
 117          *
 118          * Because this balance adjustment could potentially attempt to wake this
 119          * very thread, we must credit the ledger before taking the thread lock.
 120          * The ledger pointers are only manipulated by the thread itself at the ast
 121          * boundary.
 122          *
 123          * TODO: This fails to account for the time between when the timer was
 124          * armed and when it fired.  It should be based on the system_timer and
 125          * running a timer_update operation here.
 126          */
 127         ledger_credit(thread->t_ledger, task_ledgers.cpu_time, thread->quantum_remaining);
 128         ledger_credit(thread->t_threadledger, thread_ledgers.cpu_time, thread->quantum_remaining);
 129         if (thread->t_bankledger) {
 130                 ledger_credit(thread->t_bankledger, bank_ledgers.cpu_time,
 131                     (thread->quantum_remaining - thread->t_deduct_bank_ledger_time));
 132         }
 133         thread->t_deduct_bank_ledger_time = 0;
 134         ctime = mach_absolute_time();
 135
 136 #ifdef CONFIG_MACH_APPROXIMATE_TIME
 137         commpage_update_mach_approximate_time(ctime);
 138 #endif
 139         sched_update_pset_avg_execution_time(processor->processor_set, thread->quantum_remaining, ctime, thread->th_sched_bucket);
 140
 141 #if MONOTONIC
 142         mt_sched_update(thread);
 143 #endif /* MONOTONIC */
 144
 145         thread_lock(thread);
 146
 147         /*
 148          * We've run up until our quantum expiration, and will (potentially)
 149          * continue without re-entering the scheduler, so update this now.
 150          */
 151         processor->last_dispatch = ctime;
 152         thread->last_run_time = ctime;
 153
 154         /*
 155          *      Check for fail-safe trip.
 156          */
 157         if ((thread->sched_mode == TH_MODE_REALTIME || thread->sched_mode == TH_MODE_FIXED) &&
 158             !(thread->sched_flags & TH_SFLAG_PROMOTED) &&
 159             !(thread->kern_promotion_schedpri != 0) &&
 160             !(thread->sched_flags & TH_SFLAG_PROMOTE_REASON_MASK) &&
 161             !(thread->options & TH_OPT_SYSTEM_CRITICAL)) {
 162                 uint64_t new_computation;
 163
 164                 new_computation = ctime - thread->computation_epoch;
 165                 new_computation += thread->computation_metered;
 166                 if (new_computation > max_unsafe_computation) {
 167                         KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_FAILSAFE) | DBG_FUNC_NONE,
 168                             (uintptr_t)thread->sched_pri, (uintptr_t)thread->sched_mode, 0, 0, 0);
 169
 170                         thread->safe_release = ctime + sched_safe_duration;
 171
 172                         sched_thread_mode_demote(thread, TH_SFLAG_FAILSAFE);
 173                 }
 174         }
 175
 176         /*
 177          *      Recompute scheduled priority if appropriate.
 178          */
 179         if (SCHED(can_update_priority)(thread)) {
 180                 SCHED(update_priority)(thread);
 181         } else {
 182                 SCHED(lightweight_update_priority)(thread);
 183         }
 184
 185         if (thread->sched_mode != TH_MODE_REALTIME) {
 186                 SCHED(quantum_expire)(thread);
 187         }
 188
 189         /*
 190          *      This quantum is up, give this thread another.
 191          */
 192         processor->first_timeslice = FALSE;
 193
 194         thread_quantum_init(thread);
 195
 196         /* Reload precise timing global policy to thread-local policy */
 197         thread->precise_user_kernel_time = use_precise_user_kernel_time(thread);
 198
 199         /*
 200          * Since non-precise user/kernel time doesn't update the state/thread timer
 201          * during privilege transitions, synthesize an event now.
 202          */
 203         if (!thread->precise_user_kernel_time) {
 204                 timer_update(processor->current_state, ctime);
 205                 timer_update(processor->thread_timer, ctime);
 206                 timer_update(&thread->runnable_timer, ctime);
 207         }
 208
 209
 210         processor->quantum_end = ctime + thread->quantum_remaining;
 211
 212         /*
 213          * Context switch check
 214          *
 215          * non-urgent flags don't affect kernel threads, so upgrade to urgent
 216          * to ensure that rebalancing and non-recommendation kick in quickly.
 217          */
 218
 219         ast_t check_reason = AST_QUANTUM;
 220         if (thread->task == kernel_task) {
 221                 check_reason |= AST_URGENT;
 222         }
 223
 224         if ((preempt = csw_check(thread, processor, check_reason)) != AST_NONE) {
 225                 ast_on(preempt);
 226         }
 227
 228         /*
 229          * AST_KEVENT does not send an IPI when setting the AST,
 230          * to avoid waiting for the next context switch to propagate the AST,
 231          * the AST is propagated here at quantum expiration.
 232          */
 233         ast_propagate(thread);
 234
 235         thread_unlock(thread);
 236         running_timer_enter(processor, RUNNING_TIMER_QUANTUM, thread,
 237             processor->quantum_end, ctime);
 238
 239         /* Tell platform layer that we are still running this thread */
 240         thread_urgency_t urgency = thread_get_urgency(thread, NULL, NULL);
 241         machine_thread_going_on_core(thread, urgency, 0, 0, ctime);
 242         machine_switch_perfcontrol_state_update(QUANTUM_EXPIRY, ctime,
 243             0, thread);
 244
 245 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
 246         sched_timeshare_consider_maintenance(ctime);
 247 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
 248
 249 #if __arm__ || __arm64__
 250         if (thread->sched_mode == TH_MODE_REALTIME) {
 251                 sched_consider_recommended_cores(ctime, thread);
 252         }
 253 #endif /* __arm__ || __arm64__ */
 254
 255         KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_QUANTUM_EXPIRED) | DBG_FUNC_END, preempt, 0, 0, 0, 0);
 256 }
 257
 258 /*
 259  *      sched_set_thread_base_priority:
 260  *
 261  *      Set the base priority of the thread
 262  *      and reset its scheduled priority.
 263  *
 264  *      This is the only path to change base_pri.
 265  *
 266  *      Called with the thread locked.
 267  */
 268 void
 269 sched_set_thread_base_priority(thread_t thread, int priority)
 270 {
 271         assert(priority >= MINPRI);
 272         uint64_t ctime = 0;
 273
 274         if (thread->sched_mode == TH_MODE_REALTIME) {
 275                 assert(priority <= BASEPRI_RTQUEUES);
 276         } else {
 277                 assert(priority < BASEPRI_RTQUEUES);
 278         }
 279
 280         int old_base_pri = thread->base_pri;
 281         thread->req_base_pri = (int16_t)priority;
 282         if (thread->sched_flags & TH_SFLAG_BASE_PRI_FROZEN) {
 283                 priority = MAX(priority, old_base_pri);
 284         }
 285         thread->base_pri = (int16_t)priority;
 286
 287         if ((thread->state & TH_RUN) == TH_RUN) {
 288                 assert(thread->last_made_runnable_time != THREAD_NOT_RUNNABLE);
 289                 ctime = mach_approximate_time();
 290                 thread->last_basepri_change_time = ctime;
 291         } else {
 292                 assert(thread->last_basepri_change_time == THREAD_NOT_RUNNABLE);
 293                 assert(thread->last_made_runnable_time == THREAD_NOT_RUNNABLE);
 294         }
 295
 296         /*
 297          * Currently the perfcontrol_attr depends on the base pri of the
 298          * thread. Therefore, we use this function as the hook for the
 299          * perfcontrol callout.
 300          */
 301         if (thread == current_thread() && old_base_pri != priority) {
 302                 if (!ctime) {
 303                         ctime = mach_approximate_time();
 304                 }
 305                 machine_switch_perfcontrol_state_update(PERFCONTROL_ATTR_UPDATE,
 306                     ctime, PERFCONTROL_CALLOUT_WAKE_UNSAFE, thread);
 307         }
 308 #if !CONFIG_SCHED_CLUTCH
 309         /* For the clutch scheduler, this operation is done in set_sched_pri() */
 310         SCHED(update_thread_bucket)(thread);
 311 #endif /* !CONFIG_SCHED_CLUTCH */
 312
 313         thread_recompute_sched_pri(thread, SETPRI_DEFAULT);
 314 }
 315
 316 /*
 317  *      sched_set_kernel_thread_priority:
 318  *
 319  *      Set the absolute base priority of the thread
 320  *      and reset its scheduled priority.
 321  *
 322  *      Called with the thread unlocked.
 323  */
 324 void
 325 sched_set_kernel_thread_priority(thread_t thread, int new_priority)
 326 {
 327         spl_t s = splsched();
 328
 329         thread_lock(thread);
 330
 331         assert(thread->sched_mode != TH_MODE_REALTIME);
 332         assert(thread->effective_policy.thep_qos == THREAD_QOS_UNSPECIFIED);
 333
 334         if (new_priority > thread->max_priority) {
 335                 new_priority = thread->max_priority;
 336         }
 337 #if !defined(XNU_TARGET_OS_OSX)
 338         if (new_priority < MAXPRI_THROTTLE) {
 339                 new_priority = MAXPRI_THROTTLE;
 340         }
 341 #endif /* !defined(XNU_TARGET_OS_OSX) */
 342
 343         thread->importance = new_priority - thread->task_priority;
 344
 345         sched_set_thread_base_priority(thread, new_priority);
 346
 347         thread_unlock(thread);
 348         splx(s);
 349 }
 350
 351 /*
 352  *      thread_recompute_sched_pri:
 353  *
 354  *      Reset the scheduled priority of the thread
 355  *      according to its base priority if the
 356  *      thread has not been promoted or depressed.
 357  *
 358  *      This is the only way to push base_pri changes into sched_pri,
 359  *      or to recalculate the appropriate sched_pri after changing
 360  *      a promotion or depression.
 361  *
 362  *      Called at splsched with the thread locked.
 363  *
 364  *      TODO: Add an 'update urgency' flag to avoid urgency callouts on every rwlock operation
 365  */
 366 void
 367 thread_recompute_sched_pri(thread_t thread, set_sched_pri_options_t options)
 368 {
 369         uint32_t     sched_flags = thread->sched_flags;
 370         sched_mode_t sched_mode  = thread->sched_mode;
 371
 372         int16_t priority = thread->base_pri;
 373
 374         if (sched_mode == TH_MODE_TIMESHARE) {
 375                 priority = (int16_t)SCHED(compute_timeshare_priority)(thread);
 376         }
 377
 378         if (sched_flags & TH_SFLAG_DEPRESS) {
 379                 /* thread_yield_internal overrides kernel mutex promotion */
 380                 priority = DEPRESSPRI;
 381         } else {
 382                 /* poll-depress is overridden by mutex promotion and promote-reasons */
 383                 if ((sched_flags & TH_SFLAG_POLLDEPRESS)) {
 384                         priority = DEPRESSPRI;
 385                 }
 386
 387                 if (thread->kern_promotion_schedpri > 0) {
 388                         priority = MAX(priority, thread->kern_promotion_schedpri);
 389
 390                         if (sched_mode != TH_MODE_REALTIME) {
 391                                 priority = MIN(priority, MAXPRI_PROMOTE);
 392                         }
 393                 }
 394
 395                 if (sched_flags & TH_SFLAG_PROMOTED) {
 396                         priority = MAX(priority, thread->promotion_priority);
 397
 398                         if (sched_mode != TH_MODE_REALTIME) {
 399                                 priority = MIN(priority, MAXPRI_PROMOTE);
 400                         }
 401                 }
 402
 403                 if (sched_flags & TH_SFLAG_PROMOTE_REASON_MASK) {
 404                         if (sched_flags & TH_SFLAG_RW_PROMOTED) {
 405                                 priority = MAX(priority, MINPRI_RWLOCK);
 406                         }
 407
 408                         if (sched_flags & TH_SFLAG_WAITQ_PROMOTED) {
 409                                 priority = MAX(priority, MINPRI_WAITQ);
 410                         }
 411
 412                         if (sched_flags & TH_SFLAG_EXEC_PROMOTED) {
 413                                 priority = MAX(priority, MINPRI_EXEC);
 414                         }
 415                 }
 416         }
 417
 418         set_sched_pri(thread, priority, options);
 419 }
 420
 421 void
 422 sched_default_quantum_expire(thread_t thread __unused)
 423 {
 424         /*
 425          * No special behavior when a timeshare, fixed, or realtime thread
 426          * uses up its entire quantum
 427          */
 428 }
 429
 430 int smt_timeshare_enabled = 1;
 431 int smt_sched_bonus_16ths = 8;
 432
 433 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
 434
 435 /*
 436  *      lightweight_update_priority:
 437  *
 438  *      Update the scheduled priority for
 439  *      a timesharing thread.
 440  *
 441  *      Only for use on the current thread.
 442  *
 443  *      Called with the thread locked.
 444  */
 445 void
 446 lightweight_update_priority(thread_t thread)
 447 {
 448         assert(thread->runq == PROCESSOR_NULL);
 449         assert(thread == current_thread());
 450
 451         if (thread->sched_mode == TH_MODE_TIMESHARE) {
 452                 int priority;
 453                 uint32_t delta;
 454
 455                 thread_timer_delta(thread, delta);
 456
 457                 /*
 458                  *      Accumulate timesharing usage only
 459                  *      during contention for processor
 460                  *      resources.
 461                  */
 462                 if (thread->pri_shift < INT8_MAX) {
 463                         if (thread_no_smt(thread) && smt_timeshare_enabled) {
 464                                 thread->sched_usage += (delta + ((delta * smt_sched_bonus_16ths) >> 4));
 465                         } else {
 466                                 thread->sched_usage += delta;
 467                         }
 468                 }
 469
 470                 thread->cpu_delta += delta;
 471
 472 #if CONFIG_SCHED_CLUTCH
 473                 /*
 474                  * Update the CPU usage for the thread group to which the thread belongs.
 475                  * The implementation assumes that the thread ran for the entire delta
 476                  * as part of the same thread group.
 477                  */
 478                 sched_clutch_cpu_usage_update(thread, delta);
 479 #endif /* CONFIG_SCHED_CLUTCH */
 480
 481                 priority = sched_compute_timeshare_priority(thread);
 482
 483                 if (priority != thread->sched_pri) {
 484                         thread_recompute_sched_pri(thread, SETPRI_LAZY);
 485                 }
 486         }
 487 }
 488
 489 /*
 490  *      Define shifts for simulating (5/8) ** n
 491  *
 492  *      Shift structures for holding update shifts.  Actual computation
 493  *      is  usage = (usage >> shift1) +/- (usage >> abs(shift2))  where the
 494  *      +/- is determined by the sign of shift 2.
 495  */
 496
 497 const struct shift_data        sched_decay_shifts[SCHED_DECAY_TICKS] = {
 498         { .shift1 = 1, .shift2 = 1 },
 499         { .shift1 = 1, .shift2 = 3 },
 500         { .shift1 = 1, .shift2 = -3 },
 501         { .shift1 = 2, .shift2 = -7 },
 502         { .shift1 = 3, .shift2 = 5 },
 503         { .shift1 = 3, .shift2 = -5 },
 504         { .shift1 = 4, .shift2 = -8 },
 505         { .shift1 = 5, .shift2 = 7 },
 506         { .shift1 = 5, .shift2 = -7 },
 507         { .shift1 = 6, .shift2 = -10 },
 508         { .shift1 = 7, .shift2 = 10 },
 509         { .shift1 = 7, .shift2 = -9 },
 510         { .shift1 = 8, .shift2 = -11 },
 511         { .shift1 = 9, .shift2 = 12 },
 512         { .shift1 = 9, .shift2 = -11 },
 513         { .shift1 = 10, .shift2 = -13 },
 514         { .shift1 = 11, .shift2 = 14 },
 515         { .shift1 = 11, .shift2 = -13 },
 516         { .shift1 = 12, .shift2 = -15 },
 517         { .shift1 = 13, .shift2 = 17 },
 518         { .shift1 = 13, .shift2 = -15 },
 519         { .shift1 = 14, .shift2 = -17 },
 520         { .shift1 = 15, .shift2 = 19 },
 521         { .shift1 = 16, .shift2 = 18 },
 522         { .shift1 = 16, .shift2 = -19 },
 523         { .shift1 = 17, .shift2 = 22 },
 524         { .shift1 = 18, .shift2 = 20 },
 525         { .shift1 = 18, .shift2 = -20 },
 526         { .shift1 = 19, .shift2 = 26 },
 527         { .shift1 = 20, .shift2 = 22 },
 528         { .shift1 = 20, .shift2 = -22 },
 529         { .shift1 = 21, .shift2 = -27 }
 530 };
 531
 532 /*
 533  *      sched_compute_timeshare_priority:
 534  *
 535  *      Calculate the timesharing priority based upon usage and load.
 536  */
 537 extern int sched_pri_decay_band_limit;
 538
 539
 540 /* Only use the decay floor logic on non-macOS and non-clutch schedulers */
 541 #if !defined(XNU_TARGET_OS_OSX) && !CONFIG_SCHED_CLUTCH
 542
 543 int
 544 sched_compute_timeshare_priority(thread_t thread)
 545 {
 546         int decay_amount;
 547         int decay_limit = sched_pri_decay_band_limit;
 548
 549         if (thread->base_pri > BASEPRI_FOREGROUND) {
 550                 decay_limit += (thread->base_pri - BASEPRI_FOREGROUND);
 551         }
 552
 553         if (thread->pri_shift == INT8_MAX) {
 554                 decay_amount = 0;
 555         } else {
 556                 decay_amount = (thread->sched_usage >> thread->pri_shift);
 557         }
 558
 559         if (decay_amount > decay_limit) {
 560                 decay_amount = decay_limit;
 561         }
 562
 563         /* start with base priority */
 564         int priority = thread->base_pri - decay_amount;
 565
 566         if (priority < MAXPRI_THROTTLE) {
 567                 if (thread->task->max_priority > MAXPRI_THROTTLE) {
 568                         priority = MAXPRI_THROTTLE;
 569                 } else if (priority < MINPRI_USER) {
 570                         priority = MINPRI_USER;
 571                 }
 572         } else if (priority > MAXPRI_KERNEL) {
 573                 priority = MAXPRI_KERNEL;
 574         }
 575
 576         return priority;
 577 }
 578
 579 #else /* !defined(XNU_TARGET_OS_OSX) && !CONFIG_SCHED_CLUTCH */
 580
 581 int
 582 sched_compute_timeshare_priority(thread_t thread)
 583 {
 584         /* start with base priority */
 585         int priority = thread->base_pri;
 586
 587         if (thread->pri_shift != INT8_MAX) {
 588                 priority -= (thread->sched_usage >> thread->pri_shift);
 589         }
 590
 591         if (priority < MINPRI_USER) {
 592                 priority = MINPRI_USER;
 593         } else if (priority > MAXPRI_KERNEL) {
 594                 priority = MAXPRI_KERNEL;
 595         }
 596
 597         return priority;
 598 }
 599
 600 #endif /* !defined(XNU_TARGET_OS_OSX) && !CONFIG_SCHED_CLUTCH */
 601
 602 /*
 603  *      can_update_priority
 604  *
 605  *      Make sure we don't do re-dispatches more frequently than a scheduler tick.
 606  *
 607  *      Called with the thread locked.
 608  */
 609 boolean_t
 610 can_update_priority(
 611         thread_t        thread)
 612 {
 613         if (sched_tick == thread->sched_stamp) {
 614                 return FALSE;
 615         } else {
 616                 return TRUE;
 617         }
 618 }
 619
 620 /*
 621  *      update_priority
 622  *
 623  *      Perform housekeeping operations driven by scheduler tick.
 624  *
 625  *      Called with the thread locked.
 626  */
 627 void
 628 update_priority(
 629         thread_t        thread)
 630 {
 631         uint32_t ticks, delta;
 632
 633         ticks = sched_tick - thread->sched_stamp;
 634         assert(ticks != 0);
 635
 636         thread->sched_stamp += ticks;
 637
 638         /* If requested, accelerate aging of sched_usage */
 639         if (sched_decay_usage_age_factor > 1) {
 640                 ticks *= sched_decay_usage_age_factor;
 641         }
 642
 643         /*
 644          *      Gather cpu usage data.
 645          */
 646         thread_timer_delta(thread, delta);
 647         if (ticks < SCHED_DECAY_TICKS) {
 648                 /*
 649                  *      Accumulate timesharing usage only during contention for processor
 650                  *      resources. Use the pri_shift from the previous tick window to
 651                  *      determine if the system was in a contended state.
 652                  */
 653                 if (thread->pri_shift < INT8_MAX) {
 654                         if (thread_no_smt(thread) && smt_timeshare_enabled) {
 655                                 thread->sched_usage += (delta + ((delta * smt_sched_bonus_16ths) >> 4));
 656                         } else {
 657                                 thread->sched_usage += delta;
 658                         }
 659                 }
 660
 661                 thread->cpu_usage += delta + thread->cpu_delta;
 662                 thread->cpu_delta = 0;
 663
 664 #if CONFIG_SCHED_CLUTCH
 665                 /*
 666                  * Update the CPU usage for the thread group to which the thread belongs.
 667                  * The implementation assumes that the thread ran for the entire delta
 668                  * as part of the same thread group.
 669                  */
 670                 sched_clutch_cpu_usage_update(thread, delta);
 671 #endif /* CONFIG_SCHED_CLUTCH */
 672
 673                 const struct shift_data *shiftp = &sched_decay_shifts[ticks];
 674
 675                 if (shiftp->shift2 > 0) {
 676                         thread->cpu_usage =   (thread->cpu_usage >> shiftp->shift1) +
 677                             (thread->cpu_usage >> shiftp->shift2);
 678                         thread->sched_usage = (thread->sched_usage >> shiftp->shift1) +
 679                             (thread->sched_usage >> shiftp->shift2);
 680                 } else {
 681                         thread->cpu_usage =   (thread->cpu_usage >>   shiftp->shift1) -
 682                             (thread->cpu_usage >> -(shiftp->shift2));
 683                         thread->sched_usage = (thread->sched_usage >>   shiftp->shift1) -
 684                             (thread->sched_usage >> -(shiftp->shift2));
 685                 }
 686         } else {
 687                 thread->cpu_usage = thread->cpu_delta = 0;
 688                 thread->sched_usage = 0;
 689         }
 690
 691         /*
 692          *      Check for fail-safe release.
 693          */
 694         if ((thread->sched_flags & TH_SFLAG_FAILSAFE) &&
 695             mach_absolute_time() >= thread->safe_release) {
 696                 sched_thread_mode_undemote(thread, TH_SFLAG_FAILSAFE);
 697         }
 698
 699         /*
 700          * Now that the thread's CPU usage has been accumulated and aged
 701          * based on contention of the previous tick window, update the
 702          * pri_shift of the thread to match the current global load/shift
 703          * values. The updated pri_shift would be used to calculate the
 704          * new priority of the thread.
 705          */
 706 #if CONFIG_SCHED_CLUTCH
 707         thread->pri_shift = sched_clutch_thread_pri_shift(thread, thread->th_sched_bucket);
 708 #else /* CONFIG_SCHED_CLUTCH */
 709         thread->pri_shift = sched_pri_shifts[thread->th_sched_bucket];
 710 #endif /* CONFIG_SCHED_CLUTCH */
 711
 712         /* Recompute scheduled priority if appropriate. */
 713         if (thread->sched_mode == TH_MODE_TIMESHARE) {
 714                 thread_recompute_sched_pri(thread, SETPRI_LAZY);
 715         }
 716 }
 717
 718 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
 719
 720
 721 /*
 722  * TH_BUCKET_RUN is a count of *all* runnable non-idle threads.
 723  * Each other bucket is a count of the runnable non-idle threads
 724  * with that property. All updates to these counts should be
 725  * performed with os_atomic_* operations.
 726  *
 727  * For the clutch scheduler, this global bucket is used only for
 728  * keeping the total global run count.
 729  */
 730 uint32_t       sched_run_buckets[TH_BUCKET_MAX];
 731
 732 static void
 733 sched_incr_bucket(sched_bucket_t bucket)
 734 {
 735         assert(bucket >= TH_BUCKET_FIXPRI &&
 736             bucket <= TH_BUCKET_SHARE_BG);
 737
 738         os_atomic_inc(&sched_run_buckets[bucket], relaxed);
 739 }
 740
 741 static void
 742 sched_decr_bucket(sched_bucket_t bucket)
 743 {
 744         assert(bucket >= TH_BUCKET_FIXPRI &&
 745             bucket <= TH_BUCKET_SHARE_BG);
 746
 747         assert(os_atomic_load(&sched_run_buckets[bucket], relaxed) > 0);
 748
 749         os_atomic_dec(&sched_run_buckets[bucket], relaxed);
 750 }
 751
 752 static void
 753 sched_add_bucket(sched_bucket_t bucket, uint8_t run_weight)
 754 {
 755         assert(bucket >= TH_BUCKET_FIXPRI &&
 756             bucket <= TH_BUCKET_SHARE_BG);
 757
 758         os_atomic_add(&sched_run_buckets[bucket], run_weight, relaxed);
 759 }
 760
 761 static void
 762 sched_sub_bucket(sched_bucket_t bucket, uint8_t run_weight)
 763 {
 764         assert(bucket >= TH_BUCKET_FIXPRI &&
 765             bucket <= TH_BUCKET_SHARE_BG);
 766
 767         assert(os_atomic_load(&sched_run_buckets[bucket], relaxed) > 0);
 768
 769         os_atomic_sub(&sched_run_buckets[bucket], run_weight, relaxed);
 770 }
 771
 772 uint32_t
 773 sched_run_incr(thread_t thread)
 774 {
 775         assert((thread->state & (TH_RUN | TH_IDLE)) == TH_RUN);
 776
 777         uint32_t new_count = os_atomic_inc(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
 778
 779         sched_incr_bucket(thread->th_sched_bucket);
 780
 781         return new_count;
 782 }
 783
 784 uint32_t
 785 sched_run_decr(thread_t thread)
 786 {
 787         assert((thread->state & (TH_RUN | TH_IDLE)) != TH_RUN);
 788
 789         sched_decr_bucket(thread->th_sched_bucket);
 790
 791         uint32_t new_count = os_atomic_dec(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
 792
 793         return new_count;
 794 }
 795
 796 uint32_t
 797 sched_smt_run_incr(thread_t thread)
 798 {
 799         assert((thread->state & (TH_RUN | TH_IDLE)) == TH_RUN);
 800
 801         uint8_t run_weight = (thread_no_smt(thread) && smt_timeshare_enabled) ? 2 : 1;
 802         thread->sched_saved_run_weight = run_weight;
 803
 804         uint32_t new_count = os_atomic_add(&sched_run_buckets[TH_BUCKET_RUN], run_weight, relaxed);
 805
 806         sched_add_bucket(thread->th_sched_bucket, run_weight);
 807
 808         return new_count;
 809 }
 810
 811 uint32_t
 812 sched_smt_run_decr(thread_t thread)
 813 {
 814         assert((thread->state & (TH_RUN | TH_IDLE)) != TH_RUN);
 815
 816         uint8_t run_weight = thread->sched_saved_run_weight;
 817
 818         sched_sub_bucket(thread->th_sched_bucket, run_weight);
 819
 820         uint32_t new_count = os_atomic_sub(&sched_run_buckets[TH_BUCKET_RUN], run_weight, relaxed);
 821
 822         return new_count;
 823 }
 824
 825 void
 826 sched_update_thread_bucket(thread_t thread)
 827 {
 828         sched_bucket_t old_bucket = thread->th_sched_bucket;
 829         sched_bucket_t new_bucket = TH_BUCKET_RUN;
 830
 831         switch (thread->sched_mode) {
 832         case TH_MODE_FIXED:
 833         case TH_MODE_REALTIME:
 834                 new_bucket = TH_BUCKET_FIXPRI;
 835                 break;
 836
 837         case TH_MODE_TIMESHARE:
 838                 if (thread->base_pri > BASEPRI_DEFAULT) {
 839                         new_bucket = TH_BUCKET_SHARE_FG;
 840                 } else if (thread->base_pri > BASEPRI_UTILITY) {
 841                         new_bucket = TH_BUCKET_SHARE_DF;
 842                 } else if (thread->base_pri > MAXPRI_THROTTLE) {
 843                         new_bucket = TH_BUCKET_SHARE_UT;
 844                 } else {
 845                         new_bucket = TH_BUCKET_SHARE_BG;
 846                 }
 847                 break;
 848
 849         default:
 850                 panic("unexpected mode: %d", thread->sched_mode);
 851                 break;
 852         }
 853
 854         if (old_bucket != new_bucket) {
 855                 thread->th_sched_bucket = new_bucket;
 856                 thread->pri_shift = sched_pri_shifts[new_bucket];
 857
 858                 if ((thread->state & (TH_RUN | TH_IDLE)) == TH_RUN) {
 859                         sched_decr_bucket(old_bucket);
 860                         sched_incr_bucket(new_bucket);
 861                 }
 862         }
 863 }
 864
 865 void
 866 sched_smt_update_thread_bucket(thread_t thread)
 867 {
 868         sched_bucket_t old_bucket = thread->th_sched_bucket;
 869         sched_bucket_t new_bucket = TH_BUCKET_RUN;
 870
 871         switch (thread->sched_mode) {
 872         case TH_MODE_FIXED:
 873         case TH_MODE_REALTIME:
 874                 new_bucket = TH_BUCKET_FIXPRI;
 875                 break;
 876
 877         case TH_MODE_TIMESHARE:
 878                 if (thread->base_pri > BASEPRI_DEFAULT) {
 879                         new_bucket = TH_BUCKET_SHARE_FG;
 880                 } else if (thread->base_pri > BASEPRI_UTILITY) {
 881                         new_bucket = TH_BUCKET_SHARE_DF;
 882                 } else if (thread->base_pri > MAXPRI_THROTTLE) {
 883                         new_bucket = TH_BUCKET_SHARE_UT;
 884                 } else {
 885                         new_bucket = TH_BUCKET_SHARE_BG;
 886                 }
 887                 break;
 888
 889         default:
 890                 panic("unexpected mode: %d", thread->sched_mode);
 891                 break;
 892         }
 893
 894         if (old_bucket != new_bucket) {
 895                 thread->th_sched_bucket = new_bucket;
 896                 thread->pri_shift = sched_pri_shifts[new_bucket];
 897
 898                 if ((thread->state & (TH_RUN | TH_IDLE)) == TH_RUN) {
 899                         sched_sub_bucket(old_bucket, thread->sched_saved_run_weight);
 900                         sched_add_bucket(new_bucket, thread->sched_saved_run_weight);
 901                 }
 902         }
 903 }
 904
 905 /*
 906  * Set the thread's true scheduling mode
 907  * Called with thread mutex and thread locked
 908  * The thread has already been removed from the runqueue.
 909  *
 910  * (saved_mode is handled before this point)
 911  */
 912 void
 913 sched_set_thread_mode(thread_t thread, sched_mode_t new_mode)
 914 {
 915         assert(thread->runq == PROCESSOR_NULL);
 916
 917         switch (new_mode) {
 918         case TH_MODE_FIXED:
 919         case TH_MODE_REALTIME:
 920         case TH_MODE_TIMESHARE:
 921                 break;
 922
 923         default:
 924                 panic("unexpected mode: %d", new_mode);
 925                 break;
 926         }
 927
 928 #if CONFIG_SCHED_AUTO_JOIN
 929         /*
 930          * Realtime threads might have auto-joined a work interval based on
 931          * make runnable relationships. If such an RT thread is now being demoted
 932          * to non-RT, unjoin the thread from the work interval.
 933          */
 934         if ((thread->sched_flags & TH_SFLAG_THREAD_GROUP_AUTO_JOIN) && (new_mode != TH_MODE_REALTIME)) {
 935                 assert((thread->sched_mode == TH_MODE_REALTIME) || (thread->th_work_interval_flags & TH_WORK_INTERVAL_FLAGS_AUTO_JOIN_LEAK));
 936                 work_interval_auto_join_demote(thread);
 937         }
 938 #endif /* CONFIG_SCHED_AUTO_JOIN */
 939
 940         thread->sched_mode = new_mode;
 941
 942         SCHED(update_thread_bucket)(thread);
 943 }
 944
 945 /*
 946  * Demote the true scheduler mode to timeshare (called with the thread locked)
 947  */
 948 void
 949 sched_thread_mode_demote(thread_t thread, uint32_t reason)
 950 {
 951         assert(reason & TH_SFLAG_DEMOTED_MASK);
 952         assert((thread->sched_flags & reason) != reason);
 953
 954         if (thread->policy_reset) {
 955                 return;
 956         }
 957
 958         if (thread->sched_flags & TH_SFLAG_DEMOTED_MASK) {
 959                 /* Another demotion reason is already active */
 960                 thread->sched_flags |= reason;
 961                 return;
 962         }
 963
 964         assert(thread->saved_mode == TH_MODE_NONE);
 965
 966         boolean_t removed = thread_run_queue_remove(thread);
 967
 968         thread->sched_flags |= reason;
 969
 970         thread->saved_mode = thread->sched_mode;
 971
 972         sched_set_thread_mode(thread, TH_MODE_TIMESHARE);
 973
 974         thread_recompute_priority(thread);
 975
 976         if (removed) {
 977                 thread_run_queue_reinsert(thread, SCHED_TAILQ);
 978         }
 979 }
 980
 981 /*
 982  * Un-demote the true scheduler mode back to the saved mode (called with the thread locked)
 983  */
 984 void
 985 sched_thread_mode_undemote(thread_t thread, uint32_t reason)
 986 {
 987         assert(reason & TH_SFLAG_DEMOTED_MASK);
 988         assert((thread->sched_flags & reason) == reason);
 989         assert(thread->saved_mode != TH_MODE_NONE);
 990         assert(thread->sched_mode == TH_MODE_TIMESHARE);
 991         assert(thread->policy_reset == 0);
 992
 993         thread->sched_flags &= ~reason;
 994
 995         if (thread->sched_flags & TH_SFLAG_DEMOTED_MASK) {
 996                 /* Another demotion reason is still active */
 997                 return;
 998         }
 999
1000         boolean_t removed = thread_run_queue_remove(thread);
1001
1002         sched_set_thread_mode(thread, thread->saved_mode);
1003
1004         thread->saved_mode = TH_MODE_NONE;
1005
1006         thread_recompute_priority(thread);
1007
1008         if (removed) {
1009                 thread_run_queue_reinsert(thread, SCHED_TAILQ);
1010         }
1011 }
1012
1013 /*
1014  * Promote thread to have a sched pri floor for a specific reason
1015  *
1016  * Promotion must not last past syscall boundary
1017  * Clients must always pair promote and demote 1:1,
1018  * Handling nesting of the same promote reason is the client's responsibility
1019  *
1020  * Called at splsched with thread locked
1021  */
1022 void
1023 sched_thread_promote_reason(thread_t    thread,
1024     uint32_t    reason,
1025     __kdebug_only uintptr_t   trace_obj /* already unslid */)
1026 {
1027         assert(reason & TH_SFLAG_PROMOTE_REASON_MASK);
1028         assert((thread->sched_flags & reason) != reason);
1029
1030         switch (reason) {
1031         case TH_SFLAG_RW_PROMOTED:
1032                 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RW_PROMOTE),
1033                     thread_tid(thread), thread->sched_pri,
1034                     thread->base_pri, trace_obj);
1035                 break;
1036         case TH_SFLAG_WAITQ_PROMOTED:
1037                 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAITQ_PROMOTE),
1038                     thread_tid(thread), thread->sched_pri,
1039                     thread->base_pri, trace_obj);
1040                 break;
1041         case TH_SFLAG_EXEC_PROMOTED:
1042                 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_EXEC_PROMOTE),
1043                     thread_tid(thread), thread->sched_pri,
1044                     thread->base_pri, trace_obj);
1045                 break;
1046         }
1047
1048         thread->sched_flags |= reason;
1049
1050         thread_recompute_sched_pri(thread, SETPRI_DEFAULT);
1051 }
1052
1053 /*
1054  * End a specific promotion reason
1055  * Demotes a thread back to its expected priority without the promotion in place
1056  *
1057  * Called at splsched with thread locked
1058  */
1059 void
1060 sched_thread_unpromote_reason(thread_t  thread,
1061     uint32_t  reason,
1062     __kdebug_only uintptr_t trace_obj /* already unslid */)
1063 {
1064         assert(reason & TH_SFLAG_PROMOTE_REASON_MASK);
1065         assert((thread->sched_flags & reason) == reason);
1066
1067         switch (reason) {
1068         case TH_SFLAG_RW_PROMOTED:
1069                 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RW_DEMOTE),
1070                     thread_tid(thread), thread->sched_pri,
1071                     thread->base_pri, trace_obj);
1072                 break;
1073         case TH_SFLAG_WAITQ_PROMOTED:
1074                 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAITQ_DEMOTE),
1075                     thread_tid(thread), thread->sched_pri,
1076                     thread->base_pri, trace_obj);
1077                 break;
1078         case TH_SFLAG_EXEC_PROMOTED:
1079                 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_EXEC_DEMOTE),
1080                     thread_tid(thread), thread->sched_pri,
1081                     thread->base_pri, trace_obj);
1082                 break;
1083         }
1084
1085         thread->sched_flags &= ~reason;
1086
1087         thread_recompute_sched_pri(thread, SETPRI_DEFAULT);
1088 }