osfmk/kern/sched_average.c

   1 /*
   2  * Copyright (c) 2000-2007 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * @OSF_COPYRIGHT@
  30  */
  31 /*
  32  * Mach Operating System
  33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
  34  * All Rights Reserved.
  35  *
  36  * Permission to use, copy, modify and distribute this software and its
  37  * documentation is hereby granted, provided that both the copyright
  38  * notice and this permission notice appear in all copies of the
  39  * software, derivative works or modified versions, and any portions
  40  * thereof, and that both notices appear in supporting documentation.
  41  *
  42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  45  *
  46  * Carnegie Mellon requests users of this software to return to
  47  *
  48  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  49  *  School of Computer Science
  50  *  Carnegie Mellon University
  51  *  Pittsburgh PA 15213-3890
  52  *
  53  * any improvements or extensions that they make and grant Carnegie Mellon
  54  * the rights to redistribute these changes.
  55  */
  56 /*
  57  */
  58 /*
  59  *      Author: Avadis Tevanian, Jr.
  60  *      Date:   1986
  61  *
  62  *      Compute various averages.
  63  */
  64
  65 #include <mach/mach_types.h>
  66
  67 #include <kern/sched.h>
  68 #include <kern/assert.h>
  69 #include <kern/processor.h>
  70 #include <kern/thread.h>
  71 #if CONFIG_TELEMETRY
  72 #include <kern/telemetry.h>
  73 #endif
  74
  75 #include <sys/kdebug.h>
  76
  77 uint32_t        avenrun[3] = {0, 0, 0};
  78 uint32_t        mach_factor[3] = {0, 0, 0};
  79
  80 uint32_t        sched_load_average, sched_mach_factor;
  81
  82 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
  83 /*
  84  * Values are scaled by LOAD_SCALE, defined in processor_info.h
  85  */
  86 #define base(n)         ((n) << SCHED_TICK_SHIFT)
  87 #define frac(n)         (((base(n) - 1) * LOAD_SCALE) / base(n))
  88
  89 static uint32_t         fract[3] = {
  90         frac(5),                /* 5 second average */
  91         frac(30),               /* 30 second average */
  92         frac(60),               /* 1 minute average */
  93 };
  94
  95 #undef base
  96 #undef frac
  97
  98 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
  99
 100 static unsigned int             sched_nrun;
 101
 102 typedef void    (*sched_avg_comp_t)(
 103         void                    *param);
 104
 105 static struct sched_average {
 106         sched_avg_comp_t        comp;
 107         void                    *param;
 108         int                     period; /* in seconds */
 109         uint64_t                deadline;
 110 } sched_average[] = {
 111         { compute_averunnable, &sched_nrun, 5, 0 },
 112         { compute_stack_target, NULL, 5, 1 },
 113         { compute_pageout_gc_throttle, NULL, 1, 0 },
 114         { compute_pmap_gc_throttle, NULL, 60, 0 },
 115 #if CONFIG_TELEMETRY
 116         { compute_telemetry, NULL, 1, 0 },
 117 #endif
 118         { NULL, NULL, 0, 0 }
 119 };
 120
 121 typedef struct sched_average    *sched_average_t;
 122
 123 /*
 124  * Scheduler load calculation algorithm
 125  *
 126  * The scheduler load values provide an estimate of the number of runnable
 127  * timeshare threads in the system at various priority bands. The load
 128  * ultimately affects the priority shifts applied to all threads in a band
 129  * causing them to timeshare with other threads in the system. The load is
 130  * maintained in buckets, with each bucket corresponding to a priority band.
 131  *
 132  * Each runnable thread on the system contributes its load to its priority
 133  * band and to the bands above it. The contribution of a thread to the bands
 134  * above it is not strictly 1:1 and is weighted based on the priority band
 135  * of the thread. The rules of thread load contribution to each of its higher
 136  * bands are as follows:
 137  *
 138  * - DF threads: Upto (2 * NCPUs) threads
 139  * - UT threads: Upto NCPUs threads
 140  * - BG threads: Upto 1 thread
 141  *
 142  * To calculate the load values, the various run buckets are sampled (every
 143  * sched_load_compute_interval_abs) and the weighted contributions of the the
 144  * lower bucket threads are added. The resultant value is plugged into an
 145  * exponentially weighted moving average formula:
 146  *      new-load = alpha * old-load + (1 - alpha) * run-bucket-sample-count
 147  *      (where, alpha < 1)
 148  * The calculations for the scheduler load are done using fixpoint math with
 149  * a scale factor of 16 to avoid expensive divides and floating point
 150  * operations. The final load values are a smooth curve representative of
 151  * the actual number of runnable threads in a priority band.
 152  */
 153
 154 /* Maintains the current (scaled for fixpoint) load in various buckets */
 155 uint32_t sched_load[TH_BUCKET_MAX];
 156
 157 /*
 158  * Alpha factor for the EWMA alogrithm. The current values are chosen as
 159  * 6:10 ("old load":"new samples") to make sure the scheduler reacts fast
 160  * enough to changing system load but does not see too many spikes from bursty
 161  * activity. The current values ensure that the scheduler would converge
 162  * to the latest load in 2-3 sched_load_compute_interval_abs intervals
 163  * (which amounts to ~30-45ms with current values).
 164  */
 165 #define SCHED_LOAD_EWMA_ALPHA_OLD      6
 166 #define SCHED_LOAD_EWMA_ALPHA_NEW      10
 167 #define SCHED_LOAD_EWMA_ALPHA_SHIFT    4
 168 static_assert((SCHED_LOAD_EWMA_ALPHA_OLD + SCHED_LOAD_EWMA_ALPHA_NEW) == (1ul << SCHED_LOAD_EWMA_ALPHA_SHIFT));
 169
 170 /* For fixpoint EWMA, roundup the load to make it converge */
 171 #define SCHED_LOAD_EWMA_ROUNDUP(load)   (((load) & (1ul << (SCHED_LOAD_EWMA_ALPHA_SHIFT - 1))) != 0)
 172
 173 /* Macro to convert scaled sched load to a real load value */
 174 #define SCHED_LOAD_EWMA_UNSCALE(load)   (((load) >> SCHED_LOAD_EWMA_ALPHA_SHIFT) + SCHED_LOAD_EWMA_ROUNDUP(load))
 175
 176 /*
 177  * Routine to capture the latest runnable counts and update sched_load (only used for non-clutch schedulers)
 178  */
 179 void
 180 compute_sched_load(void)
 181 {
 182         /*
 183          * Retrieve a snapshot of the current run counts.
 184          *
 185          * Why not a bcopy()? Because we need atomic word-sized reads of sched_run_buckets,
 186          * not byte-by-byte copy.
 187          */
 188         uint32_t ncpus = processor_avail_count;
 189         uint32_t load_now[TH_BUCKET_MAX];
 190
 191         load_now[TH_BUCKET_RUN]      = os_atomic_load(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
 192         load_now[TH_BUCKET_FIXPRI]   = os_atomic_load(&sched_run_buckets[TH_BUCKET_FIXPRI], relaxed);
 193         load_now[TH_BUCKET_SHARE_FG] = os_atomic_load(&sched_run_buckets[TH_BUCKET_SHARE_FG], relaxed);
 194         load_now[TH_BUCKET_SHARE_DF] = os_atomic_load(&sched_run_buckets[TH_BUCKET_SHARE_DF], relaxed);
 195         load_now[TH_BUCKET_SHARE_UT] = os_atomic_load(&sched_run_buckets[TH_BUCKET_SHARE_UT], relaxed);
 196         load_now[TH_BUCKET_SHARE_BG] = os_atomic_load(&sched_run_buckets[TH_BUCKET_SHARE_BG], relaxed);
 197
 198         assert(load_now[TH_BUCKET_RUN] >= 0);
 199         assert(load_now[TH_BUCKET_FIXPRI] >= 0);
 200
 201         uint32_t nthreads = load_now[TH_BUCKET_RUN];
 202         uint32_t nfixpri  = load_now[TH_BUCKET_FIXPRI];
 203
 204         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
 205             MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_LOAD) | DBG_FUNC_NONE,
 206             load_now[TH_BUCKET_FIXPRI], (load_now[TH_BUCKET_SHARE_FG] + load_now[TH_BUCKET_SHARE_DF]),
 207             load_now[TH_BUCKET_SHARE_BG], load_now[TH_BUCKET_SHARE_UT], 0);
 208
 209         /*
 210          * Compute the timeshare priority conversion factor based on loading.
 211          * Because our counters may be incremented and accessed
 212          * concurrently with respect to each other, we may have
 213          * windows where the invariant (nthreads - nfixpri) == (fg + df + bg + ut)
 214          * is broken, so truncate values in these cases.
 215          */
 216         uint32_t timeshare_threads = (nthreads - nfixpri);
 217         for (uint32_t i = TH_BUCKET_SHARE_FG; i <= TH_BUCKET_SHARE_BG; i++) {
 218                 if (load_now[i] > timeshare_threads) {
 219                         load_now[i] = timeshare_threads;
 220                 }
 221         }
 222
 223         /*
 224          * Default threads contribute up to (NCPUS * 2) of load to FG threads
 225          */
 226         if (load_now[TH_BUCKET_SHARE_DF] <= (ncpus * 2)) {
 227                 load_now[TH_BUCKET_SHARE_FG] += load_now[TH_BUCKET_SHARE_DF];
 228         } else {
 229                 load_now[TH_BUCKET_SHARE_FG] += (ncpus * 2);
 230         }
 231
 232         /*
 233          * Utility threads contribute up to NCPUS of load to FG & DF threads
 234          */
 235         if (load_now[TH_BUCKET_SHARE_UT] <= ncpus) {
 236                 load_now[TH_BUCKET_SHARE_FG] += load_now[TH_BUCKET_SHARE_UT];
 237                 load_now[TH_BUCKET_SHARE_DF] += load_now[TH_BUCKET_SHARE_UT];
 238         } else {
 239                 load_now[TH_BUCKET_SHARE_FG] += ncpus;
 240                 load_now[TH_BUCKET_SHARE_DF] += ncpus;
 241         }
 242
 243         /*
 244          * BG threads contribute up to 1 thread worth of load to FG, DF and UT threads
 245          */
 246         if (load_now[TH_BUCKET_SHARE_BG] > 0) {
 247                 load_now[TH_BUCKET_SHARE_FG] += 1;
 248                 load_now[TH_BUCKET_SHARE_DF] += 1;
 249                 load_now[TH_BUCKET_SHARE_UT] += 1;
 250         }
 251
 252         /*
 253          * The conversion factor consists of two components:
 254          * a fixed value based on the absolute time unit (sched_fixed_shift),
 255          * and a dynamic portion based on load (sched_load_shifts).
 256          *
 257          * Zero load results in a out of range shift count.
 258          */
 259
 260         for (uint32_t i = TH_BUCKET_SHARE_FG; i <= TH_BUCKET_SHARE_BG; i++) {
 261                 uint32_t bucket_load = 0;
 262
 263                 if (load_now[i] > ncpus) {
 264                         /* Normalize the load to number of CPUs */
 265                         if (ncpus > 1) {
 266                                 bucket_load = load_now[i] / ncpus;
 267                         } else {
 268                                 bucket_load = load_now[i];
 269                         }
 270
 271                         if (bucket_load > MAX_LOAD) {
 272                                 bucket_load = MAX_LOAD;
 273                         }
 274                 }
 275                 /* Plug the load values into the EWMA algorithm to calculate (scaled for fixpoint) sched_load */
 276                 sched_load[i] = (sched_load[i] * SCHED_LOAD_EWMA_ALPHA_OLD) + ((bucket_load << SCHED_LOAD_EWMA_ALPHA_SHIFT) * SCHED_LOAD_EWMA_ALPHA_NEW);
 277                 sched_load[i] = sched_load[i] >> SCHED_LOAD_EWMA_ALPHA_SHIFT;
 278         }
 279
 280         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
 281             MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_LOAD_EFFECTIVE) | DBG_FUNC_NONE,
 282             SCHED_LOAD_EWMA_UNSCALE(sched_load[TH_BUCKET_SHARE_FG]), SCHED_LOAD_EWMA_UNSCALE(sched_load[TH_BUCKET_SHARE_DF]),
 283             SCHED_LOAD_EWMA_UNSCALE(sched_load[TH_BUCKET_SHARE_UT]), SCHED_LOAD_EWMA_UNSCALE(sched_load[TH_BUCKET_SHARE_BG]), 0);
 284 }
 285
 286 void
 287 compute_averages(uint64_t stdelta)
 288 {
 289         uint32_t nthreads = os_atomic_load(&sched_run_buckets[TH_BUCKET_RUN], relaxed) - 1;
 290         uint32_t ncpus = processor_avail_count;
 291
 292         /* Update the global pri_shifts based on the latest values */
 293         for (uint32_t i = TH_BUCKET_SHARE_FG; i <= TH_BUCKET_SHARE_BG; i++) {
 294                 uint32_t bucket_load = SCHED_LOAD_EWMA_UNSCALE(sched_load[i]);
 295                 uint32_t shift = sched_fixed_shift - sched_load_shifts[bucket_load];
 296
 297                 if (shift > SCHED_PRI_SHIFT_MAX) {
 298                         sched_pri_shifts[i] = INT8_MAX;
 299                 } else {
 300                         sched_pri_shifts[i] = shift;
 301                 }
 302         }
 303
 304         /*
 305          * Sample total running threads for the load average calculation.
 306          */
 307         sched_nrun = nthreads;
 308
 309         /*
 310          * Load average and mach factor calculations for
 311          * those which ask about these things.
 312          */
 313         uint32_t average_now = nthreads * LOAD_SCALE;
 314         uint32_t factor_now;
 315
 316         if (nthreads > ncpus) {
 317                 factor_now = (ncpus * LOAD_SCALE) / (nthreads + 1);
 318         } else {
 319                 factor_now = (ncpus - nthreads) * LOAD_SCALE;
 320         }
 321
 322         /*
 323          * For those statistics that formerly relied on being recomputed
 324          * on timer ticks, advance by the approximate number of corresponding
 325          * elapsed intervals, thus compensating for potential idle intervals.
 326          */
 327         for (uint32_t index = 0; index < stdelta; index++) {
 328                 sched_mach_factor = ((sched_mach_factor << 2) + factor_now) / 5;
 329                 sched_load_average = ((sched_load_average << 2) + average_now) / 5;
 330         }
 331
 332         /*
 333          * Compute old-style Mach load averages.
 334          */
 335         for (uint32_t index = 0; index < stdelta; index++) {
 336                 for (uint32_t i = 0; i < 3; i++) {
 337                         mach_factor[i] = ((mach_factor[i] * fract[i]) +
 338                             (factor_now * (LOAD_SCALE - fract[i]))) / LOAD_SCALE;
 339
 340                         avenrun[i] = ((avenrun[i] * fract[i]) +
 341                             (average_now * (LOAD_SCALE - fract[i]))) / LOAD_SCALE;
 342                 }
 343         }
 344
 345         /*
 346          * Compute averages in other components.
 347          */
 348         uint64_t abstime = mach_absolute_time();
 349
 350         for (sched_average_t avg = sched_average; avg->comp != NULL; ++avg) {
 351                 if (abstime >= avg->deadline) {
 352                         uint64_t period_abs = (avg->period * sched_one_second_interval);
 353                         uint64_t ninvokes = 1;
 354
 355                         ninvokes += (abstime - avg->deadline) / period_abs;
 356                         ninvokes = MIN(ninvokes, SCHED_TICK_MAX_DELTA);
 357
 358                         for (uint32_t index = 0; index < ninvokes; index++) {
 359                                 (*avg->comp)(avg->param);
 360                         }
 361                         avg->deadline = abstime + period_abs;
 362                 }
 363         }
 364 }