X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/8ad349bb6ed4a0be06e34c92be0d98b92e078db4..0a7de7458d150b5d4dffc935ba399be265ef0a1a:/osfmk/kern/sched_average.c diff --git a/osfmk/kern/sched_average.c b/osfmk/kern/sched_average.c index bfa1d2ef0..709803b9e 100644 --- a/osfmk/kern/sched_average.c +++ b/osfmk/kern/sched_average.c @@ -1,57 +1,55 @@ /* - * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2007 Apple Computer, Inc. All rights reserved. * - * @APPLE_LICENSE_OSREFERENCE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the - * License may not be used to create, or enable the creation or - * redistribution of, unlawful or unlicensed copies of an Apple operating - * system, or to circumvent, violate, or enable the circumvention or - * violation of, any terms of an Apple operating system software license - * agreement. + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this - * file. + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and * limitations under the License. * - * @APPLE_LICENSE_OSREFERENCE_HEADER_END@ + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* * @OSF_COPYRIGHT@ */ -/* +/* * Mach Operating System * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University * All Rights Reserved. - * + * * Permission to use, copy, modify and distribute this software and its * documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. - * + * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * + * * Carnegie Mellon requests users of this software to return to - * + * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 - * + * * any improvements or extensions that they make and grant Carnegie Mellon * the rights to redistribute these changes. */ @@ -70,141 +68,290 @@ #include #include #include - -uint32_t avenrun[3] = {0, 0, 0}; -uint32_t mach_factor[3] = {0, 0, 0}; +#if CONFIG_TELEMETRY +#include +#endif + +#include + +uint32_t avenrun[3] = {0, 0, 0}; +uint32_t mach_factor[3] = {0, 0, 0}; +uint32_t sched_load_average, sched_mach_factor; + +#if defined(CONFIG_SCHED_TIMESHARE_CORE) /* * Values are scaled by LOAD_SCALE, defined in processor_info.h */ -#define base(n) ((n) << SCHED_TICK_SHIFT) -#define frac(n) (((base(n) - 1) * LOAD_SCALE) / base(n)) +#define base(n) ((n) << SCHED_TICK_SHIFT) +#define frac(n) (((base(n) - 1) * LOAD_SCALE) / base(n)) -static uint32_t fract[3] = { - frac(5), /* 5 second average */ - frac(30), /* 30 second average */ - frac(60), /* 1 minute average */ +static uint32_t fract[3] = { + frac(5), /* 5 second average */ + frac(30), /* 30 second average */ + frac(60), /* 1 minute average */ }; #undef base #undef frac -static unsigned int sched_nrun; +#endif /* CONFIG_SCHED_TIMESHARE_CORE */ -typedef void (*sched_avg_comp_t)( - void *param); +static unsigned int sched_nrun; -#define SCHED_AVG_SECS(n) ((n) << SCHED_TICK_SHIFT) +typedef void (*sched_avg_comp_t)( + void *param); static struct sched_average { - sched_avg_comp_t comp; - void *param; - int period; - int tick; + sched_avg_comp_t comp; + void *param; + int period; /* in seconds */ + uint64_t deadline; } sched_average[] = { - { compute_averunnable, &sched_nrun, SCHED_AVG_SECS(5), 0 }, - { compute_stack_target, NULL, SCHED_AVG_SECS(5), 1 }, + { compute_averunnable, &sched_nrun, 5, 0 }, + { compute_stack_target, NULL, 5, 1 }, + { compute_pageout_gc_throttle, NULL, 1, 0 }, + { compute_pmap_gc_throttle, NULL, 60, 0 }, +#if CONFIG_TELEMETRY + { compute_telemetry, NULL, 1, 0 }, +#endif { NULL, NULL, 0, 0 } }; -typedef struct sched_average *sched_average_t; +typedef struct sched_average *sched_average_t; + +/* + * Scheduler load calculation algorithm + * + * The scheduler load values provide an estimate of the number of runnable + * timeshare threads in the system at various priority bands. The load + * ultimately affects the priority shifts applied to all threads in a band + * causing them to timeshare with other threads in the system. The load is + * maintained in buckets, with each bucket corresponding to a priority band. + * + * Each runnable thread on the system contributes its load to its priority + * band and to the bands above it. The contribution of a thread to the bands + * above it is not strictly 1:1 and is weighted based on the priority band + * of the thread. The rules of thread load contribution to each of its higher + * bands are as follows: + * + * - DF threads: Upto (2 * NCPUs) threads + * - UT threads: Upto NCPUs threads + * - BG threads: Upto 1 thread + * + * To calculate the load values, the various run buckets are sampled (every + * sched_load_compute_interval_abs) and the weighted contributions of the the + * lower bucket threads are added. The resultant value is plugged into an + * exponentially weighted moving average formula: + * new-load = alpha * old-load + (1 - alpha) * run-bucket-sample-count + * (where, alpha < 1) + * The calculations for the scheduler load are done using fixpoint math with + * a scale factor of 16 to avoid expensive divides and floating point + * operations. The final load values are a smooth curve representative of + * the actual number of runnable threads in a priority band. + */ + +/* Maintains the current (scaled for fixpoint) load in various buckets */ +uint32_t sched_load[TH_BUCKET_MAX]; +/* + * Alpha factor for the EWMA alogrithm. The current values are chosen as + * 6:10 ("old load":"new samples") to make sure the scheduler reacts fast + * enough to changing system load but does not see too many spikes from bursty + * activity. The current values ensure that the scheduler would converge + * to the latest load in 2-3 sched_load_compute_interval_abs intervals + * (which amounts to ~30-45ms with current values). + */ +#define SCHED_LOAD_EWMA_ALPHA_OLD 6 +#define SCHED_LOAD_EWMA_ALPHA_NEW 10 +#define SCHED_LOAD_EWMA_ALPHA_SHIFT 4 +static_assert((SCHED_LOAD_EWMA_ALPHA_OLD + SCHED_LOAD_EWMA_ALPHA_NEW) == (1ul << SCHED_LOAD_EWMA_ALPHA_SHIFT)); + +/* For fixpoint EWMA, roundup the load to make it converge */ +#define SCHED_LOAD_EWMA_ROUNDUP(load) (((load) & (1ul << (SCHED_LOAD_EWMA_ALPHA_SHIFT - 1))) != 0) + +/* Macro to convert scaled sched load to a real load value */ +#define SCHED_LOAD_EWMA_UNSCALE(load) (((load) >> SCHED_LOAD_EWMA_ALPHA_SHIFT) + SCHED_LOAD_EWMA_ROUNDUP(load)) + +/* + * Routine to capture the latest runnable counts and update sched_load */ void -compute_averages(void) +compute_sched_load(void) { - register processor_set_t pset = &default_pset; - register int ncpus; - register int nthreads, nshared; - sched_average_t avg; - register uint32_t factor_now = 0; - register uint32_t average_now = 0; - register uint32_t load_now = 0; - - if ((ncpus = pset->processor_count) > 0) { - /* - * Retrieve counts, ignoring - * the current thread. - */ - nthreads = pset->run_count - 1; - nshared = pset->share_count; - - /* - * Load average and mach factor calculations for - * those which ask about these things. - */ - average_now = nthreads * LOAD_SCALE; - - if (nthreads > ncpus) - factor_now = (ncpus * LOAD_SCALE) / (nthreads + 1); - else - factor_now = (ncpus - nthreads) * LOAD_SCALE; - - pset->mach_factor = ((pset->mach_factor << 2) + factor_now) / 5; - pset->load_average = ((pset->load_average << 2) + average_now) / 5; - - /* - * Compute the timeshare priority - * conversion factor based on loading. - */ - if (nshared > nthreads) - nshared = nthreads; - - if (nshared > ncpus) { - if (ncpus > 1) - load_now = nshared / ncpus; - else - load_now = nshared; - - if (load_now > NRQS - 1) - load_now = NRQS - 1; + /* + * Retrieve a snapshot of the current run counts. + * + * Why not a bcopy()? Because we need atomic word-sized reads of sched_run_buckets, + * not byte-by-byte copy. + */ + uint32_t ncpus = processor_avail_count; + uint32_t load_now[TH_BUCKET_MAX]; + + load_now[TH_BUCKET_RUN] = sched_run_buckets[TH_BUCKET_RUN]; + load_now[TH_BUCKET_FIXPRI] = sched_run_buckets[TH_BUCKET_FIXPRI]; + load_now[TH_BUCKET_SHARE_FG] = sched_run_buckets[TH_BUCKET_SHARE_FG]; + load_now[TH_BUCKET_SHARE_DF] = sched_run_buckets[TH_BUCKET_SHARE_DF]; + load_now[TH_BUCKET_SHARE_UT] = sched_run_buckets[TH_BUCKET_SHARE_UT]; + load_now[TH_BUCKET_SHARE_BG] = sched_run_buckets[TH_BUCKET_SHARE_BG]; + + assert(load_now[TH_BUCKET_RUN] >= 0); + assert(load_now[TH_BUCKET_FIXPRI] >= 0); + + uint32_t nthreads = load_now[TH_BUCKET_RUN]; + uint32_t nfixpri = load_now[TH_BUCKET_FIXPRI]; + + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_LOAD) | DBG_FUNC_NONE, + load_now[TH_BUCKET_FIXPRI], (load_now[TH_BUCKET_SHARE_FG] + load_now[TH_BUCKET_SHARE_DF]), + load_now[TH_BUCKET_SHARE_BG], load_now[TH_BUCKET_SHARE_UT], 0); + + /* + * Compute the timeshare priority conversion factor based on loading. + * Because our counters may be incremented and accessed + * concurrently with respect to each other, we may have + * windows where the invariant (nthreads - nfixpri) == (fg + df + bg + ut) + * is broken, so truncate values in these cases. + */ + uint32_t timeshare_threads = (nthreads - nfixpri); + for (uint32_t i = TH_BUCKET_SHARE_FG; i <= TH_BUCKET_SHARE_BG; i++) { + if (load_now[i] > timeshare_threads) { + load_now[i] = timeshare_threads; } + } - /* - * The conversion factor consists of - * two components: a fixed value based - * on the absolute time unit, and a - * dynamic portion based on loading. - * - * Zero loading results in a out of range - * shift count. Accumulated usage is ignored - * during conversion and new usage deltas - * are discarded. - */ - pset->pri_shift = sched_pri_shift - sched_load_shifts[load_now]; + /* + * Default threads contribute up to (NCPUS * 2) of load to FG threads + */ + if (load_now[TH_BUCKET_SHARE_DF] <= (ncpus * 2)) { + load_now[TH_BUCKET_SHARE_FG] += load_now[TH_BUCKET_SHARE_DF]; + } else { + load_now[TH_BUCKET_SHARE_FG] += (ncpus * 2); } - else { - pset->mach_factor = pset->load_average = 0; - pset->pri_shift = INT8_MAX; - nthreads = pset->run_count; + + /* + * Utility threads contribute up to NCPUS of load to FG & DF threads + */ + if (load_now[TH_BUCKET_SHARE_UT] <= ncpus) { + load_now[TH_BUCKET_SHARE_FG] += load_now[TH_BUCKET_SHARE_UT]; + load_now[TH_BUCKET_SHARE_DF] += load_now[TH_BUCKET_SHARE_UT]; + } else { + load_now[TH_BUCKET_SHARE_FG] += ncpus; + load_now[TH_BUCKET_SHARE_DF] += ncpus; } /* - * Sample total running threads. + * BG threads contribute up to 1 thread worth of load to FG, DF and UT threads + */ + if (load_now[TH_BUCKET_SHARE_BG] > 0) { + load_now[TH_BUCKET_SHARE_FG] += 1; + load_now[TH_BUCKET_SHARE_DF] += 1; + load_now[TH_BUCKET_SHARE_UT] += 1; + } + + /* + * The conversion factor consists of two components: + * a fixed value based on the absolute time unit (sched_fixed_shift), + * and a dynamic portion based on load (sched_load_shifts). + * + * Zero load results in a out of range shift count. + */ + + for (uint32_t i = TH_BUCKET_SHARE_FG; i <= TH_BUCKET_SHARE_BG; i++) { + uint32_t bucket_load = 0; + + if (load_now[i] > ncpus) { + /* Normalize the load to number of CPUs */ + if (ncpus > 1) { + bucket_load = load_now[i] / ncpus; + } else { + bucket_load = load_now[i]; + } + + if (bucket_load > MAX_LOAD) { + bucket_load = MAX_LOAD; + } + } + /* Plug the load values into the EWMA algorithm to calculate (scaled for fixpoint) sched_load */ + sched_load[i] = (sched_load[i] * SCHED_LOAD_EWMA_ALPHA_OLD) + ((bucket_load << SCHED_LOAD_EWMA_ALPHA_SHIFT) * SCHED_LOAD_EWMA_ALPHA_NEW); + sched_load[i] = sched_load[i] >> SCHED_LOAD_EWMA_ALPHA_SHIFT; + } + + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_LOAD_EFFECTIVE) | DBG_FUNC_NONE, + SCHED_LOAD_EWMA_UNSCALE(sched_load[TH_BUCKET_SHARE_FG]), SCHED_LOAD_EWMA_UNSCALE(sched_load[TH_BUCKET_SHARE_DF]), + SCHED_LOAD_EWMA_UNSCALE(sched_load[TH_BUCKET_SHARE_UT]), SCHED_LOAD_EWMA_UNSCALE(sched_load[TH_BUCKET_SHARE_BG]), 0); +} + +void +compute_averages(uint64_t stdelta) +{ + uint32_t nthreads = sched_run_buckets[TH_BUCKET_RUN] - 1; + uint32_t ncpus = processor_avail_count; + + /* Update the global pri_shifts based on the latest values */ + for (uint32_t i = TH_BUCKET_SHARE_FG; i <= TH_BUCKET_SHARE_BG; i++) { + uint32_t bucket_load = SCHED_LOAD_EWMA_UNSCALE(sched_load[i]); + sched_pri_shifts[i] = sched_fixed_shift - sched_load_shifts[bucket_load]; + } + + /* + * Sample total running threads for the load average calculation. */ sched_nrun = nthreads; /* - * Compute old-style Mach load averages. + * Load average and mach factor calculations for + * those which ask about these things. */ - { - register int i; + uint32_t average_now = nthreads * LOAD_SCALE; + uint32_t factor_now; - for (i = 0; i < 3; i++) { + if (nthreads > ncpus) { + factor_now = (ncpus * LOAD_SCALE) / (nthreads + 1); + } else { + factor_now = (ncpus - nthreads) * LOAD_SCALE; + } + + /* + * For those statistics that formerly relied on being recomputed + * on timer ticks, advance by the approximate number of corresponding + * elapsed intervals, thus compensating for potential idle intervals. + */ + for (uint32_t index = 0; index < stdelta; index++) { + sched_mach_factor = ((sched_mach_factor << 2) + factor_now) / 5; + sched_load_average = ((sched_load_average << 2) + average_now) / 5; + } + + /* + * Compute old-style Mach load averages. + */ + for (uint32_t index = 0; index < stdelta; index++) { + for (uint32_t i = 0; i < 3; i++) { mach_factor[i] = ((mach_factor[i] * fract[i]) + - (factor_now * (LOAD_SCALE - fract[i]))) / LOAD_SCALE; + (factor_now * (LOAD_SCALE - fract[i]))) / LOAD_SCALE; avenrun[i] = ((avenrun[i] * fract[i]) + - (average_now * (LOAD_SCALE - fract[i]))) / LOAD_SCALE; + (average_now * (LOAD_SCALE - fract[i]))) / LOAD_SCALE; } } /* - * Compute averages in other components. + * Compute averages in other components. */ - for (avg = sched_average; avg->comp != NULL; ++avg) { - if (++avg->tick >= avg->period) { - (*avg->comp)(avg->param); - avg->tick = 0; + uint64_t abstime = mach_absolute_time(); + + for (sched_average_t avg = sched_average; avg->comp != NULL; ++avg) { + if (abstime >= avg->deadline) { + uint64_t period_abs = (avg->period * sched_one_second_interval); + uint64_t ninvokes = 1; + + ninvokes += (abstime - avg->deadline) / period_abs; + ninvokes = MIN(ninvokes, SCHED_TICK_MAX_DELTA); + + for (uint32_t index = 0; index < ninvokes; index++) { + (*avg->comp)(avg->param); + } + avg->deadline = abstime + period_abs; } } }