X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/8ad349bb6ed4a0be06e34c92be0d98b92e078db4..0a7de7458d150b5d4dffc935ba399be265ef0a1a:/osfmk/kern/sched_average.c

diff --git a/osfmk/kern/sched_average.c b/osfmk/kern/sched_average.c
index bfa1d2ef0..709803b9e 100644
--- a/osfmk/kern/sched_average.c
+++ b/osfmk/kern/sched_average.c
@@ -1,57 +1,55 @@
 /*
- * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2000-2007 Apple Computer, Inc. All rights reserved.
  *
- * @APPLE_LICENSE_OSREFERENCE_HEADER_START@
- * 
- * This file contains Original Code and/or Modifications of Original Code 
- * as defined in and that are subject to the Apple Public Source License 
- * Version 2.0 (the 'License'). You may not use this file except in 
- * compliance with the License.  The rights granted to you under the 
- * License may not be used to create, or enable the creation or 
- * redistribution of, unlawful or unlicensed copies of an Apple operating 
- * system, or to circumvent, violate, or enable the circumvention or 
- * violation of, any terms of an Apple operating system software license 
- * agreement.
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
- * Please obtain a copy of the License at 
- * http://www.opensource.apple.com/apsl/ and read it before using this 
- * file.
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
  *
- * The Original Code and all software distributed under the License are 
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 
- * Please see the License for the specific language governing rights and 
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
  * limitations under the License.
  *
- * @APPLE_LICENSE_OSREFERENCE_HEADER_END@
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  */
 /*
  * @OSF_COPYRIGHT@
  */
-/* 
+/*
  * Mach Operating System
  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
  * All Rights Reserved.
- * 
+ *
  * Permission to use, copy, modify and distribute this software and its
  * documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
- * 
+ *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
- * 
+ *
  * Carnegie Mellon requests users of this software to return to
- * 
+ *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
- * 
+ *
  * any improvements or extensions that they make and grant Carnegie Mellon
  * the rights to redistribute these changes.
  */
@@ -70,141 +68,290 @@
 #include <kern/assert.h>
 #include <kern/processor.h>
 #include <kern/thread.h>
-	
-uint32_t	avenrun[3] = {0, 0, 0};
-uint32_t	mach_factor[3] = {0, 0, 0};
+#if CONFIG_TELEMETRY
+#include <kern/telemetry.h>
+#endif
+
+#include <sys/kdebug.h>
+
+uint32_t        avenrun[3] = {0, 0, 0};
+uint32_t        mach_factor[3] = {0, 0, 0};
 
+uint32_t        sched_load_average, sched_mach_factor;
+
+#if defined(CONFIG_SCHED_TIMESHARE_CORE)
 /*
  * Values are scaled by LOAD_SCALE, defined in processor_info.h
  */
-#define base(n)		((n) << SCHED_TICK_SHIFT)
-#define frac(n)		(((base(n) - 1) * LOAD_SCALE) /	base(n))
+#define base(n)         ((n) << SCHED_TICK_SHIFT)
+#define frac(n)         (((base(n) - 1) * LOAD_SCALE) /	base(n))
 
-static uint32_t		fract[3] = {
-	frac(5),		/* 5 second average */
-	frac(30),		/* 30 second average */
-	frac(60),		/* 1 minute average */
+static uint32_t         fract[3] = {
+	frac(5),                /* 5 second average */
+	frac(30),               /* 30 second average */
+	frac(60),               /* 1 minute average */
 };
 
 #undef base
 #undef frac
 
-static unsigned int		sched_nrun;
+#endif /* CONFIG_SCHED_TIMESHARE_CORE */
 
-typedef void	(*sched_avg_comp_t)(
-					void			*param);
+static unsigned int             sched_nrun;
 
-#define SCHED_AVG_SECS(n)	((n) << SCHED_TICK_SHIFT)
+typedef void    (*sched_avg_comp_t)(
+	void                    *param);
 
 static struct sched_average {
-	sched_avg_comp_t	comp;
-	void				*param;
-	int					period;
-	int					tick;			
+	sched_avg_comp_t        comp;
+	void                    *param;
+	int                     period; /* in seconds */
+	uint64_t                deadline;
 } sched_average[] = {
-	{ compute_averunnable, &sched_nrun, SCHED_AVG_SECS(5), 0 },
-	{ compute_stack_target, NULL, SCHED_AVG_SECS(5), 1 },
+	{ compute_averunnable, &sched_nrun, 5, 0 },
+	{ compute_stack_target, NULL, 5, 1 },
+	{ compute_pageout_gc_throttle, NULL, 1, 0 },
+	{ compute_pmap_gc_throttle, NULL, 60, 0 },
+#if CONFIG_TELEMETRY
+	{ compute_telemetry, NULL, 1, 0 },
+#endif
 	{ NULL, NULL, 0, 0 }
 };
 
-typedef struct sched_average	*sched_average_t;
+typedef struct sched_average    *sched_average_t;
+
+/*
+ * Scheduler load calculation algorithm
+ *
+ * The scheduler load values provide an estimate of the number of runnable
+ * timeshare threads in the system at various priority bands. The load
+ * ultimately affects the priority shifts applied to all threads in a band
+ * causing them to timeshare with other threads in the system. The load is
+ * maintained in buckets, with each bucket corresponding to a priority band.
+ *
+ * Each runnable thread on the system contributes its load to its priority
+ * band and to the bands above it. The contribution of a thread to the bands
+ * above it is not strictly 1:1 and is weighted based on the priority band
+ * of the thread. The rules of thread load contribution to each of its higher
+ * bands are as follows:
+ *
+ * - DF threads: Upto (2 * NCPUs) threads
+ * - UT threads: Upto NCPUs threads
+ * - BG threads: Upto 1 thread
+ *
+ * To calculate the load values, the various run buckets are sampled (every
+ * sched_load_compute_interval_abs) and the weighted contributions of the the
+ * lower bucket threads are added. The resultant value is plugged into an
+ * exponentially weighted moving average formula:
+ *      new-load = alpha * old-load + (1 - alpha) * run-bucket-sample-count
+ *      (where, alpha < 1)
+ * The calculations for the scheduler load are done using fixpoint math with
+ * a scale factor of 16 to avoid expensive divides and floating point
+ * operations. The final load values are a smooth curve representative of
+ * the actual number of runnable threads in a priority band.
+ */
+
+/* Maintains the current (scaled for fixpoint) load in various buckets */
+uint32_t sched_load[TH_BUCKET_MAX];
 
+/*
+ * Alpha factor for the EWMA alogrithm. The current values are chosen as
+ * 6:10 ("old load":"new samples") to make sure the scheduler reacts fast
+ * enough to changing system load but does not see too many spikes from bursty
+ * activity. The current values ensure that the scheduler would converge
+ * to the latest load in 2-3 sched_load_compute_interval_abs intervals
+ * (which amounts to ~30-45ms with current values).
+ */
+#define SCHED_LOAD_EWMA_ALPHA_OLD      6
+#define SCHED_LOAD_EWMA_ALPHA_NEW      10
+#define SCHED_LOAD_EWMA_ALPHA_SHIFT    4
+static_assert((SCHED_LOAD_EWMA_ALPHA_OLD + SCHED_LOAD_EWMA_ALPHA_NEW) == (1ul << SCHED_LOAD_EWMA_ALPHA_SHIFT));
+
+/* For fixpoint EWMA, roundup the load to make it converge */
+#define SCHED_LOAD_EWMA_ROUNDUP(load)   (((load) & (1ul << (SCHED_LOAD_EWMA_ALPHA_SHIFT - 1))) != 0)
+
+/* Macro to convert scaled sched load to a real load value */
+#define SCHED_LOAD_EWMA_UNSCALE(load)   (((load) >> SCHED_LOAD_EWMA_ALPHA_SHIFT) + SCHED_LOAD_EWMA_ROUNDUP(load))
+
+/*
+ * Routine to capture the latest runnable counts and update sched_load */
 void
-compute_averages(void)
+compute_sched_load(void)
 {
-	register processor_set_t	pset = &default_pset;
-	register int				ncpus;
-	register int				nthreads, nshared;
-	sched_average_t				avg;
-	register uint32_t			factor_now = 0;
-	register uint32_t			average_now = 0;
-	register uint32_t			load_now = 0;
-
-	if ((ncpus = pset->processor_count) > 0) {
-		/*
-		 *	Retrieve counts, ignoring
-		 *	the current thread.
-		 */
-		nthreads = pset->run_count - 1;
-		nshared = pset->share_count;
-
-		/*
-		 *	Load average and mach factor calculations for
-		 *	those which ask about these things.
-		 */
-		average_now = nthreads * LOAD_SCALE;
-
-		if (nthreads > ncpus)
-			factor_now = (ncpus * LOAD_SCALE) / (nthreads + 1);
-		else
-			factor_now = (ncpus - nthreads) * LOAD_SCALE;
-
-		pset->mach_factor =	((pset->mach_factor << 2) + factor_now) / 5;
-		pset->load_average = ((pset->load_average << 2) + average_now) / 5;
-
-		/*
-		 *	Compute the timeshare priority
-		 *	conversion factor based on loading.
-		 */
-		if (nshared > nthreads)
-			nshared = nthreads;
-
-		if (nshared > ncpus) {
-			if (ncpus > 1)
-				load_now = nshared / ncpus;
-			else
-				load_now = nshared;
-
-			if (load_now > NRQS - 1)
-				load_now = NRQS - 1;
+	/*
+	 * Retrieve a snapshot of the current run counts.
+	 *
+	 * Why not a bcopy()? Because we need atomic word-sized reads of sched_run_buckets,
+	 * not byte-by-byte copy.
+	 */
+	uint32_t ncpus = processor_avail_count;
+	uint32_t load_now[TH_BUCKET_MAX];
+
+	load_now[TH_BUCKET_RUN]      = sched_run_buckets[TH_BUCKET_RUN];
+	load_now[TH_BUCKET_FIXPRI]   = sched_run_buckets[TH_BUCKET_FIXPRI];
+	load_now[TH_BUCKET_SHARE_FG] = sched_run_buckets[TH_BUCKET_SHARE_FG];
+	load_now[TH_BUCKET_SHARE_DF] = sched_run_buckets[TH_BUCKET_SHARE_DF];
+	load_now[TH_BUCKET_SHARE_UT] = sched_run_buckets[TH_BUCKET_SHARE_UT];
+	load_now[TH_BUCKET_SHARE_BG] = sched_run_buckets[TH_BUCKET_SHARE_BG];
+
+	assert(load_now[TH_BUCKET_RUN] >= 0);
+	assert(load_now[TH_BUCKET_FIXPRI] >= 0);
+
+	uint32_t nthreads = load_now[TH_BUCKET_RUN];
+	uint32_t nfixpri  = load_now[TH_BUCKET_FIXPRI];
+
+	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
+	    MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_LOAD) | DBG_FUNC_NONE,
+	    load_now[TH_BUCKET_FIXPRI], (load_now[TH_BUCKET_SHARE_FG] + load_now[TH_BUCKET_SHARE_DF]),
+	    load_now[TH_BUCKET_SHARE_BG], load_now[TH_BUCKET_SHARE_UT], 0);
+
+	/*
+	 * Compute the timeshare priority conversion factor based on loading.
+	 * Because our counters may be incremented and accessed
+	 * concurrently with respect to each other, we may have
+	 * windows where the invariant (nthreads - nfixpri) == (fg + df + bg + ut)
+	 * is broken, so truncate values in these cases.
+	 */
+	uint32_t timeshare_threads = (nthreads - nfixpri);
+	for (uint32_t i = TH_BUCKET_SHARE_FG; i <= TH_BUCKET_SHARE_BG; i++) {
+		if (load_now[i] > timeshare_threads) {
+			load_now[i] = timeshare_threads;
 		}
+	}
 
-		/*
-		 *	The conversion factor consists of
-		 *	two components: a fixed value based
-		 *	on the absolute time unit, and a
-		 *	dynamic portion based on loading.
-		 *
-		 *	Zero loading results in a out of range
-		 *	shift count.  Accumulated usage is ignored
-		 *	during conversion and new usage deltas
-		 *	are discarded.
-		 */
-		pset->pri_shift = sched_pri_shift - sched_load_shifts[load_now];
+	/*
+	 * Default threads contribute up to (NCPUS * 2) of load to FG threads
+	 */
+	if (load_now[TH_BUCKET_SHARE_DF] <= (ncpus * 2)) {
+		load_now[TH_BUCKET_SHARE_FG] += load_now[TH_BUCKET_SHARE_DF];
+	} else {
+		load_now[TH_BUCKET_SHARE_FG] += (ncpus * 2);
 	}
-	else {
-		pset->mach_factor = pset->load_average = 0;
-		pset->pri_shift = INT8_MAX;
-		nthreads = pset->run_count;
+
+	/*
+	 * Utility threads contribute up to NCPUS of load to FG & DF threads
+	 */
+	if (load_now[TH_BUCKET_SHARE_UT] <= ncpus) {
+		load_now[TH_BUCKET_SHARE_FG] += load_now[TH_BUCKET_SHARE_UT];
+		load_now[TH_BUCKET_SHARE_DF] += load_now[TH_BUCKET_SHARE_UT];
+	} else {
+		load_now[TH_BUCKET_SHARE_FG] += ncpus;
+		load_now[TH_BUCKET_SHARE_DF] += ncpus;
 	}
 
 	/*
-	 *	Sample total running threads.
+	 * BG threads contribute up to 1 thread worth of load to FG, DF and UT threads
+	 */
+	if (load_now[TH_BUCKET_SHARE_BG] > 0) {
+		load_now[TH_BUCKET_SHARE_FG] += 1;
+		load_now[TH_BUCKET_SHARE_DF] += 1;
+		load_now[TH_BUCKET_SHARE_UT] += 1;
+	}
+
+	/*
+	 * The conversion factor consists of two components:
+	 * a fixed value based on the absolute time unit (sched_fixed_shift),
+	 * and a dynamic portion based on load (sched_load_shifts).
+	 *
+	 * Zero load results in a out of range shift count.
+	 */
+
+	for (uint32_t i = TH_BUCKET_SHARE_FG; i <= TH_BUCKET_SHARE_BG; i++) {
+		uint32_t bucket_load = 0;
+
+		if (load_now[i] > ncpus) {
+			/* Normalize the load to number of CPUs */
+			if (ncpus > 1) {
+				bucket_load = load_now[i] / ncpus;
+			} else {
+				bucket_load = load_now[i];
+			}
+
+			if (bucket_load > MAX_LOAD) {
+				bucket_load = MAX_LOAD;
+			}
+		}
+		/* Plug the load values into the EWMA algorithm to calculate (scaled for fixpoint) sched_load */
+		sched_load[i] = (sched_load[i] * SCHED_LOAD_EWMA_ALPHA_OLD) + ((bucket_load << SCHED_LOAD_EWMA_ALPHA_SHIFT) * SCHED_LOAD_EWMA_ALPHA_NEW);
+		sched_load[i] = sched_load[i] >> SCHED_LOAD_EWMA_ALPHA_SHIFT;
+	}
+
+	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
+	    MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_LOAD_EFFECTIVE) | DBG_FUNC_NONE,
+	    SCHED_LOAD_EWMA_UNSCALE(sched_load[TH_BUCKET_SHARE_FG]), SCHED_LOAD_EWMA_UNSCALE(sched_load[TH_BUCKET_SHARE_DF]),
+	    SCHED_LOAD_EWMA_UNSCALE(sched_load[TH_BUCKET_SHARE_UT]), SCHED_LOAD_EWMA_UNSCALE(sched_load[TH_BUCKET_SHARE_BG]), 0);
+}
+
+void
+compute_averages(uint64_t stdelta)
+{
+	uint32_t nthreads = sched_run_buckets[TH_BUCKET_RUN] - 1;
+	uint32_t ncpus = processor_avail_count;
+
+	/* Update the global pri_shifts based on the latest values */
+	for (uint32_t i = TH_BUCKET_SHARE_FG; i <= TH_BUCKET_SHARE_BG; i++) {
+		uint32_t bucket_load = SCHED_LOAD_EWMA_UNSCALE(sched_load[i]);
+		sched_pri_shifts[i] = sched_fixed_shift - sched_load_shifts[bucket_load];
+	}
+
+	/*
+	 * Sample total running threads for the load average calculation.
 	 */
 	sched_nrun = nthreads;
 
 	/*
-	 * Compute old-style Mach load averages.
+	 * Load average and mach factor calculations for
+	 * those which ask about these things.
 	 */
-	{
-		register int		i;
+	uint32_t average_now = nthreads * LOAD_SCALE;
+	uint32_t factor_now;
 
-		for (i = 0; i < 3; i++) {
+	if (nthreads > ncpus) {
+		factor_now = (ncpus * LOAD_SCALE) / (nthreads + 1);
+	} else {
+		factor_now = (ncpus - nthreads) * LOAD_SCALE;
+	}
+
+	/*
+	 * For those statistics that formerly relied on being recomputed
+	 * on timer ticks, advance by the approximate number of corresponding
+	 * elapsed intervals, thus compensating for potential idle intervals.
+	 */
+	for (uint32_t index = 0; index < stdelta; index++) {
+		sched_mach_factor = ((sched_mach_factor << 2) + factor_now) / 5;
+		sched_load_average = ((sched_load_average << 2) + average_now) / 5;
+	}
+
+	/*
+	 * Compute old-style Mach load averages.
+	 */
+	for (uint32_t index = 0; index < stdelta; index++) {
+		for (uint32_t i = 0; i < 3; i++) {
 			mach_factor[i] = ((mach_factor[i] * fract[i]) +
-						(factor_now * (LOAD_SCALE - fract[i]))) / LOAD_SCALE;
+			    (factor_now * (LOAD_SCALE - fract[i]))) / LOAD_SCALE;
 
 			avenrun[i] = ((avenrun[i] * fract[i]) +
-						(average_now * (LOAD_SCALE - fract[i]))) / LOAD_SCALE;
+			    (average_now * (LOAD_SCALE - fract[i]))) / LOAD_SCALE;
 		}
 	}
 
 	/*
-	 *	Compute averages in other components.
+	 * Compute averages in other components.
 	 */
-	for (avg = sched_average; avg->comp != NULL; ++avg) {
-		if (++avg->tick >= avg->period) {
-			(*avg->comp)(avg->param);
-			avg->tick = 0;
+	uint64_t abstime = mach_absolute_time();
+
+	for (sched_average_t avg = sched_average; avg->comp != NULL; ++avg) {
+		if (abstime >= avg->deadline) {
+			uint64_t period_abs = (avg->period * sched_one_second_interval);
+			uint64_t ninvokes = 1;
+
+			ninvokes += (abstime - avg->deadline) / period_abs;
+			ninvokes = MIN(ninvokes, SCHED_TICK_MAX_DELTA);
+
+			for (uint32_t index = 0; index < ninvokes; index++) {
+				(*avg->comp)(avg->param);
+			}
+			avg->deadline = abstime + period_abs;
 		}
 	}
 }