[apple/xnu.git] / osfmk / kern / sched_average.c

/*
 * Copyright (c) 2000-2007 Apple Computer, Inc. All rights reserved.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
 *
 * This file contains Original Code and/or Modifications of Original Code
 * as defined in and that are subject to the Apple Public Source License
 * Version 2.0 (the 'License'). You may not use this file except in
 * compliance with the License. The rights granted to you under the License
 * may not be used to create, or enable the creation or redistribution of,
 * unlawful or unlicensed copies of an Apple operating system, or to
 * circumvent, violate, or enable the circumvention or violation of, any
 * terms of an Apple operating system software license agreement.
 *
 * Please obtain a copy of the License at
 * http://www.opensource.apple.com/apsl/ and read it before using this file.
 *
 * The Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 * Please see the License for the specific language governing rights and
 * limitations under the License.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
 */
/*
 * @OSF_COPYRIGHT@
 */
/*
 * Mach Operating System
 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
 * All Rights Reserved.
 *
 * Permission to use, copy, modify and distribute this software and its
 * documentation is hereby granted, provided that both the copyright
 * notice and this permission notice appear in all copies of the
 * software, derivative works or modified versions, and any portions
 * thereof, and that both notices appear in supporting documentation.
 *
 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
 * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
 *
 * Carnegie Mellon requests users of this software to return to
 *
 *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
 *  School of Computer Science
 *  Carnegie Mellon University
 *  Pittsburgh PA 15213-3890
 *
 * any improvements or extensions that they make and grant Carnegie Mellon
 * the rights to redistribute these changes.
 */
/*
 */
/*
 *	Author:	Avadis Tevanian, Jr.
 *	Date:	1986
 *
 *	Compute various averages.
 */

#include <mach/mach_types.h>

#include <kern/sched.h>
#include <kern/assert.h>
#include <kern/processor.h>
#include <kern/thread.h>
#if CONFIG_TELEMETRY
#include <kern/telemetry.h>
#endif

#include <sys/kdebug.h>

uint32_t        avenrun[3] = {0, 0, 0};
uint32_t        mach_factor[3] = {0, 0, 0};

uint32_t        sched_load_average, sched_mach_factor;

#if defined(CONFIG_SCHED_TIMESHARE_CORE)
/*
 * Values are scaled by LOAD_SCALE, defined in processor_info.h
 */
#define base(n)         ((n) << SCHED_TICK_SHIFT)
#define frac(n)         (((base(n) - 1) * LOAD_SCALE) /	base(n))

static uint32_t         fract[3] = {
	frac(5),                /* 5 second average */
	frac(30),               /* 30 second average */
	frac(60),               /* 1 minute average */
};

#undef base
#undef frac

#endif /* CONFIG_SCHED_TIMESHARE_CORE */

static unsigned int             sched_nrun;

typedef void    (*sched_avg_comp_t)(
	void                    *param);

static struct sched_average {
	sched_avg_comp_t        comp;
	void                    *param;
	int                     period; /* in seconds */
	uint64_t                deadline;
} sched_average[] = {
	{ compute_averunnable, &sched_nrun, 5, 0 },
	{ compute_stack_target, NULL, 5, 1 },
	{ compute_pageout_gc_throttle, NULL, 1, 0 },
	{ compute_pmap_gc_throttle, NULL, 60, 0 },
#if CONFIG_TELEMETRY
	{ compute_telemetry, NULL, 1, 0 },
#endif
	{ NULL, NULL, 0, 0 }
};

typedef struct sched_average    *sched_average_t;

/*
 * Scheduler load calculation algorithm
 *
 * The scheduler load values provide an estimate of the number of runnable
 * timeshare threads in the system at various priority bands. The load
 * ultimately affects the priority shifts applied to all threads in a band
 * causing them to timeshare with other threads in the system. The load is
 * maintained in buckets, with each bucket corresponding to a priority band.
 *
 * Each runnable thread on the system contributes its load to its priority
 * band and to the bands above it. The contribution of a thread to the bands
 * above it is not strictly 1:1 and is weighted based on the priority band
 * of the thread. The rules of thread load contribution to each of its higher
 * bands are as follows:
 *
 * - DF threads: Upto (2 * NCPUs) threads
 * - UT threads: Upto NCPUs threads
 * - BG threads: Upto 1 thread
 *
 * To calculate the load values, the various run buckets are sampled (every
 * sched_load_compute_interval_abs) and the weighted contributions of the the
 * lower bucket threads are added. The resultant value is plugged into an
 * exponentially weighted moving average formula:
 *      new-load = alpha * old-load + (1 - alpha) * run-bucket-sample-count
 *      (where, alpha < 1)
 * The calculations for the scheduler load are done using fixpoint math with
 * a scale factor of 16 to avoid expensive divides and floating point
 * operations. The final load values are a smooth curve representative of
 * the actual number of runnable threads in a priority band.
 */

/* Maintains the current (scaled for fixpoint) load in various buckets */
uint32_t sched_load[TH_BUCKET_MAX];

/*
 * Alpha factor for the EWMA alogrithm. The current values are chosen as
 * 6:10 ("old load":"new samples") to make sure the scheduler reacts fast
 * enough to changing system load but does not see too many spikes from bursty
 * activity. The current values ensure that the scheduler would converge
 * to the latest load in 2-3 sched_load_compute_interval_abs intervals
 * (which amounts to ~30-45ms with current values).
 */
#define SCHED_LOAD_EWMA_ALPHA_OLD      6
#define SCHED_LOAD_EWMA_ALPHA_NEW      10
#define SCHED_LOAD_EWMA_ALPHA_SHIFT    4
static_assert((SCHED_LOAD_EWMA_ALPHA_OLD + SCHED_LOAD_EWMA_ALPHA_NEW) == (1ul << SCHED_LOAD_EWMA_ALPHA_SHIFT));

/* For fixpoint EWMA, roundup the load to make it converge */
#define SCHED_LOAD_EWMA_ROUNDUP(load)   (((load) & (1ul << (SCHED_LOAD_EWMA_ALPHA_SHIFT - 1))) != 0)

/* Macro to convert scaled sched load to a real load value */
#define SCHED_LOAD_EWMA_UNSCALE(load)   (((load) >> SCHED_LOAD_EWMA_ALPHA_SHIFT) + SCHED_LOAD_EWMA_ROUNDUP(load))

/*
 * Routine to capture the latest runnable counts and update sched_load (only used for non-clutch schedulers)
 */
void
compute_sched_load(void)
{
	/*
	 * Retrieve a snapshot of the current run counts.
	 *
	 * Why not a bcopy()? Because we need atomic word-sized reads of sched_run_buckets,
	 * not byte-by-byte copy.
	 */
	uint32_t ncpus = processor_avail_count;
	uint32_t load_now[TH_BUCKET_MAX];

	load_now[TH_BUCKET_RUN]      = os_atomic_load(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
	load_now[TH_BUCKET_FIXPRI]   = os_atomic_load(&sched_run_buckets[TH_BUCKET_FIXPRI], relaxed);
	load_now[TH_BUCKET_SHARE_FG] = os_atomic_load(&sched_run_buckets[TH_BUCKET_SHARE_FG], relaxed);
	load_now[TH_BUCKET_SHARE_DF] = os_atomic_load(&sched_run_buckets[TH_BUCKET_SHARE_DF], relaxed);
	load_now[TH_BUCKET_SHARE_UT] = os_atomic_load(&sched_run_buckets[TH_BUCKET_SHARE_UT], relaxed);
	load_now[TH_BUCKET_SHARE_BG] = os_atomic_load(&sched_run_buckets[TH_BUCKET_SHARE_BG], relaxed);

	assert(load_now[TH_BUCKET_RUN] >= 0);
	assert(load_now[TH_BUCKET_FIXPRI] >= 0);

	uint32_t nthreads = load_now[TH_BUCKET_RUN];
	uint32_t nfixpri  = load_now[TH_BUCKET_FIXPRI];

	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	    MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_LOAD) | DBG_FUNC_NONE,
	    load_now[TH_BUCKET_FIXPRI], (load_now[TH_BUCKET_SHARE_FG] + load_now[TH_BUCKET_SHARE_DF]),
	    load_now[TH_BUCKET_SHARE_BG], load_now[TH_BUCKET_SHARE_UT], 0);

	/*
	 * Compute the timeshare priority conversion factor based on loading.
	 * Because our counters may be incremented and accessed
	 * concurrently with respect to each other, we may have
	 * windows where the invariant (nthreads - nfixpri) == (fg + df + bg + ut)
	 * is broken, so truncate values in these cases.
	 */
	uint32_t timeshare_threads = (nthreads - nfixpri);
	for (uint32_t i = TH_BUCKET_SHARE_FG; i <= TH_BUCKET_SHARE_BG; i++) {
		if (load_now[i] > timeshare_threads) {
			load_now[i] = timeshare_threads;
		}
	}

	/*
	 * Default threads contribute up to (NCPUS * 2) of load to FG threads
	 */
	if (load_now[TH_BUCKET_SHARE_DF] <= (ncpus * 2)) {
		load_now[TH_BUCKET_SHARE_FG] += load_now[TH_BUCKET_SHARE_DF];
	} else {
		load_now[TH_BUCKET_SHARE_FG] += (ncpus * 2);
	}

	/*
	 * Utility threads contribute up to NCPUS of load to FG & DF threads
	 */
	if (load_now[TH_BUCKET_SHARE_UT] <= ncpus) {
		load_now[TH_BUCKET_SHARE_FG] += load_now[TH_BUCKET_SHARE_UT];
		load_now[TH_BUCKET_SHARE_DF] += load_now[TH_BUCKET_SHARE_UT];
	} else {
		load_now[TH_BUCKET_SHARE_FG] += ncpus;
		load_now[TH_BUCKET_SHARE_DF] += ncpus;
	}

	/*
	 * BG threads contribute up to 1 thread worth of load to FG, DF and UT threads
	 */
	if (load_now[TH_BUCKET_SHARE_BG] > 0) {
		load_now[TH_BUCKET_SHARE_FG] += 1;
		load_now[TH_BUCKET_SHARE_DF] += 1;
		load_now[TH_BUCKET_SHARE_UT] += 1;
	}

	/*
	 * The conversion factor consists of two components:
	 * a fixed value based on the absolute time unit (sched_fixed_shift),
	 * and a dynamic portion based on load (sched_load_shifts).
	 *
	 * Zero load results in a out of range shift count.
	 */

	for (uint32_t i = TH_BUCKET_SHARE_FG; i <= TH_BUCKET_SHARE_BG; i++) {
		uint32_t bucket_load = 0;

		if (load_now[i] > ncpus) {
			/* Normalize the load to number of CPUs */
			if (ncpus > 1) {
				bucket_load = load_now[i] / ncpus;
			} else {
				bucket_load = load_now[i];
			}

			if (bucket_load > MAX_LOAD) {
				bucket_load = MAX_LOAD;
			}
		}
		/* Plug the load values into the EWMA algorithm to calculate (scaled for fixpoint) sched_load */
		sched_load[i] = (sched_load[i] * SCHED_LOAD_EWMA_ALPHA_OLD) + ((bucket_load << SCHED_LOAD_EWMA_ALPHA_SHIFT) * SCHED_LOAD_EWMA_ALPHA_NEW);
		sched_load[i] = sched_load[i] >> SCHED_LOAD_EWMA_ALPHA_SHIFT;
	}

	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	    MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_LOAD_EFFECTIVE) | DBG_FUNC_NONE,
	    SCHED_LOAD_EWMA_UNSCALE(sched_load[TH_BUCKET_SHARE_FG]), SCHED_LOAD_EWMA_UNSCALE(sched_load[TH_BUCKET_SHARE_DF]),
	    SCHED_LOAD_EWMA_UNSCALE(sched_load[TH_BUCKET_SHARE_UT]), SCHED_LOAD_EWMA_UNSCALE(sched_load[TH_BUCKET_SHARE_BG]), 0);
}

void
compute_averages(uint64_t stdelta)
{
	uint32_t nthreads = os_atomic_load(&sched_run_buckets[TH_BUCKET_RUN], relaxed) - 1;
	uint32_t ncpus = processor_avail_count;

	/* Update the global pri_shifts based on the latest values */
	for (uint32_t i = TH_BUCKET_SHARE_FG; i <= TH_BUCKET_SHARE_BG; i++) {
		uint32_t bucket_load = SCHED_LOAD_EWMA_UNSCALE(sched_load[i]);
		uint32_t shift = sched_fixed_shift - sched_load_shifts[bucket_load];

		if (shift > SCHED_PRI_SHIFT_MAX) {
			sched_pri_shifts[i] = INT8_MAX;
		} else {
			sched_pri_shifts[i] = shift;
		}
	}

	/*
	 * Sample total running threads for the load average calculation.
	 */
	sched_nrun = nthreads;

	/*
	 * Load average and mach factor calculations for
	 * those which ask about these things.
	 */
	uint32_t average_now = nthreads * LOAD_SCALE;
	uint32_t factor_now;

	if (nthreads > ncpus) {
		factor_now = (ncpus * LOAD_SCALE) / (nthreads + 1);
	} else {
		factor_now = (ncpus - nthreads) * LOAD_SCALE;
	}

	/*
	 * For those statistics that formerly relied on being recomputed
	 * on timer ticks, advance by the approximate number of corresponding
	 * elapsed intervals, thus compensating for potential idle intervals.
	 */
	for (uint32_t index = 0; index < stdelta; index++) {
		sched_mach_factor = ((sched_mach_factor << 2) + factor_now) / 5;
		sched_load_average = ((sched_load_average << 2) + average_now) / 5;
	}

	/*
	 * Compute old-style Mach load averages.
	 */
	for (uint32_t index = 0; index < stdelta; index++) {
		for (uint32_t i = 0; i < 3; i++) {
			mach_factor[i] = ((mach_factor[i] * fract[i]) +
			    (factor_now * (LOAD_SCALE - fract[i]))) / LOAD_SCALE;

			avenrun[i] = ((avenrun[i] * fract[i]) +
			    (average_now * (LOAD_SCALE - fract[i]))) / LOAD_SCALE;
		}
	}

	/*
	 * Compute averages in other components.
	 */
	uint64_t abstime = mach_absolute_time();

	for (sched_average_t avg = sched_average; avg->comp != NULL; ++avg) {
		if (abstime >= avg->deadline) {
			uint64_t period_abs = (avg->period * sched_one_second_interval);
			uint64_t ninvokes = 1;

			ninvokes += (abstime - avg->deadline) / period_abs;
			ninvokes = MIN(ninvokes, SCHED_TICK_MAX_DELTA);

			for (uint32_t index = 0; index < ninvokes; index++) {
				(*avg->comp)(avg->param);
			}
			avg->deadline = abstime + period_abs;
		}
	}
}
Commit	Line	Data
1c79356b	1	/*
2d21ac55	2	* Copyright (c) 2000-2007 Apple Computer, Inc. All rights reserved.
1c79356b	3	*
2d21ac55	4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
0a7de745	5	*
2d21ac55 A	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. The rights granted to you under the License
	10	* may not be used to create, or enable the creation or redistribution of,
	11	* unlawful or unlicensed copies of an Apple operating system, or to
	12	* circumvent, violate, or enable the circumvention or violation of, any
	13	* terms of an Apple operating system software license agreement.
0a7de745	14	*
2d21ac55 A	15	* Please obtain a copy of the License at
2d21ac55 A	16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
0a7de745	17	*
2d21ac55 A	18	* The Original Code and all software distributed under the License are
2d21ac55 A	19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
8f6c56a5 A	20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
8f6c56a5 A	21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
2d21ac55 A	22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	23	* Please see the License for the specific language governing rights and
	24	* limitations under the License.
0a7de745	25	*
2d21ac55	26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
1c79356b A	27	*/
	28	/*
	29	* @OSF_COPYRIGHT@
	30	*/
0a7de745	31	/*
1c79356b A	32	* Mach Operating System
	33	* Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
	34	* All Rights Reserved.
0a7de745	35	*
1c79356b A	36	* Permission to use, copy, modify and distribute this software and its
	37	* documentation is hereby granted, provided that both the copyright
	38	* notice and this permission notice appear in all copies of the
	39	* software, derivative works or modified versions, and any portions
	40	* thereof, and that both notices appear in supporting documentation.
0a7de745	41	*
1c79356b A	42	* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
	43	* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
	44	* ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
0a7de745	45	*
1c79356b	46	* Carnegie Mellon requests users of this software to return to
0a7de745	47	*
1c79356b A	48	* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
	49	* School of Computer Science
	50	* Carnegie Mellon University
	51	* Pittsburgh PA 15213-3890
0a7de745	52	*
1c79356b A	53	* any improvements or extensions that they make and grant Carnegie Mellon
	54	* the rights to redistribute these changes.
	55	*/
	56	/*
	57	*/
	58	/*
1c79356b A	59	* Author: Avadis Tevanian, Jr.
	60	* Date: 1986
	61	*
91447636	62	* Compute various averages.
1c79356b A	63	*/
1c79356b A	64
91447636	65	#include <mach/mach_types.h>
1c79356b	66
1c79356b A	67	#include <kern/sched.h>
	68	#include <kern/assert.h>
	69	#include <kern/processor.h>
	70	#include <kern/thread.h>
39236c6e A	71	#if CONFIG_TELEMETRY
	72	#include <kern/telemetry.h>
	73	#endif
490019cf A	74
	75	#include <sys/kdebug.h>
	76
0a7de745 A	77	uint32_t avenrun[3] = {0, 0, 0};
0a7de745 A	78	uint32_t mach_factor[3] = {0, 0, 0};
1c79356b	79
0a7de745	80	uint32_t sched_load_average, sched_mach_factor;
39037602	81
3e170ce0	82	#if defined(CONFIG_SCHED_TIMESHARE_CORE)
1c79356b A	83	/*
	84	* Values are scaled by LOAD_SCALE, defined in processor_info.h
	85	*/
0a7de745 A	86	#define base(n) ((n) << SCHED_TICK_SHIFT)
0a7de745 A	87	#define frac(n) (((base(n) - 1) * LOAD_SCALE) / base(n))
0b4e3aa0	88
0a7de745 A	89	static uint32_t fract[3] = {
	90	frac(5), /* 5 second average */
	91	frac(30), /* 30 second average */
	92	frac(60), /* 1 minute average */
1c79356b	93	};
9bccf70c	94
0b4e3aa0 A	95	#undef base
0b4e3aa0 A	96	#undef frac
1c79356b	97
3e170ce0	98	#endif /* CONFIG_SCHED_TIMESHARE_CORE */
6d2010ae	99
0a7de745	100	static unsigned int sched_nrun;
91447636	101
0a7de745 A	102	typedef void (*sched_avg_comp_t)(
0a7de745 A	103	void *param);
91447636	104
91447636	105	static struct sched_average {
0a7de745 A	106	sched_avg_comp_t comp;
	107	void *param;
	108	int period; /* in seconds */
	109	uint64_t deadline;
91447636	110	} sched_average[] = {
6d2010ae A	111	{ compute_averunnable, &sched_nrun, 5, 0 },
6d2010ae A	112	{ compute_stack_target, NULL, 5, 1 },
316670eb	113	{ compute_pageout_gc_throttle, NULL, 1, 0 },
6d2010ae	114	{ compute_pmap_gc_throttle, NULL, 60, 0 },
39236c6e A	115	#if CONFIG_TELEMETRY
	116	{ compute_telemetry, NULL, 1, 0 },
	117	#endif
91447636 A	118	{ NULL, NULL, 0, 0 }
	119	};
	120
0a7de745	121	typedef struct sched_average *sched_average_t;
91447636	122
d9a64523 A	123	/*
	124	* Scheduler load calculation algorithm
	125	*
0a7de745 A	126	* The scheduler load values provide an estimate of the number of runnable
	127	* timeshare threads in the system at various priority bands. The load
	128	* ultimately affects the priority shifts applied to all threads in a band
	129	* causing them to timeshare with other threads in the system. The load is
d9a64523 A	130	* maintained in buckets, with each bucket corresponding to a priority band.
d9a64523 A	131	*
0a7de745 A	132	* Each runnable thread on the system contributes its load to its priority
	133	* band and to the bands above it. The contribution of a thread to the bands
	134	* above it is not strictly 1:1 and is weighted based on the priority band
	135	* of the thread. The rules of thread load contribution to each of its higher
d9a64523 A	136	* bands are as follows:
	137	*
	138	* - DF threads: Upto (2 * NCPUs) threads
	139	* - UT threads: Upto NCPUs threads
	140	* - BG threads: Upto 1 thread
	141	*
0a7de745	142	* To calculate the load values, the various run buckets are sampled (every
d9a64523	143	* sched_load_compute_interval_abs) and the weighted contributions of the the
0a7de745 A	144	* lower bucket threads are added. The resultant value is plugged into an
	145	* exponentially weighted moving average formula:
	146	* new-load = alpha * old-load + (1 - alpha) * run-bucket-sample-count
	147	* (where, alpha < 1)
	148	* The calculations for the scheduler load are done using fixpoint math with
	149	* a scale factor of 16 to avoid expensive divides and floating point
	150	* operations. The final load values are a smooth curve representative of
d9a64523 A	151	* the actual number of runnable threads in a priority band.
	152	*/
	153
	154	/* Maintains the current (scaled for fixpoint) load in various buckets */
	155	uint32_t sched_load[TH_BUCKET_MAX];
39037602	156
0a7de745 A	157	/*
	158	* Alpha factor for the EWMA alogrithm. The current values are chosen as
	159	* 6:10 ("old load":"new samples") to make sure the scheduler reacts fast
	160	* enough to changing system load but does not see too many spikes from bursty
	161	* activity. The current values ensure that the scheduler would converge
	162	* to the latest load in 2-3 sched_load_compute_interval_abs intervals
d9a64523	163	* (which amounts to ~30-45ms with current values).
39236c6e	164	*/
d9a64523 A	165	#define SCHED_LOAD_EWMA_ALPHA_OLD 6
	166	#define SCHED_LOAD_EWMA_ALPHA_NEW 10
	167	#define SCHED_LOAD_EWMA_ALPHA_SHIFT 4
	168	static_assert((SCHED_LOAD_EWMA_ALPHA_OLD + SCHED_LOAD_EWMA_ALPHA_NEW) == (1ul << SCHED_LOAD_EWMA_ALPHA_SHIFT));
	169
	170	/* For fixpoint EWMA, roundup the load to make it converge */
0a7de745	171	#define SCHED_LOAD_EWMA_ROUNDUP(load) (((load) & (1ul << (SCHED_LOAD_EWMA_ALPHA_SHIFT - 1))) != 0)
d9a64523 A	172
d9a64523 A	173	/* Macro to convert scaled sched load to a real load value */
0a7de745	174	#define SCHED_LOAD_EWMA_UNSCALE(load) (((load) >> SCHED_LOAD_EWMA_ALPHA_SHIFT) + SCHED_LOAD_EWMA_ROUNDUP(load))
39236c6e	175
d9a64523	176	/*
cb323159 A	177	* Routine to capture the latest runnable counts and update sched_load (only used for non-clutch schedulers)
cb323159 A	178	*/
1c79356b	179	void
d9a64523	180	compute_sched_load(void)
1c79356b	181	{
2d21ac55	182	/*
39037602 A	183	* Retrieve a snapshot of the current run counts.
	184	*
	185	* Why not a bcopy()? Because we need atomic word-sized reads of sched_run_buckets,
	186	* not byte-by-byte copy.
2d21ac55	187	*/
39037602	188	uint32_t ncpus = processor_avail_count;
d9a64523	189	uint32_t load_now[TH_BUCKET_MAX];
2d21ac55	190
cb323159 A	191	load_now[TH_BUCKET_RUN] = os_atomic_load(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
	192	load_now[TH_BUCKET_FIXPRI] = os_atomic_load(&sched_run_buckets[TH_BUCKET_FIXPRI], relaxed);
	193	load_now[TH_BUCKET_SHARE_FG] = os_atomic_load(&sched_run_buckets[TH_BUCKET_SHARE_FG], relaxed);
	194	load_now[TH_BUCKET_SHARE_DF] = os_atomic_load(&sched_run_buckets[TH_BUCKET_SHARE_DF], relaxed);
	195	load_now[TH_BUCKET_SHARE_UT] = os_atomic_load(&sched_run_buckets[TH_BUCKET_SHARE_UT], relaxed);
	196	load_now[TH_BUCKET_SHARE_BG] = os_atomic_load(&sched_run_buckets[TH_BUCKET_SHARE_BG], relaxed);
2d21ac55	197
39037602 A	198	assert(load_now[TH_BUCKET_RUN] >= 0);
	199	assert(load_now[TH_BUCKET_FIXPRI] >= 0);
	200
d9a64523 A	201	uint32_t nthreads = load_now[TH_BUCKET_RUN];
d9a64523 A	202	uint32_t nfixpri = load_now[TH_BUCKET_FIXPRI];
39037602 A	203
39037602 A	204	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
0a7de745 A	205	MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_LOAD) \| DBG_FUNC_NONE,
	206	load_now[TH_BUCKET_FIXPRI], (load_now[TH_BUCKET_SHARE_FG] + load_now[TH_BUCKET_SHARE_DF]),
	207	load_now[TH_BUCKET_SHARE_BG], load_now[TH_BUCKET_SHARE_UT], 0);
2d21ac55	208
2d21ac55	209	/*
39236c6e A	210	* Compute the timeshare priority conversion factor based on loading.
	211	* Because our counters may be incremented and accessed
	212	* concurrently with respect to each other, we may have
d9a64523	213	* windows where the invariant (nthreads - nfixpri) == (fg + df + bg + ut)
39236c6e	214	* is broken, so truncate values in these cases.
2d21ac55	215	*/
39037602	216	uint32_t timeshare_threads = (nthreads - nfixpri);
0a7de745 A	217	for (uint32_t i = TH_BUCKET_SHARE_FG; i <= TH_BUCKET_SHARE_BG; i++) {
0a7de745 A	218	if (load_now[i] > timeshare_threads) {
39037602	219	load_now[i] = timeshare_threads;
0a7de745	220	}
39037602	221	}
39236c6e	222
0a7de745 A	223	/*
0a7de745 A	224	* Default threads contribute up to (NCPUS * 2) of load to FG threads
d9a64523 A	225	*/
	226	if (load_now[TH_BUCKET_SHARE_DF] <= (ncpus * 2)) {
	227	load_now[TH_BUCKET_SHARE_FG] += load_now[TH_BUCKET_SHARE_DF];
	228	} else {
	229	load_now[TH_BUCKET_SHARE_FG] += (ncpus * 2);
	230	}
0a7de745	231
39037602	232	/*
d9a64523	233	* Utility threads contribute up to NCPUS of load to FG & DF threads
39037602 A	234	*/
	235	if (load_now[TH_BUCKET_SHARE_UT] <= ncpus) {
	236	load_now[TH_BUCKET_SHARE_FG] += load_now[TH_BUCKET_SHARE_UT];
d9a64523	237	load_now[TH_BUCKET_SHARE_DF] += load_now[TH_BUCKET_SHARE_UT];
39037602 A	238	} else {
39037602 A	239	load_now[TH_BUCKET_SHARE_FG] += ncpus;
d9a64523	240	load_now[TH_BUCKET_SHARE_DF] += ncpus;
39037602	241	}
91447636	242
39037602	243	/*
d9a64523	244	* BG threads contribute up to 1 thread worth of load to FG, DF and UT threads
39037602 A	245	*/
	246	if (load_now[TH_BUCKET_SHARE_BG] > 0) {
	247	load_now[TH_BUCKET_SHARE_FG] += 1;
d9a64523	248	load_now[TH_BUCKET_SHARE_DF] += 1;
39037602	249	load_now[TH_BUCKET_SHARE_UT] += 1;
1c79356b A	250	}
1c79356b A	251
39037602 A	252	/*
	253	* The conversion factor consists of two components:
	254	* a fixed value based on the absolute time unit (sched_fixed_shift),
	255	* and a dynamic portion based on load (sched_load_shifts).
	256	*
	257	* Zero load results in a out of range shift count.
	258	*/
39236c6e	259
0a7de745	260	for (uint32_t i = TH_BUCKET_SHARE_FG; i <= TH_BUCKET_SHARE_BG; i++) {
39037602	261	uint32_t bucket_load = 0;
39236c6e	262
39037602	263	if (load_now[i] > ncpus) {
d9a64523	264	/* Normalize the load to number of CPUs */
0a7de745	265	if (ncpus > 1) {
39037602	266	bucket_load = load_now[i] / ncpus;
0a7de745	267	} else {
39037602	268	bucket_load = load_now[i];
0a7de745	269	}
39236c6e	270
0a7de745	271	if (bucket_load > MAX_LOAD) {
39037602	272	bucket_load = MAX_LOAD;
0a7de745	273	}
39037602	274	}
d9a64523 A	275	/* Plug the load values into the EWMA algorithm to calculate (scaled for fixpoint) sched_load */
	276	sched_load[i] = (sched_load[i] * SCHED_LOAD_EWMA_ALPHA_OLD) + ((bucket_load << SCHED_LOAD_EWMA_ALPHA_SHIFT) * SCHED_LOAD_EWMA_ALPHA_NEW);
	277	sched_load[i] = sched_load[i] >> SCHED_LOAD_EWMA_ALPHA_SHIFT;
	278	}
39236c6e	279
d9a64523	280	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
0a7de745 A	281	MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_LOAD_EFFECTIVE) \| DBG_FUNC_NONE,
	282	SCHED_LOAD_EWMA_UNSCALE(sched_load[TH_BUCKET_SHARE_FG]), SCHED_LOAD_EWMA_UNSCALE(sched_load[TH_BUCKET_SHARE_DF]),
	283	SCHED_LOAD_EWMA_UNSCALE(sched_load[TH_BUCKET_SHARE_UT]), SCHED_LOAD_EWMA_UNSCALE(sched_load[TH_BUCKET_SHARE_BG]), 0);
d9a64523 A	284	}
	285
	286	void
	287	compute_averages(uint64_t stdelta)
	288	{
cb323159	289	uint32_t nthreads = os_atomic_load(&sched_run_buckets[TH_BUCKET_RUN], relaxed) - 1;
d9a64523	290	uint32_t ncpus = processor_avail_count;
0a7de745	291
d9a64523	292	/* Update the global pri_shifts based on the latest values */
0a7de745	293	for (uint32_t i = TH_BUCKET_SHARE_FG; i <= TH_BUCKET_SHARE_BG; i++) {
d9a64523	294	uint32_t bucket_load = SCHED_LOAD_EWMA_UNSCALE(sched_load[i]);
f427ee49 A	295	uint32_t shift = sched_fixed_shift - sched_load_shifts[bucket_load];
	296
	297	if (shift > SCHED_PRI_SHIFT_MAX) {
	298	sched_pri_shifts[i] = INT8_MAX;
	299	} else {
	300	sched_pri_shifts[i] = shift;
	301	}
39037602	302	}
490019cf	303
6d2010ae	304	/*
39037602	305	* Sample total running threads for the load average calculation.
6d2010ae A	306	*/
6d2010ae A	307	sched_nrun = nthreads;
6d2010ae	308
2d21ac55	309	/*
39037602 A	310	* Load average and mach factor calculations for
39037602 A	311	* those which ask about these things.
2d21ac55	312	*/
39037602 A	313	uint32_t average_now = nthreads * LOAD_SCALE;
	314	uint32_t factor_now;
	315
0a7de745	316	if (nthreads > ncpus) {
39037602	317	factor_now = (ncpus * LOAD_SCALE) / (nthreads + 1);
0a7de745	318	} else {
39037602	319	factor_now = (ncpus - nthreads) * LOAD_SCALE;
0a7de745	320	}
2d21ac55	321
1c79356b	322	/*
39037602 A	323	* For those statistics that formerly relied on being recomputed
	324	* on timer ticks, advance by the approximate number of corresponding
	325	* elapsed intervals, thus compensating for potential idle intervals.
1c79356b	326	*/
39037602 A	327	for (uint32_t index = 0; index < stdelta; index++) {
	328	sched_mach_factor = ((sched_mach_factor << 2) + factor_now) / 5;
	329	sched_load_average = ((sched_load_average << 2) + average_now) / 5;
	330	}
39236c6e	331
39037602 A	332	/*
	333	* Compute old-style Mach load averages.
	334	*/
	335	for (uint32_t index = 0; index < stdelta; index++) {
	336	for (uint32_t i = 0; i < 3; i++) {
1c79356b	337	mach_factor[i] = ((mach_factor[i] * fract[i]) +
0a7de745	338	(factor_now * (LOAD_SCALE - fract[i]))) / LOAD_SCALE;
1c79356b A	339
1c79356b A	340	avenrun[i] = ((avenrun[i] * fract[i]) +
0a7de745	341	(average_now * (LOAD_SCALE - fract[i]))) / LOAD_SCALE;
1c79356b A	342	}
1c79356b A	343	}
9bccf70c A	344
9bccf70c A	345	/*
39037602	346	* Compute averages in other components.
9bccf70c	347	*/
39037602 A	348	uint64_t abstime = mach_absolute_time();
	349
	350	for (sched_average_t avg = sched_average; avg->comp != NULL; ++avg) {
6d2010ae	351	if (abstime >= avg->deadline) {
39236c6e A	352	uint64_t period_abs = (avg->period * sched_one_second_interval);
	353	uint64_t ninvokes = 1;
	354
	355	ninvokes += (abstime - avg->deadline) / period_abs;
	356	ninvokes = MIN(ninvokes, SCHED_TICK_MAX_DELTA);
	357
39037602	358	for (uint32_t index = 0; index < ninvokes; index++) {
39236c6e A	359	(*avg->comp)(avg->param);
	360	}
	361	avg->deadline = abstime + period_abs;
91447636	362	}
9bccf70c	363	}
1c79356b	364	}