git.saurik.com Git - apple/xnu.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2000-2007 Apple Computer, Inc. All rights reserved.
	3	*
	4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
	5	*
	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. The rights granted to you under the License
	10	* may not be used to create, or enable the creation or redistribution of,
	11	* unlawful or unlicensed copies of an Apple operating system, or to
	12	* circumvent, violate, or enable the circumvention or violation of, any
	13	* terms of an Apple operating system software license agreement.
	14	*
	15	* Please obtain a copy of the License at
	16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
	17	*
	18	* The Original Code and all software distributed under the License are
	19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	23	* Please see the License for the specific language governing rights and
	24	* limitations under the License.
	25	*
	26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
	27	*/
	28	/*
	29	* @OSF_COPYRIGHT@
	30	*/
	31	/*
	32	* Mach Operating System
	33	* Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
	34	* All Rights Reserved.
	35	*
	36	* Permission to use, copy, modify and distribute this software and its
	37	* documentation is hereby granted, provided that both the copyright
	38	* notice and this permission notice appear in all copies of the
	39	* software, derivative works or modified versions, and any portions
	40	* thereof, and that both notices appear in supporting documentation.
	41	*
	42	* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
	43	* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
	44	* ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
	45	*
	46	* Carnegie Mellon requests users of this software to return to
	47	*
	48	* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
	49	* School of Computer Science
	50	* Carnegie Mellon University
	51	* Pittsburgh PA 15213-3890
	52	*
	53	* any improvements or extensions that they make and grant Carnegie Mellon
	54	* the rights to redistribute these changes.
	55	*/
	56	/*
	57	*/
	58	/*
	59	* Author: Avadis Tevanian, Jr.
	60	* Date: 1986
	61	*
	62	* Compute various averages.
	63	*/
	64
	65	#include <mach/mach_types.h>
	66
	67	#include <kern/sched.h>
	68	#include <kern/assert.h>
	69	#include <kern/processor.h>
	70	#include <kern/thread.h>
	71	#if CONFIG_TELEMETRY
	72	#include <kern/telemetry.h>
	73	#endif
	74	#include <kern/zalloc_internal.h>
	75
	76	#include <sys/kdebug.h>
	77
	78	uint32_t avenrun[3] = {0, 0, 0};
	79	uint32_t mach_factor[3] = {0, 0, 0};
	80
	81	uint32_t sched_load_average, sched_mach_factor;
	82
	83	#if defined(CONFIG_SCHED_TIMESHARE_CORE)
	84	/*
	85	* Values are scaled by LOAD_SCALE, defined in processor_info.h
	86	*/
	87	#define base(n) ((n) << SCHED_TICK_SHIFT)
	88	#define frac(n) (((base(n) - 1) * LOAD_SCALE) / base(n))
	89
	90	static uint32_t fract[3] = {
	91	frac(5), /* 5 second average */
	92	frac(30), /* 30 second average */
	93	frac(60), /* 1 minute average */
	94	};
	95
	96	#undef base
	97	#undef frac
	98
	99	#endif /* CONFIG_SCHED_TIMESHARE_CORE */
	100
	101	static unsigned int sched_nrun;
	102
	103	typedef void (*sched_avg_comp_t)(
	104	void *param);
	105
	106	static struct sched_average {
	107	sched_avg_comp_t comp;
	108	void *param;
	109	int period; /* in seconds */
	110	uint64_t deadline;
	111	} sched_average[] = {
	112	{ compute_averunnable, &sched_nrun, 5, 0 },
	113	{ compute_stack_target, NULL, 5, 1 },
	114	{ compute_pageout_gc_throttle, NULL, 1, 0 },
	115	{ compute_pmap_gc_throttle, NULL, 60, 0 },
	116	{ compute_zone_working_set_size, NULL, ZONE_WSS_UPDATE_PERIOD, 0 },
	117	#if CONFIG_TELEMETRY
	118	{ compute_telemetry, NULL, 1, 0 },
	119	#endif
	120	{ NULL, NULL, 0, 0 }
	121	};
	122
	123	typedef struct sched_average *sched_average_t;
	124
	125	/*
	126	* Scheduler load calculation algorithm
	127	*
	128	* The scheduler load values provide an estimate of the number of runnable
	129	* timeshare threads in the system at various priority bands. The load
	130	* ultimately affects the priority shifts applied to all threads in a band
	131	* causing them to timeshare with other threads in the system. The load is
	132	* maintained in buckets, with each bucket corresponding to a priority band.
	133	*
	134	* Each runnable thread on the system contributes its load to its priority
	135	* band and to the bands above it. The contribution of a thread to the bands
	136	* above it is not strictly 1:1 and is weighted based on the priority band
	137	* of the thread. The rules of thread load contribution to each of its higher
	138	* bands are as follows:
	139	*
	140	* - DF threads: Upto (2 * NCPUs) threads
	141	* - UT threads: Upto NCPUs threads
	142	* - BG threads: Upto 1 thread
	143	*
	144	* To calculate the load values, the various run buckets are sampled (every
	145	* sched_load_compute_interval_abs) and the weighted contributions of the the
	146	* lower bucket threads are added. The resultant value is plugged into an
	147	* exponentially weighted moving average formula:
	148	* new-load = alpha * old-load + (1 - alpha) * run-bucket-sample-count
	149	* (where, alpha < 1)
	150	* The calculations for the scheduler load are done using fixpoint math with
	151	* a scale factor of 16 to avoid expensive divides and floating point
	152	* operations. The final load values are a smooth curve representative of
	153	* the actual number of runnable threads in a priority band.
	154	*/
	155
	156	/* Maintains the current (scaled for fixpoint) load in various buckets */
	157	uint32_t sched_load[TH_BUCKET_MAX];
	158
	159	/*
	160	* Alpha factor for the EWMA alogrithm. The current values are chosen as
	161	* 6:10 ("old load":"new samples") to make sure the scheduler reacts fast
	162	* enough to changing system load but does not see too many spikes from bursty
	163	* activity. The current values ensure that the scheduler would converge
	164	* to the latest load in 2-3 sched_load_compute_interval_abs intervals
	165	* (which amounts to ~30-45ms with current values).
	166	*/
	167	#define SCHED_LOAD_EWMA_ALPHA_OLD 6
	168	#define SCHED_LOAD_EWMA_ALPHA_NEW 10
	169	#define SCHED_LOAD_EWMA_ALPHA_SHIFT 4
	170	static_assert((SCHED_LOAD_EWMA_ALPHA_OLD + SCHED_LOAD_EWMA_ALPHA_NEW) == (1ul << SCHED_LOAD_EWMA_ALPHA_SHIFT));
	171
	172	/* For fixpoint EWMA, roundup the load to make it converge */
	173	#define SCHED_LOAD_EWMA_ROUNDUP(load) (((load) & (1ul << (SCHED_LOAD_EWMA_ALPHA_SHIFT - 1))) != 0)
	174
	175	/* Macro to convert scaled sched load to a real load value */
	176	#define SCHED_LOAD_EWMA_UNSCALE(load) (((load) >> SCHED_LOAD_EWMA_ALPHA_SHIFT) + SCHED_LOAD_EWMA_ROUNDUP(load))
	177
	178	/*
	179	* Routine to capture the latest runnable counts and update sched_load (only used for non-clutch schedulers)
	180	*/
	181	void
	182	compute_sched_load(void)
	183	{
	184	/*
	185	* Retrieve a snapshot of the current run counts.
	186	*
	187	* Why not a bcopy()? Because we need atomic word-sized reads of sched_run_buckets,
	188	* not byte-by-byte copy.
	189	*/
	190	uint32_t ncpus = processor_avail_count;
	191	uint32_t load_now[TH_BUCKET_MAX];
	192
	193	load_now[TH_BUCKET_RUN] = os_atomic_load(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
	194	load_now[TH_BUCKET_FIXPRI] = os_atomic_load(&sched_run_buckets[TH_BUCKET_FIXPRI], relaxed);
	195	load_now[TH_BUCKET_SHARE_FG] = os_atomic_load(&sched_run_buckets[TH_BUCKET_SHARE_FG], relaxed);
	196	load_now[TH_BUCKET_SHARE_DF] = os_atomic_load(&sched_run_buckets[TH_BUCKET_SHARE_DF], relaxed);
	197	load_now[TH_BUCKET_SHARE_UT] = os_atomic_load(&sched_run_buckets[TH_BUCKET_SHARE_UT], relaxed);
	198	load_now[TH_BUCKET_SHARE_BG] = os_atomic_load(&sched_run_buckets[TH_BUCKET_SHARE_BG], relaxed);
	199
	200	assert(load_now[TH_BUCKET_RUN] >= 0);
	201	assert(load_now[TH_BUCKET_FIXPRI] >= 0);
	202
	203	uint32_t nthreads = load_now[TH_BUCKET_RUN];
	204	uint32_t nfixpri = load_now[TH_BUCKET_FIXPRI];
	205
	206	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	207	MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_LOAD) \| DBG_FUNC_NONE,
	208	load_now[TH_BUCKET_FIXPRI], (load_now[TH_BUCKET_SHARE_FG] + load_now[TH_BUCKET_SHARE_DF]),
	209	load_now[TH_BUCKET_SHARE_BG], load_now[TH_BUCKET_SHARE_UT], 0);
	210
	211	/*
	212	* Compute the timeshare priority conversion factor based on loading.
	213	* Because our counters may be incremented and accessed
	214	* concurrently with respect to each other, we may have
	215	* windows where the invariant (nthreads - nfixpri) == (fg + df + bg + ut)
	216	* is broken, so truncate values in these cases.
	217	*/
	218	uint32_t timeshare_threads = (nthreads - nfixpri);
	219	for (uint32_t i = TH_BUCKET_SHARE_FG; i <= TH_BUCKET_SHARE_BG; i++) {
	220	if (load_now[i] > timeshare_threads) {
	221	load_now[i] = timeshare_threads;
	222	}
	223	}
	224
	225	/*
	226	* Default threads contribute up to (NCPUS * 2) of load to FG threads
	227	*/
	228	if (load_now[TH_BUCKET_SHARE_DF] <= (ncpus * 2)) {
	229	load_now[TH_BUCKET_SHARE_FG] += load_now[TH_BUCKET_SHARE_DF];
	230	} else {
	231	load_now[TH_BUCKET_SHARE_FG] += (ncpus * 2);
	232	}
	233
	234	/*
	235	* Utility threads contribute up to NCPUS of load to FG & DF threads
	236	*/
	237	if (load_now[TH_BUCKET_SHARE_UT] <= ncpus) {
	238	load_now[TH_BUCKET_SHARE_FG] += load_now[TH_BUCKET_SHARE_UT];
	239	load_now[TH_BUCKET_SHARE_DF] += load_now[TH_BUCKET_SHARE_UT];
	240	} else {
	241	load_now[TH_BUCKET_SHARE_FG] += ncpus;
	242	load_now[TH_BUCKET_SHARE_DF] += ncpus;
	243	}
	244
	245	/*
	246	* BG threads contribute up to 1 thread worth of load to FG, DF and UT threads
	247	*/
	248	if (load_now[TH_BUCKET_SHARE_BG] > 0) {
	249	load_now[TH_BUCKET_SHARE_FG] += 1;
	250	load_now[TH_BUCKET_SHARE_DF] += 1;
	251	load_now[TH_BUCKET_SHARE_UT] += 1;
	252	}
	253
	254	/*
	255	* The conversion factor consists of two components:
	256	* a fixed value based on the absolute time unit (sched_fixed_shift),
	257	* and a dynamic portion based on load (sched_load_shifts).
	258	*
	259	* Zero load results in a out of range shift count.
	260	*/
	261
	262	for (uint32_t i = TH_BUCKET_SHARE_FG; i <= TH_BUCKET_SHARE_BG; i++) {
	263	uint32_t bucket_load = 0;
	264
	265	if (load_now[i] > ncpus) {
	266	/* Normalize the load to number of CPUs */
	267	if (ncpus > 1) {
	268	bucket_load = load_now[i] / ncpus;
	269	} else {
	270	bucket_load = load_now[i];
	271	}
	272
	273	if (bucket_load > MAX_LOAD) {
	274	bucket_load = MAX_LOAD;
	275	}
	276	}
	277	/* Plug the load values into the EWMA algorithm to calculate (scaled for fixpoint) sched_load */
	278	sched_load[i] = (sched_load[i] * SCHED_LOAD_EWMA_ALPHA_OLD) + ((bucket_load << SCHED_LOAD_EWMA_ALPHA_SHIFT) * SCHED_LOAD_EWMA_ALPHA_NEW);
	279	sched_load[i] = sched_load[i] >> SCHED_LOAD_EWMA_ALPHA_SHIFT;
	280	}
	281
	282	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	283	MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_LOAD_EFFECTIVE) \| DBG_FUNC_NONE,
	284	SCHED_LOAD_EWMA_UNSCALE(sched_load[TH_BUCKET_SHARE_FG]), SCHED_LOAD_EWMA_UNSCALE(sched_load[TH_BUCKET_SHARE_DF]),
	285	SCHED_LOAD_EWMA_UNSCALE(sched_load[TH_BUCKET_SHARE_UT]), SCHED_LOAD_EWMA_UNSCALE(sched_load[TH_BUCKET_SHARE_BG]), 0);
	286	}
	287
	288	void
	289	compute_averages(uint64_t stdelta)
	290	{
	291	uint32_t nthreads = os_atomic_load(&sched_run_buckets[TH_BUCKET_RUN], relaxed) - 1;
	292	uint32_t ncpus = processor_avail_count;
	293
	294	/* Update the global pri_shifts based on the latest values */
	295	for (uint32_t i = TH_BUCKET_SHARE_FG; i <= TH_BUCKET_SHARE_BG; i++) {
	296	uint32_t bucket_load = SCHED_LOAD_EWMA_UNSCALE(sched_load[i]);
	297	uint32_t shift = sched_fixed_shift - sched_load_shifts[bucket_load];
	298
	299	if (shift > SCHED_PRI_SHIFT_MAX) {
	300	sched_pri_shifts[i] = INT8_MAX;
	301	} else {
	302	sched_pri_shifts[i] = shift;
	303	}
	304	}
	305
	306	/*
	307	* Sample total running threads for the load average calculation.
	308	*/
	309	sched_nrun = nthreads;
	310
	311	/*
	312	* Load average and mach factor calculations for
	313	* those which ask about these things.
	314	*/
	315	uint32_t average_now = nthreads * LOAD_SCALE;
	316	uint32_t factor_now;
	317
	318	if (nthreads > ncpus) {
	319	factor_now = (ncpus * LOAD_SCALE) / (nthreads + 1);
	320	} else {
	321	factor_now = (ncpus - nthreads) * LOAD_SCALE;
	322	}
	323
	324	/*
	325	* For those statistics that formerly relied on being recomputed
	326	* on timer ticks, advance by the approximate number of corresponding
	327	* elapsed intervals, thus compensating for potential idle intervals.
	328	*/
	329	for (uint32_t index = 0; index < stdelta; index++) {
	330	sched_mach_factor = ((sched_mach_factor << 2) + factor_now) / 5;
	331	sched_load_average = ((sched_load_average << 2) + average_now) / 5;
	332	}
	333
	334	/*
	335	* Compute old-style Mach load averages.
	336	*/
	337	for (uint32_t index = 0; index < stdelta; index++) {
	338	for (uint32_t i = 0; i < 3; i++) {
	339	mach_factor[i] = ((mach_factor[i] * fract[i]) +
	340	(factor_now * (LOAD_SCALE - fract[i]))) / LOAD_SCALE;
	341
	342	avenrun[i] = ((avenrun[i] * fract[i]) +
	343	(average_now * (LOAD_SCALE - fract[i]))) / LOAD_SCALE;
	344	}
	345	}
	346
	347	/*
	348	* Compute averages in other components.
	349	*/
	350	uint64_t abstime = mach_absolute_time();
	351
	352	for (sched_average_t avg = sched_average; avg->comp != NULL; ++avg) {
	353	if (abstime >= avg->deadline) {
	354	uint64_t period_abs = (avg->period * sched_one_second_interval);
	355	uint64_t ninvokes = 1;
	356
	357	ninvokes += (abstime - avg->deadline) / period_abs;
	358	ninvokes = MIN(ninvokes, SCHED_TICK_MAX_DELTA);
	359
	360	for (uint32_t index = 0; index < ninvokes; index++) {
	361	(*avg->comp)(avg->param);
	362	}
	363	avg->deadline = abstime + period_abs;
	364	}
	365	}
	366	}