git.saurik.com Git - apple/xnu.git/blame - osfmk/kern/sched

Commit	Line	Data
1c79356b	1	/*
2d21ac55	2	* Copyright (c) 2000-2007 Apple Computer, Inc. All rights reserved.
1c79356b	3	*
2d21ac55	4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
0a7de745	5	*
2d21ac55 A	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. The rights granted to you under the License
	10	* may not be used to create, or enable the creation or redistribution of,
	11	* unlawful or unlicensed copies of an Apple operating system, or to
	12	* circumvent, violate, or enable the circumvention or violation of, any
	13	* terms of an Apple operating system software license agreement.
0a7de745	14	*
2d21ac55 A	15	* Please obtain a copy of the License at
2d21ac55 A	16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
0a7de745	17	*
2d21ac55 A	18	* The Original Code and all software distributed under the License are
2d21ac55 A	19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
8f6c56a5 A	20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
8f6c56a5 A	21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
2d21ac55 A	22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	23	* Please see the License for the specific language governing rights and
	24	* limitations under the License.
0a7de745	25	*
2d21ac55	26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
1c79356b A	27	*/
	28	/*
	29	* @OSF_COPYRIGHT@
	30	*/
0a7de745	31	/*
1c79356b A	32	* Mach Operating System
	33	* Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
	34	* All Rights Reserved.
0a7de745	35	*
1c79356b A	36	* Permission to use, copy, modify and distribute this software and its
	37	* documentation is hereby granted, provided that both the copyright
	38	* notice and this permission notice appear in all copies of the
	39	* software, derivative works or modified versions, and any portions
	40	* thereof, and that both notices appear in supporting documentation.
0a7de745	41	*
1c79356b A	42	* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
	43	* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
	44	* ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
0a7de745	45	*
1c79356b	46	* Carnegie Mellon requests users of this software to return to
0a7de745	47	*
1c79356b A	48	* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
	49	* School of Computer Science
	50	* Carnegie Mellon University
	51	* Pittsburgh PA 15213-3890
0a7de745	52	*
1c79356b A	53	* any improvements or extensions that they make and grant Carnegie Mellon
	54	* the rights to redistribute these changes.
	55	*/
	56	/*
	57	*/
	58	/*
1c79356b A	59	* Author: Avadis Tevanian, Jr.
	60	* Date: 1986
	61	*
91447636	62	* Compute various averages.
1c79356b A	63	*/
1c79356b A	64
91447636	65	#include <mach/mach_types.h>
1c79356b	66
1c79356b A	67	#include <kern/sched.h>
	68	#include <kern/assert.h>
	69	#include <kern/processor.h>
	70	#include <kern/thread.h>
39236c6e A	71	#if CONFIG_TELEMETRY
	72	#include <kern/telemetry.h>
	73	#endif
c3c9b80d	74	#include <kern/zalloc_internal.h>
490019cf A	75
	76	#include <sys/kdebug.h>
	77
0a7de745 A	78	uint32_t avenrun[3] = {0, 0, 0};
0a7de745 A	79	uint32_t mach_factor[3] = {0, 0, 0};
1c79356b	80
0a7de745	81	uint32_t sched_load_average, sched_mach_factor;
39037602	82
3e170ce0	83	#if defined(CONFIG_SCHED_TIMESHARE_CORE)
1c79356b A	84	/*
	85	* Values are scaled by LOAD_SCALE, defined in processor_info.h
	86	*/
0a7de745 A	87	#define base(n) ((n) << SCHED_TICK_SHIFT)
0a7de745 A	88	#define frac(n) (((base(n) - 1) * LOAD_SCALE) / base(n))
0b4e3aa0	89
0a7de745 A	90	static uint32_t fract[3] = {
	91	frac(5), /* 5 second average */
	92	frac(30), /* 30 second average */
	93	frac(60), /* 1 minute average */
1c79356b	94	};
9bccf70c	95
0b4e3aa0 A	96	#undef base
0b4e3aa0 A	97	#undef frac
1c79356b	98
3e170ce0	99	#endif /* CONFIG_SCHED_TIMESHARE_CORE */
6d2010ae	100
0a7de745	101	static unsigned int sched_nrun;
91447636	102
0a7de745 A	103	typedef void (*sched_avg_comp_t)(
0a7de745 A	104	void *param);
91447636	105
91447636	106	static struct sched_average {
0a7de745 A	107	sched_avg_comp_t comp;
	108	void *param;
	109	int period; /* in seconds */
	110	uint64_t deadline;
91447636	111	} sched_average[] = {
6d2010ae A	112	{ compute_averunnable, &sched_nrun, 5, 0 },
6d2010ae A	113	{ compute_stack_target, NULL, 5, 1 },
316670eb	114	{ compute_pageout_gc_throttle, NULL, 1, 0 },
6d2010ae	115	{ compute_pmap_gc_throttle, NULL, 60, 0 },
c3c9b80d	116	{ compute_zone_working_set_size, NULL, ZONE_WSS_UPDATE_PERIOD, 0 },
39236c6e A	117	#if CONFIG_TELEMETRY
	118	{ compute_telemetry, NULL, 1, 0 },
	119	#endif
91447636 A	120	{ NULL, NULL, 0, 0 }
	121	};
	122
0a7de745	123	typedef struct sched_average *sched_average_t;
91447636	124
d9a64523 A	125	/*
	126	* Scheduler load calculation algorithm
	127	*
0a7de745 A	128	* The scheduler load values provide an estimate of the number of runnable
	129	* timeshare threads in the system at various priority bands. The load
	130	* ultimately affects the priority shifts applied to all threads in a band
	131	* causing them to timeshare with other threads in the system. The load is
d9a64523 A	132	* maintained in buckets, with each bucket corresponding to a priority band.
d9a64523 A	133	*
0a7de745 A	134	* Each runnable thread on the system contributes its load to its priority
	135	* band and to the bands above it. The contribution of a thread to the bands
	136	* above it is not strictly 1:1 and is weighted based on the priority band
	137	* of the thread. The rules of thread load contribution to each of its higher
d9a64523 A	138	* bands are as follows:
	139	*
	140	* - DF threads: Upto (2 * NCPUs) threads
	141	* - UT threads: Upto NCPUs threads
	142	* - BG threads: Upto 1 thread
	143	*
0a7de745	144	* To calculate the load values, the various run buckets are sampled (every
d9a64523	145	* sched_load_compute_interval_abs) and the weighted contributions of the the
0a7de745 A	146	* lower bucket threads are added. The resultant value is plugged into an
	147	* exponentially weighted moving average formula:
	148	* new-load = alpha * old-load + (1 - alpha) * run-bucket-sample-count
	149	* (where, alpha < 1)
	150	* The calculations for the scheduler load are done using fixpoint math with
	151	* a scale factor of 16 to avoid expensive divides and floating point
	152	* operations. The final load values are a smooth curve representative of
d9a64523 A	153	* the actual number of runnable threads in a priority band.
	154	*/
	155
	156	/* Maintains the current (scaled for fixpoint) load in various buckets */
	157	uint32_t sched_load[TH_BUCKET_MAX];
39037602	158
0a7de745 A	159	/*
	160	* Alpha factor for the EWMA alogrithm. The current values are chosen as
	161	* 6:10 ("old load":"new samples") to make sure the scheduler reacts fast
	162	* enough to changing system load but does not see too many spikes from bursty
	163	* activity. The current values ensure that the scheduler would converge
	164	* to the latest load in 2-3 sched_load_compute_interval_abs intervals
d9a64523	165	* (which amounts to ~30-45ms with current values).
39236c6e	166	*/
d9a64523 A	167	#define SCHED_LOAD_EWMA_ALPHA_OLD 6
	168	#define SCHED_LOAD_EWMA_ALPHA_NEW 10
	169	#define SCHED_LOAD_EWMA_ALPHA_SHIFT 4
	170	static_assert((SCHED_LOAD_EWMA_ALPHA_OLD + SCHED_LOAD_EWMA_ALPHA_NEW) == (1ul << SCHED_LOAD_EWMA_ALPHA_SHIFT));
	171
	172	/* For fixpoint EWMA, roundup the load to make it converge */
0a7de745	173	#define SCHED_LOAD_EWMA_ROUNDUP(load) (((load) & (1ul << (SCHED_LOAD_EWMA_ALPHA_SHIFT - 1))) != 0)
d9a64523 A	174
d9a64523 A	175	/* Macro to convert scaled sched load to a real load value */
0a7de745	176	#define SCHED_LOAD_EWMA_UNSCALE(load) (((load) >> SCHED_LOAD_EWMA_ALPHA_SHIFT) + SCHED_LOAD_EWMA_ROUNDUP(load))
39236c6e	177
d9a64523	178	/*
cb323159 A	179	* Routine to capture the latest runnable counts and update sched_load (only used for non-clutch schedulers)
cb323159 A	180	*/
1c79356b	181	void
d9a64523	182	compute_sched_load(void)
1c79356b	183	{
2d21ac55	184	/*
39037602 A	185	* Retrieve a snapshot of the current run counts.
	186	*
	187	* Why not a bcopy()? Because we need atomic word-sized reads of sched_run_buckets,
	188	* not byte-by-byte copy.
2d21ac55	189	*/
39037602	190	uint32_t ncpus = processor_avail_count;
d9a64523	191	uint32_t load_now[TH_BUCKET_MAX];
2d21ac55	192
cb323159 A	193	load_now[TH_BUCKET_RUN] = os_atomic_load(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
	194	load_now[TH_BUCKET_FIXPRI] = os_atomic_load(&sched_run_buckets[TH_BUCKET_FIXPRI], relaxed);
	195	load_now[TH_BUCKET_SHARE_FG] = os_atomic_load(&sched_run_buckets[TH_BUCKET_SHARE_FG], relaxed);
	196	load_now[TH_BUCKET_SHARE_DF] = os_atomic_load(&sched_run_buckets[TH_BUCKET_SHARE_DF], relaxed);
	197	load_now[TH_BUCKET_SHARE_UT] = os_atomic_load(&sched_run_buckets[TH_BUCKET_SHARE_UT], relaxed);
	198	load_now[TH_BUCKET_SHARE_BG] = os_atomic_load(&sched_run_buckets[TH_BUCKET_SHARE_BG], relaxed);
2d21ac55	199
39037602 A	200	assert(load_now[TH_BUCKET_RUN] >= 0);
	201	assert(load_now[TH_BUCKET_FIXPRI] >= 0);
	202
d9a64523 A	203	uint32_t nthreads = load_now[TH_BUCKET_RUN];
d9a64523 A	204	uint32_t nfixpri = load_now[TH_BUCKET_FIXPRI];
39037602 A	205
39037602 A	206	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
0a7de745 A	207	MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_LOAD) \| DBG_FUNC_NONE,
	208	load_now[TH_BUCKET_FIXPRI], (load_now[TH_BUCKET_SHARE_FG] + load_now[TH_BUCKET_SHARE_DF]),
	209	load_now[TH_BUCKET_SHARE_BG], load_now[TH_BUCKET_SHARE_UT], 0);
2d21ac55	210
2d21ac55	211	/*
39236c6e A	212	* Compute the timeshare priority conversion factor based on loading.
	213	* Because our counters may be incremented and accessed
	214	* concurrently with respect to each other, we may have
d9a64523	215	* windows where the invariant (nthreads - nfixpri) == (fg + df + bg + ut)
39236c6e	216	* is broken, so truncate values in these cases.
2d21ac55	217	*/
39037602	218	uint32_t timeshare_threads = (nthreads - nfixpri);
0a7de745 A	219	for (uint32_t i = TH_BUCKET_SHARE_FG; i <= TH_BUCKET_SHARE_BG; i++) {
0a7de745 A	220	if (load_now[i] > timeshare_threads) {
39037602	221	load_now[i] = timeshare_threads;
0a7de745	222	}
39037602	223	}
39236c6e	224
0a7de745 A	225	/*
0a7de745 A	226	* Default threads contribute up to (NCPUS * 2) of load to FG threads
d9a64523 A	227	*/
	228	if (load_now[TH_BUCKET_SHARE_DF] <= (ncpus * 2)) {
	229	load_now[TH_BUCKET_SHARE_FG] += load_now[TH_BUCKET_SHARE_DF];
	230	} else {
	231	load_now[TH_BUCKET_SHARE_FG] += (ncpus * 2);
	232	}
0a7de745	233
39037602	234	/*
d9a64523	235	* Utility threads contribute up to NCPUS of load to FG & DF threads
39037602 A	236	*/
	237	if (load_now[TH_BUCKET_SHARE_UT] <= ncpus) {
	238	load_now[TH_BUCKET_SHARE_FG] += load_now[TH_BUCKET_SHARE_UT];
d9a64523	239	load_now[TH_BUCKET_SHARE_DF] += load_now[TH_BUCKET_SHARE_UT];
39037602 A	240	} else {
39037602 A	241	load_now[TH_BUCKET_SHARE_FG] += ncpus;
d9a64523	242	load_now[TH_BUCKET_SHARE_DF] += ncpus;
39037602	243	}
91447636	244
39037602	245	/*
d9a64523	246	* BG threads contribute up to 1 thread worth of load to FG, DF and UT threads
39037602 A	247	*/
	248	if (load_now[TH_BUCKET_SHARE_BG] > 0) {
	249	load_now[TH_BUCKET_SHARE_FG] += 1;
d9a64523	250	load_now[TH_BUCKET_SHARE_DF] += 1;
39037602	251	load_now[TH_BUCKET_SHARE_UT] += 1;
1c79356b A	252	}
1c79356b A	253
39037602 A	254	/*
	255	* The conversion factor consists of two components:
	256	* a fixed value based on the absolute time unit (sched_fixed_shift),
	257	* and a dynamic portion based on load (sched_load_shifts).
	258	*
	259	* Zero load results in a out of range shift count.
	260	*/
39236c6e	261
0a7de745	262	for (uint32_t i = TH_BUCKET_SHARE_FG; i <= TH_BUCKET_SHARE_BG; i++) {
39037602	263	uint32_t bucket_load = 0;
39236c6e	264
39037602	265	if (load_now[i] > ncpus) {
d9a64523	266	/* Normalize the load to number of CPUs */
0a7de745	267	if (ncpus > 1) {
39037602	268	bucket_load = load_now[i] / ncpus;
0a7de745	269	} else {
39037602	270	bucket_load = load_now[i];
0a7de745	271	}
39236c6e	272
0a7de745	273	if (bucket_load > MAX_LOAD) {
39037602	274	bucket_load = MAX_LOAD;
0a7de745	275	}
39037602	276	}
d9a64523 A	277	/* Plug the load values into the EWMA algorithm to calculate (scaled for fixpoint) sched_load */
	278	sched_load[i] = (sched_load[i] * SCHED_LOAD_EWMA_ALPHA_OLD) + ((bucket_load << SCHED_LOAD_EWMA_ALPHA_SHIFT) * SCHED_LOAD_EWMA_ALPHA_NEW);
	279	sched_load[i] = sched_load[i] >> SCHED_LOAD_EWMA_ALPHA_SHIFT;
	280	}
39236c6e	281
d9a64523	282	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
0a7de745 A	283	MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_LOAD_EFFECTIVE) \| DBG_FUNC_NONE,
	284	SCHED_LOAD_EWMA_UNSCALE(sched_load[TH_BUCKET_SHARE_FG]), SCHED_LOAD_EWMA_UNSCALE(sched_load[TH_BUCKET_SHARE_DF]),
	285	SCHED_LOAD_EWMA_UNSCALE(sched_load[TH_BUCKET_SHARE_UT]), SCHED_LOAD_EWMA_UNSCALE(sched_load[TH_BUCKET_SHARE_BG]), 0);
d9a64523 A	286	}
	287
	288	void
	289	compute_averages(uint64_t stdelta)
	290	{
cb323159	291	uint32_t nthreads = os_atomic_load(&sched_run_buckets[TH_BUCKET_RUN], relaxed) - 1;
d9a64523	292	uint32_t ncpus = processor_avail_count;
0a7de745	293
d9a64523	294	/* Update the global pri_shifts based on the latest values */
0a7de745	295	for (uint32_t i = TH_BUCKET_SHARE_FG; i <= TH_BUCKET_SHARE_BG; i++) {
d9a64523	296	uint32_t bucket_load = SCHED_LOAD_EWMA_UNSCALE(sched_load[i]);
f427ee49 A	297	uint32_t shift = sched_fixed_shift - sched_load_shifts[bucket_load];
	298
	299	if (shift > SCHED_PRI_SHIFT_MAX) {
	300	sched_pri_shifts[i] = INT8_MAX;
	301	} else {
	302	sched_pri_shifts[i] = shift;
	303	}
39037602	304	}
490019cf	305
6d2010ae	306	/*
39037602	307	* Sample total running threads for the load average calculation.
6d2010ae A	308	*/
6d2010ae A	309	sched_nrun = nthreads;
6d2010ae	310
2d21ac55	311	/*
39037602 A	312	* Load average and mach factor calculations for
39037602 A	313	* those which ask about these things.
2d21ac55	314	*/
39037602 A	315	uint32_t average_now = nthreads * LOAD_SCALE;
	316	uint32_t factor_now;
	317
0a7de745	318	if (nthreads > ncpus) {
39037602	319	factor_now = (ncpus * LOAD_SCALE) / (nthreads + 1);
0a7de745	320	} else {
39037602	321	factor_now = (ncpus - nthreads) * LOAD_SCALE;
0a7de745	322	}
2d21ac55	323
1c79356b	324	/*
39037602 A	325	* For those statistics that formerly relied on being recomputed
	326	* on timer ticks, advance by the approximate number of corresponding
	327	* elapsed intervals, thus compensating for potential idle intervals.
1c79356b	328	*/
39037602 A	329	for (uint32_t index = 0; index < stdelta; index++) {
	330	sched_mach_factor = ((sched_mach_factor << 2) + factor_now) / 5;
	331	sched_load_average = ((sched_load_average << 2) + average_now) / 5;
	332	}
39236c6e	333
39037602 A	334	/*
	335	* Compute old-style Mach load averages.
	336	*/
	337	for (uint32_t index = 0; index < stdelta; index++) {
	338	for (uint32_t i = 0; i < 3; i++) {
1c79356b	339	mach_factor[i] = ((mach_factor[i] * fract[i]) +
0a7de745	340	(factor_now * (LOAD_SCALE - fract[i]))) / LOAD_SCALE;
1c79356b A	341
1c79356b A	342	avenrun[i] = ((avenrun[i] * fract[i]) +
0a7de745	343	(average_now * (LOAD_SCALE - fract[i]))) / LOAD_SCALE;
1c79356b A	344	}
1c79356b A	345	}
9bccf70c A	346
9bccf70c A	347	/*
39037602	348	* Compute averages in other components.
9bccf70c	349	*/
39037602 A	350	uint64_t abstime = mach_absolute_time();
	351
	352	for (sched_average_t avg = sched_average; avg->comp != NULL; ++avg) {
6d2010ae	353	if (abstime >= avg->deadline) {
39236c6e A	354	uint64_t period_abs = (avg->period * sched_one_second_interval);
	355	uint64_t ninvokes = 1;
	356
	357	ninvokes += (abstime - avg->deadline) / period_abs;
	358	ninvokes = MIN(ninvokes, SCHED_TICK_MAX_DELTA);
	359
39037602	360	for (uint32_t index = 0; index < ninvokes; index++) {
39236c6e A	361	(*avg->comp)(avg->param);
	362	}
	363	avg->deadline = abstime + period_abs;
91447636	364	}
9bccf70c	365	}
1c79356b	366	}