]> git.saurik.com Git - apple/xnu.git/blame - osfmk/kern/sched_average.c
xnu-7195.101.1.tar.gz
[apple/xnu.git] / osfmk / kern / sched_average.c
CommitLineData
1c79356b 1/*
2d21ac55 2 * Copyright (c) 2000-2007 Apple Computer, Inc. All rights reserved.
1c79356b 3 *
2d21ac55 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
0a7de745 5 *
2d21ac55
A
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
0a7de745 14 *
2d21ac55
A
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
0a7de745 17 *
2d21ac55
A
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
8f6c56a5
A
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
2d21ac55
A
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
0a7de745 25 *
2d21ac55 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
1c79356b
A
27 */
28/*
29 * @OSF_COPYRIGHT@
30 */
0a7de745 31/*
1c79356b
A
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
0a7de745 35 *
1c79356b
A
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
0a7de745 41 *
1c79356b
A
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
0a7de745 45 *
1c79356b 46 * Carnegie Mellon requests users of this software to return to
0a7de745 47 *
1c79356b
A
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
0a7de745 52 *
1c79356b
A
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56/*
57 */
58/*
1c79356b
A
59 * Author: Avadis Tevanian, Jr.
60 * Date: 1986
61 *
91447636 62 * Compute various averages.
1c79356b
A
63 */
64
91447636 65#include <mach/mach_types.h>
1c79356b 66
1c79356b
A
67#include <kern/sched.h>
68#include <kern/assert.h>
69#include <kern/processor.h>
70#include <kern/thread.h>
39236c6e
A
71#if CONFIG_TELEMETRY
72#include <kern/telemetry.h>
73#endif
c3c9b80d 74#include <kern/zalloc_internal.h>
490019cf
A
75
76#include <sys/kdebug.h>
77
0a7de745
A
78uint32_t avenrun[3] = {0, 0, 0};
79uint32_t mach_factor[3] = {0, 0, 0};
1c79356b 80
0a7de745 81uint32_t sched_load_average, sched_mach_factor;
39037602 82
3e170ce0 83#if defined(CONFIG_SCHED_TIMESHARE_CORE)
1c79356b
A
84/*
85 * Values are scaled by LOAD_SCALE, defined in processor_info.h
86 */
0a7de745
A
87#define base(n) ((n) << SCHED_TICK_SHIFT)
88#define frac(n) (((base(n) - 1) * LOAD_SCALE) / base(n))
0b4e3aa0 89
0a7de745
A
90static uint32_t fract[3] = {
91 frac(5), /* 5 second average */
92 frac(30), /* 30 second average */
93 frac(60), /* 1 minute average */
1c79356b 94};
9bccf70c 95
0b4e3aa0
A
96#undef base
97#undef frac
1c79356b 98
3e170ce0 99#endif /* CONFIG_SCHED_TIMESHARE_CORE */
6d2010ae 100
0a7de745 101static unsigned int sched_nrun;
91447636 102
0a7de745
A
103typedef void (*sched_avg_comp_t)(
104 void *param);
91447636 105
91447636 106static struct sched_average {
0a7de745
A
107 sched_avg_comp_t comp;
108 void *param;
109 int period; /* in seconds */
110 uint64_t deadline;
91447636 111} sched_average[] = {
6d2010ae
A
112 { compute_averunnable, &sched_nrun, 5, 0 },
113 { compute_stack_target, NULL, 5, 1 },
316670eb 114 { compute_pageout_gc_throttle, NULL, 1, 0 },
6d2010ae 115 { compute_pmap_gc_throttle, NULL, 60, 0 },
c3c9b80d 116 { compute_zone_working_set_size, NULL, ZONE_WSS_UPDATE_PERIOD, 0 },
39236c6e
A
117#if CONFIG_TELEMETRY
118 { compute_telemetry, NULL, 1, 0 },
119#endif
91447636
A
120 { NULL, NULL, 0, 0 }
121};
122
0a7de745 123typedef struct sched_average *sched_average_t;
91447636 124
d9a64523
A
125/*
126 * Scheduler load calculation algorithm
127 *
0a7de745
A
128 * The scheduler load values provide an estimate of the number of runnable
129 * timeshare threads in the system at various priority bands. The load
130 * ultimately affects the priority shifts applied to all threads in a band
131 * causing them to timeshare with other threads in the system. The load is
d9a64523
A
132 * maintained in buckets, with each bucket corresponding to a priority band.
133 *
0a7de745
A
134 * Each runnable thread on the system contributes its load to its priority
135 * band and to the bands above it. The contribution of a thread to the bands
136 * above it is not strictly 1:1 and is weighted based on the priority band
137 * of the thread. The rules of thread load contribution to each of its higher
d9a64523
A
138 * bands are as follows:
139 *
140 * - DF threads: Upto (2 * NCPUs) threads
141 * - UT threads: Upto NCPUs threads
142 * - BG threads: Upto 1 thread
143 *
0a7de745 144 * To calculate the load values, the various run buckets are sampled (every
d9a64523 145 * sched_load_compute_interval_abs) and the weighted contributions of the the
0a7de745
A
146 * lower bucket threads are added. The resultant value is plugged into an
147 * exponentially weighted moving average formula:
148 * new-load = alpha * old-load + (1 - alpha) * run-bucket-sample-count
149 * (where, alpha < 1)
150 * The calculations for the scheduler load are done using fixpoint math with
151 * a scale factor of 16 to avoid expensive divides and floating point
152 * operations. The final load values are a smooth curve representative of
d9a64523
A
153 * the actual number of runnable threads in a priority band.
154 */
155
156/* Maintains the current (scaled for fixpoint) load in various buckets */
157uint32_t sched_load[TH_BUCKET_MAX];
39037602 158
0a7de745
A
159/*
160 * Alpha factor for the EWMA alogrithm. The current values are chosen as
161 * 6:10 ("old load":"new samples") to make sure the scheduler reacts fast
162 * enough to changing system load but does not see too many spikes from bursty
163 * activity. The current values ensure that the scheduler would converge
164 * to the latest load in 2-3 sched_load_compute_interval_abs intervals
d9a64523 165 * (which amounts to ~30-45ms with current values).
39236c6e 166 */
d9a64523
A
167#define SCHED_LOAD_EWMA_ALPHA_OLD 6
168#define SCHED_LOAD_EWMA_ALPHA_NEW 10
169#define SCHED_LOAD_EWMA_ALPHA_SHIFT 4
170static_assert((SCHED_LOAD_EWMA_ALPHA_OLD + SCHED_LOAD_EWMA_ALPHA_NEW) == (1ul << SCHED_LOAD_EWMA_ALPHA_SHIFT));
171
172/* For fixpoint EWMA, roundup the load to make it converge */
0a7de745 173#define SCHED_LOAD_EWMA_ROUNDUP(load) (((load) & (1ul << (SCHED_LOAD_EWMA_ALPHA_SHIFT - 1))) != 0)
d9a64523
A
174
175/* Macro to convert scaled sched load to a real load value */
0a7de745 176#define SCHED_LOAD_EWMA_UNSCALE(load) (((load) >> SCHED_LOAD_EWMA_ALPHA_SHIFT) + SCHED_LOAD_EWMA_ROUNDUP(load))
39236c6e 177
d9a64523 178/*
cb323159
A
179 * Routine to capture the latest runnable counts and update sched_load (only used for non-clutch schedulers)
180 */
1c79356b 181void
d9a64523 182compute_sched_load(void)
1c79356b 183{
2d21ac55 184 /*
39037602
A
185 * Retrieve a snapshot of the current run counts.
186 *
187 * Why not a bcopy()? Because we need atomic word-sized reads of sched_run_buckets,
188 * not byte-by-byte copy.
2d21ac55 189 */
39037602 190 uint32_t ncpus = processor_avail_count;
d9a64523 191 uint32_t load_now[TH_BUCKET_MAX];
2d21ac55 192
cb323159
A
193 load_now[TH_BUCKET_RUN] = os_atomic_load(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
194 load_now[TH_BUCKET_FIXPRI] = os_atomic_load(&sched_run_buckets[TH_BUCKET_FIXPRI], relaxed);
195 load_now[TH_BUCKET_SHARE_FG] = os_atomic_load(&sched_run_buckets[TH_BUCKET_SHARE_FG], relaxed);
196 load_now[TH_BUCKET_SHARE_DF] = os_atomic_load(&sched_run_buckets[TH_BUCKET_SHARE_DF], relaxed);
197 load_now[TH_BUCKET_SHARE_UT] = os_atomic_load(&sched_run_buckets[TH_BUCKET_SHARE_UT], relaxed);
198 load_now[TH_BUCKET_SHARE_BG] = os_atomic_load(&sched_run_buckets[TH_BUCKET_SHARE_BG], relaxed);
2d21ac55 199
39037602
A
200 assert(load_now[TH_BUCKET_RUN] >= 0);
201 assert(load_now[TH_BUCKET_FIXPRI] >= 0);
202
d9a64523
A
203 uint32_t nthreads = load_now[TH_BUCKET_RUN];
204 uint32_t nfixpri = load_now[TH_BUCKET_FIXPRI];
39037602
A
205
206 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
0a7de745
A
207 MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_LOAD) | DBG_FUNC_NONE,
208 load_now[TH_BUCKET_FIXPRI], (load_now[TH_BUCKET_SHARE_FG] + load_now[TH_BUCKET_SHARE_DF]),
209 load_now[TH_BUCKET_SHARE_BG], load_now[TH_BUCKET_SHARE_UT], 0);
2d21ac55 210
2d21ac55 211 /*
39236c6e
A
212 * Compute the timeshare priority conversion factor based on loading.
213 * Because our counters may be incremented and accessed
214 * concurrently with respect to each other, we may have
d9a64523 215 * windows where the invariant (nthreads - nfixpri) == (fg + df + bg + ut)
39236c6e 216 * is broken, so truncate values in these cases.
2d21ac55 217 */
39037602 218 uint32_t timeshare_threads = (nthreads - nfixpri);
0a7de745
A
219 for (uint32_t i = TH_BUCKET_SHARE_FG; i <= TH_BUCKET_SHARE_BG; i++) {
220 if (load_now[i] > timeshare_threads) {
39037602 221 load_now[i] = timeshare_threads;
0a7de745 222 }
39037602 223 }
39236c6e 224
0a7de745
A
225 /*
226 * Default threads contribute up to (NCPUS * 2) of load to FG threads
d9a64523
A
227 */
228 if (load_now[TH_BUCKET_SHARE_DF] <= (ncpus * 2)) {
229 load_now[TH_BUCKET_SHARE_FG] += load_now[TH_BUCKET_SHARE_DF];
230 } else {
231 load_now[TH_BUCKET_SHARE_FG] += (ncpus * 2);
232 }
0a7de745 233
39037602 234 /*
d9a64523 235 * Utility threads contribute up to NCPUS of load to FG & DF threads
39037602
A
236 */
237 if (load_now[TH_BUCKET_SHARE_UT] <= ncpus) {
238 load_now[TH_BUCKET_SHARE_FG] += load_now[TH_BUCKET_SHARE_UT];
d9a64523 239 load_now[TH_BUCKET_SHARE_DF] += load_now[TH_BUCKET_SHARE_UT];
39037602
A
240 } else {
241 load_now[TH_BUCKET_SHARE_FG] += ncpus;
d9a64523 242 load_now[TH_BUCKET_SHARE_DF] += ncpus;
39037602 243 }
91447636 244
39037602 245 /*
d9a64523 246 * BG threads contribute up to 1 thread worth of load to FG, DF and UT threads
39037602
A
247 */
248 if (load_now[TH_BUCKET_SHARE_BG] > 0) {
249 load_now[TH_BUCKET_SHARE_FG] += 1;
d9a64523 250 load_now[TH_BUCKET_SHARE_DF] += 1;
39037602 251 load_now[TH_BUCKET_SHARE_UT] += 1;
1c79356b
A
252 }
253
39037602
A
254 /*
255 * The conversion factor consists of two components:
256 * a fixed value based on the absolute time unit (sched_fixed_shift),
257 * and a dynamic portion based on load (sched_load_shifts).
258 *
259 * Zero load results in a out of range shift count.
260 */
39236c6e 261
0a7de745 262 for (uint32_t i = TH_BUCKET_SHARE_FG; i <= TH_BUCKET_SHARE_BG; i++) {
39037602 263 uint32_t bucket_load = 0;
39236c6e 264
39037602 265 if (load_now[i] > ncpus) {
d9a64523 266 /* Normalize the load to number of CPUs */
0a7de745 267 if (ncpus > 1) {
39037602 268 bucket_load = load_now[i] / ncpus;
0a7de745 269 } else {
39037602 270 bucket_load = load_now[i];
0a7de745 271 }
39236c6e 272
0a7de745 273 if (bucket_load > MAX_LOAD) {
39037602 274 bucket_load = MAX_LOAD;
0a7de745 275 }
39037602 276 }
d9a64523
A
277 /* Plug the load values into the EWMA algorithm to calculate (scaled for fixpoint) sched_load */
278 sched_load[i] = (sched_load[i] * SCHED_LOAD_EWMA_ALPHA_OLD) + ((bucket_load << SCHED_LOAD_EWMA_ALPHA_SHIFT) * SCHED_LOAD_EWMA_ALPHA_NEW);
279 sched_load[i] = sched_load[i] >> SCHED_LOAD_EWMA_ALPHA_SHIFT;
280 }
39236c6e 281
d9a64523 282 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
0a7de745
A
283 MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_LOAD_EFFECTIVE) | DBG_FUNC_NONE,
284 SCHED_LOAD_EWMA_UNSCALE(sched_load[TH_BUCKET_SHARE_FG]), SCHED_LOAD_EWMA_UNSCALE(sched_load[TH_BUCKET_SHARE_DF]),
285 SCHED_LOAD_EWMA_UNSCALE(sched_load[TH_BUCKET_SHARE_UT]), SCHED_LOAD_EWMA_UNSCALE(sched_load[TH_BUCKET_SHARE_BG]), 0);
d9a64523
A
286}
287
288void
289compute_averages(uint64_t stdelta)
290{
cb323159 291 uint32_t nthreads = os_atomic_load(&sched_run_buckets[TH_BUCKET_RUN], relaxed) - 1;
d9a64523 292 uint32_t ncpus = processor_avail_count;
0a7de745 293
d9a64523 294 /* Update the global pri_shifts based on the latest values */
0a7de745 295 for (uint32_t i = TH_BUCKET_SHARE_FG; i <= TH_BUCKET_SHARE_BG; i++) {
d9a64523 296 uint32_t bucket_load = SCHED_LOAD_EWMA_UNSCALE(sched_load[i]);
f427ee49
A
297 uint32_t shift = sched_fixed_shift - sched_load_shifts[bucket_load];
298
299 if (shift > SCHED_PRI_SHIFT_MAX) {
300 sched_pri_shifts[i] = INT8_MAX;
301 } else {
302 sched_pri_shifts[i] = shift;
303 }
39037602 304 }
490019cf 305
6d2010ae 306 /*
39037602 307 * Sample total running threads for the load average calculation.
6d2010ae
A
308 */
309 sched_nrun = nthreads;
6d2010ae 310
2d21ac55 311 /*
39037602
A
312 * Load average and mach factor calculations for
313 * those which ask about these things.
2d21ac55 314 */
39037602
A
315 uint32_t average_now = nthreads * LOAD_SCALE;
316 uint32_t factor_now;
317
0a7de745 318 if (nthreads > ncpus) {
39037602 319 factor_now = (ncpus * LOAD_SCALE) / (nthreads + 1);
0a7de745 320 } else {
39037602 321 factor_now = (ncpus - nthreads) * LOAD_SCALE;
0a7de745 322 }
2d21ac55 323
1c79356b 324 /*
39037602
A
325 * For those statistics that formerly relied on being recomputed
326 * on timer ticks, advance by the approximate number of corresponding
327 * elapsed intervals, thus compensating for potential idle intervals.
1c79356b 328 */
39037602
A
329 for (uint32_t index = 0; index < stdelta; index++) {
330 sched_mach_factor = ((sched_mach_factor << 2) + factor_now) / 5;
331 sched_load_average = ((sched_load_average << 2) + average_now) / 5;
332 }
39236c6e 333
39037602
A
334 /*
335 * Compute old-style Mach load averages.
336 */
337 for (uint32_t index = 0; index < stdelta; index++) {
338 for (uint32_t i = 0; i < 3; i++) {
1c79356b 339 mach_factor[i] = ((mach_factor[i] * fract[i]) +
0a7de745 340 (factor_now * (LOAD_SCALE - fract[i]))) / LOAD_SCALE;
1c79356b
A
341
342 avenrun[i] = ((avenrun[i] * fract[i]) +
0a7de745 343 (average_now * (LOAD_SCALE - fract[i]))) / LOAD_SCALE;
1c79356b
A
344 }
345 }
9bccf70c
A
346
347 /*
39037602 348 * Compute averages in other components.
9bccf70c 349 */
39037602
A
350 uint64_t abstime = mach_absolute_time();
351
352 for (sched_average_t avg = sched_average; avg->comp != NULL; ++avg) {
6d2010ae 353 if (abstime >= avg->deadline) {
39236c6e
A
354 uint64_t period_abs = (avg->period * sched_one_second_interval);
355 uint64_t ninvokes = 1;
356
357 ninvokes += (abstime - avg->deadline) / period_abs;
358 ninvokes = MIN(ninvokes, SCHED_TICK_MAX_DELTA);
359
39037602 360 for (uint32_t index = 0; index < ninvokes; index++) {
39236c6e
A
361 (*avg->comp)(avg->param);
362 }
363 avg->deadline = abstime + period_abs;
91447636 364 }
9bccf70c 365 }
1c79356b 366}