]> git.saurik.com Git - apple/xnu.git/blob - osfmk/kern/sched_average.c
xnu-4903.270.47.tar.gz
[apple/xnu.git] / osfmk / kern / sched_average.c
1 /*
2 * Copyright (c) 2000-2007 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 */
58 /*
59 * Author: Avadis Tevanian, Jr.
60 * Date: 1986
61 *
62 * Compute various averages.
63 */
64
65 #include <mach/mach_types.h>
66
67 #include <kern/sched.h>
68 #include <kern/assert.h>
69 #include <kern/processor.h>
70 #include <kern/thread.h>
71 #if CONFIG_TELEMETRY
72 #include <kern/telemetry.h>
73 #endif
74
75 #include <sys/kdebug.h>
76
77 uint32_t avenrun[3] = {0, 0, 0};
78 uint32_t mach_factor[3] = {0, 0, 0};
79
80 uint32_t sched_load_average, sched_mach_factor;
81
82 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
83 /*
84 * Values are scaled by LOAD_SCALE, defined in processor_info.h
85 */
86 #define base(n) ((n) << SCHED_TICK_SHIFT)
87 #define frac(n) (((base(n) - 1) * LOAD_SCALE) / base(n))
88
89 static uint32_t fract[3] = {
90 frac(5), /* 5 second average */
91 frac(30), /* 30 second average */
92 frac(60), /* 1 minute average */
93 };
94
95 #undef base
96 #undef frac
97
98 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
99
100 static unsigned int sched_nrun;
101
102 typedef void (*sched_avg_comp_t)(
103 void *param);
104
105 static struct sched_average {
106 sched_avg_comp_t comp;
107 void *param;
108 int period; /* in seconds */
109 uint64_t deadline;
110 } sched_average[] = {
111 { compute_averunnable, &sched_nrun, 5, 0 },
112 { compute_stack_target, NULL, 5, 1 },
113 { compute_pageout_gc_throttle, NULL, 1, 0 },
114 { compute_pmap_gc_throttle, NULL, 60, 0 },
115 #if CONFIG_TELEMETRY
116 { compute_telemetry, NULL, 1, 0 },
117 #endif
118 { NULL, NULL, 0, 0 }
119 };
120
121 typedef struct sched_average *sched_average_t;
122
123 /*
124 * Scheduler load calculation algorithm
125 *
126 * The scheduler load values provide an estimate of the number of runnable
127 * timeshare threads in the system at various priority bands. The load
128 * ultimately affects the priority shifts applied to all threads in a band
129 * causing them to timeshare with other threads in the system. The load is
130 * maintained in buckets, with each bucket corresponding to a priority band.
131 *
132 * Each runnable thread on the system contributes its load to its priority
133 * band and to the bands above it. The contribution of a thread to the bands
134 * above it is not strictly 1:1 and is weighted based on the priority band
135 * of the thread. The rules of thread load contribution to each of its higher
136 * bands are as follows:
137 *
138 * - DF threads: Upto (2 * NCPUs) threads
139 * - UT threads: Upto NCPUs threads
140 * - BG threads: Upto 1 thread
141 *
142 * To calculate the load values, the various run buckets are sampled (every
143 * sched_load_compute_interval_abs) and the weighted contributions of the the
144 * lower bucket threads are added. The resultant value is plugged into an
145 * exponentially weighted moving average formula:
146 * new-load = alpha * old-load + (1 - alpha) * run-bucket-sample-count
147 * (where, alpha < 1)
148 * The calculations for the scheduler load are done using fixpoint math with
149 * a scale factor of 16 to avoid expensive divides and floating point
150 * operations. The final load values are a smooth curve representative of
151 * the actual number of runnable threads in a priority band.
152 */
153
154 /* Maintains the current (scaled for fixpoint) load in various buckets */
155 uint32_t sched_load[TH_BUCKET_MAX];
156
157 /*
158 * Alpha factor for the EWMA alogrithm. The current values are chosen as
159 * 6:10 ("old load":"new samples") to make sure the scheduler reacts fast
160 * enough to changing system load but does not see too many spikes from bursty
161 * activity. The current values ensure that the scheduler would converge
162 * to the latest load in 2-3 sched_load_compute_interval_abs intervals
163 * (which amounts to ~30-45ms with current values).
164 */
165 #define SCHED_LOAD_EWMA_ALPHA_OLD 6
166 #define SCHED_LOAD_EWMA_ALPHA_NEW 10
167 #define SCHED_LOAD_EWMA_ALPHA_SHIFT 4
168 static_assert((SCHED_LOAD_EWMA_ALPHA_OLD + SCHED_LOAD_EWMA_ALPHA_NEW) == (1ul << SCHED_LOAD_EWMA_ALPHA_SHIFT));
169
170 /* For fixpoint EWMA, roundup the load to make it converge */
171 #define SCHED_LOAD_EWMA_ROUNDUP(load) (((load) & (1ul << (SCHED_LOAD_EWMA_ALPHA_SHIFT - 1))) != 0)
172
173 /* Macro to convert scaled sched load to a real load value */
174 #define SCHED_LOAD_EWMA_UNSCALE(load) (((load) >> SCHED_LOAD_EWMA_ALPHA_SHIFT) + SCHED_LOAD_EWMA_ROUNDUP(load))
175
176 /*
177 * Routine to capture the latest runnable counts and update sched_load */
178 void
179 compute_sched_load(void)
180 {
181 /*
182 * Retrieve a snapshot of the current run counts.
183 *
184 * Why not a bcopy()? Because we need atomic word-sized reads of sched_run_buckets,
185 * not byte-by-byte copy.
186 */
187 uint32_t ncpus = processor_avail_count;
188 uint32_t load_now[TH_BUCKET_MAX];
189
190 load_now[TH_BUCKET_RUN] = sched_run_buckets[TH_BUCKET_RUN];
191 load_now[TH_BUCKET_FIXPRI] = sched_run_buckets[TH_BUCKET_FIXPRI];
192 load_now[TH_BUCKET_SHARE_FG] = sched_run_buckets[TH_BUCKET_SHARE_FG];
193 load_now[TH_BUCKET_SHARE_DF] = sched_run_buckets[TH_BUCKET_SHARE_DF];
194 load_now[TH_BUCKET_SHARE_UT] = sched_run_buckets[TH_BUCKET_SHARE_UT];
195 load_now[TH_BUCKET_SHARE_BG] = sched_run_buckets[TH_BUCKET_SHARE_BG];
196
197 assert(load_now[TH_BUCKET_RUN] >= 0);
198 assert(load_now[TH_BUCKET_FIXPRI] >= 0);
199
200 uint32_t nthreads = load_now[TH_BUCKET_RUN];
201 uint32_t nfixpri = load_now[TH_BUCKET_FIXPRI];
202
203 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
204 MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_LOAD) | DBG_FUNC_NONE,
205 load_now[TH_BUCKET_FIXPRI], (load_now[TH_BUCKET_SHARE_FG] + load_now[TH_BUCKET_SHARE_DF]),
206 load_now[TH_BUCKET_SHARE_BG], load_now[TH_BUCKET_SHARE_UT], 0);
207
208 /*
209 * Compute the timeshare priority conversion factor based on loading.
210 * Because our counters may be incremented and accessed
211 * concurrently with respect to each other, we may have
212 * windows where the invariant (nthreads - nfixpri) == (fg + df + bg + ut)
213 * is broken, so truncate values in these cases.
214 */
215 uint32_t timeshare_threads = (nthreads - nfixpri);
216 for (uint32_t i = TH_BUCKET_SHARE_FG; i <= TH_BUCKET_SHARE_BG; i++) {
217 if (load_now[i] > timeshare_threads) {
218 load_now[i] = timeshare_threads;
219 }
220 }
221
222 /*
223 * Default threads contribute up to (NCPUS * 2) of load to FG threads
224 */
225 if (load_now[TH_BUCKET_SHARE_DF] <= (ncpus * 2)) {
226 load_now[TH_BUCKET_SHARE_FG] += load_now[TH_BUCKET_SHARE_DF];
227 } else {
228 load_now[TH_BUCKET_SHARE_FG] += (ncpus * 2);
229 }
230
231 /*
232 * Utility threads contribute up to NCPUS of load to FG & DF threads
233 */
234 if (load_now[TH_BUCKET_SHARE_UT] <= ncpus) {
235 load_now[TH_BUCKET_SHARE_FG] += load_now[TH_BUCKET_SHARE_UT];
236 load_now[TH_BUCKET_SHARE_DF] += load_now[TH_BUCKET_SHARE_UT];
237 } else {
238 load_now[TH_BUCKET_SHARE_FG] += ncpus;
239 load_now[TH_BUCKET_SHARE_DF] += ncpus;
240 }
241
242 /*
243 * BG threads contribute up to 1 thread worth of load to FG, DF and UT threads
244 */
245 if (load_now[TH_BUCKET_SHARE_BG] > 0) {
246 load_now[TH_BUCKET_SHARE_FG] += 1;
247 load_now[TH_BUCKET_SHARE_DF] += 1;
248 load_now[TH_BUCKET_SHARE_UT] += 1;
249 }
250
251 /*
252 * The conversion factor consists of two components:
253 * a fixed value based on the absolute time unit (sched_fixed_shift),
254 * and a dynamic portion based on load (sched_load_shifts).
255 *
256 * Zero load results in a out of range shift count.
257 */
258
259 for (uint32_t i = TH_BUCKET_SHARE_FG; i <= TH_BUCKET_SHARE_BG; i++) {
260 uint32_t bucket_load = 0;
261
262 if (load_now[i] > ncpus) {
263 /* Normalize the load to number of CPUs */
264 if (ncpus > 1) {
265 bucket_load = load_now[i] / ncpus;
266 } else {
267 bucket_load = load_now[i];
268 }
269
270 if (bucket_load > MAX_LOAD) {
271 bucket_load = MAX_LOAD;
272 }
273 }
274 /* Plug the load values into the EWMA algorithm to calculate (scaled for fixpoint) sched_load */
275 sched_load[i] = (sched_load[i] * SCHED_LOAD_EWMA_ALPHA_OLD) + ((bucket_load << SCHED_LOAD_EWMA_ALPHA_SHIFT) * SCHED_LOAD_EWMA_ALPHA_NEW);
276 sched_load[i] = sched_load[i] >> SCHED_LOAD_EWMA_ALPHA_SHIFT;
277 }
278
279 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
280 MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_LOAD_EFFECTIVE) | DBG_FUNC_NONE,
281 SCHED_LOAD_EWMA_UNSCALE(sched_load[TH_BUCKET_SHARE_FG]), SCHED_LOAD_EWMA_UNSCALE(sched_load[TH_BUCKET_SHARE_DF]),
282 SCHED_LOAD_EWMA_UNSCALE(sched_load[TH_BUCKET_SHARE_UT]), SCHED_LOAD_EWMA_UNSCALE(sched_load[TH_BUCKET_SHARE_BG]), 0);
283 }
284
285 void
286 compute_averages(uint64_t stdelta)
287 {
288 uint32_t nthreads = sched_run_buckets[TH_BUCKET_RUN] - 1;
289 uint32_t ncpus = processor_avail_count;
290
291 /* Update the global pri_shifts based on the latest values */
292 for (uint32_t i = TH_BUCKET_SHARE_FG; i <= TH_BUCKET_SHARE_BG; i++) {
293 uint32_t bucket_load = SCHED_LOAD_EWMA_UNSCALE(sched_load[i]);
294 sched_pri_shifts[i] = sched_fixed_shift - sched_load_shifts[bucket_load];
295 }
296
297 /*
298 * Sample total running threads for the load average calculation.
299 */
300 sched_nrun = nthreads;
301
302 /*
303 * Load average and mach factor calculations for
304 * those which ask about these things.
305 */
306 uint32_t average_now = nthreads * LOAD_SCALE;
307 uint32_t factor_now;
308
309 if (nthreads > ncpus) {
310 factor_now = (ncpus * LOAD_SCALE) / (nthreads + 1);
311 } else {
312 factor_now = (ncpus - nthreads) * LOAD_SCALE;
313 }
314
315 /*
316 * For those statistics that formerly relied on being recomputed
317 * on timer ticks, advance by the approximate number of corresponding
318 * elapsed intervals, thus compensating for potential idle intervals.
319 */
320 for (uint32_t index = 0; index < stdelta; index++) {
321 sched_mach_factor = ((sched_mach_factor << 2) + factor_now) / 5;
322 sched_load_average = ((sched_load_average << 2) + average_now) / 5;
323 }
324
325 /*
326 * Compute old-style Mach load averages.
327 */
328 for (uint32_t index = 0; index < stdelta; index++) {
329 for (uint32_t i = 0; i < 3; i++) {
330 mach_factor[i] = ((mach_factor[i] * fract[i]) +
331 (factor_now * (LOAD_SCALE - fract[i]))) / LOAD_SCALE;
332
333 avenrun[i] = ((avenrun[i] * fract[i]) +
334 (average_now * (LOAD_SCALE - fract[i]))) / LOAD_SCALE;
335 }
336 }
337
338 /*
339 * Compute averages in other components.
340 */
341 uint64_t abstime = mach_absolute_time();
342
343 for (sched_average_t avg = sched_average; avg->comp != NULL; ++avg) {
344 if (abstime >= avg->deadline) {
345 uint64_t period_abs = (avg->period * sched_one_second_interval);
346 uint64_t ninvokes = 1;
347
348 ninvokes += (abstime - avg->deadline) / period_abs;
349 ninvokes = MIN(ninvokes, SCHED_TICK_MAX_DELTA);
350
351 for (uint32_t index = 0; index < ninvokes; index++) {
352 (*avg->comp)(avg->param);
353 }
354 avg->deadline = abstime + period_abs;
355 }
356 }
357 }