]>
Commit | Line | Data |
---|---|---|
1c79356b | 1 | /* |
2d21ac55 | 2 | * Copyright (c) 2000-2007 Apple Computer, Inc. All rights reserved. |
1c79356b | 3 | * |
2d21ac55 | 4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ |
0a7de745 | 5 | * |
2d21ac55 A |
6 | * This file contains Original Code and/or Modifications of Original Code |
7 | * as defined in and that are subject to the Apple Public Source License | |
8 | * Version 2.0 (the 'License'). You may not use this file except in | |
9 | * compliance with the License. The rights granted to you under the License | |
10 | * may not be used to create, or enable the creation or redistribution of, | |
11 | * unlawful or unlicensed copies of an Apple operating system, or to | |
12 | * circumvent, violate, or enable the circumvention or violation of, any | |
13 | * terms of an Apple operating system software license agreement. | |
0a7de745 | 14 | * |
2d21ac55 A |
15 | * Please obtain a copy of the License at |
16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. | |
0a7de745 | 17 | * |
2d21ac55 A |
18 | * The Original Code and all software distributed under the License are |
19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
8f6c56a5 A |
20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, |
21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
2d21ac55 A |
22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. |
23 | * Please see the License for the specific language governing rights and | |
24 | * limitations under the License. | |
0a7de745 | 25 | * |
2d21ac55 | 26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ |
1c79356b A |
27 | */ |
28 | /* | |
29 | * @OSF_COPYRIGHT@ | |
30 | */ | |
0a7de745 | 31 | /* |
1c79356b A |
32 | * Mach Operating System |
33 | * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University | |
34 | * All Rights Reserved. | |
0a7de745 | 35 | * |
1c79356b A |
36 | * Permission to use, copy, modify and distribute this software and its |
37 | * documentation is hereby granted, provided that both the copyright | |
38 | * notice and this permission notice appear in all copies of the | |
39 | * software, derivative works or modified versions, and any portions | |
40 | * thereof, and that both notices appear in supporting documentation. | |
0a7de745 | 41 | * |
1c79356b A |
42 | * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" |
43 | * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR | |
44 | * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. | |
0a7de745 | 45 | * |
1c79356b | 46 | * Carnegie Mellon requests users of this software to return to |
0a7de745 | 47 | * |
1c79356b A |
48 | * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU |
49 | * School of Computer Science | |
50 | * Carnegie Mellon University | |
51 | * Pittsburgh PA 15213-3890 | |
0a7de745 | 52 | * |
1c79356b A |
53 | * any improvements or extensions that they make and grant Carnegie Mellon |
54 | * the rights to redistribute these changes. | |
55 | */ | |
56 | /* | |
57 | */ | |
58 | /* | |
1c79356b A |
59 | * Author: Avadis Tevanian, Jr. |
60 | * Date: 1986 | |
61 | * | |
91447636 | 62 | * Compute various averages. |
1c79356b A |
63 | */ |
64 | ||
91447636 | 65 | #include <mach/mach_types.h> |
1c79356b | 66 | |
1c79356b A |
67 | #include <kern/sched.h> |
68 | #include <kern/assert.h> | |
69 | #include <kern/processor.h> | |
70 | #include <kern/thread.h> | |
39236c6e A |
71 | #if CONFIG_TELEMETRY |
72 | #include <kern/telemetry.h> | |
73 | #endif | |
c3c9b80d | 74 | #include <kern/zalloc_internal.h> |
490019cf A |
75 | |
76 | #include <sys/kdebug.h> | |
77 | ||
0a7de745 A |
78 | uint32_t avenrun[3] = {0, 0, 0}; |
79 | uint32_t mach_factor[3] = {0, 0, 0}; | |
1c79356b | 80 | |
0a7de745 | 81 | uint32_t sched_load_average, sched_mach_factor; |
39037602 | 82 | |
3e170ce0 | 83 | #if defined(CONFIG_SCHED_TIMESHARE_CORE) |
1c79356b A |
84 | /* |
85 | * Values are scaled by LOAD_SCALE, defined in processor_info.h | |
86 | */ | |
0a7de745 A |
87 | #define base(n) ((n) << SCHED_TICK_SHIFT) |
88 | #define frac(n) (((base(n) - 1) * LOAD_SCALE) / base(n)) | |
0b4e3aa0 | 89 | |
0a7de745 A |
90 | static uint32_t fract[3] = { |
91 | frac(5), /* 5 second average */ | |
92 | frac(30), /* 30 second average */ | |
93 | frac(60), /* 1 minute average */ | |
1c79356b | 94 | }; |
9bccf70c | 95 | |
0b4e3aa0 A |
96 | #undef base |
97 | #undef frac | |
1c79356b | 98 | |
3e170ce0 | 99 | #endif /* CONFIG_SCHED_TIMESHARE_CORE */ |
6d2010ae | 100 | |
0a7de745 | 101 | static unsigned int sched_nrun; |
91447636 | 102 | |
0a7de745 A |
103 | typedef void (*sched_avg_comp_t)( |
104 | void *param); | |
91447636 | 105 | |
91447636 | 106 | static struct sched_average { |
0a7de745 A |
107 | sched_avg_comp_t comp; |
108 | void *param; | |
109 | int period; /* in seconds */ | |
110 | uint64_t deadline; | |
91447636 | 111 | } sched_average[] = { |
6d2010ae A |
112 | { compute_averunnable, &sched_nrun, 5, 0 }, |
113 | { compute_stack_target, NULL, 5, 1 }, | |
316670eb | 114 | { compute_pageout_gc_throttle, NULL, 1, 0 }, |
6d2010ae | 115 | { compute_pmap_gc_throttle, NULL, 60, 0 }, |
c3c9b80d | 116 | { compute_zone_working_set_size, NULL, ZONE_WSS_UPDATE_PERIOD, 0 }, |
39236c6e A |
117 | #if CONFIG_TELEMETRY |
118 | { compute_telemetry, NULL, 1, 0 }, | |
119 | #endif | |
91447636 A |
120 | { NULL, NULL, 0, 0 } |
121 | }; | |
122 | ||
0a7de745 | 123 | typedef struct sched_average *sched_average_t; |
91447636 | 124 | |
d9a64523 A |
125 | /* |
126 | * Scheduler load calculation algorithm | |
127 | * | |
0a7de745 A |
128 | * The scheduler load values provide an estimate of the number of runnable |
129 | * timeshare threads in the system at various priority bands. The load | |
130 | * ultimately affects the priority shifts applied to all threads in a band | |
131 | * causing them to timeshare with other threads in the system. The load is | |
d9a64523 A |
132 | * maintained in buckets, with each bucket corresponding to a priority band. |
133 | * | |
0a7de745 A |
134 | * Each runnable thread on the system contributes its load to its priority |
135 | * band and to the bands above it. The contribution of a thread to the bands | |
136 | * above it is not strictly 1:1 and is weighted based on the priority band | |
137 | * of the thread. The rules of thread load contribution to each of its higher | |
d9a64523 A |
138 | * bands are as follows: |
139 | * | |
140 | * - DF threads: Upto (2 * NCPUs) threads | |
141 | * - UT threads: Upto NCPUs threads | |
142 | * - BG threads: Upto 1 thread | |
143 | * | |
0a7de745 | 144 | * To calculate the load values, the various run buckets are sampled (every |
d9a64523 | 145 | * sched_load_compute_interval_abs) and the weighted contributions of the the |
0a7de745 A |
146 | * lower bucket threads are added. The resultant value is plugged into an |
147 | * exponentially weighted moving average formula: | |
148 | * new-load = alpha * old-load + (1 - alpha) * run-bucket-sample-count | |
149 | * (where, alpha < 1) | |
150 | * The calculations for the scheduler load are done using fixpoint math with | |
151 | * a scale factor of 16 to avoid expensive divides and floating point | |
152 | * operations. The final load values are a smooth curve representative of | |
d9a64523 A |
153 | * the actual number of runnable threads in a priority band. |
154 | */ | |
155 | ||
156 | /* Maintains the current (scaled for fixpoint) load in various buckets */ | |
157 | uint32_t sched_load[TH_BUCKET_MAX]; | |
39037602 | 158 | |
0a7de745 A |
159 | /* |
160 | * Alpha factor for the EWMA alogrithm. The current values are chosen as | |
161 | * 6:10 ("old load":"new samples") to make sure the scheduler reacts fast | |
162 | * enough to changing system load but does not see too many spikes from bursty | |
163 | * activity. The current values ensure that the scheduler would converge | |
164 | * to the latest load in 2-3 sched_load_compute_interval_abs intervals | |
d9a64523 | 165 | * (which amounts to ~30-45ms with current values). |
39236c6e | 166 | */ |
d9a64523 A |
167 | #define SCHED_LOAD_EWMA_ALPHA_OLD 6 |
168 | #define SCHED_LOAD_EWMA_ALPHA_NEW 10 | |
169 | #define SCHED_LOAD_EWMA_ALPHA_SHIFT 4 | |
170 | static_assert((SCHED_LOAD_EWMA_ALPHA_OLD + SCHED_LOAD_EWMA_ALPHA_NEW) == (1ul << SCHED_LOAD_EWMA_ALPHA_SHIFT)); | |
171 | ||
172 | /* For fixpoint EWMA, roundup the load to make it converge */ | |
0a7de745 | 173 | #define SCHED_LOAD_EWMA_ROUNDUP(load) (((load) & (1ul << (SCHED_LOAD_EWMA_ALPHA_SHIFT - 1))) != 0) |
d9a64523 A |
174 | |
175 | /* Macro to convert scaled sched load to a real load value */ | |
0a7de745 | 176 | #define SCHED_LOAD_EWMA_UNSCALE(load) (((load) >> SCHED_LOAD_EWMA_ALPHA_SHIFT) + SCHED_LOAD_EWMA_ROUNDUP(load)) |
39236c6e | 177 | |
d9a64523 | 178 | /* |
cb323159 A |
179 | * Routine to capture the latest runnable counts and update sched_load (only used for non-clutch schedulers) |
180 | */ | |
1c79356b | 181 | void |
d9a64523 | 182 | compute_sched_load(void) |
1c79356b | 183 | { |
2d21ac55 | 184 | /* |
39037602 A |
185 | * Retrieve a snapshot of the current run counts. |
186 | * | |
187 | * Why not a bcopy()? Because we need atomic word-sized reads of sched_run_buckets, | |
188 | * not byte-by-byte copy. | |
2d21ac55 | 189 | */ |
39037602 | 190 | uint32_t ncpus = processor_avail_count; |
d9a64523 | 191 | uint32_t load_now[TH_BUCKET_MAX]; |
2d21ac55 | 192 | |
cb323159 A |
193 | load_now[TH_BUCKET_RUN] = os_atomic_load(&sched_run_buckets[TH_BUCKET_RUN], relaxed); |
194 | load_now[TH_BUCKET_FIXPRI] = os_atomic_load(&sched_run_buckets[TH_BUCKET_FIXPRI], relaxed); | |
195 | load_now[TH_BUCKET_SHARE_FG] = os_atomic_load(&sched_run_buckets[TH_BUCKET_SHARE_FG], relaxed); | |
196 | load_now[TH_BUCKET_SHARE_DF] = os_atomic_load(&sched_run_buckets[TH_BUCKET_SHARE_DF], relaxed); | |
197 | load_now[TH_BUCKET_SHARE_UT] = os_atomic_load(&sched_run_buckets[TH_BUCKET_SHARE_UT], relaxed); | |
198 | load_now[TH_BUCKET_SHARE_BG] = os_atomic_load(&sched_run_buckets[TH_BUCKET_SHARE_BG], relaxed); | |
2d21ac55 | 199 | |
39037602 A |
200 | assert(load_now[TH_BUCKET_RUN] >= 0); |
201 | assert(load_now[TH_BUCKET_FIXPRI] >= 0); | |
202 | ||
d9a64523 A |
203 | uint32_t nthreads = load_now[TH_BUCKET_RUN]; |
204 | uint32_t nfixpri = load_now[TH_BUCKET_FIXPRI]; | |
39037602 A |
205 | |
206 | KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, | |
0a7de745 A |
207 | MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_LOAD) | DBG_FUNC_NONE, |
208 | load_now[TH_BUCKET_FIXPRI], (load_now[TH_BUCKET_SHARE_FG] + load_now[TH_BUCKET_SHARE_DF]), | |
209 | load_now[TH_BUCKET_SHARE_BG], load_now[TH_BUCKET_SHARE_UT], 0); | |
2d21ac55 | 210 | |
2d21ac55 | 211 | /* |
39236c6e A |
212 | * Compute the timeshare priority conversion factor based on loading. |
213 | * Because our counters may be incremented and accessed | |
214 | * concurrently with respect to each other, we may have | |
d9a64523 | 215 | * windows where the invariant (nthreads - nfixpri) == (fg + df + bg + ut) |
39236c6e | 216 | * is broken, so truncate values in these cases. |
2d21ac55 | 217 | */ |
39037602 | 218 | uint32_t timeshare_threads = (nthreads - nfixpri); |
0a7de745 A |
219 | for (uint32_t i = TH_BUCKET_SHARE_FG; i <= TH_BUCKET_SHARE_BG; i++) { |
220 | if (load_now[i] > timeshare_threads) { | |
39037602 | 221 | load_now[i] = timeshare_threads; |
0a7de745 | 222 | } |
39037602 | 223 | } |
39236c6e | 224 | |
0a7de745 A |
225 | /* |
226 | * Default threads contribute up to (NCPUS * 2) of load to FG threads | |
d9a64523 A |
227 | */ |
228 | if (load_now[TH_BUCKET_SHARE_DF] <= (ncpus * 2)) { | |
229 | load_now[TH_BUCKET_SHARE_FG] += load_now[TH_BUCKET_SHARE_DF]; | |
230 | } else { | |
231 | load_now[TH_BUCKET_SHARE_FG] += (ncpus * 2); | |
232 | } | |
0a7de745 | 233 | |
39037602 | 234 | /* |
d9a64523 | 235 | * Utility threads contribute up to NCPUS of load to FG & DF threads |
39037602 A |
236 | */ |
237 | if (load_now[TH_BUCKET_SHARE_UT] <= ncpus) { | |
238 | load_now[TH_BUCKET_SHARE_FG] += load_now[TH_BUCKET_SHARE_UT]; | |
d9a64523 | 239 | load_now[TH_BUCKET_SHARE_DF] += load_now[TH_BUCKET_SHARE_UT]; |
39037602 A |
240 | } else { |
241 | load_now[TH_BUCKET_SHARE_FG] += ncpus; | |
d9a64523 | 242 | load_now[TH_BUCKET_SHARE_DF] += ncpus; |
39037602 | 243 | } |
91447636 | 244 | |
39037602 | 245 | /* |
d9a64523 | 246 | * BG threads contribute up to 1 thread worth of load to FG, DF and UT threads |
39037602 A |
247 | */ |
248 | if (load_now[TH_BUCKET_SHARE_BG] > 0) { | |
249 | load_now[TH_BUCKET_SHARE_FG] += 1; | |
d9a64523 | 250 | load_now[TH_BUCKET_SHARE_DF] += 1; |
39037602 | 251 | load_now[TH_BUCKET_SHARE_UT] += 1; |
1c79356b A |
252 | } |
253 | ||
39037602 A |
254 | /* |
255 | * The conversion factor consists of two components: | |
256 | * a fixed value based on the absolute time unit (sched_fixed_shift), | |
257 | * and a dynamic portion based on load (sched_load_shifts). | |
258 | * | |
259 | * Zero load results in a out of range shift count. | |
260 | */ | |
39236c6e | 261 | |
0a7de745 | 262 | for (uint32_t i = TH_BUCKET_SHARE_FG; i <= TH_BUCKET_SHARE_BG; i++) { |
39037602 | 263 | uint32_t bucket_load = 0; |
39236c6e | 264 | |
39037602 | 265 | if (load_now[i] > ncpus) { |
d9a64523 | 266 | /* Normalize the load to number of CPUs */ |
0a7de745 | 267 | if (ncpus > 1) { |
39037602 | 268 | bucket_load = load_now[i] / ncpus; |
0a7de745 | 269 | } else { |
39037602 | 270 | bucket_load = load_now[i]; |
0a7de745 | 271 | } |
39236c6e | 272 | |
0a7de745 | 273 | if (bucket_load > MAX_LOAD) { |
39037602 | 274 | bucket_load = MAX_LOAD; |
0a7de745 | 275 | } |
39037602 | 276 | } |
d9a64523 A |
277 | /* Plug the load values into the EWMA algorithm to calculate (scaled for fixpoint) sched_load */ |
278 | sched_load[i] = (sched_load[i] * SCHED_LOAD_EWMA_ALPHA_OLD) + ((bucket_load << SCHED_LOAD_EWMA_ALPHA_SHIFT) * SCHED_LOAD_EWMA_ALPHA_NEW); | |
279 | sched_load[i] = sched_load[i] >> SCHED_LOAD_EWMA_ALPHA_SHIFT; | |
280 | } | |
39236c6e | 281 | |
d9a64523 | 282 | KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, |
0a7de745 A |
283 | MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_LOAD_EFFECTIVE) | DBG_FUNC_NONE, |
284 | SCHED_LOAD_EWMA_UNSCALE(sched_load[TH_BUCKET_SHARE_FG]), SCHED_LOAD_EWMA_UNSCALE(sched_load[TH_BUCKET_SHARE_DF]), | |
285 | SCHED_LOAD_EWMA_UNSCALE(sched_load[TH_BUCKET_SHARE_UT]), SCHED_LOAD_EWMA_UNSCALE(sched_load[TH_BUCKET_SHARE_BG]), 0); | |
d9a64523 A |
286 | } |
287 | ||
288 | void | |
289 | compute_averages(uint64_t stdelta) | |
290 | { | |
cb323159 | 291 | uint32_t nthreads = os_atomic_load(&sched_run_buckets[TH_BUCKET_RUN], relaxed) - 1; |
d9a64523 | 292 | uint32_t ncpus = processor_avail_count; |
0a7de745 | 293 | |
d9a64523 | 294 | /* Update the global pri_shifts based on the latest values */ |
0a7de745 | 295 | for (uint32_t i = TH_BUCKET_SHARE_FG; i <= TH_BUCKET_SHARE_BG; i++) { |
d9a64523 | 296 | uint32_t bucket_load = SCHED_LOAD_EWMA_UNSCALE(sched_load[i]); |
f427ee49 A |
297 | uint32_t shift = sched_fixed_shift - sched_load_shifts[bucket_load]; |
298 | ||
299 | if (shift > SCHED_PRI_SHIFT_MAX) { | |
300 | sched_pri_shifts[i] = INT8_MAX; | |
301 | } else { | |
302 | sched_pri_shifts[i] = shift; | |
303 | } | |
39037602 | 304 | } |
490019cf | 305 | |
6d2010ae | 306 | /* |
39037602 | 307 | * Sample total running threads for the load average calculation. |
6d2010ae A |
308 | */ |
309 | sched_nrun = nthreads; | |
6d2010ae | 310 | |
2d21ac55 | 311 | /* |
39037602 A |
312 | * Load average and mach factor calculations for |
313 | * those which ask about these things. | |
2d21ac55 | 314 | */ |
39037602 A |
315 | uint32_t average_now = nthreads * LOAD_SCALE; |
316 | uint32_t factor_now; | |
317 | ||
0a7de745 | 318 | if (nthreads > ncpus) { |
39037602 | 319 | factor_now = (ncpus * LOAD_SCALE) / (nthreads + 1); |
0a7de745 | 320 | } else { |
39037602 | 321 | factor_now = (ncpus - nthreads) * LOAD_SCALE; |
0a7de745 | 322 | } |
2d21ac55 | 323 | |
1c79356b | 324 | /* |
39037602 A |
325 | * For those statistics that formerly relied on being recomputed |
326 | * on timer ticks, advance by the approximate number of corresponding | |
327 | * elapsed intervals, thus compensating for potential idle intervals. | |
1c79356b | 328 | */ |
39037602 A |
329 | for (uint32_t index = 0; index < stdelta; index++) { |
330 | sched_mach_factor = ((sched_mach_factor << 2) + factor_now) / 5; | |
331 | sched_load_average = ((sched_load_average << 2) + average_now) / 5; | |
332 | } | |
39236c6e | 333 | |
39037602 A |
334 | /* |
335 | * Compute old-style Mach load averages. | |
336 | */ | |
337 | for (uint32_t index = 0; index < stdelta; index++) { | |
338 | for (uint32_t i = 0; i < 3; i++) { | |
1c79356b | 339 | mach_factor[i] = ((mach_factor[i] * fract[i]) + |
0a7de745 | 340 | (factor_now * (LOAD_SCALE - fract[i]))) / LOAD_SCALE; |
1c79356b A |
341 | |
342 | avenrun[i] = ((avenrun[i] * fract[i]) + | |
0a7de745 | 343 | (average_now * (LOAD_SCALE - fract[i]))) / LOAD_SCALE; |
1c79356b A |
344 | } |
345 | } | |
9bccf70c A |
346 | |
347 | /* | |
39037602 | 348 | * Compute averages in other components. |
9bccf70c | 349 | */ |
39037602 A |
350 | uint64_t abstime = mach_absolute_time(); |
351 | ||
352 | for (sched_average_t avg = sched_average; avg->comp != NULL; ++avg) { | |
6d2010ae | 353 | if (abstime >= avg->deadline) { |
39236c6e A |
354 | uint64_t period_abs = (avg->period * sched_one_second_interval); |
355 | uint64_t ninvokes = 1; | |
356 | ||
357 | ninvokes += (abstime - avg->deadline) / period_abs; | |
358 | ninvokes = MIN(ninvokes, SCHED_TICK_MAX_DELTA); | |
359 | ||
39037602 | 360 | for (uint32_t index = 0; index < ninvokes; index++) { |
39236c6e A |
361 | (*avg->comp)(avg->param); |
362 | } | |
363 | avg->deadline = abstime + period_abs; | |
91447636 | 364 | } |
9bccf70c | 365 | } |
1c79356b | 366 | } |