2 * Copyright (c) 2000-2007 Apple Computer, Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
46 * Carnegie Mellon requests users of this software to return to
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
59 * Author: Avadis Tevanian, Jr.
62 * Compute various averages.
65 #include <mach/mach_types.h>
67 #include <kern/sched.h>
68 #include <kern/assert.h>
69 #include <kern/processor.h>
70 #include <kern/thread.h>
72 #include <kern/telemetry.h>
75 #include <sys/kdebug.h>
77 uint32_t avenrun
[3] = {0, 0, 0};
78 uint32_t mach_factor
[3] = {0, 0, 0};
80 uint32_t sched_load_average
, sched_mach_factor
;
82 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
84 * Values are scaled by LOAD_SCALE, defined in processor_info.h
86 #define base(n) ((n) << SCHED_TICK_SHIFT)
87 #define frac(n) (((base(n) - 1) * LOAD_SCALE) / base(n))
89 static uint32_t fract
[3] = {
90 frac(5), /* 5 second average */
91 frac(30), /* 30 second average */
92 frac(60), /* 1 minute average */
98 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
100 static unsigned int sched_nrun
;
102 typedef void (*sched_avg_comp_t
)(
105 static struct sched_average
{
106 sched_avg_comp_t comp
;
108 int period
; /* in seconds */
110 } sched_average
[] = {
111 { compute_averunnable
, &sched_nrun
, 5, 0 },
112 { compute_stack_target
, NULL
, 5, 1 },
113 { compute_pageout_gc_throttle
, NULL
, 1, 0 },
114 { compute_pmap_gc_throttle
, NULL
, 60, 0 },
116 { compute_telemetry
, NULL
, 1, 0 },
121 typedef struct sched_average
*sched_average_t
;
124 * Scheduler load calculation algorithm
126 * The scheduler load values provide an estimate of the number of runnable
127 * timeshare threads in the system at various priority bands. The load
128 * ultimately affects the priority shifts applied to all threads in a band
129 * causing them to timeshare with other threads in the system. The load is
130 * maintained in buckets, with each bucket corresponding to a priority band.
132 * Each runnable thread on the system contributes its load to its priority
133 * band and to the bands above it. The contribution of a thread to the bands
134 * above it is not strictly 1:1 and is weighted based on the priority band
135 * of the thread. The rules of thread load contribution to each of its higher
136 * bands are as follows:
138 * - DF threads: Upto (2 * NCPUs) threads
139 * - UT threads: Upto NCPUs threads
140 * - BG threads: Upto 1 thread
142 * To calculate the load values, the various run buckets are sampled (every
143 * sched_load_compute_interval_abs) and the weighted contributions of the the
144 * lower bucket threads are added. The resultant value is plugged into an
145 * exponentially weighted moving average formula:
146 * new-load = alpha * old-load + (1 - alpha) * run-bucket-sample-count
148 * The calculations for the scheduler load are done using fixpoint math with
149 * a scale factor of 16 to avoid expensive divides and floating point
150 * operations. The final load values are a smooth curve representative of
151 * the actual number of runnable threads in a priority band.
154 /* Maintains the current (scaled for fixpoint) load in various buckets */
155 uint32_t sched_load
[TH_BUCKET_MAX
];
158 * Alpha factor for the EWMA alogrithm. The current values are chosen as
159 * 6:10 ("old load":"new samples") to make sure the scheduler reacts fast
160 * enough to changing system load but does not see too many spikes from bursty
161 * activity. The current values ensure that the scheduler would converge
162 * to the latest load in 2-3 sched_load_compute_interval_abs intervals
163 * (which amounts to ~30-45ms with current values).
165 #define SCHED_LOAD_EWMA_ALPHA_OLD 6
166 #define SCHED_LOAD_EWMA_ALPHA_NEW 10
167 #define SCHED_LOAD_EWMA_ALPHA_SHIFT 4
168 static_assert((SCHED_LOAD_EWMA_ALPHA_OLD
+ SCHED_LOAD_EWMA_ALPHA_NEW
) == (1ul << SCHED_LOAD_EWMA_ALPHA_SHIFT
));
170 /* For fixpoint EWMA, roundup the load to make it converge */
171 #define SCHED_LOAD_EWMA_ROUNDUP(load) (((load) & (1ul << (SCHED_LOAD_EWMA_ALPHA_SHIFT - 1))) != 0)
173 /* Macro to convert scaled sched load to a real load value */
174 #define SCHED_LOAD_EWMA_UNSCALE(load) (((load) >> SCHED_LOAD_EWMA_ALPHA_SHIFT) + SCHED_LOAD_EWMA_ROUNDUP(load))
177 * Routine to capture the latest runnable counts and update sched_load (only used for non-clutch schedulers)
180 compute_sched_load(void)
183 * Retrieve a snapshot of the current run counts.
185 * Why not a bcopy()? Because we need atomic word-sized reads of sched_run_buckets,
186 * not byte-by-byte copy.
188 uint32_t ncpus
= processor_avail_count
;
189 uint32_t load_now
[TH_BUCKET_MAX
];
191 load_now
[TH_BUCKET_RUN
] = os_atomic_load(&sched_run_buckets
[TH_BUCKET_RUN
], relaxed
);
192 load_now
[TH_BUCKET_FIXPRI
] = os_atomic_load(&sched_run_buckets
[TH_BUCKET_FIXPRI
], relaxed
);
193 load_now
[TH_BUCKET_SHARE_FG
] = os_atomic_load(&sched_run_buckets
[TH_BUCKET_SHARE_FG
], relaxed
);
194 load_now
[TH_BUCKET_SHARE_DF
] = os_atomic_load(&sched_run_buckets
[TH_BUCKET_SHARE_DF
], relaxed
);
195 load_now
[TH_BUCKET_SHARE_UT
] = os_atomic_load(&sched_run_buckets
[TH_BUCKET_SHARE_UT
], relaxed
);
196 load_now
[TH_BUCKET_SHARE_BG
] = os_atomic_load(&sched_run_buckets
[TH_BUCKET_SHARE_BG
], relaxed
);
198 assert(load_now
[TH_BUCKET_RUN
] >= 0);
199 assert(load_now
[TH_BUCKET_FIXPRI
] >= 0);
201 uint32_t nthreads
= load_now
[TH_BUCKET_RUN
];
202 uint32_t nfixpri
= load_now
[TH_BUCKET_FIXPRI
];
204 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE
,
205 MACHDBG_CODE(DBG_MACH_SCHED
, MACH_SCHED_LOAD
) | DBG_FUNC_NONE
,
206 load_now
[TH_BUCKET_FIXPRI
], (load_now
[TH_BUCKET_SHARE_FG
] + load_now
[TH_BUCKET_SHARE_DF
]),
207 load_now
[TH_BUCKET_SHARE_BG
], load_now
[TH_BUCKET_SHARE_UT
], 0);
210 * Compute the timeshare priority conversion factor based on loading.
211 * Because our counters may be incremented and accessed
212 * concurrently with respect to each other, we may have
213 * windows where the invariant (nthreads - nfixpri) == (fg + df + bg + ut)
214 * is broken, so truncate values in these cases.
216 uint32_t timeshare_threads
= (nthreads
- nfixpri
);
217 for (uint32_t i
= TH_BUCKET_SHARE_FG
; i
<= TH_BUCKET_SHARE_BG
; i
++) {
218 if (load_now
[i
] > timeshare_threads
) {
219 load_now
[i
] = timeshare_threads
;
224 * Default threads contribute up to (NCPUS * 2) of load to FG threads
226 if (load_now
[TH_BUCKET_SHARE_DF
] <= (ncpus
* 2)) {
227 load_now
[TH_BUCKET_SHARE_FG
] += load_now
[TH_BUCKET_SHARE_DF
];
229 load_now
[TH_BUCKET_SHARE_FG
] += (ncpus
* 2);
233 * Utility threads contribute up to NCPUS of load to FG & DF threads
235 if (load_now
[TH_BUCKET_SHARE_UT
] <= ncpus
) {
236 load_now
[TH_BUCKET_SHARE_FG
] += load_now
[TH_BUCKET_SHARE_UT
];
237 load_now
[TH_BUCKET_SHARE_DF
] += load_now
[TH_BUCKET_SHARE_UT
];
239 load_now
[TH_BUCKET_SHARE_FG
] += ncpus
;
240 load_now
[TH_BUCKET_SHARE_DF
] += ncpus
;
244 * BG threads contribute up to 1 thread worth of load to FG, DF and UT threads
246 if (load_now
[TH_BUCKET_SHARE_BG
] > 0) {
247 load_now
[TH_BUCKET_SHARE_FG
] += 1;
248 load_now
[TH_BUCKET_SHARE_DF
] += 1;
249 load_now
[TH_BUCKET_SHARE_UT
] += 1;
253 * The conversion factor consists of two components:
254 * a fixed value based on the absolute time unit (sched_fixed_shift),
255 * and a dynamic portion based on load (sched_load_shifts).
257 * Zero load results in a out of range shift count.
260 for (uint32_t i
= TH_BUCKET_SHARE_FG
; i
<= TH_BUCKET_SHARE_BG
; i
++) {
261 uint32_t bucket_load
= 0;
263 if (load_now
[i
] > ncpus
) {
264 /* Normalize the load to number of CPUs */
266 bucket_load
= load_now
[i
] / ncpus
;
268 bucket_load
= load_now
[i
];
271 if (bucket_load
> MAX_LOAD
) {
272 bucket_load
= MAX_LOAD
;
275 /* Plug the load values into the EWMA algorithm to calculate (scaled for fixpoint) sched_load */
276 sched_load
[i
] = (sched_load
[i
] * SCHED_LOAD_EWMA_ALPHA_OLD
) + ((bucket_load
<< SCHED_LOAD_EWMA_ALPHA_SHIFT
) * SCHED_LOAD_EWMA_ALPHA_NEW
);
277 sched_load
[i
] = sched_load
[i
] >> SCHED_LOAD_EWMA_ALPHA_SHIFT
;
280 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE
,
281 MACHDBG_CODE(DBG_MACH_SCHED
, MACH_SCHED_LOAD_EFFECTIVE
) | DBG_FUNC_NONE
,
282 SCHED_LOAD_EWMA_UNSCALE(sched_load
[TH_BUCKET_SHARE_FG
]), SCHED_LOAD_EWMA_UNSCALE(sched_load
[TH_BUCKET_SHARE_DF
]),
283 SCHED_LOAD_EWMA_UNSCALE(sched_load
[TH_BUCKET_SHARE_UT
]), SCHED_LOAD_EWMA_UNSCALE(sched_load
[TH_BUCKET_SHARE_BG
]), 0);
287 compute_averages(uint64_t stdelta
)
289 uint32_t nthreads
= os_atomic_load(&sched_run_buckets
[TH_BUCKET_RUN
], relaxed
) - 1;
290 uint32_t ncpus
= processor_avail_count
;
292 /* Update the global pri_shifts based on the latest values */
293 for (uint32_t i
= TH_BUCKET_SHARE_FG
; i
<= TH_BUCKET_SHARE_BG
; i
++) {
294 uint32_t bucket_load
= SCHED_LOAD_EWMA_UNSCALE(sched_load
[i
]);
295 sched_pri_shifts
[i
] = sched_fixed_shift
- sched_load_shifts
[bucket_load
];
299 * Sample total running threads for the load average calculation.
301 sched_nrun
= nthreads
;
304 * Load average and mach factor calculations for
305 * those which ask about these things.
307 uint32_t average_now
= nthreads
* LOAD_SCALE
;
310 if (nthreads
> ncpus
) {
311 factor_now
= (ncpus
* LOAD_SCALE
) / (nthreads
+ 1);
313 factor_now
= (ncpus
- nthreads
) * LOAD_SCALE
;
317 * For those statistics that formerly relied on being recomputed
318 * on timer ticks, advance by the approximate number of corresponding
319 * elapsed intervals, thus compensating for potential idle intervals.
321 for (uint32_t index
= 0; index
< stdelta
; index
++) {
322 sched_mach_factor
= ((sched_mach_factor
<< 2) + factor_now
) / 5;
323 sched_load_average
= ((sched_load_average
<< 2) + average_now
) / 5;
327 * Compute old-style Mach load averages.
329 for (uint32_t index
= 0; index
< stdelta
; index
++) {
330 for (uint32_t i
= 0; i
< 3; i
++) {
331 mach_factor
[i
] = ((mach_factor
[i
] * fract
[i
]) +
332 (factor_now
* (LOAD_SCALE
- fract
[i
]))) / LOAD_SCALE
;
334 avenrun
[i
] = ((avenrun
[i
] * fract
[i
]) +
335 (average_now
* (LOAD_SCALE
- fract
[i
]))) / LOAD_SCALE
;
340 * Compute averages in other components.
342 uint64_t abstime
= mach_absolute_time();
344 for (sched_average_t avg
= sched_average
; avg
->comp
!= NULL
; ++avg
) {
345 if (abstime
>= avg
->deadline
) {
346 uint64_t period_abs
= (avg
->period
* sched_one_second_interval
);
347 uint64_t ninvokes
= 1;
349 ninvokes
+= (abstime
- avg
->deadline
) / period_abs
;
350 ninvokes
= MIN(ninvokes
, SCHED_TICK_MAX_DELTA
);
352 for (uint32_t index
= 0; index
< ninvokes
; index
++) {
353 (*avg
->comp
)(avg
->param
);
355 avg
->deadline
= abstime
+ period_abs
;