2 * Copyright (c) 2000-2007 Apple Computer, Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
46 * Carnegie Mellon requests users of this software to return to
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
59 * Author: Avadis Tevanian, Jr.
62 * Compute various averages.
65 #include <mach/mach_types.h>
67 #include <kern/sched.h>
68 #include <kern/assert.h>
69 #include <kern/processor.h>
70 #include <kern/thread.h>
72 #include <kern/telemetry.h>
75 #include <sys/kdebug.h>
77 uint32_t avenrun
[3] = {0, 0, 0};
78 uint32_t mach_factor
[3] = {0, 0, 0};
80 uint32_t sched_load_average
, sched_mach_factor
;
82 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
84 * Values are scaled by LOAD_SCALE, defined in processor_info.h
86 #define base(n) ((n) << SCHED_TICK_SHIFT)
87 #define frac(n) (((base(n) - 1) * LOAD_SCALE) / base(n))
89 static uint32_t fract
[3] = {
90 frac(5), /* 5 second average */
91 frac(30), /* 30 second average */
92 frac(60), /* 1 minute average */
98 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
100 static unsigned int sched_nrun
;
102 typedef void (*sched_avg_comp_t
)(
105 static struct sched_average
{
106 sched_avg_comp_t comp
;
108 int period
; /* in seconds */
110 } sched_average
[] = {
111 { compute_averunnable
, &sched_nrun
, 5, 0 },
112 { compute_stack_target
, NULL
, 5, 1 },
113 { compute_pageout_gc_throttle
, NULL
, 1, 0 },
114 { compute_pmap_gc_throttle
, NULL
, 60, 0 },
116 { compute_telemetry
, NULL
, 1, 0 },
121 typedef struct sched_average
*sched_average_t
;
124 * Scheduler load calculation algorithm
126 * The scheduler load values provide an estimate of the number of runnable
127 * timeshare threads in the system at various priority bands. The load
128 * ultimately affects the priority shifts applied to all threads in a band
129 * causing them to timeshare with other threads in the system. The load is
130 * maintained in buckets, with each bucket corresponding to a priority band.
132 * Each runnable thread on the system contributes its load to its priority
133 * band and to the bands above it. The contribution of a thread to the bands
134 * above it is not strictly 1:1 and is weighted based on the priority band
135 * of the thread. The rules of thread load contribution to each of its higher
136 * bands are as follows:
138 * - DF threads: Upto (2 * NCPUs) threads
139 * - UT threads: Upto NCPUs threads
140 * - BG threads: Upto 1 thread
142 * To calculate the load values, the various run buckets are sampled (every
143 * sched_load_compute_interval_abs) and the weighted contributions of the the
144 * lower bucket threads are added. The resultant value is plugged into an
145 * exponentially weighted moving average formula:
146 * new-load = alpha * old-load + (1 - alpha) * run-bucket-sample-count
148 * The calculations for the scheduler load are done using fixpoint math with
149 * a scale factor of 16 to avoid expensive divides and floating point
150 * operations. The final load values are a smooth curve representative of
151 * the actual number of runnable threads in a priority band.
154 /* Maintains the current (scaled for fixpoint) load in various buckets */
155 uint32_t sched_load
[TH_BUCKET_MAX
];
158 * Alpha factor for the EWMA alogrithm. The current values are chosen as
159 * 6:10 ("old load":"new samples") to make sure the scheduler reacts fast
160 * enough to changing system load but does not see too many spikes from bursty
161 * activity. The current values ensure that the scheduler would converge
162 * to the latest load in 2-3 sched_load_compute_interval_abs intervals
163 * (which amounts to ~30-45ms with current values).
165 #define SCHED_LOAD_EWMA_ALPHA_OLD 6
166 #define SCHED_LOAD_EWMA_ALPHA_NEW 10
167 #define SCHED_LOAD_EWMA_ALPHA_SHIFT 4
168 static_assert((SCHED_LOAD_EWMA_ALPHA_OLD
+ SCHED_LOAD_EWMA_ALPHA_NEW
) == (1ul << SCHED_LOAD_EWMA_ALPHA_SHIFT
));
170 /* For fixpoint EWMA, roundup the load to make it converge */
171 #define SCHED_LOAD_EWMA_ROUNDUP(load) (((load) & (1ul << (SCHED_LOAD_EWMA_ALPHA_SHIFT - 1))) != 0)
173 /* Macro to convert scaled sched load to a real load value */
174 #define SCHED_LOAD_EWMA_UNSCALE(load) (((load) >> SCHED_LOAD_EWMA_ALPHA_SHIFT) + SCHED_LOAD_EWMA_ROUNDUP(load))
177 * Routine to capture the latest runnable counts and update sched_load */
179 compute_sched_load(void)
182 * Retrieve a snapshot of the current run counts.
184 * Why not a bcopy()? Because we need atomic word-sized reads of sched_run_buckets,
185 * not byte-by-byte copy.
187 uint32_t ncpus
= processor_avail_count
;
188 uint32_t load_now
[TH_BUCKET_MAX
];
190 load_now
[TH_BUCKET_RUN
] = sched_run_buckets
[TH_BUCKET_RUN
];
191 load_now
[TH_BUCKET_FIXPRI
] = sched_run_buckets
[TH_BUCKET_FIXPRI
];
192 load_now
[TH_BUCKET_SHARE_FG
] = sched_run_buckets
[TH_BUCKET_SHARE_FG
];
193 load_now
[TH_BUCKET_SHARE_DF
] = sched_run_buckets
[TH_BUCKET_SHARE_DF
];
194 load_now
[TH_BUCKET_SHARE_UT
] = sched_run_buckets
[TH_BUCKET_SHARE_UT
];
195 load_now
[TH_BUCKET_SHARE_BG
] = sched_run_buckets
[TH_BUCKET_SHARE_BG
];
197 assert(load_now
[TH_BUCKET_RUN
] >= 0);
198 assert(load_now
[TH_BUCKET_FIXPRI
] >= 0);
200 uint32_t nthreads
= load_now
[TH_BUCKET_RUN
];
201 uint32_t nfixpri
= load_now
[TH_BUCKET_FIXPRI
];
203 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE
,
204 MACHDBG_CODE(DBG_MACH_SCHED
, MACH_SCHED_LOAD
) | DBG_FUNC_NONE
,
205 load_now
[TH_BUCKET_FIXPRI
], (load_now
[TH_BUCKET_SHARE_FG
] + load_now
[TH_BUCKET_SHARE_DF
]),
206 load_now
[TH_BUCKET_SHARE_BG
], load_now
[TH_BUCKET_SHARE_UT
], 0);
209 * Compute the timeshare priority conversion factor based on loading.
210 * Because our counters may be incremented and accessed
211 * concurrently with respect to each other, we may have
212 * windows where the invariant (nthreads - nfixpri) == (fg + df + bg + ut)
213 * is broken, so truncate values in these cases.
215 uint32_t timeshare_threads
= (nthreads
- nfixpri
);
216 for (uint32_t i
= TH_BUCKET_SHARE_FG
; i
<= TH_BUCKET_SHARE_BG
; i
++) {
217 if (load_now
[i
] > timeshare_threads
)
218 load_now
[i
] = timeshare_threads
;
222 * Default threads contribute up to (NCPUS * 2) of load to FG threads
224 if (load_now
[TH_BUCKET_SHARE_DF
] <= (ncpus
* 2)) {
225 load_now
[TH_BUCKET_SHARE_FG
] += load_now
[TH_BUCKET_SHARE_DF
];
227 load_now
[TH_BUCKET_SHARE_FG
] += (ncpus
* 2);
231 * Utility threads contribute up to NCPUS of load to FG & DF threads
233 if (load_now
[TH_BUCKET_SHARE_UT
] <= ncpus
) {
234 load_now
[TH_BUCKET_SHARE_FG
] += load_now
[TH_BUCKET_SHARE_UT
];
235 load_now
[TH_BUCKET_SHARE_DF
] += load_now
[TH_BUCKET_SHARE_UT
];
237 load_now
[TH_BUCKET_SHARE_FG
] += ncpus
;
238 load_now
[TH_BUCKET_SHARE_DF
] += ncpus
;
242 * BG threads contribute up to 1 thread worth of load to FG, DF and UT threads
244 if (load_now
[TH_BUCKET_SHARE_BG
] > 0) {
245 load_now
[TH_BUCKET_SHARE_FG
] += 1;
246 load_now
[TH_BUCKET_SHARE_DF
] += 1;
247 load_now
[TH_BUCKET_SHARE_UT
] += 1;
251 * The conversion factor consists of two components:
252 * a fixed value based on the absolute time unit (sched_fixed_shift),
253 * and a dynamic portion based on load (sched_load_shifts).
255 * Zero load results in a out of range shift count.
258 for (uint32_t i
= TH_BUCKET_SHARE_FG
; i
<= TH_BUCKET_SHARE_BG
; i
++) {
259 uint32_t bucket_load
= 0;
261 if (load_now
[i
] > ncpus
) {
262 /* Normalize the load to number of CPUs */
264 bucket_load
= load_now
[i
] / ncpus
;
266 bucket_load
= load_now
[i
];
268 if (bucket_load
> MAX_LOAD
)
269 bucket_load
= MAX_LOAD
;
271 /* Plug the load values into the EWMA algorithm to calculate (scaled for fixpoint) sched_load */
272 sched_load
[i
] = (sched_load
[i
] * SCHED_LOAD_EWMA_ALPHA_OLD
) + ((bucket_load
<< SCHED_LOAD_EWMA_ALPHA_SHIFT
) * SCHED_LOAD_EWMA_ALPHA_NEW
);
273 sched_load
[i
] = sched_load
[i
] >> SCHED_LOAD_EWMA_ALPHA_SHIFT
;
276 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE
,
277 MACHDBG_CODE(DBG_MACH_SCHED
, MACH_SCHED_LOAD_EFFECTIVE
) | DBG_FUNC_NONE
,
278 SCHED_LOAD_EWMA_UNSCALE(sched_load
[TH_BUCKET_SHARE_FG
]), SCHED_LOAD_EWMA_UNSCALE(sched_load
[TH_BUCKET_SHARE_DF
]),
279 SCHED_LOAD_EWMA_UNSCALE(sched_load
[TH_BUCKET_SHARE_UT
]), SCHED_LOAD_EWMA_UNSCALE(sched_load
[TH_BUCKET_SHARE_BG
]), 0);
283 compute_averages(uint64_t stdelta
)
286 uint32_t nthreads
= sched_run_buckets
[TH_BUCKET_RUN
] - 1;
287 uint32_t ncpus
= processor_avail_count
;
289 /* Update the global pri_shifts based on the latest values */
290 for (uint32_t i
= TH_BUCKET_SHARE_FG
; i
<= TH_BUCKET_SHARE_BG
; i
++) {
291 uint32_t bucket_load
= SCHED_LOAD_EWMA_UNSCALE(sched_load
[i
]);
292 sched_pri_shifts
[i
] = sched_fixed_shift
- sched_load_shifts
[bucket_load
];
296 * Sample total running threads for the load average calculation.
298 sched_nrun
= nthreads
;
301 * Load average and mach factor calculations for
302 * those which ask about these things.
304 uint32_t average_now
= nthreads
* LOAD_SCALE
;
307 if (nthreads
> ncpus
)
308 factor_now
= (ncpus
* LOAD_SCALE
) / (nthreads
+ 1);
310 factor_now
= (ncpus
- nthreads
) * LOAD_SCALE
;
313 * For those statistics that formerly relied on being recomputed
314 * on timer ticks, advance by the approximate number of corresponding
315 * elapsed intervals, thus compensating for potential idle intervals.
317 for (uint32_t index
= 0; index
< stdelta
; index
++) {
318 sched_mach_factor
= ((sched_mach_factor
<< 2) + factor_now
) / 5;
319 sched_load_average
= ((sched_load_average
<< 2) + average_now
) / 5;
323 * Compute old-style Mach load averages.
325 for (uint32_t index
= 0; index
< stdelta
; index
++) {
326 for (uint32_t i
= 0; i
< 3; i
++) {
327 mach_factor
[i
] = ((mach_factor
[i
] * fract
[i
]) +
328 (factor_now
* (LOAD_SCALE
- fract
[i
]))) / LOAD_SCALE
;
330 avenrun
[i
] = ((avenrun
[i
] * fract
[i
]) +
331 (average_now
* (LOAD_SCALE
- fract
[i
]))) / LOAD_SCALE
;
336 * Compute averages in other components.
338 uint64_t abstime
= mach_absolute_time();
340 for (sched_average_t avg
= sched_average
; avg
->comp
!= NULL
; ++avg
) {
341 if (abstime
>= avg
->deadline
) {
342 uint64_t period_abs
= (avg
->period
* sched_one_second_interval
);
343 uint64_t ninvokes
= 1;
345 ninvokes
+= (abstime
- avg
->deadline
) / period_abs
;
346 ninvokes
= MIN(ninvokes
, SCHED_TICK_MAX_DELTA
);
348 for (uint32_t index
= 0; index
< ninvokes
; index
++) {
349 (*avg
->comp
)(avg
->param
);
351 avg
->deadline
= abstime
+ period_abs
;