2 * Copyright (c) 2000-2007 Apple Computer, Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
46 * Carnegie Mellon requests users of this software to return to
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
59 * Author: Avadis Tevanian, Jr.
62 * Compute various averages.
65 #include <mach/mach_types.h>
67 #include <kern/sched.h>
68 #include <kern/assert.h>
69 #include <kern/processor.h>
70 #include <kern/thread.h>
72 #include <kern/telemetry.h>
74 #include <kern/zalloc_internal.h>
76 #include <sys/kdebug.h>
78 uint32_t avenrun
[3] = {0, 0, 0};
79 uint32_t mach_factor
[3] = {0, 0, 0};
81 uint32_t sched_load_average
, sched_mach_factor
;
83 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
85 * Values are scaled by LOAD_SCALE, defined in processor_info.h
87 #define base(n) ((n) << SCHED_TICK_SHIFT)
88 #define frac(n) (((base(n) - 1) * LOAD_SCALE) / base(n))
90 static uint32_t fract
[3] = {
91 frac(5), /* 5 second average */
92 frac(30), /* 30 second average */
93 frac(60), /* 1 minute average */
99 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
101 static unsigned int sched_nrun
;
103 typedef void (*sched_avg_comp_t
)(
106 static struct sched_average
{
107 sched_avg_comp_t comp
;
109 int period
; /* in seconds */
111 } sched_average
[] = {
112 { compute_averunnable
, &sched_nrun
, 5, 0 },
113 { compute_stack_target
, NULL
, 5, 1 },
114 { compute_pageout_gc_throttle
, NULL
, 1, 0 },
115 { compute_pmap_gc_throttle
, NULL
, 60, 0 },
116 { compute_zone_working_set_size
, NULL
, ZONE_WSS_UPDATE_PERIOD
, 0 },
118 { compute_telemetry
, NULL
, 1, 0 },
123 typedef struct sched_average
*sched_average_t
;
126 * Scheduler load calculation algorithm
128 * The scheduler load values provide an estimate of the number of runnable
129 * timeshare threads in the system at various priority bands. The load
130 * ultimately affects the priority shifts applied to all threads in a band
131 * causing them to timeshare with other threads in the system. The load is
132 * maintained in buckets, with each bucket corresponding to a priority band.
134 * Each runnable thread on the system contributes its load to its priority
135 * band and to the bands above it. The contribution of a thread to the bands
136 * above it is not strictly 1:1 and is weighted based on the priority band
137 * of the thread. The rules of thread load contribution to each of its higher
138 * bands are as follows:
140 * - DF threads: Upto (2 * NCPUs) threads
141 * - UT threads: Upto NCPUs threads
142 * - BG threads: Upto 1 thread
144 * To calculate the load values, the various run buckets are sampled (every
145 * sched_load_compute_interval_abs) and the weighted contributions of the the
146 * lower bucket threads are added. The resultant value is plugged into an
147 * exponentially weighted moving average formula:
148 * new-load = alpha * old-load + (1 - alpha) * run-bucket-sample-count
150 * The calculations for the scheduler load are done using fixpoint math with
151 * a scale factor of 16 to avoid expensive divides and floating point
152 * operations. The final load values are a smooth curve representative of
153 * the actual number of runnable threads in a priority band.
156 /* Maintains the current (scaled for fixpoint) load in various buckets */
157 uint32_t sched_load
[TH_BUCKET_MAX
];
160 * Alpha factor for the EWMA alogrithm. The current values are chosen as
161 * 6:10 ("old load":"new samples") to make sure the scheduler reacts fast
162 * enough to changing system load but does not see too many spikes from bursty
163 * activity. The current values ensure that the scheduler would converge
164 * to the latest load in 2-3 sched_load_compute_interval_abs intervals
165 * (which amounts to ~30-45ms with current values).
167 #define SCHED_LOAD_EWMA_ALPHA_OLD 6
168 #define SCHED_LOAD_EWMA_ALPHA_NEW 10
169 #define SCHED_LOAD_EWMA_ALPHA_SHIFT 4
170 static_assert((SCHED_LOAD_EWMA_ALPHA_OLD
+ SCHED_LOAD_EWMA_ALPHA_NEW
) == (1ul << SCHED_LOAD_EWMA_ALPHA_SHIFT
));
172 /* For fixpoint EWMA, roundup the load to make it converge */
173 #define SCHED_LOAD_EWMA_ROUNDUP(load) (((load) & (1ul << (SCHED_LOAD_EWMA_ALPHA_SHIFT - 1))) != 0)
175 /* Macro to convert scaled sched load to a real load value */
176 #define SCHED_LOAD_EWMA_UNSCALE(load) (((load) >> SCHED_LOAD_EWMA_ALPHA_SHIFT) + SCHED_LOAD_EWMA_ROUNDUP(load))
179 * Routine to capture the latest runnable counts and update sched_load (only used for non-clutch schedulers)
182 compute_sched_load(void)
185 * Retrieve a snapshot of the current run counts.
187 * Why not a bcopy()? Because we need atomic word-sized reads of sched_run_buckets,
188 * not byte-by-byte copy.
190 uint32_t ncpus
= processor_avail_count
;
191 uint32_t load_now
[TH_BUCKET_MAX
];
193 load_now
[TH_BUCKET_RUN
] = os_atomic_load(&sched_run_buckets
[TH_BUCKET_RUN
], relaxed
);
194 load_now
[TH_BUCKET_FIXPRI
] = os_atomic_load(&sched_run_buckets
[TH_BUCKET_FIXPRI
], relaxed
);
195 load_now
[TH_BUCKET_SHARE_FG
] = os_atomic_load(&sched_run_buckets
[TH_BUCKET_SHARE_FG
], relaxed
);
196 load_now
[TH_BUCKET_SHARE_DF
] = os_atomic_load(&sched_run_buckets
[TH_BUCKET_SHARE_DF
], relaxed
);
197 load_now
[TH_BUCKET_SHARE_UT
] = os_atomic_load(&sched_run_buckets
[TH_BUCKET_SHARE_UT
], relaxed
);
198 load_now
[TH_BUCKET_SHARE_BG
] = os_atomic_load(&sched_run_buckets
[TH_BUCKET_SHARE_BG
], relaxed
);
200 assert(load_now
[TH_BUCKET_RUN
] >= 0);
201 assert(load_now
[TH_BUCKET_FIXPRI
] >= 0);
203 uint32_t nthreads
= load_now
[TH_BUCKET_RUN
];
204 uint32_t nfixpri
= load_now
[TH_BUCKET_FIXPRI
];
206 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE
,
207 MACHDBG_CODE(DBG_MACH_SCHED
, MACH_SCHED_LOAD
) | DBG_FUNC_NONE
,
208 load_now
[TH_BUCKET_FIXPRI
], (load_now
[TH_BUCKET_SHARE_FG
] + load_now
[TH_BUCKET_SHARE_DF
]),
209 load_now
[TH_BUCKET_SHARE_BG
], load_now
[TH_BUCKET_SHARE_UT
], 0);
212 * Compute the timeshare priority conversion factor based on loading.
213 * Because our counters may be incremented and accessed
214 * concurrently with respect to each other, we may have
215 * windows where the invariant (nthreads - nfixpri) == (fg + df + bg + ut)
216 * is broken, so truncate values in these cases.
218 uint32_t timeshare_threads
= (nthreads
- nfixpri
);
219 for (uint32_t i
= TH_BUCKET_SHARE_FG
; i
<= TH_BUCKET_SHARE_BG
; i
++) {
220 if (load_now
[i
] > timeshare_threads
) {
221 load_now
[i
] = timeshare_threads
;
226 * Default threads contribute up to (NCPUS * 2) of load to FG threads
228 if (load_now
[TH_BUCKET_SHARE_DF
] <= (ncpus
* 2)) {
229 load_now
[TH_BUCKET_SHARE_FG
] += load_now
[TH_BUCKET_SHARE_DF
];
231 load_now
[TH_BUCKET_SHARE_FG
] += (ncpus
* 2);
235 * Utility threads contribute up to NCPUS of load to FG & DF threads
237 if (load_now
[TH_BUCKET_SHARE_UT
] <= ncpus
) {
238 load_now
[TH_BUCKET_SHARE_FG
] += load_now
[TH_BUCKET_SHARE_UT
];
239 load_now
[TH_BUCKET_SHARE_DF
] += load_now
[TH_BUCKET_SHARE_UT
];
241 load_now
[TH_BUCKET_SHARE_FG
] += ncpus
;
242 load_now
[TH_BUCKET_SHARE_DF
] += ncpus
;
246 * BG threads contribute up to 1 thread worth of load to FG, DF and UT threads
248 if (load_now
[TH_BUCKET_SHARE_BG
] > 0) {
249 load_now
[TH_BUCKET_SHARE_FG
] += 1;
250 load_now
[TH_BUCKET_SHARE_DF
] += 1;
251 load_now
[TH_BUCKET_SHARE_UT
] += 1;
255 * The conversion factor consists of two components:
256 * a fixed value based on the absolute time unit (sched_fixed_shift),
257 * and a dynamic portion based on load (sched_load_shifts).
259 * Zero load results in a out of range shift count.
262 for (uint32_t i
= TH_BUCKET_SHARE_FG
; i
<= TH_BUCKET_SHARE_BG
; i
++) {
263 uint32_t bucket_load
= 0;
265 if (load_now
[i
] > ncpus
) {
266 /* Normalize the load to number of CPUs */
268 bucket_load
= load_now
[i
] / ncpus
;
270 bucket_load
= load_now
[i
];
273 if (bucket_load
> MAX_LOAD
) {
274 bucket_load
= MAX_LOAD
;
277 /* Plug the load values into the EWMA algorithm to calculate (scaled for fixpoint) sched_load */
278 sched_load
[i
] = (sched_load
[i
] * SCHED_LOAD_EWMA_ALPHA_OLD
) + ((bucket_load
<< SCHED_LOAD_EWMA_ALPHA_SHIFT
) * SCHED_LOAD_EWMA_ALPHA_NEW
);
279 sched_load
[i
] = sched_load
[i
] >> SCHED_LOAD_EWMA_ALPHA_SHIFT
;
282 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE
,
283 MACHDBG_CODE(DBG_MACH_SCHED
, MACH_SCHED_LOAD_EFFECTIVE
) | DBG_FUNC_NONE
,
284 SCHED_LOAD_EWMA_UNSCALE(sched_load
[TH_BUCKET_SHARE_FG
]), SCHED_LOAD_EWMA_UNSCALE(sched_load
[TH_BUCKET_SHARE_DF
]),
285 SCHED_LOAD_EWMA_UNSCALE(sched_load
[TH_BUCKET_SHARE_UT
]), SCHED_LOAD_EWMA_UNSCALE(sched_load
[TH_BUCKET_SHARE_BG
]), 0);
289 compute_averages(uint64_t stdelta
)
291 uint32_t nthreads
= os_atomic_load(&sched_run_buckets
[TH_BUCKET_RUN
], relaxed
) - 1;
292 uint32_t ncpus
= processor_avail_count
;
294 /* Update the global pri_shifts based on the latest values */
295 for (uint32_t i
= TH_BUCKET_SHARE_FG
; i
<= TH_BUCKET_SHARE_BG
; i
++) {
296 uint32_t bucket_load
= SCHED_LOAD_EWMA_UNSCALE(sched_load
[i
]);
297 uint32_t shift
= sched_fixed_shift
- sched_load_shifts
[bucket_load
];
299 if (shift
> SCHED_PRI_SHIFT_MAX
) {
300 sched_pri_shifts
[i
] = INT8_MAX
;
302 sched_pri_shifts
[i
] = shift
;
307 * Sample total running threads for the load average calculation.
309 sched_nrun
= nthreads
;
312 * Load average and mach factor calculations for
313 * those which ask about these things.
315 uint32_t average_now
= nthreads
* LOAD_SCALE
;
318 if (nthreads
> ncpus
) {
319 factor_now
= (ncpus
* LOAD_SCALE
) / (nthreads
+ 1);
321 factor_now
= (ncpus
- nthreads
) * LOAD_SCALE
;
325 * For those statistics that formerly relied on being recomputed
326 * on timer ticks, advance by the approximate number of corresponding
327 * elapsed intervals, thus compensating for potential idle intervals.
329 for (uint32_t index
= 0; index
< stdelta
; index
++) {
330 sched_mach_factor
= ((sched_mach_factor
<< 2) + factor_now
) / 5;
331 sched_load_average
= ((sched_load_average
<< 2) + average_now
) / 5;
335 * Compute old-style Mach load averages.
337 for (uint32_t index
= 0; index
< stdelta
; index
++) {
338 for (uint32_t i
= 0; i
< 3; i
++) {
339 mach_factor
[i
] = ((mach_factor
[i
] * fract
[i
]) +
340 (factor_now
* (LOAD_SCALE
- fract
[i
]))) / LOAD_SCALE
;
342 avenrun
[i
] = ((avenrun
[i
] * fract
[i
]) +
343 (average_now
* (LOAD_SCALE
- fract
[i
]))) / LOAD_SCALE
;
348 * Compute averages in other components.
350 uint64_t abstime
= mach_absolute_time();
352 for (sched_average_t avg
= sched_average
; avg
->comp
!= NULL
; ++avg
) {
353 if (abstime
>= avg
->deadline
) {
354 uint64_t period_abs
= (avg
->period
* sched_one_second_interval
);
355 uint64_t ninvokes
= 1;
357 ninvokes
+= (abstime
- avg
->deadline
) / period_abs
;
358 ninvokes
= MIN(ninvokes
, SCHED_TICK_MAX_DELTA
);
360 for (uint32_t index
= 0; index
< ninvokes
; index
++) {
361 (*avg
->comp
)(avg
->param
);
363 avg
->deadline
= abstime
+ period_abs
;