git.saurik.com Git - apple/xnu.git/blob - osfmk/kern/sched

1 /*

3 *

4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@

5 *

6 * This file contains Original Code and/or Modifications of Original Code

7 * as defined in and that are subject to the Apple Public Source License

8 * Version 2.0 (the 'License'). You may not use this file except in

9 * compliance with the License. The rights granted to you under the License

10 * may not be used to create, or enable the creation or redistribution of,

11 * unlawful or unlicensed copies of an Apple operating system, or to

12 * circumvent, violate, or enable the circumvention or violation of, any

13 * terms of an Apple operating system software license agreement.

14 *

15 * Please obtain a copy of the License at

16 * http://www.opensource.apple.com/apsl/ and read it before using this file.

17 *

18 * The Original Code and all software distributed under the License are

19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER

20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,

21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,

22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.

23 * Please see the License for the specific language governing rights and

24 * limitations under the License.

25 *

26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@

27 */

28 /*

29 * @OSF_COPYRIGHT@

30 */

31 /*

32 * Mach Operating System

35 *

36 * Permission to use, copy, modify and distribute this software and its

37 * documentation is hereby granted, provided that both the copyright

38 * notice and this permission notice appear in all copies of the

39 * software, derivative works or modified versions, and any portions

40 * thereof, and that both notices appear in supporting documentation.

41 *

42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"

43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR

44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.

45 *

46 * Carnegie Mellon requests users of this software to return to

47 *

48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU

49 * School of Computer Science

50 * Carnegie Mellon University

51 * Pittsburgh PA 15213-3890

52 *

53 * any improvements or extensions that they make and grant Carnegie Mellon

54 * the rights to redistribute these changes.

55 */

56 /*

57 */

58 /*

59 * Author: Avadis Tevanian, Jr.

60 * Date: 1986

61 *

62 * Compute various averages.

63 */

65 #include <mach/mach_types.h>

67 #include <kern/sched.h>

68 #include <kern/assert.h>

69 #include <kern/processor.h>

70 #include <kern/thread.h>

71 #if CONFIG_TELEMETRY

72 #include <kern/telemetry.h>

73 #endif

74 #include <kern/zalloc_internal.h>

76 #include <sys/kdebug.h>

 uint32_t        avenrun[3] = {0, 0, 0};

 uint32_t        mach_factor[3] = {0, 0, 0};

81 uint32_t sched_load_average, sched_mach_factor;

83 #if defined(CONFIG_SCHED_TIMESHARE_CORE)

84 /*

85 * Values are scaled by LOAD_SCALE, defined in processor_info.h

86 */

87 #define base(n) ((n) << SCHED_TICK_SHIFT)

88 #define frac(n) (((base(n) - 1) * LOAD_SCALE) / base(n))

90 static uint32_t fract[3] = {

         frac(5),                /* 5 second average */

         frac(30),               /* 30 second average */

         frac(60),               /* 1 minute average */

94 };

96 #undef base

97 #undef frac

99 #endif /* CONFIG_SCHED_TIMESHARE_CORE */

100

101 static unsigned int sched_nrun;

102

103 typedef void (*sched_avg_comp_t)(

104 void *param);

105

106 static struct sched_average {

107 sched_avg_comp_t comp;

108 void *param;

109 int period; /* in seconds */

110 uint64_t deadline;

111 } sched_average[] = {

         { compute_averunnable, &sched_nrun, 5, 0 },

         { compute_stack_target, NULL, 5, 1 },

         { compute_pageout_gc_throttle, NULL, 1, 0 },

         { compute_pmap_gc_throttle, NULL, 60, 0 },

         { compute_zone_working_set_size, NULL, ZONE_WSS_UPDATE_PERIOD, 0 },

117 #if CONFIG_TELEMETRY

         { compute_telemetry, NULL, 1, 0 },

119 #endif

         { NULL, NULL, 0, 0 }

121 };

122

123 typedef struct sched_average *sched_average_t;

124

125 /*

126 * Scheduler load calculation algorithm

127 *

128 * The scheduler load values provide an estimate of the number of runnable

129 * timeshare threads in the system at various priority bands. The load

130 * ultimately affects the priority shifts applied to all threads in a band

131 * causing them to timeshare with other threads in the system. The load is

132 * maintained in buckets, with each bucket corresponding to a priority band.

133 *

134 * Each runnable thread on the system contributes its load to its priority

135 * band and to the bands above it. The contribution of a thread to the bands

136 * above it is not strictly 1:1 and is weighted based on the priority band

137 * of the thread. The rules of thread load contribution to each of its higher

138 * bands are as follows:

139 *

140 * - DF threads: Upto (2 * NCPUs) threads

141 * - UT threads: Upto NCPUs threads

142 * - BG threads: Upto 1 thread

143 *

144 * To calculate the load values, the various run buckets are sampled (every

145 * sched_load_compute_interval_abs) and the weighted contributions of the the

146 * lower bucket threads are added. The resultant value is plugged into an

147 * exponentially weighted moving average formula:

148 * new-load = alpha * old-load + (1 - alpha) * run-bucket-sample-count

149 * (where, alpha < 1)

150 * The calculations for the scheduler load are done using fixpoint math with

151 * a scale factor of 16 to avoid expensive divides and floating point

152 * operations. The final load values are a smooth curve representative of

153 * the actual number of runnable threads in a priority band.

154 */

155

156 /* Maintains the current (scaled for fixpoint) load in various buckets */

157 uint32_t sched_load[TH_BUCKET_MAX];

158

159 /*

160 * Alpha factor for the EWMA alogrithm. The current values are chosen as

161 * 6:10 ("old load":"new samples") to make sure the scheduler reacts fast

162 * enough to changing system load but does not see too many spikes from bursty

163 * activity. The current values ensure that the scheduler would converge

164 * to the latest load in 2-3 sched_load_compute_interval_abs intervals

165 * (which amounts to ~30-45ms with current values).

166 */

167 #define SCHED_LOAD_EWMA_ALPHA_OLD 6

168 #define SCHED_LOAD_EWMA_ALPHA_NEW 10

169 #define SCHED_LOAD_EWMA_ALPHA_SHIFT 4

 static_assert((SCHED_LOAD_EWMA_ALPHA_OLD + SCHED_LOAD_EWMA_ALPHA_NEW) == (1ul << SCHED_LOAD_EWMA_ALPHA_SHIFT));

171

172 /* For fixpoint EWMA, roundup the load to make it converge */

173 #define SCHED_LOAD_EWMA_ROUNDUP(load) (((load) & (1ul << (SCHED_LOAD_EWMA_ALPHA_SHIFT - 1))) != 0)

174

175 /* Macro to convert scaled sched load to a real load value */

176 #define SCHED_LOAD_EWMA_UNSCALE(load) (((load) >> SCHED_LOAD_EWMA_ALPHA_SHIFT) + SCHED_LOAD_EWMA_ROUNDUP(load))

177

178 /*

179 * Routine to capture the latest runnable counts and update sched_load (only used for non-clutch schedulers)

180 */

181 void

182 compute_sched_load(void)

183 {

184 /*

185 * Retrieve a snapshot of the current run counts.

186 *

187 * Why not a bcopy()? Because we need atomic word-sized reads of sched_run_buckets,

188 * not byte-by-byte copy.

189 */

190 uint32_t ncpus = processor_avail_count;

191 uint32_t load_now[TH_BUCKET_MAX];

192

         load_now[TH_BUCKET_RUN]      = os_atomic_load(&sched_run_buckets[TH_BUCKET_RUN], relaxed);

         load_now[TH_BUCKET_FIXPRI]   = os_atomic_load(&sched_run_buckets[TH_BUCKET_FIXPRI], relaxed);

         load_now[TH_BUCKET_SHARE_FG] = os_atomic_load(&sched_run_buckets[TH_BUCKET_SHARE_FG], relaxed);

         load_now[TH_BUCKET_SHARE_DF] = os_atomic_load(&sched_run_buckets[TH_BUCKET_SHARE_DF], relaxed);

         load_now[TH_BUCKET_SHARE_UT] = os_atomic_load(&sched_run_buckets[TH_BUCKET_SHARE_UT], relaxed);

         load_now[TH_BUCKET_SHARE_BG] = os_atomic_load(&sched_run_buckets[TH_BUCKET_SHARE_BG], relaxed);

199

         assert(load_now[TH_BUCKET_RUN] >= 0);

         assert(load_now[TH_BUCKET_FIXPRI] >= 0);

202

203 uint32_t nthreads = load_now[TH_BUCKET_RUN];

204 uint32_t nfixpri = load_now[TH_BUCKET_FIXPRI];

205

206 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,

             MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_LOAD) | DBG_FUNC_NONE,

             load_now[TH_BUCKET_FIXPRI], (load_now[TH_BUCKET_SHARE_FG] + load_now[TH_BUCKET_SHARE_DF]),

             load_now[TH_BUCKET_SHARE_BG], load_now[TH_BUCKET_SHARE_UT], 0);

210

211 /*

212 * Compute the timeshare priority conversion factor based on loading.

213 * Because our counters may be incremented and accessed

214 * concurrently with respect to each other, we may have

215 * windows where the invariant (nthreads - nfixpri) == (fg + df + bg + ut)

216 * is broken, so truncate values in these cases.

217 */

218 uint32_t timeshare_threads = (nthreads - nfixpri);

         for (uint32_t i = TH_BUCKET_SHARE_FG; i <= TH_BUCKET_SHARE_BG; i++) {

                 if (load_now[i] > timeshare_threads) {

221 load_now[i] = timeshare_threads;

222 }

223 }

224

225 /*

226 * Default threads contribute up to (NCPUS * 2) of load to FG threads

227 */

         if (load_now[TH_BUCKET_SHARE_DF] <= (ncpus * 2)) {

229 load_now[TH_BUCKET_SHARE_FG] += load_now[TH_BUCKET_SHARE_DF];

230 } else {

                 load_now[TH_BUCKET_SHARE_FG] += (ncpus * 2);

232 }

233

234 /*

235 * Utility threads contribute up to NCPUS of load to FG & DF threads

236 */

         if (load_now[TH_BUCKET_SHARE_UT] <= ncpus) {

238 load_now[TH_BUCKET_SHARE_FG] += load_now[TH_BUCKET_SHARE_UT];

239 load_now[TH_BUCKET_SHARE_DF] += load_now[TH_BUCKET_SHARE_UT];

240 } else {

241 load_now[TH_BUCKET_SHARE_FG] += ncpus;

242 load_now[TH_BUCKET_SHARE_DF] += ncpus;

243 }

244

245 /*

246 * BG threads contribute up to 1 thread worth of load to FG, DF and UT threads

247 */

         if (load_now[TH_BUCKET_SHARE_BG] > 0) {

249 load_now[TH_BUCKET_SHARE_FG] += 1;

250 load_now[TH_BUCKET_SHARE_DF] += 1;

251 load_now[TH_BUCKET_SHARE_UT] += 1;

252 }

253

254 /*

255 * The conversion factor consists of two components:

256 * a fixed value based on the absolute time unit (sched_fixed_shift),

257 * and a dynamic portion based on load (sched_load_shifts).

258 *

259 * Zero load results in a out of range shift count.

260 */

261

         for (uint32_t i = TH_BUCKET_SHARE_FG; i <= TH_BUCKET_SHARE_BG; i++) {

263 uint32_t bucket_load = 0;

264

                 if (load_now[i] > ncpus) {

266 /* Normalize the load to number of CPUs */

                         if (ncpus > 1) {

268 bucket_load = load_now[i] / ncpus;

269 } else {

270 bucket_load = load_now[i];

271 }

272

273 if (bucket_load > MAX_LOAD) {

274 bucket_load = MAX_LOAD;

275 }

276 }

277 /* Plug the load values into the EWMA algorithm to calculate (scaled for fixpoint) sched_load */

                 sched_load[i] = (sched_load[i] * SCHED_LOAD_EWMA_ALPHA_OLD) + ((bucket_load << SCHED_LOAD_EWMA_ALPHA_SHIFT) * SCHED_LOAD_EWMA_ALPHA_NEW);

                 sched_load[i] = sched_load[i] >> SCHED_LOAD_EWMA_ALPHA_SHIFT;

280 }

281

282 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,

             MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_LOAD_EFFECTIVE) | DBG_FUNC_NONE,

             SCHED_LOAD_EWMA_UNSCALE(sched_load[TH_BUCKET_SHARE_FG]), SCHED_LOAD_EWMA_UNSCALE(sched_load[TH_BUCKET_SHARE_DF]),

             SCHED_LOAD_EWMA_UNSCALE(sched_load[TH_BUCKET_SHARE_UT]), SCHED_LOAD_EWMA_UNSCALE(sched_load[TH_BUCKET_SHARE_BG]), 0);

286 }

287

288 void

289 compute_averages(uint64_t stdelta)

290 {

         uint32_t nthreads = os_atomic_load(&sched_run_buckets[TH_BUCKET_RUN], relaxed) - 1;

292 uint32_t ncpus = processor_avail_count;

293

294 /* Update the global pri_shifts based on the latest values */

         for (uint32_t i = TH_BUCKET_SHARE_FG; i <= TH_BUCKET_SHARE_BG; i++) {

                 uint32_t bucket_load = SCHED_LOAD_EWMA_UNSCALE(sched_load[i]);

                 uint32_t shift = sched_fixed_shift - sched_load_shifts[bucket_load];

298

299 if (shift > SCHED_PRI_SHIFT_MAX) {

300 sched_pri_shifts[i] = INT8_MAX;

301 } else {

302 sched_pri_shifts[i] = shift;

303 }

304 }

305

306 /*

307 * Sample total running threads for the load average calculation.

308 */

309 sched_nrun = nthreads;

310

311 /*

312 * Load average and mach factor calculations for

313 * those which ask about these things.

314 */

315 uint32_t average_now = nthreads * LOAD_SCALE;

316 uint32_t factor_now;

317

318 if (nthreads > ncpus) {

                 factor_now = (ncpus * LOAD_SCALE) / (nthreads + 1);

320 } else {

321 factor_now = (ncpus - nthreads) * LOAD_SCALE;

322 }

323

324 /*

325 * For those statistics that formerly relied on being recomputed

326 * on timer ticks, advance by the approximate number of corresponding

327 * elapsed intervals, thus compensating for potential idle intervals.

328 */

         for (uint32_t index = 0; index < stdelta; index++) {

                 sched_mach_factor = ((sched_mach_factor << 2) + factor_now) / 5;

                 sched_load_average = ((sched_load_average << 2) + average_now) / 5;

332 }

333

334 /*

335 * Compute old-style Mach load averages.

336 */

         for (uint32_t index = 0; index < stdelta; index++) {

                 for (uint32_t i = 0; i < 3; i++) {

                         mach_factor[i] = ((mach_factor[i] * fract[i]) +

                             (factor_now * (LOAD_SCALE - fract[i]))) / LOAD_SCALE;

341

                         avenrun[i] = ((avenrun[i] * fract[i]) +

                             (average_now * (LOAD_SCALE - fract[i]))) / LOAD_SCALE;

344 }

345 }

346

347 /*

348 * Compute averages in other components.

349 */

350 uint64_t abstime = mach_absolute_time();

351

         for (sched_average_t avg = sched_average; avg->comp != NULL; ++avg) {

                 if (abstime >= avg->deadline) {

                         uint64_t period_abs = (avg->period * sched_one_second_interval);

355 uint64_t ninvokes = 1;

356

                         ninvokes += (abstime - avg->deadline) / period_abs;

                         ninvokes = MIN(ninvokes, SCHED_TICK_MAX_DELTA);

359

                         for (uint32_t index = 0; index < ninvokes; index++) {

                                 (*avg->comp)(avg->param);

362 }

363 avg->deadline = abstime + period_abs;

364 }

365 }

366 }