]> git.saurik.com Git - apple/xnu.git/blame - osfmk/kern/cpu_quiesce.c
xnu-7195.101.1.tar.gz
[apple/xnu.git] / osfmk / kern / cpu_quiesce.c
CommitLineData
d9a64523
A
1/*
2 * Copyright (c) 2018 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#ifdef __x86_64__
30#error This file is only needed on weakly-ordered systems!
31#endif
32
33#include <machine/atomic.h>
34#include <machine/commpage.h>
35#include <machine/machine_cpu.h>
36
37#include <kern/sched_prim.h>
f427ee49 38#include <kern/percpu.h>
d9a64523
A
39#include <kern/ast.h>
40
41#include <kern/cpu_quiesce.h>
42
43/*
44 * CPU quiescing generation counter implemented with a checkin mask
45 *
46 * A tri-state bitfield, with 2 bits for each processor:;
47 * 1) 'checkin' bit, saying this processor has 'checked in', i.e. executed the acqrel barrier
48 * 2) 'expected' bit, saying this processor is expected to check in, i.e. not idle.
49 *
50 * When a processor causes the 'expected' bits to equal the 'checkin' bits, which
51 * indicates that all processors have executed the barrier, it ticks the algorithm
52 * and resets the state.
53 *
54 * Idle CPUs won't check in, because they don't run, so the algorithm won't tick.
55 * However, they can't do anything in userspace while idle, so we don't need
56 * them to execute barriers, so we have them 'leave' the counter so that
57 * they don't delay the tick while idle.
58 *
59 * This bitfield currently limits MAX_CPUS to 32 on LP64.
60 * In the future, we can use double-wide atomics and int128 if we need 64 CPUS.
61 *
62 * The mask only guarantees ordering to code running in userspace.
63 * We defer joining the counter until we actually reach userspace, allowing
64 * processors that come out of idle and only run kernel code to avoid the overhead
65 * of participation.
66 *
67 * We additionally defer updating the counter for a minimum interval to
68 * reduce the frequency of executing the exclusive atomic operations.
69 *
70 * The longest delay between two checkins assuming that at least one processor
71 * joins is <checkin delay> + (<thread quantum> * 2)
72 */
73
74typedef unsigned long checkin_mask_t;
75
76static _Atomic checkin_mask_t cpu_quiescing_checkin_state;
77
78static uint64_t cpu_checkin_last_commit;
79
f427ee49
A
80struct cpu_quiesce {
81 cpu_quiescent_state_t state;
82 uint64_t last_checkin;
83};
84
85static struct cpu_quiesce PERCPU_DATA(cpu_quiesce);
86
d9a64523
A
87#define CPU_CHECKIN_MIN_INTERVAL_US 4000 /* 4ms */
88#define CPU_CHECKIN_MIN_INTERVAL_MAX_US USEC_PER_SEC /* 1s */
89static uint64_t cpu_checkin_min_interval;
cb323159 90static uint32_t cpu_checkin_min_interval_us;
d9a64523
A
91
92#if __LP64__
c3c9b80d
A
93#define CPU_CHECKIN_MASK_MAX_CPUS 32
94#define CPU_CHECKIN_MASK 0x5555555555555555UL
95#define CPU_EXPECTED_MASK (~CPU_CHECKIN_MASK)
d9a64523
A
96#else
97/* Avoid double-wide CAS on 32-bit platforms by using a 32-bit state and mask */
c3c9b80d
A
98#define CPU_CHECKIN_MASK_MAX_CPUS 16
99#define CPU_CHECKIN_MASK 0x55555555UL
100#define CPU_EXPECTED_MASK (~CPU_CHECKIN_MASK)
d9a64523
A
101#endif
102
c3c9b80d 103static_assert(MAX_CPUS <= CPU_CHECKIN_MASK_MAX_CPUS);
d9a64523
A
104static_assert(CPU_CHECKIN_MASK == CPU_EXPECTED_MASK >> 1);
105
106static inline checkin_mask_t
107cpu_checked_in_bit(int cpuid)
108{
109 return 1UL << (2 * cpuid);
110}
111
112static inline checkin_mask_t
113cpu_expected_bit(int cpuid)
114{
115 return 1UL << (2 * cpuid + 1);
116}
117
118void
119cpu_quiescent_counter_init(void)
120{
c3c9b80d
A
121 assert(CPU_CHECKIN_MASK & cpu_checked_in_bit(MAX_CPUS - 1));
122 assert(CPU_EXPECTED_MASK & cpu_expected_bit(MAX_CPUS - 1));
123 assert((CPU_CHECKIN_MASK & cpu_expected_bit(MAX_CPUS - 1)) == 0);
124 assert((CPU_EXPECTED_MASK & cpu_checked_in_bit(MAX_CPUS - 1)) == 0);
d9a64523
A
125
126 cpu_quiescent_counter_set_min_interval_us(CPU_CHECKIN_MIN_INTERVAL_US);
127}
128
129void
130cpu_quiescent_counter_set_min_interval_us(uint32_t new_value_us)
131{
132 /* clamp to something vaguely sane */
0a7de745 133 if (new_value_us > CPU_CHECKIN_MIN_INTERVAL_MAX_US) {
d9a64523 134 new_value_us = CPU_CHECKIN_MIN_INTERVAL_MAX_US;
0a7de745 135 }
d9a64523
A
136
137 cpu_checkin_min_interval_us = new_value_us;
138
139 uint64_t abstime = 0;
140 clock_interval_to_absolutetime_interval(cpu_checkin_min_interval_us,
0a7de745 141 NSEC_PER_USEC, &abstime);
d9a64523
A
142 cpu_checkin_min_interval = abstime;
143}
144
cb323159
A
145uint32_t
146cpu_quiescent_counter_get_min_interval_us(void)
147{
148 return cpu_checkin_min_interval_us;
149}
150
d9a64523
A
151
152/*
153 * Called when all running CPUs have checked in.
154 *
155 * The commpage increment is protected by the 'lock' of having caused the tick,
156 * and it is published by the state reset release barrier.
157 */
158static void
159cpu_quiescent_counter_commit(uint64_t ctime)
160{
161 __kdebug_only uint64_t old_gen;
162 __kdebug_only checkin_mask_t old_state;
163
164 old_gen = commpage_increment_cpu_quiescent_counter();
165
166 cpu_checkin_last_commit = ctime;
167
cb323159 168 old_state = os_atomic_andnot(&cpu_quiescing_checkin_state, CPU_CHECKIN_MASK, release);
d9a64523
A
169
170 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_QUIESCENT_COUNTER), old_gen, old_state, ctime, 0);
171}
172
173/*
174 * Have all the expected CPUs checked in?
175 */
176static bool
177cpu_quiescent_counter_needs_commit(checkin_mask_t state)
178{
179 return (state & CPU_CHECKIN_MASK) == ((state & CPU_EXPECTED_MASK) >> 1);
180}
181
182/*
183 * Called when a processor wants to start participating in the counter, e.g.
184 * 1) when context switching away from the idle thread
185 * 2) when coming up for the first time
186 * 3) when coming up after a shutdown
187 *
188 * Called with interrupts disabled.
189 */
190void
191cpu_quiescent_counter_join(__unused uint64_t ctime)
192{
f427ee49
A
193 struct cpu_quiesce *st = PERCPU_GET(cpu_quiesce);
194 __assert_only int cpuid = cpu_number();
d9a64523 195
c3c9b80d 196 assert(cpuid < MAX_CPUS);
f427ee49
A
197 assert(st->state == CPU_QUIESCE_COUNTER_NONE ||
198 st->state == CPU_QUIESCE_COUNTER_LEFT);
d9a64523
A
199
200 assert((os_atomic_load(&cpu_quiescing_checkin_state, relaxed) &
0a7de745 201 (cpu_expected_bit(cpuid) | cpu_checked_in_bit(cpuid))) == 0);
d9a64523 202
f427ee49 203 st->state = CPU_QUIESCE_COUNTER_PENDING_JOIN;
d9a64523
A
204
205 /*
206 * Mark the processor to call cpu_quiescent_counter_ast before it
207 * ever returns to userspace.
208 */
209 ast_on(AST_UNQUIESCE);
210}
211
212/*
213 * Called with interrupts disabled from the userspace boundary at the AST_UNQUIESCE callback
214 * It needs to acquire the counter to see data and the counter published by other CPUs.
215 */
216void
217cpu_quiescent_counter_ast(void)
218{
f427ee49
A
219 struct cpu_quiesce *st = PERCPU_GET(cpu_quiesce);
220 int cpuid = cpu_number();
d9a64523 221
f427ee49 222 assert(st->state == CPU_QUIESCE_COUNTER_PENDING_JOIN);
d9a64523
A
223
224 /* We had better not already be joined. */
225 assert((os_atomic_load(&cpu_quiescing_checkin_state, relaxed) &
0a7de745 226 (cpu_expected_bit(cpuid) | cpu_checked_in_bit(cpuid))) == 0);
d9a64523
A
227
228 /*
229 * No release barrier needed because we have no prior state to publish.
230 * Acquire barrier needed because we need this processor to see
231 * the latest counter value.
232 *
233 * The state may be in 'needs checkin' both before and after
234 * this atomic or.
235 *
236 * Additionally, if this is the first processor to come out of idle,
237 * it may need to kickstart the algorithm, otherwise it would
238 * stay in 'needs commit' perpetually with no processor assigned to
239 * actually do the commit. To do that, the first processor only adds
240 * its expected bit.
241 */
242
f427ee49
A
243 st->state = CPU_QUIESCE_COUNTER_JOINED;
244 st->last_checkin = mach_absolute_time();
d9a64523
A
245
246 checkin_mask_t old_mask, new_mask;
247 os_atomic_rmw_loop(&cpu_quiescing_checkin_state, old_mask, new_mask, acquire, {
248 if (old_mask == 0) {
0a7de745 249 new_mask = old_mask | cpu_expected_bit(cpuid);
d9a64523 250 } else {
0a7de745 251 new_mask = old_mask | cpu_expected_bit(cpuid) | cpu_checked_in_bit(cpuid);
d9a64523
A
252 }
253 });
254}
255
256/*
257 * Called when a processor no longer wants to participate in the counter,
258 * i.e. when a processor is on its way to idle or shutdown.
259 *
260 * Called with interrupts disabled.
261 *
262 * The processor needs to remove itself from the expected mask, to allow the
263 * algorithm to continue ticking without its participation.
264 * However, it needs to ensure that anything it has done since the last time
265 * it checked in has been published before the next tick is allowed to commit.
266 */
267void
268cpu_quiescent_counter_leave(uint64_t ctime)
269{
f427ee49
A
270 struct cpu_quiesce *st = PERCPU_GET(cpu_quiesce);
271 int cpuid = cpu_number();
d9a64523 272
f427ee49
A
273 assert(st->state == CPU_QUIESCE_COUNTER_JOINED ||
274 st->state == CPU_QUIESCE_COUNTER_PENDING_JOIN);
d9a64523
A
275
276 /* We no longer need the cpu_quiescent_counter_ast callback to be armed */
277 ast_off(AST_UNQUIESCE);
278
f427ee49 279 if (st->state == CPU_QUIESCE_COUNTER_PENDING_JOIN) {
d9a64523 280 /* We never actually joined, so we don't have to do the work to leave. */
f427ee49 281 st->state = CPU_QUIESCE_COUNTER_LEFT;
d9a64523
A
282 return;
283 }
284
285 /* Leaving can't be deferred, even if we're within the min interval */
f427ee49 286 st->last_checkin = ctime;
d9a64523
A
287
288 checkin_mask_t mask = cpu_checked_in_bit(cpuid) | cpu_expected_bit(cpuid);
289
cb323159
A
290 checkin_mask_t orig_state = os_atomic_andnot_orig(&cpu_quiescing_checkin_state,
291 mask, acq_rel);
d9a64523
A
292
293 assert((orig_state & cpu_expected_bit(cpuid)));
294
f427ee49 295 st->state = CPU_QUIESCE_COUNTER_LEFT;
d9a64523
A
296
297 if (cpu_quiescent_counter_needs_commit(orig_state)) {
298 /*
299 * the old state indicates someone else was already doing a commit
300 * but hadn't finished yet. We successfully inserted the acq_rel
301 * before they finished the commit by resetting the bitfield,
302 * so we're done here.
303 */
304 return;
305 }
306
307 checkin_mask_t new_state = orig_state & ~mask;
308
309 if (cpu_quiescent_counter_needs_commit(new_state)) {
310 cpu_quiescent_counter_commit(ctime);
311 }
312}
313
314/*
315 * Called when a processor wants to check in to the counter
316 * If it hasn't yet fully joined, it doesn't need to check in.
317 *
318 * Called with interrupts disabled.
319 */
320void
321cpu_quiescent_counter_checkin(uint64_t ctime)
322{
f427ee49
A
323 struct cpu_quiesce *st = PERCPU_GET(cpu_quiesce);
324 int cpuid = cpu_number();
d9a64523 325
f427ee49 326 assert(st->state != CPU_QUIESCE_COUNTER_NONE);
d9a64523
A
327
328 /* If we're not joined yet, we don't need to check in */
f427ee49 329 if (__probable(st->state != CPU_QUIESCE_COUNTER_JOINED)) {
d9a64523 330 return;
0a7de745 331 }
d9a64523
A
332
333 /* If we've checked in recently, we don't need to check in yet. */
f427ee49 334 if (__probable((ctime - st->last_checkin) <= cpu_checkin_min_interval)) {
d9a64523 335 return;
0a7de745 336 }
d9a64523 337
f427ee49 338 st->last_checkin = ctime;
d9a64523
A
339
340 checkin_mask_t state = os_atomic_load(&cpu_quiescing_checkin_state, relaxed);
341
342 assert((state & cpu_expected_bit(cpuid)));
343
344 if (__probable((state & cpu_checked_in_bit(cpuid)))) {
345 /*
346 * Processor has already checked in for this round, no need to
347 * acquire the cacheline exclusive.
348 */
349 return;
350 }
351
352 checkin_mask_t orig_state = os_atomic_or_orig(&cpu_quiescing_checkin_state,
0a7de745 353 cpu_checked_in_bit(cpuid), acq_rel);
d9a64523
A
354
355 checkin_mask_t new_state = orig_state | cpu_checked_in_bit(cpuid);
356
357 if (cpu_quiescent_counter_needs_commit(new_state)) {
358 assertf(!cpu_quiescent_counter_needs_commit(orig_state),
0a7de745 359 "old: 0x%lx, new: 0x%lx", orig_state, new_state);
d9a64523
A
360 cpu_quiescent_counter_commit(ctime);
361 }
362}
363
364#if MACH_ASSERT
365/*
366 * Called on all AST exits to userspace to assert this processor actually joined
367 *
368 * Called with interrupts disabled after the AST should have been handled
369 */
370void
371cpu_quiescent_counter_assert_ast(void)
372{
f427ee49
A
373 struct cpu_quiesce *st = PERCPU_GET(cpu_quiesce);
374 int cpuid = cpu_number();
d9a64523 375
f427ee49 376 assert(st->state == CPU_QUIESCE_COUNTER_JOINED);
d9a64523
A
377
378 checkin_mask_t state = os_atomic_load(&cpu_quiescing_checkin_state, relaxed);
379 assert((state & cpu_expected_bit(cpuid)));
380}
381#endif /* MACH_ASSERT */