]> git.saurik.com Git - apple/xnu.git/blame - osfmk/kern/cpu_quiesce.c
xnu-7195.50.7.100.1.tar.gz
[apple/xnu.git] / osfmk / kern / cpu_quiesce.c
CommitLineData
d9a64523
A
1/*
2 * Copyright (c) 2018 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#ifdef __x86_64__
30#error This file is only needed on weakly-ordered systems!
31#endif
32
33#include <machine/atomic.h>
34#include <machine/commpage.h>
35#include <machine/machine_cpu.h>
36
37#include <kern/sched_prim.h>
f427ee49 38#include <kern/percpu.h>
d9a64523
A
39#include <kern/ast.h>
40
41#include <kern/cpu_quiesce.h>
42
43/*
44 * CPU quiescing generation counter implemented with a checkin mask
45 *
46 * A tri-state bitfield, with 2 bits for each processor:;
47 * 1) 'checkin' bit, saying this processor has 'checked in', i.e. executed the acqrel barrier
48 * 2) 'expected' bit, saying this processor is expected to check in, i.e. not idle.
49 *
50 * When a processor causes the 'expected' bits to equal the 'checkin' bits, which
51 * indicates that all processors have executed the barrier, it ticks the algorithm
52 * and resets the state.
53 *
54 * Idle CPUs won't check in, because they don't run, so the algorithm won't tick.
55 * However, they can't do anything in userspace while idle, so we don't need
56 * them to execute barriers, so we have them 'leave' the counter so that
57 * they don't delay the tick while idle.
58 *
59 * This bitfield currently limits MAX_CPUS to 32 on LP64.
60 * In the future, we can use double-wide atomics and int128 if we need 64 CPUS.
61 *
62 * The mask only guarantees ordering to code running in userspace.
63 * We defer joining the counter until we actually reach userspace, allowing
64 * processors that come out of idle and only run kernel code to avoid the overhead
65 * of participation.
66 *
67 * We additionally defer updating the counter for a minimum interval to
68 * reduce the frequency of executing the exclusive atomic operations.
69 *
70 * The longest delay between two checkins assuming that at least one processor
71 * joins is <checkin delay> + (<thread quantum> * 2)
72 */
73
74typedef unsigned long checkin_mask_t;
75
76static _Atomic checkin_mask_t cpu_quiescing_checkin_state;
77
78static uint64_t cpu_checkin_last_commit;
79
f427ee49
A
80struct cpu_quiesce {
81 cpu_quiescent_state_t state;
82 uint64_t last_checkin;
83};
84
85static struct cpu_quiesce PERCPU_DATA(cpu_quiesce);
86
d9a64523
A
87#define CPU_CHECKIN_MIN_INTERVAL_US 4000 /* 4ms */
88#define CPU_CHECKIN_MIN_INTERVAL_MAX_US USEC_PER_SEC /* 1s */
89static uint64_t cpu_checkin_min_interval;
cb323159 90static uint32_t cpu_checkin_min_interval_us;
d9a64523
A
91
92#if __LP64__
93static_assert(MAX_CPUS <= 32);
94#define CPU_CHECKIN_MASK 0x5555555555555555UL
95#define CPU_EXPECTED_MASK (~CPU_CHECKIN_MASK)
96#else
97/* Avoid double-wide CAS on 32-bit platforms by using a 32-bit state and mask */
98static_assert(MAX_CPUS <= 16);
99#define CPU_CHECKIN_MASK 0x55555555UL
100#define CPU_EXPECTED_MASK (~CPU_CHECKIN_MASK)
101#endif
102
103static_assert(CPU_CHECKIN_MASK == CPU_EXPECTED_MASK >> 1);
104
105static inline checkin_mask_t
106cpu_checked_in_bit(int cpuid)
107{
108 return 1UL << (2 * cpuid);
109}
110
111static inline checkin_mask_t
112cpu_expected_bit(int cpuid)
113{
114 return 1UL << (2 * cpuid + 1);
115}
116
117void
118cpu_quiescent_counter_init(void)
119{
120 assert(CPU_CHECKIN_MASK & cpu_checked_in_bit(MAX_CPUS));
121 assert(CPU_EXPECTED_MASK & cpu_expected_bit(MAX_CPUS));
122 assert((CPU_CHECKIN_MASK & cpu_expected_bit(MAX_CPUS)) == 0);
123 assert((CPU_EXPECTED_MASK & cpu_checked_in_bit(MAX_CPUS)) == 0);
124
125 cpu_quiescent_counter_set_min_interval_us(CPU_CHECKIN_MIN_INTERVAL_US);
126}
127
128void
129cpu_quiescent_counter_set_min_interval_us(uint32_t new_value_us)
130{
131 /* clamp to something vaguely sane */
0a7de745 132 if (new_value_us > CPU_CHECKIN_MIN_INTERVAL_MAX_US) {
d9a64523 133 new_value_us = CPU_CHECKIN_MIN_INTERVAL_MAX_US;
0a7de745 134 }
d9a64523
A
135
136 cpu_checkin_min_interval_us = new_value_us;
137
138 uint64_t abstime = 0;
139 clock_interval_to_absolutetime_interval(cpu_checkin_min_interval_us,
0a7de745 140 NSEC_PER_USEC, &abstime);
d9a64523
A
141 cpu_checkin_min_interval = abstime;
142}
143
cb323159
A
144uint32_t
145cpu_quiescent_counter_get_min_interval_us(void)
146{
147 return cpu_checkin_min_interval_us;
148}
149
d9a64523
A
150
151/*
152 * Called when all running CPUs have checked in.
153 *
154 * The commpage increment is protected by the 'lock' of having caused the tick,
155 * and it is published by the state reset release barrier.
156 */
157static void
158cpu_quiescent_counter_commit(uint64_t ctime)
159{
160 __kdebug_only uint64_t old_gen;
161 __kdebug_only checkin_mask_t old_state;
162
163 old_gen = commpage_increment_cpu_quiescent_counter();
164
165 cpu_checkin_last_commit = ctime;
166
cb323159 167 old_state = os_atomic_andnot(&cpu_quiescing_checkin_state, CPU_CHECKIN_MASK, release);
d9a64523
A
168
169 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_QUIESCENT_COUNTER), old_gen, old_state, ctime, 0);
170}
171
172/*
173 * Have all the expected CPUs checked in?
174 */
175static bool
176cpu_quiescent_counter_needs_commit(checkin_mask_t state)
177{
178 return (state & CPU_CHECKIN_MASK) == ((state & CPU_EXPECTED_MASK) >> 1);
179}
180
181/*
182 * Called when a processor wants to start participating in the counter, e.g.
183 * 1) when context switching away from the idle thread
184 * 2) when coming up for the first time
185 * 3) when coming up after a shutdown
186 *
187 * Called with interrupts disabled.
188 */
189void
190cpu_quiescent_counter_join(__unused uint64_t ctime)
191{
f427ee49
A
192 struct cpu_quiesce *st = PERCPU_GET(cpu_quiesce);
193 __assert_only int cpuid = cpu_number();
d9a64523 194
f427ee49
A
195 assert(st->state == CPU_QUIESCE_COUNTER_NONE ||
196 st->state == CPU_QUIESCE_COUNTER_LEFT);
d9a64523
A
197
198 assert((os_atomic_load(&cpu_quiescing_checkin_state, relaxed) &
0a7de745 199 (cpu_expected_bit(cpuid) | cpu_checked_in_bit(cpuid))) == 0);
d9a64523 200
f427ee49 201 st->state = CPU_QUIESCE_COUNTER_PENDING_JOIN;
d9a64523
A
202
203 /*
204 * Mark the processor to call cpu_quiescent_counter_ast before it
205 * ever returns to userspace.
206 */
207 ast_on(AST_UNQUIESCE);
208}
209
210/*
211 * Called with interrupts disabled from the userspace boundary at the AST_UNQUIESCE callback
212 * It needs to acquire the counter to see data and the counter published by other CPUs.
213 */
214void
215cpu_quiescent_counter_ast(void)
216{
f427ee49
A
217 struct cpu_quiesce *st = PERCPU_GET(cpu_quiesce);
218 int cpuid = cpu_number();
d9a64523 219
f427ee49 220 assert(st->state == CPU_QUIESCE_COUNTER_PENDING_JOIN);
d9a64523
A
221
222 /* We had better not already be joined. */
223 assert((os_atomic_load(&cpu_quiescing_checkin_state, relaxed) &
0a7de745 224 (cpu_expected_bit(cpuid) | cpu_checked_in_bit(cpuid))) == 0);
d9a64523
A
225
226 /*
227 * No release barrier needed because we have no prior state to publish.
228 * Acquire barrier needed because we need this processor to see
229 * the latest counter value.
230 *
231 * The state may be in 'needs checkin' both before and after
232 * this atomic or.
233 *
234 * Additionally, if this is the first processor to come out of idle,
235 * it may need to kickstart the algorithm, otherwise it would
236 * stay in 'needs commit' perpetually with no processor assigned to
237 * actually do the commit. To do that, the first processor only adds
238 * its expected bit.
239 */
240
f427ee49
A
241 st->state = CPU_QUIESCE_COUNTER_JOINED;
242 st->last_checkin = mach_absolute_time();
d9a64523
A
243
244 checkin_mask_t old_mask, new_mask;
245 os_atomic_rmw_loop(&cpu_quiescing_checkin_state, old_mask, new_mask, acquire, {
246 if (old_mask == 0) {
0a7de745 247 new_mask = old_mask | cpu_expected_bit(cpuid);
d9a64523 248 } else {
0a7de745 249 new_mask = old_mask | cpu_expected_bit(cpuid) | cpu_checked_in_bit(cpuid);
d9a64523
A
250 }
251 });
252}
253
254/*
255 * Called when a processor no longer wants to participate in the counter,
256 * i.e. when a processor is on its way to idle or shutdown.
257 *
258 * Called with interrupts disabled.
259 *
260 * The processor needs to remove itself from the expected mask, to allow the
261 * algorithm to continue ticking without its participation.
262 * However, it needs to ensure that anything it has done since the last time
263 * it checked in has been published before the next tick is allowed to commit.
264 */
265void
266cpu_quiescent_counter_leave(uint64_t ctime)
267{
f427ee49
A
268 struct cpu_quiesce *st = PERCPU_GET(cpu_quiesce);
269 int cpuid = cpu_number();
d9a64523 270
f427ee49
A
271 assert(st->state == CPU_QUIESCE_COUNTER_JOINED ||
272 st->state == CPU_QUIESCE_COUNTER_PENDING_JOIN);
d9a64523
A
273
274 /* We no longer need the cpu_quiescent_counter_ast callback to be armed */
275 ast_off(AST_UNQUIESCE);
276
f427ee49 277 if (st->state == CPU_QUIESCE_COUNTER_PENDING_JOIN) {
d9a64523 278 /* We never actually joined, so we don't have to do the work to leave. */
f427ee49 279 st->state = CPU_QUIESCE_COUNTER_LEFT;
d9a64523
A
280 return;
281 }
282
283 /* Leaving can't be deferred, even if we're within the min interval */
f427ee49 284 st->last_checkin = ctime;
d9a64523
A
285
286 checkin_mask_t mask = cpu_checked_in_bit(cpuid) | cpu_expected_bit(cpuid);
287
cb323159
A
288 checkin_mask_t orig_state = os_atomic_andnot_orig(&cpu_quiescing_checkin_state,
289 mask, acq_rel);
d9a64523
A
290
291 assert((orig_state & cpu_expected_bit(cpuid)));
292
f427ee49 293 st->state = CPU_QUIESCE_COUNTER_LEFT;
d9a64523
A
294
295 if (cpu_quiescent_counter_needs_commit(orig_state)) {
296 /*
297 * the old state indicates someone else was already doing a commit
298 * but hadn't finished yet. We successfully inserted the acq_rel
299 * before they finished the commit by resetting the bitfield,
300 * so we're done here.
301 */
302 return;
303 }
304
305 checkin_mask_t new_state = orig_state & ~mask;
306
307 if (cpu_quiescent_counter_needs_commit(new_state)) {
308 cpu_quiescent_counter_commit(ctime);
309 }
310}
311
312/*
313 * Called when a processor wants to check in to the counter
314 * If it hasn't yet fully joined, it doesn't need to check in.
315 *
316 * Called with interrupts disabled.
317 */
318void
319cpu_quiescent_counter_checkin(uint64_t ctime)
320{
f427ee49
A
321 struct cpu_quiesce *st = PERCPU_GET(cpu_quiesce);
322 int cpuid = cpu_number();
d9a64523 323
f427ee49 324 assert(st->state != CPU_QUIESCE_COUNTER_NONE);
d9a64523
A
325
326 /* If we're not joined yet, we don't need to check in */
f427ee49 327 if (__probable(st->state != CPU_QUIESCE_COUNTER_JOINED)) {
d9a64523 328 return;
0a7de745 329 }
d9a64523
A
330
331 /* If we've checked in recently, we don't need to check in yet. */
f427ee49 332 if (__probable((ctime - st->last_checkin) <= cpu_checkin_min_interval)) {
d9a64523 333 return;
0a7de745 334 }
d9a64523 335
f427ee49 336 st->last_checkin = ctime;
d9a64523
A
337
338 checkin_mask_t state = os_atomic_load(&cpu_quiescing_checkin_state, relaxed);
339
340 assert((state & cpu_expected_bit(cpuid)));
341
342 if (__probable((state & cpu_checked_in_bit(cpuid)))) {
343 /*
344 * Processor has already checked in for this round, no need to
345 * acquire the cacheline exclusive.
346 */
347 return;
348 }
349
350 checkin_mask_t orig_state = os_atomic_or_orig(&cpu_quiescing_checkin_state,
0a7de745 351 cpu_checked_in_bit(cpuid), acq_rel);
d9a64523
A
352
353 checkin_mask_t new_state = orig_state | cpu_checked_in_bit(cpuid);
354
355 if (cpu_quiescent_counter_needs_commit(new_state)) {
356 assertf(!cpu_quiescent_counter_needs_commit(orig_state),
0a7de745 357 "old: 0x%lx, new: 0x%lx", orig_state, new_state);
d9a64523
A
358 cpu_quiescent_counter_commit(ctime);
359 }
360}
361
362#if MACH_ASSERT
363/*
364 * Called on all AST exits to userspace to assert this processor actually joined
365 *
366 * Called with interrupts disabled after the AST should have been handled
367 */
368void
369cpu_quiescent_counter_assert_ast(void)
370{
f427ee49
A
371 struct cpu_quiesce *st = PERCPU_GET(cpu_quiesce);
372 int cpuid = cpu_number();
d9a64523 373
f427ee49 374 assert(st->state == CPU_QUIESCE_COUNTER_JOINED);
d9a64523
A
375
376 checkin_mask_t state = os_atomic_load(&cpu_quiescing_checkin_state, relaxed);
377 assert((state & cpu_expected_bit(cpuid)));
378}
379#endif /* MACH_ASSERT */