2 * Copyright (c) 2018 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
30 #error This file is only needed on weakly-ordered systems!
33 #include <machine/atomic.h>
34 #include <machine/commpage.h>
35 #include <machine/machine_cpu.h>
37 #include <kern/sched_prim.h>
38 #include <kern/percpu.h>
41 #include <kern/cpu_quiesce.h>
44 * CPU quiescing generation counter implemented with a checkin mask
46 * A tri-state bitfield, with 2 bits for each processor:;
47 * 1) 'checkin' bit, saying this processor has 'checked in', i.e. executed the acqrel barrier
48 * 2) 'expected' bit, saying this processor is expected to check in, i.e. not idle.
50 * When a processor causes the 'expected' bits to equal the 'checkin' bits, which
51 * indicates that all processors have executed the barrier, it ticks the algorithm
52 * and resets the state.
54 * Idle CPUs won't check in, because they don't run, so the algorithm won't tick.
55 * However, they can't do anything in userspace while idle, so we don't need
56 * them to execute barriers, so we have them 'leave' the counter so that
57 * they don't delay the tick while idle.
59 * This bitfield currently limits MAX_CPUS to 32 on LP64.
60 * In the future, we can use double-wide atomics and int128 if we need 64 CPUS.
62 * The mask only guarantees ordering to code running in userspace.
63 * We defer joining the counter until we actually reach userspace, allowing
64 * processors that come out of idle and only run kernel code to avoid the overhead
67 * We additionally defer updating the counter for a minimum interval to
68 * reduce the frequency of executing the exclusive atomic operations.
70 * The longest delay between two checkins assuming that at least one processor
71 * joins is <checkin delay> + (<thread quantum> * 2)
74 typedef unsigned long checkin_mask_t
;
76 static _Atomic checkin_mask_t cpu_quiescing_checkin_state
;
78 static uint64_t cpu_checkin_last_commit
;
81 cpu_quiescent_state_t state
;
82 uint64_t last_checkin
;
85 static struct cpu_quiesce
PERCPU_DATA(cpu_quiesce
);
87 #define CPU_CHECKIN_MIN_INTERVAL_US 4000 /* 4ms */
88 #define CPU_CHECKIN_MIN_INTERVAL_MAX_US USEC_PER_SEC /* 1s */
89 static uint64_t cpu_checkin_min_interval
;
90 static uint32_t cpu_checkin_min_interval_us
;
93 #define CPU_CHECKIN_MASK_MAX_CPUS 32
94 #define CPU_CHECKIN_MASK 0x5555555555555555UL
95 #define CPU_EXPECTED_MASK (~CPU_CHECKIN_MASK)
97 /* Avoid double-wide CAS on 32-bit platforms by using a 32-bit state and mask */
98 #define CPU_CHECKIN_MASK_MAX_CPUS 16
99 #define CPU_CHECKIN_MASK 0x55555555UL
100 #define CPU_EXPECTED_MASK (~CPU_CHECKIN_MASK)
103 static_assert(MAX_CPUS
<= CPU_CHECKIN_MASK_MAX_CPUS
);
104 static_assert(CPU_CHECKIN_MASK
== CPU_EXPECTED_MASK
>> 1);
106 static inline checkin_mask_t
107 cpu_checked_in_bit(int cpuid
)
109 return 1UL << (2 * cpuid
);
112 static inline checkin_mask_t
113 cpu_expected_bit(int cpuid
)
115 return 1UL << (2 * cpuid
+ 1);
119 cpu_quiescent_counter_init(void)
121 assert(CPU_CHECKIN_MASK
& cpu_checked_in_bit(MAX_CPUS
- 1));
122 assert(CPU_EXPECTED_MASK
& cpu_expected_bit(MAX_CPUS
- 1));
123 assert((CPU_CHECKIN_MASK
& cpu_expected_bit(MAX_CPUS
- 1)) == 0);
124 assert((CPU_EXPECTED_MASK
& cpu_checked_in_bit(MAX_CPUS
- 1)) == 0);
126 cpu_quiescent_counter_set_min_interval_us(CPU_CHECKIN_MIN_INTERVAL_US
);
130 cpu_quiescent_counter_set_min_interval_us(uint32_t new_value_us
)
132 /* clamp to something vaguely sane */
133 if (new_value_us
> CPU_CHECKIN_MIN_INTERVAL_MAX_US
) {
134 new_value_us
= CPU_CHECKIN_MIN_INTERVAL_MAX_US
;
137 cpu_checkin_min_interval_us
= new_value_us
;
139 uint64_t abstime
= 0;
140 clock_interval_to_absolutetime_interval(cpu_checkin_min_interval_us
,
141 NSEC_PER_USEC
, &abstime
);
142 cpu_checkin_min_interval
= abstime
;
146 cpu_quiescent_counter_get_min_interval_us(void)
148 return cpu_checkin_min_interval_us
;
153 * Called when all running CPUs have checked in.
155 * The commpage increment is protected by the 'lock' of having caused the tick,
156 * and it is published by the state reset release barrier.
159 cpu_quiescent_counter_commit(uint64_t ctime
)
161 __kdebug_only
uint64_t old_gen
;
162 __kdebug_only checkin_mask_t old_state
;
164 old_gen
= commpage_increment_cpu_quiescent_counter();
166 cpu_checkin_last_commit
= ctime
;
168 old_state
= os_atomic_andnot(&cpu_quiescing_checkin_state
, CPU_CHECKIN_MASK
, release
);
170 KDBG(MACHDBG_CODE(DBG_MACH_SCHED
, MACH_QUIESCENT_COUNTER
), old_gen
, old_state
, ctime
, 0);
174 * Have all the expected CPUs checked in?
177 cpu_quiescent_counter_needs_commit(checkin_mask_t state
)
179 return (state
& CPU_CHECKIN_MASK
) == ((state
& CPU_EXPECTED_MASK
) >> 1);
183 * Called when a processor wants to start participating in the counter, e.g.
184 * 1) when context switching away from the idle thread
185 * 2) when coming up for the first time
186 * 3) when coming up after a shutdown
188 * Called with interrupts disabled.
191 cpu_quiescent_counter_join(__unused
uint64_t ctime
)
193 struct cpu_quiesce
*st
= PERCPU_GET(cpu_quiesce
);
194 __assert_only
int cpuid
= cpu_number();
196 assert(cpuid
< MAX_CPUS
);
197 assert(st
->state
== CPU_QUIESCE_COUNTER_NONE
||
198 st
->state
== CPU_QUIESCE_COUNTER_LEFT
);
200 assert((os_atomic_load(&cpu_quiescing_checkin_state
, relaxed
) &
201 (cpu_expected_bit(cpuid
) | cpu_checked_in_bit(cpuid
))) == 0);
203 st
->state
= CPU_QUIESCE_COUNTER_PENDING_JOIN
;
206 * Mark the processor to call cpu_quiescent_counter_ast before it
207 * ever returns to userspace.
209 ast_on(AST_UNQUIESCE
);
213 * Called with interrupts disabled from the userspace boundary at the AST_UNQUIESCE callback
214 * It needs to acquire the counter to see data and the counter published by other CPUs.
217 cpu_quiescent_counter_ast(void)
219 struct cpu_quiesce
*st
= PERCPU_GET(cpu_quiesce
);
220 int cpuid
= cpu_number();
222 assert(st
->state
== CPU_QUIESCE_COUNTER_PENDING_JOIN
);
224 /* We had better not already be joined. */
225 assert((os_atomic_load(&cpu_quiescing_checkin_state
, relaxed
) &
226 (cpu_expected_bit(cpuid
) | cpu_checked_in_bit(cpuid
))) == 0);
229 * No release barrier needed because we have no prior state to publish.
230 * Acquire barrier needed because we need this processor to see
231 * the latest counter value.
233 * The state may be in 'needs checkin' both before and after
236 * Additionally, if this is the first processor to come out of idle,
237 * it may need to kickstart the algorithm, otherwise it would
238 * stay in 'needs commit' perpetually with no processor assigned to
239 * actually do the commit. To do that, the first processor only adds
243 st
->state
= CPU_QUIESCE_COUNTER_JOINED
;
244 st
->last_checkin
= mach_absolute_time();
246 checkin_mask_t old_mask
, new_mask
;
247 os_atomic_rmw_loop(&cpu_quiescing_checkin_state
, old_mask
, new_mask
, acquire
, {
249 new_mask
= old_mask
| cpu_expected_bit(cpuid
);
251 new_mask
= old_mask
| cpu_expected_bit(cpuid
) | cpu_checked_in_bit(cpuid
);
257 * Called when a processor no longer wants to participate in the counter,
258 * i.e. when a processor is on its way to idle or shutdown.
260 * Called with interrupts disabled.
262 * The processor needs to remove itself from the expected mask, to allow the
263 * algorithm to continue ticking without its participation.
264 * However, it needs to ensure that anything it has done since the last time
265 * it checked in has been published before the next tick is allowed to commit.
268 cpu_quiescent_counter_leave(uint64_t ctime
)
270 struct cpu_quiesce
*st
= PERCPU_GET(cpu_quiesce
);
271 int cpuid
= cpu_number();
273 assert(st
->state
== CPU_QUIESCE_COUNTER_JOINED
||
274 st
->state
== CPU_QUIESCE_COUNTER_PENDING_JOIN
);
276 /* We no longer need the cpu_quiescent_counter_ast callback to be armed */
277 ast_off(AST_UNQUIESCE
);
279 if (st
->state
== CPU_QUIESCE_COUNTER_PENDING_JOIN
) {
280 /* We never actually joined, so we don't have to do the work to leave. */
281 st
->state
= CPU_QUIESCE_COUNTER_LEFT
;
285 /* Leaving can't be deferred, even if we're within the min interval */
286 st
->last_checkin
= ctime
;
288 checkin_mask_t mask
= cpu_checked_in_bit(cpuid
) | cpu_expected_bit(cpuid
);
290 checkin_mask_t orig_state
= os_atomic_andnot_orig(&cpu_quiescing_checkin_state
,
293 assert((orig_state
& cpu_expected_bit(cpuid
)));
295 st
->state
= CPU_QUIESCE_COUNTER_LEFT
;
297 if (cpu_quiescent_counter_needs_commit(orig_state
)) {
299 * the old state indicates someone else was already doing a commit
300 * but hadn't finished yet. We successfully inserted the acq_rel
301 * before they finished the commit by resetting the bitfield,
302 * so we're done here.
307 checkin_mask_t new_state
= orig_state
& ~mask
;
309 if (cpu_quiescent_counter_needs_commit(new_state
)) {
310 cpu_quiescent_counter_commit(ctime
);
315 * Called when a processor wants to check in to the counter
316 * If it hasn't yet fully joined, it doesn't need to check in.
318 * Called with interrupts disabled.
321 cpu_quiescent_counter_checkin(uint64_t ctime
)
323 struct cpu_quiesce
*st
= PERCPU_GET(cpu_quiesce
);
324 int cpuid
= cpu_number();
326 assert(st
->state
!= CPU_QUIESCE_COUNTER_NONE
);
328 /* If we're not joined yet, we don't need to check in */
329 if (__probable(st
->state
!= CPU_QUIESCE_COUNTER_JOINED
)) {
333 /* If we've checked in recently, we don't need to check in yet. */
334 if (__probable((ctime
- st
->last_checkin
) <= cpu_checkin_min_interval
)) {
338 st
->last_checkin
= ctime
;
340 checkin_mask_t state
= os_atomic_load(&cpu_quiescing_checkin_state
, relaxed
);
342 assert((state
& cpu_expected_bit(cpuid
)));
344 if (__probable((state
& cpu_checked_in_bit(cpuid
)))) {
346 * Processor has already checked in for this round, no need to
347 * acquire the cacheline exclusive.
352 checkin_mask_t orig_state
= os_atomic_or_orig(&cpu_quiescing_checkin_state
,
353 cpu_checked_in_bit(cpuid
), acq_rel
);
355 checkin_mask_t new_state
= orig_state
| cpu_checked_in_bit(cpuid
);
357 if (cpu_quiescent_counter_needs_commit(new_state
)) {
358 assertf(!cpu_quiescent_counter_needs_commit(orig_state
),
359 "old: 0x%lx, new: 0x%lx", orig_state
, new_state
);
360 cpu_quiescent_counter_commit(ctime
);
366 * Called on all AST exits to userspace to assert this processor actually joined
368 * Called with interrupts disabled after the AST should have been handled
371 cpu_quiescent_counter_assert_ast(void)
373 struct cpu_quiesce
*st
= PERCPU_GET(cpu_quiesce
);
374 int cpuid
= cpu_number();
376 assert(st
->state
== CPU_QUIESCE_COUNTER_JOINED
);
378 checkin_mask_t state
= os_atomic_load(&cpu_quiescing_checkin_state
, relaxed
);
379 assert((state
& cpu_expected_bit(cpuid
)));
381 #endif /* MACH_ASSERT */