]>
Commit | Line | Data |
---|---|---|
d9a64523 A |
1 | /* |
2 | * Copyright (c) 2018 Apple Inc. All rights reserved. | |
3 | * | |
4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ | |
5 | * | |
6 | * This file contains Original Code and/or Modifications of Original Code | |
7 | * as defined in and that are subject to the Apple Public Source License | |
8 | * Version 2.0 (the 'License'). You may not use this file except in | |
9 | * compliance with the License. The rights granted to you under the License | |
10 | * may not be used to create, or enable the creation or redistribution of, | |
11 | * unlawful or unlicensed copies of an Apple operating system, or to | |
12 | * circumvent, violate, or enable the circumvention or violation of, any | |
13 | * terms of an Apple operating system software license agreement. | |
14 | * | |
15 | * Please obtain a copy of the License at | |
16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. | |
17 | * | |
18 | * The Original Code and all software distributed under the License are | |
19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, | |
21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. | |
23 | * Please see the License for the specific language governing rights and | |
24 | * limitations under the License. | |
25 | * | |
26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ | |
27 | */ | |
28 | ||
29 | #ifdef __x86_64__ | |
30 | #error This file is only needed on weakly-ordered systems! | |
31 | #endif | |
32 | ||
33 | #include <machine/atomic.h> | |
34 | #include <machine/commpage.h> | |
35 | #include <machine/machine_cpu.h> | |
36 | ||
37 | #include <kern/sched_prim.h> | |
f427ee49 | 38 | #include <kern/percpu.h> |
d9a64523 A |
39 | #include <kern/ast.h> |
40 | ||
41 | #include <kern/cpu_quiesce.h> | |
42 | ||
43 | /* | |
44 | * CPU quiescing generation counter implemented with a checkin mask | |
45 | * | |
46 | * A tri-state bitfield, with 2 bits for each processor:; | |
47 | * 1) 'checkin' bit, saying this processor has 'checked in', i.e. executed the acqrel barrier | |
48 | * 2) 'expected' bit, saying this processor is expected to check in, i.e. not idle. | |
49 | * | |
50 | * When a processor causes the 'expected' bits to equal the 'checkin' bits, which | |
51 | * indicates that all processors have executed the barrier, it ticks the algorithm | |
52 | * and resets the state. | |
53 | * | |
54 | * Idle CPUs won't check in, because they don't run, so the algorithm won't tick. | |
55 | * However, they can't do anything in userspace while idle, so we don't need | |
56 | * them to execute barriers, so we have them 'leave' the counter so that | |
57 | * they don't delay the tick while idle. | |
58 | * | |
59 | * This bitfield currently limits MAX_CPUS to 32 on LP64. | |
60 | * In the future, we can use double-wide atomics and int128 if we need 64 CPUS. | |
61 | * | |
62 | * The mask only guarantees ordering to code running in userspace. | |
63 | * We defer joining the counter until we actually reach userspace, allowing | |
64 | * processors that come out of idle and only run kernel code to avoid the overhead | |
65 | * of participation. | |
66 | * | |
67 | * We additionally defer updating the counter for a minimum interval to | |
68 | * reduce the frequency of executing the exclusive atomic operations. | |
69 | * | |
70 | * The longest delay between two checkins assuming that at least one processor | |
71 | * joins is <checkin delay> + (<thread quantum> * 2) | |
72 | */ | |
73 | ||
74 | typedef unsigned long checkin_mask_t; | |
75 | ||
76 | static _Atomic checkin_mask_t cpu_quiescing_checkin_state; | |
77 | ||
78 | static uint64_t cpu_checkin_last_commit; | |
79 | ||
f427ee49 A |
80 | struct cpu_quiesce { |
81 | cpu_quiescent_state_t state; | |
82 | uint64_t last_checkin; | |
83 | }; | |
84 | ||
85 | static struct cpu_quiesce PERCPU_DATA(cpu_quiesce); | |
86 | ||
d9a64523 A |
87 | #define CPU_CHECKIN_MIN_INTERVAL_US 4000 /* 4ms */ |
88 | #define CPU_CHECKIN_MIN_INTERVAL_MAX_US USEC_PER_SEC /* 1s */ | |
89 | static uint64_t cpu_checkin_min_interval; | |
cb323159 | 90 | static uint32_t cpu_checkin_min_interval_us; |
d9a64523 A |
91 | |
92 | #if __LP64__ | |
c3c9b80d A |
93 | #define CPU_CHECKIN_MASK_MAX_CPUS 32 |
94 | #define CPU_CHECKIN_MASK 0x5555555555555555UL | |
95 | #define CPU_EXPECTED_MASK (~CPU_CHECKIN_MASK) | |
d9a64523 A |
96 | #else |
97 | /* Avoid double-wide CAS on 32-bit platforms by using a 32-bit state and mask */ | |
c3c9b80d A |
98 | #define CPU_CHECKIN_MASK_MAX_CPUS 16 |
99 | #define CPU_CHECKIN_MASK 0x55555555UL | |
100 | #define CPU_EXPECTED_MASK (~CPU_CHECKIN_MASK) | |
d9a64523 A |
101 | #endif |
102 | ||
c3c9b80d | 103 | static_assert(MAX_CPUS <= CPU_CHECKIN_MASK_MAX_CPUS); |
d9a64523 A |
104 | static_assert(CPU_CHECKIN_MASK == CPU_EXPECTED_MASK >> 1); |
105 | ||
106 | static inline checkin_mask_t | |
107 | cpu_checked_in_bit(int cpuid) | |
108 | { | |
109 | return 1UL << (2 * cpuid); | |
110 | } | |
111 | ||
112 | static inline checkin_mask_t | |
113 | cpu_expected_bit(int cpuid) | |
114 | { | |
115 | return 1UL << (2 * cpuid + 1); | |
116 | } | |
117 | ||
118 | void | |
119 | cpu_quiescent_counter_init(void) | |
120 | { | |
c3c9b80d A |
121 | assert(CPU_CHECKIN_MASK & cpu_checked_in_bit(MAX_CPUS - 1)); |
122 | assert(CPU_EXPECTED_MASK & cpu_expected_bit(MAX_CPUS - 1)); | |
123 | assert((CPU_CHECKIN_MASK & cpu_expected_bit(MAX_CPUS - 1)) == 0); | |
124 | assert((CPU_EXPECTED_MASK & cpu_checked_in_bit(MAX_CPUS - 1)) == 0); | |
d9a64523 A |
125 | |
126 | cpu_quiescent_counter_set_min_interval_us(CPU_CHECKIN_MIN_INTERVAL_US); | |
127 | } | |
128 | ||
129 | void | |
130 | cpu_quiescent_counter_set_min_interval_us(uint32_t new_value_us) | |
131 | { | |
132 | /* clamp to something vaguely sane */ | |
0a7de745 | 133 | if (new_value_us > CPU_CHECKIN_MIN_INTERVAL_MAX_US) { |
d9a64523 | 134 | new_value_us = CPU_CHECKIN_MIN_INTERVAL_MAX_US; |
0a7de745 | 135 | } |
d9a64523 A |
136 | |
137 | cpu_checkin_min_interval_us = new_value_us; | |
138 | ||
139 | uint64_t abstime = 0; | |
140 | clock_interval_to_absolutetime_interval(cpu_checkin_min_interval_us, | |
0a7de745 | 141 | NSEC_PER_USEC, &abstime); |
d9a64523 A |
142 | cpu_checkin_min_interval = abstime; |
143 | } | |
144 | ||
cb323159 A |
145 | uint32_t |
146 | cpu_quiescent_counter_get_min_interval_us(void) | |
147 | { | |
148 | return cpu_checkin_min_interval_us; | |
149 | } | |
150 | ||
d9a64523 A |
151 | |
152 | /* | |
153 | * Called when all running CPUs have checked in. | |
154 | * | |
155 | * The commpage increment is protected by the 'lock' of having caused the tick, | |
156 | * and it is published by the state reset release barrier. | |
157 | */ | |
158 | static void | |
159 | cpu_quiescent_counter_commit(uint64_t ctime) | |
160 | { | |
161 | __kdebug_only uint64_t old_gen; | |
162 | __kdebug_only checkin_mask_t old_state; | |
163 | ||
164 | old_gen = commpage_increment_cpu_quiescent_counter(); | |
165 | ||
166 | cpu_checkin_last_commit = ctime; | |
167 | ||
cb323159 | 168 | old_state = os_atomic_andnot(&cpu_quiescing_checkin_state, CPU_CHECKIN_MASK, release); |
d9a64523 A |
169 | |
170 | KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_QUIESCENT_COUNTER), old_gen, old_state, ctime, 0); | |
171 | } | |
172 | ||
173 | /* | |
174 | * Have all the expected CPUs checked in? | |
175 | */ | |
176 | static bool | |
177 | cpu_quiescent_counter_needs_commit(checkin_mask_t state) | |
178 | { | |
179 | return (state & CPU_CHECKIN_MASK) == ((state & CPU_EXPECTED_MASK) >> 1); | |
180 | } | |
181 | ||
182 | /* | |
183 | * Called when a processor wants to start participating in the counter, e.g. | |
184 | * 1) when context switching away from the idle thread | |
185 | * 2) when coming up for the first time | |
186 | * 3) when coming up after a shutdown | |
187 | * | |
188 | * Called with interrupts disabled. | |
189 | */ | |
190 | void | |
191 | cpu_quiescent_counter_join(__unused uint64_t ctime) | |
192 | { | |
f427ee49 A |
193 | struct cpu_quiesce *st = PERCPU_GET(cpu_quiesce); |
194 | __assert_only int cpuid = cpu_number(); | |
d9a64523 | 195 | |
c3c9b80d | 196 | assert(cpuid < MAX_CPUS); |
f427ee49 A |
197 | assert(st->state == CPU_QUIESCE_COUNTER_NONE || |
198 | st->state == CPU_QUIESCE_COUNTER_LEFT); | |
d9a64523 A |
199 | |
200 | assert((os_atomic_load(&cpu_quiescing_checkin_state, relaxed) & | |
0a7de745 | 201 | (cpu_expected_bit(cpuid) | cpu_checked_in_bit(cpuid))) == 0); |
d9a64523 | 202 | |
f427ee49 | 203 | st->state = CPU_QUIESCE_COUNTER_PENDING_JOIN; |
d9a64523 A |
204 | |
205 | /* | |
206 | * Mark the processor to call cpu_quiescent_counter_ast before it | |
207 | * ever returns to userspace. | |
208 | */ | |
209 | ast_on(AST_UNQUIESCE); | |
210 | } | |
211 | ||
212 | /* | |
213 | * Called with interrupts disabled from the userspace boundary at the AST_UNQUIESCE callback | |
214 | * It needs to acquire the counter to see data and the counter published by other CPUs. | |
215 | */ | |
216 | void | |
217 | cpu_quiescent_counter_ast(void) | |
218 | { | |
f427ee49 A |
219 | struct cpu_quiesce *st = PERCPU_GET(cpu_quiesce); |
220 | int cpuid = cpu_number(); | |
d9a64523 | 221 | |
f427ee49 | 222 | assert(st->state == CPU_QUIESCE_COUNTER_PENDING_JOIN); |
d9a64523 A |
223 | |
224 | /* We had better not already be joined. */ | |
225 | assert((os_atomic_load(&cpu_quiescing_checkin_state, relaxed) & | |
0a7de745 | 226 | (cpu_expected_bit(cpuid) | cpu_checked_in_bit(cpuid))) == 0); |
d9a64523 A |
227 | |
228 | /* | |
229 | * No release barrier needed because we have no prior state to publish. | |
230 | * Acquire barrier needed because we need this processor to see | |
231 | * the latest counter value. | |
232 | * | |
233 | * The state may be in 'needs checkin' both before and after | |
234 | * this atomic or. | |
235 | * | |
236 | * Additionally, if this is the first processor to come out of idle, | |
237 | * it may need to kickstart the algorithm, otherwise it would | |
238 | * stay in 'needs commit' perpetually with no processor assigned to | |
239 | * actually do the commit. To do that, the first processor only adds | |
240 | * its expected bit. | |
241 | */ | |
242 | ||
f427ee49 A |
243 | st->state = CPU_QUIESCE_COUNTER_JOINED; |
244 | st->last_checkin = mach_absolute_time(); | |
d9a64523 A |
245 | |
246 | checkin_mask_t old_mask, new_mask; | |
247 | os_atomic_rmw_loop(&cpu_quiescing_checkin_state, old_mask, new_mask, acquire, { | |
248 | if (old_mask == 0) { | |
0a7de745 | 249 | new_mask = old_mask | cpu_expected_bit(cpuid); |
d9a64523 | 250 | } else { |
0a7de745 | 251 | new_mask = old_mask | cpu_expected_bit(cpuid) | cpu_checked_in_bit(cpuid); |
d9a64523 A |
252 | } |
253 | }); | |
254 | } | |
255 | ||
256 | /* | |
257 | * Called when a processor no longer wants to participate in the counter, | |
258 | * i.e. when a processor is on its way to idle or shutdown. | |
259 | * | |
260 | * Called with interrupts disabled. | |
261 | * | |
262 | * The processor needs to remove itself from the expected mask, to allow the | |
263 | * algorithm to continue ticking without its participation. | |
264 | * However, it needs to ensure that anything it has done since the last time | |
265 | * it checked in has been published before the next tick is allowed to commit. | |
266 | */ | |
267 | void | |
268 | cpu_quiescent_counter_leave(uint64_t ctime) | |
269 | { | |
f427ee49 A |
270 | struct cpu_quiesce *st = PERCPU_GET(cpu_quiesce); |
271 | int cpuid = cpu_number(); | |
d9a64523 | 272 | |
f427ee49 A |
273 | assert(st->state == CPU_QUIESCE_COUNTER_JOINED || |
274 | st->state == CPU_QUIESCE_COUNTER_PENDING_JOIN); | |
d9a64523 A |
275 | |
276 | /* We no longer need the cpu_quiescent_counter_ast callback to be armed */ | |
277 | ast_off(AST_UNQUIESCE); | |
278 | ||
f427ee49 | 279 | if (st->state == CPU_QUIESCE_COUNTER_PENDING_JOIN) { |
d9a64523 | 280 | /* We never actually joined, so we don't have to do the work to leave. */ |
f427ee49 | 281 | st->state = CPU_QUIESCE_COUNTER_LEFT; |
d9a64523 A |
282 | return; |
283 | } | |
284 | ||
285 | /* Leaving can't be deferred, even if we're within the min interval */ | |
f427ee49 | 286 | st->last_checkin = ctime; |
d9a64523 A |
287 | |
288 | checkin_mask_t mask = cpu_checked_in_bit(cpuid) | cpu_expected_bit(cpuid); | |
289 | ||
cb323159 A |
290 | checkin_mask_t orig_state = os_atomic_andnot_orig(&cpu_quiescing_checkin_state, |
291 | mask, acq_rel); | |
d9a64523 A |
292 | |
293 | assert((orig_state & cpu_expected_bit(cpuid))); | |
294 | ||
f427ee49 | 295 | st->state = CPU_QUIESCE_COUNTER_LEFT; |
d9a64523 A |
296 | |
297 | if (cpu_quiescent_counter_needs_commit(orig_state)) { | |
298 | /* | |
299 | * the old state indicates someone else was already doing a commit | |
300 | * but hadn't finished yet. We successfully inserted the acq_rel | |
301 | * before they finished the commit by resetting the bitfield, | |
302 | * so we're done here. | |
303 | */ | |
304 | return; | |
305 | } | |
306 | ||
307 | checkin_mask_t new_state = orig_state & ~mask; | |
308 | ||
309 | if (cpu_quiescent_counter_needs_commit(new_state)) { | |
310 | cpu_quiescent_counter_commit(ctime); | |
311 | } | |
312 | } | |
313 | ||
314 | /* | |
315 | * Called when a processor wants to check in to the counter | |
316 | * If it hasn't yet fully joined, it doesn't need to check in. | |
317 | * | |
318 | * Called with interrupts disabled. | |
319 | */ | |
320 | void | |
321 | cpu_quiescent_counter_checkin(uint64_t ctime) | |
322 | { | |
f427ee49 A |
323 | struct cpu_quiesce *st = PERCPU_GET(cpu_quiesce); |
324 | int cpuid = cpu_number(); | |
d9a64523 | 325 | |
f427ee49 | 326 | assert(st->state != CPU_QUIESCE_COUNTER_NONE); |
d9a64523 A |
327 | |
328 | /* If we're not joined yet, we don't need to check in */ | |
f427ee49 | 329 | if (__probable(st->state != CPU_QUIESCE_COUNTER_JOINED)) { |
d9a64523 | 330 | return; |
0a7de745 | 331 | } |
d9a64523 A |
332 | |
333 | /* If we've checked in recently, we don't need to check in yet. */ | |
f427ee49 | 334 | if (__probable((ctime - st->last_checkin) <= cpu_checkin_min_interval)) { |
d9a64523 | 335 | return; |
0a7de745 | 336 | } |
d9a64523 | 337 | |
f427ee49 | 338 | st->last_checkin = ctime; |
d9a64523 A |
339 | |
340 | checkin_mask_t state = os_atomic_load(&cpu_quiescing_checkin_state, relaxed); | |
341 | ||
342 | assert((state & cpu_expected_bit(cpuid))); | |
343 | ||
344 | if (__probable((state & cpu_checked_in_bit(cpuid)))) { | |
345 | /* | |
346 | * Processor has already checked in for this round, no need to | |
347 | * acquire the cacheline exclusive. | |
348 | */ | |
349 | return; | |
350 | } | |
351 | ||
352 | checkin_mask_t orig_state = os_atomic_or_orig(&cpu_quiescing_checkin_state, | |
0a7de745 | 353 | cpu_checked_in_bit(cpuid), acq_rel); |
d9a64523 A |
354 | |
355 | checkin_mask_t new_state = orig_state | cpu_checked_in_bit(cpuid); | |
356 | ||
357 | if (cpu_quiescent_counter_needs_commit(new_state)) { | |
358 | assertf(!cpu_quiescent_counter_needs_commit(orig_state), | |
0a7de745 | 359 | "old: 0x%lx, new: 0x%lx", orig_state, new_state); |
d9a64523 A |
360 | cpu_quiescent_counter_commit(ctime); |
361 | } | |
362 | } | |
363 | ||
364 | #if MACH_ASSERT | |
365 | /* | |
366 | * Called on all AST exits to userspace to assert this processor actually joined | |
367 | * | |
368 | * Called with interrupts disabled after the AST should have been handled | |
369 | */ | |
370 | void | |
371 | cpu_quiescent_counter_assert_ast(void) | |
372 | { | |
f427ee49 A |
373 | struct cpu_quiesce *st = PERCPU_GET(cpu_quiesce); |
374 | int cpuid = cpu_number(); | |
d9a64523 | 375 | |
f427ee49 | 376 | assert(st->state == CPU_QUIESCE_COUNTER_JOINED); |
d9a64523 A |
377 | |
378 | checkin_mask_t state = os_atomic_load(&cpu_quiescing_checkin_state, relaxed); | |
379 | assert((state & cpu_expected_bit(cpuid))); | |
380 | } | |
381 | #endif /* MACH_ASSERT */ |