]>
Commit | Line | Data |
---|---|---|
d9a64523 A |
1 | /* |
2 | * Copyright (c) 2018 Apple Inc. All rights reserved. | |
3 | * | |
4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ | |
5 | * | |
6 | * This file contains Original Code and/or Modifications of Original Code | |
7 | * as defined in and that are subject to the Apple Public Source License | |
8 | * Version 2.0 (the 'License'). You may not use this file except in | |
9 | * compliance with the License. The rights granted to you under the License | |
10 | * may not be used to create, or enable the creation or redistribution of, | |
11 | * unlawful or unlicensed copies of an Apple operating system, or to | |
12 | * circumvent, violate, or enable the circumvention or violation of, any | |
13 | * terms of an Apple operating system software license agreement. | |
14 | * | |
15 | * Please obtain a copy of the License at | |
16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. | |
17 | * | |
18 | * The Original Code and all software distributed under the License are | |
19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, | |
21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. | |
23 | * Please see the License for the specific language governing rights and | |
24 | * limitations under the License. | |
25 | * | |
26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ | |
27 | */ | |
28 | ||
29 | #ifdef __x86_64__ | |
30 | #error This file is only needed on weakly-ordered systems! | |
31 | #endif | |
32 | ||
33 | #include <machine/atomic.h> | |
34 | #include <machine/commpage.h> | |
35 | #include <machine/machine_cpu.h> | |
36 | ||
37 | #include <kern/sched_prim.h> | |
38 | #include <kern/processor.h> | |
39 | #include <kern/ast.h> | |
40 | ||
41 | #include <kern/cpu_quiesce.h> | |
42 | ||
43 | /* | |
44 | * CPU quiescing generation counter implemented with a checkin mask | |
45 | * | |
46 | * A tri-state bitfield, with 2 bits for each processor:; | |
47 | * 1) 'checkin' bit, saying this processor has 'checked in', i.e. executed the acqrel barrier | |
48 | * 2) 'expected' bit, saying this processor is expected to check in, i.e. not idle. | |
49 | * | |
50 | * When a processor causes the 'expected' bits to equal the 'checkin' bits, which | |
51 | * indicates that all processors have executed the barrier, it ticks the algorithm | |
52 | * and resets the state. | |
53 | * | |
54 | * Idle CPUs won't check in, because they don't run, so the algorithm won't tick. | |
55 | * However, they can't do anything in userspace while idle, so we don't need | |
56 | * them to execute barriers, so we have them 'leave' the counter so that | |
57 | * they don't delay the tick while idle. | |
58 | * | |
59 | * This bitfield currently limits MAX_CPUS to 32 on LP64. | |
60 | * In the future, we can use double-wide atomics and int128 if we need 64 CPUS. | |
61 | * | |
62 | * The mask only guarantees ordering to code running in userspace. | |
63 | * We defer joining the counter until we actually reach userspace, allowing | |
64 | * processors that come out of idle and only run kernel code to avoid the overhead | |
65 | * of participation. | |
66 | * | |
67 | * We additionally defer updating the counter for a minimum interval to | |
68 | * reduce the frequency of executing the exclusive atomic operations. | |
69 | * | |
70 | * The longest delay between two checkins assuming that at least one processor | |
71 | * joins is <checkin delay> + (<thread quantum> * 2) | |
72 | */ | |
73 | ||
74 | typedef unsigned long checkin_mask_t; | |
75 | ||
76 | static _Atomic checkin_mask_t cpu_quiescing_checkin_state; | |
77 | ||
78 | static uint64_t cpu_checkin_last_commit; | |
79 | ||
80 | #define CPU_CHECKIN_MIN_INTERVAL_US 4000 /* 4ms */ | |
81 | #define CPU_CHECKIN_MIN_INTERVAL_MAX_US USEC_PER_SEC /* 1s */ | |
82 | static uint64_t cpu_checkin_min_interval; | |
cb323159 | 83 | static uint32_t cpu_checkin_min_interval_us; |
d9a64523 A |
84 | |
85 | #if __LP64__ | |
86 | static_assert(MAX_CPUS <= 32); | |
87 | #define CPU_CHECKIN_MASK 0x5555555555555555UL | |
88 | #define CPU_EXPECTED_MASK (~CPU_CHECKIN_MASK) | |
89 | #else | |
90 | /* Avoid double-wide CAS on 32-bit platforms by using a 32-bit state and mask */ | |
91 | static_assert(MAX_CPUS <= 16); | |
92 | #define CPU_CHECKIN_MASK 0x55555555UL | |
93 | #define CPU_EXPECTED_MASK (~CPU_CHECKIN_MASK) | |
94 | #endif | |
95 | ||
96 | static_assert(CPU_CHECKIN_MASK == CPU_EXPECTED_MASK >> 1); | |
97 | ||
98 | static inline checkin_mask_t | |
99 | cpu_checked_in_bit(int cpuid) | |
100 | { | |
101 | return 1UL << (2 * cpuid); | |
102 | } | |
103 | ||
104 | static inline checkin_mask_t | |
105 | cpu_expected_bit(int cpuid) | |
106 | { | |
107 | return 1UL << (2 * cpuid + 1); | |
108 | } | |
109 | ||
110 | void | |
111 | cpu_quiescent_counter_init(void) | |
112 | { | |
113 | assert(CPU_CHECKIN_MASK & cpu_checked_in_bit(MAX_CPUS)); | |
114 | assert(CPU_EXPECTED_MASK & cpu_expected_bit(MAX_CPUS)); | |
115 | assert((CPU_CHECKIN_MASK & cpu_expected_bit(MAX_CPUS)) == 0); | |
116 | assert((CPU_EXPECTED_MASK & cpu_checked_in_bit(MAX_CPUS)) == 0); | |
117 | ||
118 | cpu_quiescent_counter_set_min_interval_us(CPU_CHECKIN_MIN_INTERVAL_US); | |
119 | } | |
120 | ||
121 | void | |
122 | cpu_quiescent_counter_set_min_interval_us(uint32_t new_value_us) | |
123 | { | |
124 | /* clamp to something vaguely sane */ | |
0a7de745 | 125 | if (new_value_us > CPU_CHECKIN_MIN_INTERVAL_MAX_US) { |
d9a64523 | 126 | new_value_us = CPU_CHECKIN_MIN_INTERVAL_MAX_US; |
0a7de745 | 127 | } |
d9a64523 A |
128 | |
129 | cpu_checkin_min_interval_us = new_value_us; | |
130 | ||
131 | uint64_t abstime = 0; | |
132 | clock_interval_to_absolutetime_interval(cpu_checkin_min_interval_us, | |
0a7de745 | 133 | NSEC_PER_USEC, &abstime); |
d9a64523 A |
134 | cpu_checkin_min_interval = abstime; |
135 | } | |
136 | ||
cb323159 A |
137 | uint32_t |
138 | cpu_quiescent_counter_get_min_interval_us(void) | |
139 | { | |
140 | return cpu_checkin_min_interval_us; | |
141 | } | |
142 | ||
d9a64523 A |
143 | |
144 | /* | |
145 | * Called when all running CPUs have checked in. | |
146 | * | |
147 | * The commpage increment is protected by the 'lock' of having caused the tick, | |
148 | * and it is published by the state reset release barrier. | |
149 | */ | |
150 | static void | |
151 | cpu_quiescent_counter_commit(uint64_t ctime) | |
152 | { | |
153 | __kdebug_only uint64_t old_gen; | |
154 | __kdebug_only checkin_mask_t old_state; | |
155 | ||
156 | old_gen = commpage_increment_cpu_quiescent_counter(); | |
157 | ||
158 | cpu_checkin_last_commit = ctime; | |
159 | ||
cb323159 | 160 | old_state = os_atomic_andnot(&cpu_quiescing_checkin_state, CPU_CHECKIN_MASK, release); |
d9a64523 A |
161 | |
162 | KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_QUIESCENT_COUNTER), old_gen, old_state, ctime, 0); | |
163 | } | |
164 | ||
165 | /* | |
166 | * Have all the expected CPUs checked in? | |
167 | */ | |
168 | static bool | |
169 | cpu_quiescent_counter_needs_commit(checkin_mask_t state) | |
170 | { | |
171 | return (state & CPU_CHECKIN_MASK) == ((state & CPU_EXPECTED_MASK) >> 1); | |
172 | } | |
173 | ||
174 | /* | |
175 | * Called when a processor wants to start participating in the counter, e.g. | |
176 | * 1) when context switching away from the idle thread | |
177 | * 2) when coming up for the first time | |
178 | * 3) when coming up after a shutdown | |
179 | * | |
180 | * Called with interrupts disabled. | |
181 | */ | |
182 | void | |
183 | cpu_quiescent_counter_join(__unused uint64_t ctime) | |
184 | { | |
185 | processor_t processor = current_processor(); | |
186 | __assert_only int cpuid = processor->cpu_id; | |
187 | ||
188 | assert(processor->cpu_quiesce_state == CPU_QUIESCE_COUNTER_NONE || | |
0a7de745 | 189 | processor->cpu_quiesce_state == CPU_QUIESCE_COUNTER_LEFT); |
d9a64523 A |
190 | |
191 | assert((os_atomic_load(&cpu_quiescing_checkin_state, relaxed) & | |
0a7de745 | 192 | (cpu_expected_bit(cpuid) | cpu_checked_in_bit(cpuid))) == 0); |
d9a64523 A |
193 | |
194 | processor->cpu_quiesce_state = CPU_QUIESCE_COUNTER_PENDING_JOIN; | |
195 | ||
196 | /* | |
197 | * Mark the processor to call cpu_quiescent_counter_ast before it | |
198 | * ever returns to userspace. | |
199 | */ | |
200 | ast_on(AST_UNQUIESCE); | |
201 | } | |
202 | ||
203 | /* | |
204 | * Called with interrupts disabled from the userspace boundary at the AST_UNQUIESCE callback | |
205 | * It needs to acquire the counter to see data and the counter published by other CPUs. | |
206 | */ | |
207 | void | |
208 | cpu_quiescent_counter_ast(void) | |
209 | { | |
210 | processor_t processor = current_processor(); | |
211 | int cpuid = processor->cpu_id; | |
212 | ||
213 | assert(processor->cpu_quiesce_state == CPU_QUIESCE_COUNTER_PENDING_JOIN); | |
214 | ||
215 | /* We had better not already be joined. */ | |
216 | assert((os_atomic_load(&cpu_quiescing_checkin_state, relaxed) & | |
0a7de745 | 217 | (cpu_expected_bit(cpuid) | cpu_checked_in_bit(cpuid))) == 0); |
d9a64523 A |
218 | |
219 | /* | |
220 | * No release barrier needed because we have no prior state to publish. | |
221 | * Acquire barrier needed because we need this processor to see | |
222 | * the latest counter value. | |
223 | * | |
224 | * The state may be in 'needs checkin' both before and after | |
225 | * this atomic or. | |
226 | * | |
227 | * Additionally, if this is the first processor to come out of idle, | |
228 | * it may need to kickstart the algorithm, otherwise it would | |
229 | * stay in 'needs commit' perpetually with no processor assigned to | |
230 | * actually do the commit. To do that, the first processor only adds | |
231 | * its expected bit. | |
232 | */ | |
233 | ||
234 | processor->cpu_quiesce_state = CPU_QUIESCE_COUNTER_JOINED; | |
235 | processor->cpu_quiesce_last_checkin = mach_absolute_time(); | |
236 | ||
237 | checkin_mask_t old_mask, new_mask; | |
238 | os_atomic_rmw_loop(&cpu_quiescing_checkin_state, old_mask, new_mask, acquire, { | |
239 | if (old_mask == 0) { | |
0a7de745 | 240 | new_mask = old_mask | cpu_expected_bit(cpuid); |
d9a64523 | 241 | } else { |
0a7de745 | 242 | new_mask = old_mask | cpu_expected_bit(cpuid) | cpu_checked_in_bit(cpuid); |
d9a64523 A |
243 | } |
244 | }); | |
245 | } | |
246 | ||
247 | /* | |
248 | * Called when a processor no longer wants to participate in the counter, | |
249 | * i.e. when a processor is on its way to idle or shutdown. | |
250 | * | |
251 | * Called with interrupts disabled. | |
252 | * | |
253 | * The processor needs to remove itself from the expected mask, to allow the | |
254 | * algorithm to continue ticking without its participation. | |
255 | * However, it needs to ensure that anything it has done since the last time | |
256 | * it checked in has been published before the next tick is allowed to commit. | |
257 | */ | |
258 | void | |
259 | cpu_quiescent_counter_leave(uint64_t ctime) | |
260 | { | |
261 | processor_t processor = current_processor(); | |
262 | int cpuid = processor->cpu_id; | |
263 | ||
264 | assert(processor->cpu_quiesce_state == CPU_QUIESCE_COUNTER_JOINED || | |
0a7de745 | 265 | processor->cpu_quiesce_state == CPU_QUIESCE_COUNTER_PENDING_JOIN); |
d9a64523 A |
266 | |
267 | /* We no longer need the cpu_quiescent_counter_ast callback to be armed */ | |
268 | ast_off(AST_UNQUIESCE); | |
269 | ||
270 | if (processor->cpu_quiesce_state == CPU_QUIESCE_COUNTER_PENDING_JOIN) { | |
271 | /* We never actually joined, so we don't have to do the work to leave. */ | |
272 | processor->cpu_quiesce_state = CPU_QUIESCE_COUNTER_LEFT; | |
273 | return; | |
274 | } | |
275 | ||
276 | /* Leaving can't be deferred, even if we're within the min interval */ | |
277 | processor->cpu_quiesce_last_checkin = ctime; | |
278 | ||
279 | checkin_mask_t mask = cpu_checked_in_bit(cpuid) | cpu_expected_bit(cpuid); | |
280 | ||
cb323159 A |
281 | checkin_mask_t orig_state = os_atomic_andnot_orig(&cpu_quiescing_checkin_state, |
282 | mask, acq_rel); | |
d9a64523 A |
283 | |
284 | assert((orig_state & cpu_expected_bit(cpuid))); | |
285 | ||
286 | processor->cpu_quiesce_state = CPU_QUIESCE_COUNTER_LEFT; | |
287 | ||
288 | if (cpu_quiescent_counter_needs_commit(orig_state)) { | |
289 | /* | |
290 | * the old state indicates someone else was already doing a commit | |
291 | * but hadn't finished yet. We successfully inserted the acq_rel | |
292 | * before they finished the commit by resetting the bitfield, | |
293 | * so we're done here. | |
294 | */ | |
295 | return; | |
296 | } | |
297 | ||
298 | checkin_mask_t new_state = orig_state & ~mask; | |
299 | ||
300 | if (cpu_quiescent_counter_needs_commit(new_state)) { | |
301 | cpu_quiescent_counter_commit(ctime); | |
302 | } | |
303 | } | |
304 | ||
305 | /* | |
306 | * Called when a processor wants to check in to the counter | |
307 | * If it hasn't yet fully joined, it doesn't need to check in. | |
308 | * | |
309 | * Called with interrupts disabled. | |
310 | */ | |
311 | void | |
312 | cpu_quiescent_counter_checkin(uint64_t ctime) | |
313 | { | |
314 | processor_t processor = current_processor(); | |
315 | int cpuid = processor->cpu_id; | |
316 | ||
317 | assert(processor->cpu_quiesce_state != CPU_QUIESCE_COUNTER_NONE); | |
318 | ||
319 | /* If we're not joined yet, we don't need to check in */ | |
0a7de745 | 320 | if (__probable(processor->cpu_quiesce_state != CPU_QUIESCE_COUNTER_JOINED)) { |
d9a64523 | 321 | return; |
0a7de745 | 322 | } |
d9a64523 A |
323 | |
324 | /* If we've checked in recently, we don't need to check in yet. */ | |
0a7de745 | 325 | if (__probable((ctime - processor->cpu_quiesce_last_checkin) <= cpu_checkin_min_interval)) { |
d9a64523 | 326 | return; |
0a7de745 | 327 | } |
d9a64523 A |
328 | |
329 | processor->cpu_quiesce_last_checkin = ctime; | |
330 | ||
331 | checkin_mask_t state = os_atomic_load(&cpu_quiescing_checkin_state, relaxed); | |
332 | ||
333 | assert((state & cpu_expected_bit(cpuid))); | |
334 | ||
335 | if (__probable((state & cpu_checked_in_bit(cpuid)))) { | |
336 | /* | |
337 | * Processor has already checked in for this round, no need to | |
338 | * acquire the cacheline exclusive. | |
339 | */ | |
340 | return; | |
341 | } | |
342 | ||
343 | checkin_mask_t orig_state = os_atomic_or_orig(&cpu_quiescing_checkin_state, | |
0a7de745 | 344 | cpu_checked_in_bit(cpuid), acq_rel); |
d9a64523 A |
345 | |
346 | checkin_mask_t new_state = orig_state | cpu_checked_in_bit(cpuid); | |
347 | ||
348 | if (cpu_quiescent_counter_needs_commit(new_state)) { | |
349 | assertf(!cpu_quiescent_counter_needs_commit(orig_state), | |
0a7de745 | 350 | "old: 0x%lx, new: 0x%lx", orig_state, new_state); |
d9a64523 A |
351 | cpu_quiescent_counter_commit(ctime); |
352 | } | |
353 | } | |
354 | ||
355 | #if MACH_ASSERT | |
356 | /* | |
357 | * Called on all AST exits to userspace to assert this processor actually joined | |
358 | * | |
359 | * Called with interrupts disabled after the AST should have been handled | |
360 | */ | |
361 | void | |
362 | cpu_quiescent_counter_assert_ast(void) | |
363 | { | |
364 | processor_t processor = current_processor(); | |
365 | int cpuid = processor->cpu_id; | |
366 | ||
367 | assert(processor->cpu_quiesce_state == CPU_QUIESCE_COUNTER_JOINED); | |
368 | ||
369 | checkin_mask_t state = os_atomic_load(&cpu_quiescing_checkin_state, relaxed); | |
370 | assert((state & cpu_expected_bit(cpuid))); | |
371 | } | |
372 | #endif /* MACH_ASSERT */ |