]>
Commit | Line | Data |
---|---|---|
d9a64523 A |
1 | /* |
2 | * Copyright (c) 2018 Apple Inc. All rights reserved. | |
3 | * | |
4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ | |
5 | * | |
6 | * This file contains Original Code and/or Modifications of Original Code | |
7 | * as defined in and that are subject to the Apple Public Source License | |
8 | * Version 2.0 (the 'License'). You may not use this file except in | |
9 | * compliance with the License. The rights granted to you under the License | |
10 | * may not be used to create, or enable the creation or redistribution of, | |
11 | * unlawful or unlicensed copies of an Apple operating system, or to | |
12 | * circumvent, violate, or enable the circumvention or violation of, any | |
13 | * terms of an Apple operating system software license agreement. | |
14 | * | |
15 | * Please obtain a copy of the License at | |
16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. | |
17 | * | |
18 | * The Original Code and all software distributed under the License are | |
19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, | |
21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. | |
23 | * Please see the License for the specific language governing rights and | |
24 | * limitations under the License. | |
25 | * | |
26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ | |
27 | */ | |
28 | ||
29 | #ifdef __x86_64__ | |
30 | #error This file is only needed on weakly-ordered systems! | |
31 | #endif | |
32 | ||
33 | #include <machine/atomic.h> | |
34 | #include <machine/commpage.h> | |
35 | #include <machine/machine_cpu.h> | |
36 | ||
37 | #include <kern/sched_prim.h> | |
38 | #include <kern/processor.h> | |
39 | #include <kern/ast.h> | |
40 | ||
41 | #include <kern/cpu_quiesce.h> | |
42 | ||
43 | /* | |
44 | * CPU quiescing generation counter implemented with a checkin mask | |
45 | * | |
46 | * A tri-state bitfield, with 2 bits for each processor:; | |
47 | * 1) 'checkin' bit, saying this processor has 'checked in', i.e. executed the acqrel barrier | |
48 | * 2) 'expected' bit, saying this processor is expected to check in, i.e. not idle. | |
49 | * | |
50 | * When a processor causes the 'expected' bits to equal the 'checkin' bits, which | |
51 | * indicates that all processors have executed the barrier, it ticks the algorithm | |
52 | * and resets the state. | |
53 | * | |
54 | * Idle CPUs won't check in, because they don't run, so the algorithm won't tick. | |
55 | * However, they can't do anything in userspace while idle, so we don't need | |
56 | * them to execute barriers, so we have them 'leave' the counter so that | |
57 | * they don't delay the tick while idle. | |
58 | * | |
59 | * This bitfield currently limits MAX_CPUS to 32 on LP64. | |
60 | * In the future, we can use double-wide atomics and int128 if we need 64 CPUS. | |
61 | * | |
62 | * The mask only guarantees ordering to code running in userspace. | |
63 | * We defer joining the counter until we actually reach userspace, allowing | |
64 | * processors that come out of idle and only run kernel code to avoid the overhead | |
65 | * of participation. | |
66 | * | |
67 | * We additionally defer updating the counter for a minimum interval to | |
68 | * reduce the frequency of executing the exclusive atomic operations. | |
69 | * | |
70 | * The longest delay between two checkins assuming that at least one processor | |
71 | * joins is <checkin delay> + (<thread quantum> * 2) | |
72 | */ | |
73 | ||
74 | typedef unsigned long checkin_mask_t; | |
75 | ||
76 | static _Atomic checkin_mask_t cpu_quiescing_checkin_state; | |
77 | ||
78 | static uint64_t cpu_checkin_last_commit; | |
79 | ||
80 | #define CPU_CHECKIN_MIN_INTERVAL_US 4000 /* 4ms */ | |
81 | #define CPU_CHECKIN_MIN_INTERVAL_MAX_US USEC_PER_SEC /* 1s */ | |
82 | static uint64_t cpu_checkin_min_interval; | |
83 | uint32_t cpu_checkin_min_interval_us; | |
84 | ||
85 | #if __LP64__ | |
86 | static_assert(MAX_CPUS <= 32); | |
87 | #define CPU_CHECKIN_MASK 0x5555555555555555UL | |
88 | #define CPU_EXPECTED_MASK (~CPU_CHECKIN_MASK) | |
89 | #else | |
90 | /* Avoid double-wide CAS on 32-bit platforms by using a 32-bit state and mask */ | |
91 | static_assert(MAX_CPUS <= 16); | |
92 | #define CPU_CHECKIN_MASK 0x55555555UL | |
93 | #define CPU_EXPECTED_MASK (~CPU_CHECKIN_MASK) | |
94 | #endif | |
95 | ||
96 | static_assert(CPU_CHECKIN_MASK == CPU_EXPECTED_MASK >> 1); | |
97 | ||
98 | static inline checkin_mask_t | |
99 | cpu_checked_in_bit(int cpuid) | |
100 | { | |
101 | return 1UL << (2 * cpuid); | |
102 | } | |
103 | ||
104 | static inline checkin_mask_t | |
105 | cpu_expected_bit(int cpuid) | |
106 | { | |
107 | return 1UL << (2 * cpuid + 1); | |
108 | } | |
109 | ||
110 | void | |
111 | cpu_quiescent_counter_init(void) | |
112 | { | |
113 | assert(CPU_CHECKIN_MASK & cpu_checked_in_bit(MAX_CPUS)); | |
114 | assert(CPU_EXPECTED_MASK & cpu_expected_bit(MAX_CPUS)); | |
115 | assert((CPU_CHECKIN_MASK & cpu_expected_bit(MAX_CPUS)) == 0); | |
116 | assert((CPU_EXPECTED_MASK & cpu_checked_in_bit(MAX_CPUS)) == 0); | |
117 | ||
118 | cpu_quiescent_counter_set_min_interval_us(CPU_CHECKIN_MIN_INTERVAL_US); | |
119 | } | |
120 | ||
121 | void | |
122 | cpu_quiescent_counter_set_min_interval_us(uint32_t new_value_us) | |
123 | { | |
124 | /* clamp to something vaguely sane */ | |
125 | if (new_value_us > CPU_CHECKIN_MIN_INTERVAL_MAX_US) | |
126 | new_value_us = CPU_CHECKIN_MIN_INTERVAL_MAX_US; | |
127 | ||
128 | cpu_checkin_min_interval_us = new_value_us; | |
129 | ||
130 | uint64_t abstime = 0; | |
131 | clock_interval_to_absolutetime_interval(cpu_checkin_min_interval_us, | |
132 | NSEC_PER_USEC, &abstime); | |
133 | cpu_checkin_min_interval = abstime; | |
134 | } | |
135 | ||
136 | ||
137 | /* | |
138 | * Called when all running CPUs have checked in. | |
139 | * | |
140 | * The commpage increment is protected by the 'lock' of having caused the tick, | |
141 | * and it is published by the state reset release barrier. | |
142 | */ | |
143 | static void | |
144 | cpu_quiescent_counter_commit(uint64_t ctime) | |
145 | { | |
146 | __kdebug_only uint64_t old_gen; | |
147 | __kdebug_only checkin_mask_t old_state; | |
148 | ||
149 | old_gen = commpage_increment_cpu_quiescent_counter(); | |
150 | ||
151 | cpu_checkin_last_commit = ctime; | |
152 | ||
153 | old_state = os_atomic_and(&cpu_quiescing_checkin_state, ~CPU_CHECKIN_MASK, release); | |
154 | ||
155 | KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_QUIESCENT_COUNTER), old_gen, old_state, ctime, 0); | |
156 | } | |
157 | ||
158 | /* | |
159 | * Have all the expected CPUs checked in? | |
160 | */ | |
161 | static bool | |
162 | cpu_quiescent_counter_needs_commit(checkin_mask_t state) | |
163 | { | |
164 | return (state & CPU_CHECKIN_MASK) == ((state & CPU_EXPECTED_MASK) >> 1); | |
165 | } | |
166 | ||
167 | /* | |
168 | * Called when a processor wants to start participating in the counter, e.g. | |
169 | * 1) when context switching away from the idle thread | |
170 | * 2) when coming up for the first time | |
171 | * 3) when coming up after a shutdown | |
172 | * | |
173 | * Called with interrupts disabled. | |
174 | */ | |
175 | void | |
176 | cpu_quiescent_counter_join(__unused uint64_t ctime) | |
177 | { | |
178 | processor_t processor = current_processor(); | |
179 | __assert_only int cpuid = processor->cpu_id; | |
180 | ||
181 | assert(processor->cpu_quiesce_state == CPU_QUIESCE_COUNTER_NONE || | |
182 | processor->cpu_quiesce_state == CPU_QUIESCE_COUNTER_LEFT); | |
183 | ||
184 | assert((os_atomic_load(&cpu_quiescing_checkin_state, relaxed) & | |
185 | (cpu_expected_bit(cpuid) | cpu_checked_in_bit(cpuid))) == 0); | |
186 | ||
187 | processor->cpu_quiesce_state = CPU_QUIESCE_COUNTER_PENDING_JOIN; | |
188 | ||
189 | /* | |
190 | * Mark the processor to call cpu_quiescent_counter_ast before it | |
191 | * ever returns to userspace. | |
192 | */ | |
193 | ast_on(AST_UNQUIESCE); | |
194 | } | |
195 | ||
196 | /* | |
197 | * Called with interrupts disabled from the userspace boundary at the AST_UNQUIESCE callback | |
198 | * It needs to acquire the counter to see data and the counter published by other CPUs. | |
199 | */ | |
200 | void | |
201 | cpu_quiescent_counter_ast(void) | |
202 | { | |
203 | processor_t processor = current_processor(); | |
204 | int cpuid = processor->cpu_id; | |
205 | ||
206 | assert(processor->cpu_quiesce_state == CPU_QUIESCE_COUNTER_PENDING_JOIN); | |
207 | ||
208 | /* We had better not already be joined. */ | |
209 | assert((os_atomic_load(&cpu_quiescing_checkin_state, relaxed) & | |
210 | (cpu_expected_bit(cpuid) | cpu_checked_in_bit(cpuid))) == 0); | |
211 | ||
212 | /* | |
213 | * No release barrier needed because we have no prior state to publish. | |
214 | * Acquire barrier needed because we need this processor to see | |
215 | * the latest counter value. | |
216 | * | |
217 | * The state may be in 'needs checkin' both before and after | |
218 | * this atomic or. | |
219 | * | |
220 | * Additionally, if this is the first processor to come out of idle, | |
221 | * it may need to kickstart the algorithm, otherwise it would | |
222 | * stay in 'needs commit' perpetually with no processor assigned to | |
223 | * actually do the commit. To do that, the first processor only adds | |
224 | * its expected bit. | |
225 | */ | |
226 | ||
227 | processor->cpu_quiesce_state = CPU_QUIESCE_COUNTER_JOINED; | |
228 | processor->cpu_quiesce_last_checkin = mach_absolute_time(); | |
229 | ||
230 | checkin_mask_t old_mask, new_mask; | |
231 | os_atomic_rmw_loop(&cpu_quiescing_checkin_state, old_mask, new_mask, acquire, { | |
232 | if (old_mask == 0) { | |
233 | new_mask = old_mask | cpu_expected_bit(cpuid); | |
234 | } else { | |
235 | new_mask = old_mask | cpu_expected_bit(cpuid) | cpu_checked_in_bit(cpuid); | |
236 | } | |
237 | }); | |
238 | } | |
239 | ||
240 | /* | |
241 | * Called when a processor no longer wants to participate in the counter, | |
242 | * i.e. when a processor is on its way to idle or shutdown. | |
243 | * | |
244 | * Called with interrupts disabled. | |
245 | * | |
246 | * The processor needs to remove itself from the expected mask, to allow the | |
247 | * algorithm to continue ticking without its participation. | |
248 | * However, it needs to ensure that anything it has done since the last time | |
249 | * it checked in has been published before the next tick is allowed to commit. | |
250 | */ | |
251 | void | |
252 | cpu_quiescent_counter_leave(uint64_t ctime) | |
253 | { | |
254 | processor_t processor = current_processor(); | |
255 | int cpuid = processor->cpu_id; | |
256 | ||
257 | assert(processor->cpu_quiesce_state == CPU_QUIESCE_COUNTER_JOINED || | |
258 | processor->cpu_quiesce_state == CPU_QUIESCE_COUNTER_PENDING_JOIN); | |
259 | ||
260 | /* We no longer need the cpu_quiescent_counter_ast callback to be armed */ | |
261 | ast_off(AST_UNQUIESCE); | |
262 | ||
263 | if (processor->cpu_quiesce_state == CPU_QUIESCE_COUNTER_PENDING_JOIN) { | |
264 | /* We never actually joined, so we don't have to do the work to leave. */ | |
265 | processor->cpu_quiesce_state = CPU_QUIESCE_COUNTER_LEFT; | |
266 | return; | |
267 | } | |
268 | ||
269 | /* Leaving can't be deferred, even if we're within the min interval */ | |
270 | processor->cpu_quiesce_last_checkin = ctime; | |
271 | ||
272 | checkin_mask_t mask = cpu_checked_in_bit(cpuid) | cpu_expected_bit(cpuid); | |
273 | ||
274 | checkin_mask_t orig_state = os_atomic_and_orig(&cpu_quiescing_checkin_state, | |
275 | ~mask, acq_rel); | |
276 | ||
277 | assert((orig_state & cpu_expected_bit(cpuid))); | |
278 | ||
279 | processor->cpu_quiesce_state = CPU_QUIESCE_COUNTER_LEFT; | |
280 | ||
281 | if (cpu_quiescent_counter_needs_commit(orig_state)) { | |
282 | /* | |
283 | * the old state indicates someone else was already doing a commit | |
284 | * but hadn't finished yet. We successfully inserted the acq_rel | |
285 | * before they finished the commit by resetting the bitfield, | |
286 | * so we're done here. | |
287 | */ | |
288 | return; | |
289 | } | |
290 | ||
291 | checkin_mask_t new_state = orig_state & ~mask; | |
292 | ||
293 | if (cpu_quiescent_counter_needs_commit(new_state)) { | |
294 | cpu_quiescent_counter_commit(ctime); | |
295 | } | |
296 | } | |
297 | ||
298 | /* | |
299 | * Called when a processor wants to check in to the counter | |
300 | * If it hasn't yet fully joined, it doesn't need to check in. | |
301 | * | |
302 | * Called with interrupts disabled. | |
303 | */ | |
304 | void | |
305 | cpu_quiescent_counter_checkin(uint64_t ctime) | |
306 | { | |
307 | processor_t processor = current_processor(); | |
308 | int cpuid = processor->cpu_id; | |
309 | ||
310 | assert(processor->cpu_quiesce_state != CPU_QUIESCE_COUNTER_NONE); | |
311 | ||
312 | /* If we're not joined yet, we don't need to check in */ | |
313 | if (__probable(processor->cpu_quiesce_state != CPU_QUIESCE_COUNTER_JOINED)) | |
314 | return; | |
315 | ||
316 | /* If we've checked in recently, we don't need to check in yet. */ | |
317 | if (__probable((ctime - processor->cpu_quiesce_last_checkin) <= cpu_checkin_min_interval)) | |
318 | return; | |
319 | ||
320 | processor->cpu_quiesce_last_checkin = ctime; | |
321 | ||
322 | checkin_mask_t state = os_atomic_load(&cpu_quiescing_checkin_state, relaxed); | |
323 | ||
324 | assert((state & cpu_expected_bit(cpuid))); | |
325 | ||
326 | if (__probable((state & cpu_checked_in_bit(cpuid)))) { | |
327 | /* | |
328 | * Processor has already checked in for this round, no need to | |
329 | * acquire the cacheline exclusive. | |
330 | */ | |
331 | return; | |
332 | } | |
333 | ||
334 | checkin_mask_t orig_state = os_atomic_or_orig(&cpu_quiescing_checkin_state, | |
335 | cpu_checked_in_bit(cpuid), acq_rel); | |
336 | ||
337 | checkin_mask_t new_state = orig_state | cpu_checked_in_bit(cpuid); | |
338 | ||
339 | if (cpu_quiescent_counter_needs_commit(new_state)) { | |
340 | assertf(!cpu_quiescent_counter_needs_commit(orig_state), | |
341 | "old: 0x%lx, new: 0x%lx", orig_state, new_state); | |
342 | cpu_quiescent_counter_commit(ctime); | |
343 | } | |
344 | } | |
345 | ||
346 | #if MACH_ASSERT | |
347 | /* | |
348 | * Called on all AST exits to userspace to assert this processor actually joined | |
349 | * | |
350 | * Called with interrupts disabled after the AST should have been handled | |
351 | */ | |
352 | void | |
353 | cpu_quiescent_counter_assert_ast(void) | |
354 | { | |
355 | processor_t processor = current_processor(); | |
356 | int cpuid = processor->cpu_id; | |
357 | ||
358 | assert(processor->cpu_quiesce_state == CPU_QUIESCE_COUNTER_JOINED); | |
359 | ||
360 | checkin_mask_t state = os_atomic_load(&cpu_quiescing_checkin_state, relaxed); | |
361 | assert((state & cpu_expected_bit(cpuid))); | |
362 | } | |
363 | #endif /* MACH_ASSERT */ | |
364 |