]>
Commit | Line | Data |
---|---|---|
d9a64523 A |
1 | /* |
2 | * Copyright (c) 2018 Apple Inc. All rights reserved. | |
3 | * | |
4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ | |
5 | * | |
6 | * This file contains Original Code and/or Modifications of Original Code | |
7 | * as defined in and that are subject to the Apple Public Source License | |
8 | * Version 2.0 (the 'License'). You may not use this file except in | |
9 | * compliance with the License. The rights granted to you under the License | |
10 | * may not be used to create, or enable the creation or redistribution of, | |
11 | * unlawful or unlicensed copies of an Apple operating system, or to | |
12 | * circumvent, violate, or enable the circumvention or violation of, any | |
13 | * terms of an Apple operating system software license agreement. | |
14 | * | |
15 | * Please obtain a copy of the License at | |
16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. | |
17 | * | |
18 | * The Original Code and all software distributed under the License are | |
19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, | |
21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. | |
23 | * Please see the License for the specific language governing rights and | |
24 | * limitations under the License. | |
25 | * | |
26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ | |
27 | */ | |
28 | ||
29 | #ifdef __x86_64__ | |
30 | #error This file is only needed on weakly-ordered systems! | |
31 | #endif | |
32 | ||
33 | #include <machine/atomic.h> | |
34 | #include <machine/commpage.h> | |
35 | #include <machine/machine_cpu.h> | |
36 | ||
37 | #include <kern/sched_prim.h> | |
f427ee49 | 38 | #include <kern/percpu.h> |
d9a64523 A |
39 | #include <kern/ast.h> |
40 | ||
41 | #include <kern/cpu_quiesce.h> | |
42 | ||
43 | /* | |
44 | * CPU quiescing generation counter implemented with a checkin mask | |
45 | * | |
46 | * A tri-state bitfield, with 2 bits for each processor:; | |
47 | * 1) 'checkin' bit, saying this processor has 'checked in', i.e. executed the acqrel barrier | |
48 | * 2) 'expected' bit, saying this processor is expected to check in, i.e. not idle. | |
49 | * | |
50 | * When a processor causes the 'expected' bits to equal the 'checkin' bits, which | |
51 | * indicates that all processors have executed the barrier, it ticks the algorithm | |
52 | * and resets the state. | |
53 | * | |
54 | * Idle CPUs won't check in, because they don't run, so the algorithm won't tick. | |
55 | * However, they can't do anything in userspace while idle, so we don't need | |
56 | * them to execute barriers, so we have them 'leave' the counter so that | |
57 | * they don't delay the tick while idle. | |
58 | * | |
59 | * This bitfield currently limits MAX_CPUS to 32 on LP64. | |
60 | * In the future, we can use double-wide atomics and int128 if we need 64 CPUS. | |
61 | * | |
62 | * The mask only guarantees ordering to code running in userspace. | |
63 | * We defer joining the counter until we actually reach userspace, allowing | |
64 | * processors that come out of idle and only run kernel code to avoid the overhead | |
65 | * of participation. | |
66 | * | |
67 | * We additionally defer updating the counter for a minimum interval to | |
68 | * reduce the frequency of executing the exclusive atomic operations. | |
69 | * | |
70 | * The longest delay between two checkins assuming that at least one processor | |
71 | * joins is <checkin delay> + (<thread quantum> * 2) | |
72 | */ | |
73 | ||
74 | typedef unsigned long checkin_mask_t; | |
75 | ||
76 | static _Atomic checkin_mask_t cpu_quiescing_checkin_state; | |
77 | ||
78 | static uint64_t cpu_checkin_last_commit; | |
79 | ||
f427ee49 A |
80 | struct cpu_quiesce { |
81 | cpu_quiescent_state_t state; | |
82 | uint64_t last_checkin; | |
83 | }; | |
84 | ||
85 | static struct cpu_quiesce PERCPU_DATA(cpu_quiesce); | |
86 | ||
d9a64523 A |
87 | #define CPU_CHECKIN_MIN_INTERVAL_US 4000 /* 4ms */ |
88 | #define CPU_CHECKIN_MIN_INTERVAL_MAX_US USEC_PER_SEC /* 1s */ | |
89 | static uint64_t cpu_checkin_min_interval; | |
cb323159 | 90 | static uint32_t cpu_checkin_min_interval_us; |
d9a64523 A |
91 | |
92 | #if __LP64__ | |
93 | static_assert(MAX_CPUS <= 32); | |
94 | #define CPU_CHECKIN_MASK 0x5555555555555555UL | |
95 | #define CPU_EXPECTED_MASK (~CPU_CHECKIN_MASK) | |
96 | #else | |
97 | /* Avoid double-wide CAS on 32-bit platforms by using a 32-bit state and mask */ | |
98 | static_assert(MAX_CPUS <= 16); | |
99 | #define CPU_CHECKIN_MASK 0x55555555UL | |
100 | #define CPU_EXPECTED_MASK (~CPU_CHECKIN_MASK) | |
101 | #endif | |
102 | ||
103 | static_assert(CPU_CHECKIN_MASK == CPU_EXPECTED_MASK >> 1); | |
104 | ||
105 | static inline checkin_mask_t | |
106 | cpu_checked_in_bit(int cpuid) | |
107 | { | |
108 | return 1UL << (2 * cpuid); | |
109 | } | |
110 | ||
111 | static inline checkin_mask_t | |
112 | cpu_expected_bit(int cpuid) | |
113 | { | |
114 | return 1UL << (2 * cpuid + 1); | |
115 | } | |
116 | ||
117 | void | |
118 | cpu_quiescent_counter_init(void) | |
119 | { | |
120 | assert(CPU_CHECKIN_MASK & cpu_checked_in_bit(MAX_CPUS)); | |
121 | assert(CPU_EXPECTED_MASK & cpu_expected_bit(MAX_CPUS)); | |
122 | assert((CPU_CHECKIN_MASK & cpu_expected_bit(MAX_CPUS)) == 0); | |
123 | assert((CPU_EXPECTED_MASK & cpu_checked_in_bit(MAX_CPUS)) == 0); | |
124 | ||
125 | cpu_quiescent_counter_set_min_interval_us(CPU_CHECKIN_MIN_INTERVAL_US); | |
126 | } | |
127 | ||
128 | void | |
129 | cpu_quiescent_counter_set_min_interval_us(uint32_t new_value_us) | |
130 | { | |
131 | /* clamp to something vaguely sane */ | |
0a7de745 | 132 | if (new_value_us > CPU_CHECKIN_MIN_INTERVAL_MAX_US) { |
d9a64523 | 133 | new_value_us = CPU_CHECKIN_MIN_INTERVAL_MAX_US; |
0a7de745 | 134 | } |
d9a64523 A |
135 | |
136 | cpu_checkin_min_interval_us = new_value_us; | |
137 | ||
138 | uint64_t abstime = 0; | |
139 | clock_interval_to_absolutetime_interval(cpu_checkin_min_interval_us, | |
0a7de745 | 140 | NSEC_PER_USEC, &abstime); |
d9a64523 A |
141 | cpu_checkin_min_interval = abstime; |
142 | } | |
143 | ||
cb323159 A |
144 | uint32_t |
145 | cpu_quiescent_counter_get_min_interval_us(void) | |
146 | { | |
147 | return cpu_checkin_min_interval_us; | |
148 | } | |
149 | ||
d9a64523 A |
150 | |
151 | /* | |
152 | * Called when all running CPUs have checked in. | |
153 | * | |
154 | * The commpage increment is protected by the 'lock' of having caused the tick, | |
155 | * and it is published by the state reset release barrier. | |
156 | */ | |
157 | static void | |
158 | cpu_quiescent_counter_commit(uint64_t ctime) | |
159 | { | |
160 | __kdebug_only uint64_t old_gen; | |
161 | __kdebug_only checkin_mask_t old_state; | |
162 | ||
163 | old_gen = commpage_increment_cpu_quiescent_counter(); | |
164 | ||
165 | cpu_checkin_last_commit = ctime; | |
166 | ||
cb323159 | 167 | old_state = os_atomic_andnot(&cpu_quiescing_checkin_state, CPU_CHECKIN_MASK, release); |
d9a64523 A |
168 | |
169 | KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_QUIESCENT_COUNTER), old_gen, old_state, ctime, 0); | |
170 | } | |
171 | ||
172 | /* | |
173 | * Have all the expected CPUs checked in? | |
174 | */ | |
175 | static bool | |
176 | cpu_quiescent_counter_needs_commit(checkin_mask_t state) | |
177 | { | |
178 | return (state & CPU_CHECKIN_MASK) == ((state & CPU_EXPECTED_MASK) >> 1); | |
179 | } | |
180 | ||
181 | /* | |
182 | * Called when a processor wants to start participating in the counter, e.g. | |
183 | * 1) when context switching away from the idle thread | |
184 | * 2) when coming up for the first time | |
185 | * 3) when coming up after a shutdown | |
186 | * | |
187 | * Called with interrupts disabled. | |
188 | */ | |
189 | void | |
190 | cpu_quiescent_counter_join(__unused uint64_t ctime) | |
191 | { | |
f427ee49 A |
192 | struct cpu_quiesce *st = PERCPU_GET(cpu_quiesce); |
193 | __assert_only int cpuid = cpu_number(); | |
d9a64523 | 194 | |
f427ee49 A |
195 | assert(st->state == CPU_QUIESCE_COUNTER_NONE || |
196 | st->state == CPU_QUIESCE_COUNTER_LEFT); | |
d9a64523 A |
197 | |
198 | assert((os_atomic_load(&cpu_quiescing_checkin_state, relaxed) & | |
0a7de745 | 199 | (cpu_expected_bit(cpuid) | cpu_checked_in_bit(cpuid))) == 0); |
d9a64523 | 200 | |
f427ee49 | 201 | st->state = CPU_QUIESCE_COUNTER_PENDING_JOIN; |
d9a64523 A |
202 | |
203 | /* | |
204 | * Mark the processor to call cpu_quiescent_counter_ast before it | |
205 | * ever returns to userspace. | |
206 | */ | |
207 | ast_on(AST_UNQUIESCE); | |
208 | } | |
209 | ||
210 | /* | |
211 | * Called with interrupts disabled from the userspace boundary at the AST_UNQUIESCE callback | |
212 | * It needs to acquire the counter to see data and the counter published by other CPUs. | |
213 | */ | |
214 | void | |
215 | cpu_quiescent_counter_ast(void) | |
216 | { | |
f427ee49 A |
217 | struct cpu_quiesce *st = PERCPU_GET(cpu_quiesce); |
218 | int cpuid = cpu_number(); | |
d9a64523 | 219 | |
f427ee49 | 220 | assert(st->state == CPU_QUIESCE_COUNTER_PENDING_JOIN); |
d9a64523 A |
221 | |
222 | /* We had better not already be joined. */ | |
223 | assert((os_atomic_load(&cpu_quiescing_checkin_state, relaxed) & | |
0a7de745 | 224 | (cpu_expected_bit(cpuid) | cpu_checked_in_bit(cpuid))) == 0); |
d9a64523 A |
225 | |
226 | /* | |
227 | * No release barrier needed because we have no prior state to publish. | |
228 | * Acquire barrier needed because we need this processor to see | |
229 | * the latest counter value. | |
230 | * | |
231 | * The state may be in 'needs checkin' both before and after | |
232 | * this atomic or. | |
233 | * | |
234 | * Additionally, if this is the first processor to come out of idle, | |
235 | * it may need to kickstart the algorithm, otherwise it would | |
236 | * stay in 'needs commit' perpetually with no processor assigned to | |
237 | * actually do the commit. To do that, the first processor only adds | |
238 | * its expected bit. | |
239 | */ | |
240 | ||
f427ee49 A |
241 | st->state = CPU_QUIESCE_COUNTER_JOINED; |
242 | st->last_checkin = mach_absolute_time(); | |
d9a64523 A |
243 | |
244 | checkin_mask_t old_mask, new_mask; | |
245 | os_atomic_rmw_loop(&cpu_quiescing_checkin_state, old_mask, new_mask, acquire, { | |
246 | if (old_mask == 0) { | |
0a7de745 | 247 | new_mask = old_mask | cpu_expected_bit(cpuid); |
d9a64523 | 248 | } else { |
0a7de745 | 249 | new_mask = old_mask | cpu_expected_bit(cpuid) | cpu_checked_in_bit(cpuid); |
d9a64523 A |
250 | } |
251 | }); | |
252 | } | |
253 | ||
254 | /* | |
255 | * Called when a processor no longer wants to participate in the counter, | |
256 | * i.e. when a processor is on its way to idle or shutdown. | |
257 | * | |
258 | * Called with interrupts disabled. | |
259 | * | |
260 | * The processor needs to remove itself from the expected mask, to allow the | |
261 | * algorithm to continue ticking without its participation. | |
262 | * However, it needs to ensure that anything it has done since the last time | |
263 | * it checked in has been published before the next tick is allowed to commit. | |
264 | */ | |
265 | void | |
266 | cpu_quiescent_counter_leave(uint64_t ctime) | |
267 | { | |
f427ee49 A |
268 | struct cpu_quiesce *st = PERCPU_GET(cpu_quiesce); |
269 | int cpuid = cpu_number(); | |
d9a64523 | 270 | |
f427ee49 A |
271 | assert(st->state == CPU_QUIESCE_COUNTER_JOINED || |
272 | st->state == CPU_QUIESCE_COUNTER_PENDING_JOIN); | |
d9a64523 A |
273 | |
274 | /* We no longer need the cpu_quiescent_counter_ast callback to be armed */ | |
275 | ast_off(AST_UNQUIESCE); | |
276 | ||
f427ee49 | 277 | if (st->state == CPU_QUIESCE_COUNTER_PENDING_JOIN) { |
d9a64523 | 278 | /* We never actually joined, so we don't have to do the work to leave. */ |
f427ee49 | 279 | st->state = CPU_QUIESCE_COUNTER_LEFT; |
d9a64523 A |
280 | return; |
281 | } | |
282 | ||
283 | /* Leaving can't be deferred, even if we're within the min interval */ | |
f427ee49 | 284 | st->last_checkin = ctime; |
d9a64523 A |
285 | |
286 | checkin_mask_t mask = cpu_checked_in_bit(cpuid) | cpu_expected_bit(cpuid); | |
287 | ||
cb323159 A |
288 | checkin_mask_t orig_state = os_atomic_andnot_orig(&cpu_quiescing_checkin_state, |
289 | mask, acq_rel); | |
d9a64523 A |
290 | |
291 | assert((orig_state & cpu_expected_bit(cpuid))); | |
292 | ||
f427ee49 | 293 | st->state = CPU_QUIESCE_COUNTER_LEFT; |
d9a64523 A |
294 | |
295 | if (cpu_quiescent_counter_needs_commit(orig_state)) { | |
296 | /* | |
297 | * the old state indicates someone else was already doing a commit | |
298 | * but hadn't finished yet. We successfully inserted the acq_rel | |
299 | * before they finished the commit by resetting the bitfield, | |
300 | * so we're done here. | |
301 | */ | |
302 | return; | |
303 | } | |
304 | ||
305 | checkin_mask_t new_state = orig_state & ~mask; | |
306 | ||
307 | if (cpu_quiescent_counter_needs_commit(new_state)) { | |
308 | cpu_quiescent_counter_commit(ctime); | |
309 | } | |
310 | } | |
311 | ||
312 | /* | |
313 | * Called when a processor wants to check in to the counter | |
314 | * If it hasn't yet fully joined, it doesn't need to check in. | |
315 | * | |
316 | * Called with interrupts disabled. | |
317 | */ | |
318 | void | |
319 | cpu_quiescent_counter_checkin(uint64_t ctime) | |
320 | { | |
f427ee49 A |
321 | struct cpu_quiesce *st = PERCPU_GET(cpu_quiesce); |
322 | int cpuid = cpu_number(); | |
d9a64523 | 323 | |
f427ee49 | 324 | assert(st->state != CPU_QUIESCE_COUNTER_NONE); |
d9a64523 A |
325 | |
326 | /* If we're not joined yet, we don't need to check in */ | |
f427ee49 | 327 | if (__probable(st->state != CPU_QUIESCE_COUNTER_JOINED)) { |
d9a64523 | 328 | return; |
0a7de745 | 329 | } |
d9a64523 A |
330 | |
331 | /* If we've checked in recently, we don't need to check in yet. */ | |
f427ee49 | 332 | if (__probable((ctime - st->last_checkin) <= cpu_checkin_min_interval)) { |
d9a64523 | 333 | return; |
0a7de745 | 334 | } |
d9a64523 | 335 | |
f427ee49 | 336 | st->last_checkin = ctime; |
d9a64523 A |
337 | |
338 | checkin_mask_t state = os_atomic_load(&cpu_quiescing_checkin_state, relaxed); | |
339 | ||
340 | assert((state & cpu_expected_bit(cpuid))); | |
341 | ||
342 | if (__probable((state & cpu_checked_in_bit(cpuid)))) { | |
343 | /* | |
344 | * Processor has already checked in for this round, no need to | |
345 | * acquire the cacheline exclusive. | |
346 | */ | |
347 | return; | |
348 | } | |
349 | ||
350 | checkin_mask_t orig_state = os_atomic_or_orig(&cpu_quiescing_checkin_state, | |
0a7de745 | 351 | cpu_checked_in_bit(cpuid), acq_rel); |
d9a64523 A |
352 | |
353 | checkin_mask_t new_state = orig_state | cpu_checked_in_bit(cpuid); | |
354 | ||
355 | if (cpu_quiescent_counter_needs_commit(new_state)) { | |
356 | assertf(!cpu_quiescent_counter_needs_commit(orig_state), | |
0a7de745 | 357 | "old: 0x%lx, new: 0x%lx", orig_state, new_state); |
d9a64523 A |
358 | cpu_quiescent_counter_commit(ctime); |
359 | } | |
360 | } | |
361 | ||
362 | #if MACH_ASSERT | |
363 | /* | |
364 | * Called on all AST exits to userspace to assert this processor actually joined | |
365 | * | |
366 | * Called with interrupts disabled after the AST should have been handled | |
367 | */ | |
368 | void | |
369 | cpu_quiescent_counter_assert_ast(void) | |
370 | { | |
f427ee49 A |
371 | struct cpu_quiesce *st = PERCPU_GET(cpu_quiesce); |
372 | int cpuid = cpu_number(); | |
d9a64523 | 373 | |
f427ee49 | 374 | assert(st->state == CPU_QUIESCE_COUNTER_JOINED); |
d9a64523 A |
375 | |
376 | checkin_mask_t state = os_atomic_load(&cpu_quiescing_checkin_state, relaxed); | |
377 | assert((state & cpu_expected_bit(cpuid))); | |
378 | } | |
379 | #endif /* MACH_ASSERT */ |