]> git.saurik.com Git - apple/xnu.git/blame - osfmk/kern/timer_call.c
xnu-7195.101.1.tar.gz
[apple/xnu.git] / osfmk / kern / timer_call.c
CommitLineData
1c79356b 1/*
c910b4d9 2 * Copyright (c) 1993-2008 Apple Inc. All rights reserved.
1c79356b 3 *
2d21ac55 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
0a7de745 5 *
2d21ac55
A
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
0a7de745 14 *
2d21ac55
A
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
0a7de745 17 *
2d21ac55
A
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
8f6c56a5
A
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
2d21ac55
A
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
0a7de745 25 *
2d21ac55 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
1c79356b
A
27 */
28/*
29 * Timer interrupt callout module.
1c79356b
A
30 */
31
32#include <mach/mach_types.h>
33
34#include <kern/clock.h>
3e170ce0 35#include <kern/smp.h>
9bccf70c 36#include <kern/processor.h>
1c79356b 37#include <kern/timer_call.h>
c910b4d9 38#include <kern/timer_queue.h>
39236c6e 39#include <kern/thread.h>
39037602 40#include <kern/policy_internal.h>
1c79356b 41
0c530ab8
A
42#include <sys/kdebug.h>
43
4b17d6b6 44#if CONFIG_DTRACE
2d21ac55
A
45#include <mach/sdt.h>
46#endif
1c79356b 47
1c79356b 48
6d2010ae 49#if DEBUG
0a7de745 50#define TIMER_ASSERT 1
6d2010ae
A
51#endif
52
53//#define TIMER_ASSERT 1
54//#define TIMER_DBG 1
55
56#if TIMER_DBG
57#define DBG(x...) kprintf("DBG: " x);
58#else
59#define DBG(x...)
60#endif
61
39236c6e 62#if TIMER_TRACE
0a7de745 63#define TIMER_KDEBUG_TRACE KERNEL_DEBUG_CONSTANT_IST
39236c6e
A
64#else
65#define TIMER_KDEBUG_TRACE(x...)
66#endif
67
f427ee49
A
68LCK_GRP_DECLARE(timer_call_lck_grp, "timer_call");
69LCK_GRP_DECLARE(timer_longterm_lck_grp, "timer_longterm");
39236c6e 70
3e170ce0 71/* Timer queue lock must be acquired with interrupts disabled (under splclock()) */
0a7de745 72#define timer_queue_lock_spin(queue) \
6d2010ae
A
73 lck_mtx_lock_spin_always(&queue->lock_data)
74
0a7de745 75#define timer_queue_unlock(queue) \
6d2010ae 76 lck_mtx_unlock_always(&queue->lock_data)
6d2010ae 77
39236c6e
A
78/*
79 * The longterm timer object is a global structure holding all timers
80 * beyond the short-term, local timer queue threshold. The boot processor
81 * is responsible for moving each timer to its local timer queue
82 * if and when that timer becomes due within the threshold.
83 */
5ba3f43e
A
84
85/* Sentinel for "no time set": */
0a7de745
A
86#define TIMER_LONGTERM_NONE EndOfAllTime
87/* The default threadhold is the delta above which a timer is "long-term" */
39236c6e 88#if defined(__x86_64__)
0a7de745 89#define TIMER_LONGTERM_THRESHOLD (1ULL * NSEC_PER_SEC) /* 1 sec */
39236c6e 90#else
0a7de745 91#define TIMER_LONGTERM_THRESHOLD TIMER_LONGTERM_NONE /* disabled */
39236c6e
A
92#endif
93
5ba3f43e 94/*
a39ff7e2 95 * The scan_limit throttles processing of the longterm queue.
0a7de745 96 * If the scan time exceeds this limit, we terminate, unlock
a39ff7e2 97 * and defer for scan_interval. This prevents unbounded holding of
5ba3f43e
A
98 * timer queue locks with interrupts masked.
99 */
0a7de745
A
100#define TIMER_LONGTERM_SCAN_LIMIT (100ULL * NSEC_PER_USEC) /* 100 us */
101#define TIMER_LONGTERM_SCAN_INTERVAL (100ULL * NSEC_PER_USEC) /* 100 us */
5ba3f43e 102/* Sentinel for "scan limit exceeded": */
0a7de745 103#define TIMER_LONGTERM_SCAN_AGAIN 0
5ba3f43e 104
39236c6e 105typedef struct {
0a7de745
A
106 uint64_t interval; /* longterm timer interval */
107 uint64_t margin; /* fudge factor (10% of interval */
108 uint64_t deadline; /* first/soonest longterm deadline */
109 uint64_t preempted; /* sooner timer has pre-empted */
110 timer_call_t call; /* first/soonest longterm timer call */
111 uint64_t deadline_set; /* next timer set */
112 timer_call_data_t timer; /* timer used by threshold management */
113 /* Stats: */
114 uint64_t scans; /* num threshold timer scans */
115 uint64_t preempts; /* num threshold reductions */
116 uint64_t latency; /* average threshold latency */
117 uint64_t latency_min; /* minimum threshold latency */
118 uint64_t latency_max; /* maximum threshold latency */
39236c6e
A
119} threshold_t;
120
121typedef struct {
0a7de745
A
122 mpqueue_head_t queue; /* longterm timer list */
123 uint64_t enqueues; /* num timers queued */
124 uint64_t dequeues; /* num timers dequeued */
125 uint64_t escalates; /* num timers becoming shortterm */
126 uint64_t scan_time; /* last time the list was scanned */
127 threshold_t threshold; /* longterm timer threshold */
128 uint64_t scan_limit; /* maximum scan time */
129 uint64_t scan_interval; /* interval between LT "escalation" scans */
130 uint64_t scan_pauses; /* num scans exceeding time limit */
39236c6e
A
131} timer_longterm_t;
132
0a7de745
A
133timer_longterm_t timer_longterm = {
134 .scan_limit = TIMER_LONGTERM_SCAN_LIMIT,
135 .scan_interval = TIMER_LONGTERM_SCAN_INTERVAL,
136};
137
138static mpqueue_head_t *timer_longterm_queue = NULL;
139
140static void timer_longterm_init(void);
141static void timer_longterm_callout(
142 timer_call_param_t p0,
143 timer_call_param_t p1);
144extern void timer_longterm_scan(
145 timer_longterm_t *tlp,
146 uint64_t now);
147static void timer_longterm_update(
148 timer_longterm_t *tlp);
149static void timer_longterm_update_locked(
150 timer_longterm_t *tlp);
151static mpqueue_head_t * timer_longterm_enqueue_unlocked(
152 timer_call_t call,
153 uint64_t now,
154 uint64_t deadline,
155 mpqueue_head_t ** old_queue,
156 uint64_t soft_deadline,
157 uint64_t ttd,
158 timer_call_param_t param1,
159 uint32_t callout_flags);
160static void timer_longterm_dequeued_locked(
161 timer_call_t call);
316670eb
A
162
163uint64_t past_deadline_timers;
164uint64_t past_deadline_deltas;
165uint64_t past_deadline_longest;
166uint64_t past_deadline_shortest = ~0ULL;
167enum {PAST_DEADLINE_TIMER_ADJUSTMENT_NS = 10 * 1000};
168
169uint64_t past_deadline_timer_adjustment;
170
39236c6e 171static boolean_t timer_call_enter_internal(timer_call_t call, timer_call_param_t param1, uint64_t deadline, uint64_t leeway, uint32_t flags, boolean_t ratelimited);
0a7de745 172boolean_t mach_timer_coalescing_enabled = TRUE;
6d2010ae 173
0a7de745
A
174mpqueue_head_t *timer_call_enqueue_deadline_unlocked(
175 timer_call_t call,
176 mpqueue_head_t *queue,
177 uint64_t deadline,
178 uint64_t soft_deadline,
179 uint64_t ttd,
180 timer_call_param_t param1,
181 uint32_t flags);
6d2010ae 182
0a7de745
A
183mpqueue_head_t *timer_call_dequeue_unlocked(
184 timer_call_t call);
6d2010ae 185
fe8ab488
A
186timer_coalescing_priority_params_t tcoal_prio_params;
187
188#if TCOAL_PRIO_STATS
189int32_t nc_tcl, rt_tcl, bg_tcl, kt_tcl, fp_tcl, ts_tcl, qos_tcl;
190#define TCOAL_PRIO_STAT(x) (x++)
191#else
192#define TCOAL_PRIO_STAT(x)
193#endif
194
195static void
196timer_call_init_abstime(void)
197{
198 int i;
199 uint64_t result;
200 timer_coalescing_priority_params_ns_t * tcoal_prio_params_init = timer_call_get_priority_params();
201 nanoseconds_to_absolutetime(PAST_DEADLINE_TIMER_ADJUSTMENT_NS, &past_deadline_timer_adjustment);
202 nanoseconds_to_absolutetime(tcoal_prio_params_init->idle_entry_timer_processing_hdeadline_threshold_ns, &result);
203 tcoal_prio_params.idle_entry_timer_processing_hdeadline_threshold_abstime = (uint32_t)result;
204 nanoseconds_to_absolutetime(tcoal_prio_params_init->interrupt_timer_coalescing_ilat_threshold_ns, &result);
205 tcoal_prio_params.interrupt_timer_coalescing_ilat_threshold_abstime = (uint32_t)result;
206 nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_resort_threshold_ns, &result);
207 tcoal_prio_params.timer_resort_threshold_abstime = (uint32_t)result;
208 tcoal_prio_params.timer_coalesce_rt_shift = tcoal_prio_params_init->timer_coalesce_rt_shift;
209 tcoal_prio_params.timer_coalesce_bg_shift = tcoal_prio_params_init->timer_coalesce_bg_shift;
210 tcoal_prio_params.timer_coalesce_kt_shift = tcoal_prio_params_init->timer_coalesce_kt_shift;
211 tcoal_prio_params.timer_coalesce_fp_shift = tcoal_prio_params_init->timer_coalesce_fp_shift;
212 tcoal_prio_params.timer_coalesce_ts_shift = tcoal_prio_params_init->timer_coalesce_ts_shift;
213
214 nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_coalesce_rt_ns_max,
215 &tcoal_prio_params.timer_coalesce_rt_abstime_max);
216 nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_coalesce_bg_ns_max,
217 &tcoal_prio_params.timer_coalesce_bg_abstime_max);
218 nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_coalesce_kt_ns_max,
219 &tcoal_prio_params.timer_coalesce_kt_abstime_max);
220 nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_coalesce_fp_ns_max,
221 &tcoal_prio_params.timer_coalesce_fp_abstime_max);
222 nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_coalesce_ts_ns_max,
223 &tcoal_prio_params.timer_coalesce_ts_abstime_max);
224
225 for (i = 0; i < NUM_LATENCY_QOS_TIERS; i++) {
226 tcoal_prio_params.latency_qos_scale[i] = tcoal_prio_params_init->latency_qos_scale[i];
227 nanoseconds_to_absolutetime(tcoal_prio_params_init->latency_qos_ns_max[i],
228 &tcoal_prio_params.latency_qos_abstime_max[i]);
229 tcoal_prio_params.latency_tier_rate_limited[i] = tcoal_prio_params_init->latency_tier_rate_limited[i];
230 }
231}
232
1c79356b
A
233
234void
39236c6e 235timer_call_init(void)
1c79356b 236{
39236c6e 237 timer_longterm_init();
fe8ab488 238 timer_call_init_abstime();
1c79356b
A
239}
240
6d2010ae
A
241
242void
39236c6e 243timer_call_queue_init(mpqueue_head_t *queue)
6d2010ae 244{
39236c6e 245 DBG("timer_call_queue_init(%p)\n", queue);
f427ee49 246 mpqueue_init(queue, &timer_call_lck_grp, LCK_ATTR_NULL);
6d2010ae
A
247}
248
249
1c79356b
A
250void
251timer_call_setup(
0a7de745
A
252 timer_call_t call,
253 timer_call_func_t func,
254 timer_call_param_t param0)
1c79356b 255{
6d2010ae 256 DBG("timer_call_setup(%p,%p,%p)\n", call, func, param0);
f427ee49
A
257
258 *call = (struct timer_call) {
259 .tc_func = func,
260 .tc_param0 = param0,
261 .tc_async_dequeue = false,
262 };
263
264 simple_lock_init(&(call)->tc_lock, 0);
1c79356b 265}
f427ee49
A
266
267static mpqueue_head_t*
268mpqueue_for_timer_call(timer_call_t entry)
269{
270 queue_t queue_entry_is_on = entry->tc_queue;
271 /* 'cast' the queue back to the orignal mpqueue */
272 return __container_of(queue_entry_is_on, struct mpqueue_head, head);
273}
274
275
6d2010ae
A
276static __inline__ mpqueue_head_t *
277timer_call_entry_dequeue(
0a7de745 278 timer_call_t entry)
6d2010ae 279{
f427ee49 280 mpqueue_head_t *old_mpqueue = mpqueue_for_timer_call(entry);
6d2010ae 281
f427ee49
A
282 /* The entry was always on a queue */
283 assert(old_mpqueue != NULL);
284
285#if TIMER_ASSERT
286 if (!hw_lock_held((hw_lock_t)&entry->tc_lock)) {
6d2010ae 287 panic("_call_entry_dequeue() "
0a7de745
A
288 "entry %p is not locked\n", entry);
289 }
f427ee49 290
6d2010ae
A
291 /*
292 * XXX The queue lock is actually a mutex in spin mode
293 * but there's no way to test for it being held
294 * so we pretend it's a spinlock!
295 */
f427ee49 296 if (!hw_lock_held((hw_lock_t)&old_mpqueue->lock_data)) {
6d2010ae 297 panic("_call_entry_dequeue() "
f427ee49
A
298 "queue %p is not locked\n", old_mpqueue);
299 }
300#endif /* TIMER_ASSERT */
301
302 if (old_mpqueue != timer_longterm_queue) {
303 priority_queue_remove(&old_mpqueue->mpq_pqhead,
304 &entry->tc_pqlink);
0a7de745 305 }
6d2010ae 306
f427ee49 307 remqueue(&entry->tc_qlink);
c910b4d9 308
f427ee49
A
309 entry->tc_queue = NULL;
310
311 old_mpqueue->count--;
312
313 return old_mpqueue;
6d2010ae 314}
1c79356b 315
6d2010ae
A
316static __inline__ mpqueue_head_t *
317timer_call_entry_enqueue_deadline(
f427ee49
A
318 timer_call_t entry,
319 mpqueue_head_t *new_mpqueue,
320 uint64_t deadline)
6d2010ae 321{
f427ee49 322 mpqueue_head_t *old_mpqueue = mpqueue_for_timer_call(entry);
1c79356b 323
f427ee49
A
324#if TIMER_ASSERT
325 if (!hw_lock_held((hw_lock_t)&entry->tc_lock)) {
6d2010ae 326 panic("_call_entry_enqueue_deadline() "
0a7de745
A
327 "entry %p is not locked\n", entry);
328 }
f427ee49 329
6d2010ae 330 /* XXX More lock pretense: */
f427ee49 331 if (!hw_lock_held((hw_lock_t)&new_mpqueue->lock_data)) {
6d2010ae 332 panic("_call_entry_enqueue_deadline() "
f427ee49 333 "queue %p is not locked\n", new_mpqueue);
0a7de745 334 }
f427ee49
A
335
336 if (old_mpqueue != NULL && old_mpqueue != new_mpqueue) {
6d2010ae 337 panic("_call_entry_enqueue_deadline() "
f427ee49 338 "old_mpqueue %p != new_mpqueue", old_mpqueue);
0a7de745 339 }
f427ee49 340#endif /* TIMER_ASSERT */
1c79356b 341
f427ee49
A
342 /* no longterm queue involved */
343 assert(new_mpqueue != timer_longterm_queue);
344 assert(old_mpqueue != timer_longterm_queue);
39236c6e 345
f427ee49
A
346 if (old_mpqueue == new_mpqueue) {
347 /* optimize the same-queue case to avoid a full re-insert */
348 uint64_t old_deadline = entry->tc_pqlink.deadline;
349 entry->tc_pqlink.deadline = deadline;
39236c6e 350
f427ee49
A
351 if (old_deadline < deadline) {
352 priority_queue_entry_increased(&new_mpqueue->mpq_pqhead,
353 &entry->tc_pqlink);
354 } else {
355 priority_queue_entry_decreased(&new_mpqueue->mpq_pqhead,
356 &entry->tc_pqlink);
357 }
358 } else {
359 if (old_mpqueue != NULL) {
360 priority_queue_remove(&old_mpqueue->mpq_pqhead,
361 &entry->tc_pqlink);
39236c6e 362
f427ee49
A
363 re_queue_tail(&new_mpqueue->head, &entry->tc_qlink);
364 } else {
365 enqueue_tail(&new_mpqueue->head, &entry->tc_qlink);
366 }
39236c6e 367
f427ee49
A
368 entry->tc_queue = &new_mpqueue->head;
369 entry->tc_pqlink.deadline = deadline;
c910b4d9 370
f427ee49
A
371 priority_queue_insert(&new_mpqueue->mpq_pqhead, &entry->tc_pqlink);
372 }
39236c6e 373
39236c6e
A
374
375 /* For efficiency, track the earliest soft deadline on the queue,
376 * so that fuzzy decisions can be made without lock acquisitions.
377 */
fe8ab488 378
f427ee49
A
379 timer_call_t thead = priority_queue_min(&new_mpqueue->mpq_pqhead, struct timer_call, tc_pqlink);
380
381 new_mpqueue->earliest_soft_deadline = thead->tc_flags & TIMER_CALL_RATELIMITED ? thead->tc_pqlink.deadline : thead->tc_soft_deadline;
39236c6e 382
f427ee49
A
383 if (old_mpqueue) {
384 old_mpqueue->count--;
0a7de745 385 }
f427ee49 386 new_mpqueue->count++;
39236c6e 387
f427ee49 388 return old_mpqueue;
1c79356b
A
389}
390
39236c6e
A
391static __inline__ void
392timer_call_entry_enqueue_tail(
0a7de745
A
393 timer_call_t entry,
394 mpqueue_head_t *queue)
39236c6e 395{
f427ee49
A
396 /* entry is always dequeued before this call */
397 assert(entry->tc_queue == NULL);
398
399 /*
400 * this is only used for timer_longterm_queue, which is unordered
401 * and thus needs no priority queueing
402 */
403 assert(queue == timer_longterm_queue);
404
405 enqueue_tail(&queue->head, &entry->tc_qlink);
406
407 entry->tc_queue = &queue->head;
408
39236c6e
A
409 queue->count++;
410 return;
411}
412
413/*
414 * Remove timer entry from its queue but don't change the queue pointer
415 * and set the async_dequeue flag. This is locking case 2b.
416 */
417static __inline__ void
418timer_call_entry_dequeue_async(
0a7de745 419 timer_call_t entry)
39236c6e 420{
f427ee49
A
421 mpqueue_head_t *old_mpqueue = mpqueue_for_timer_call(entry);
422 if (old_mpqueue) {
423 old_mpqueue->count--;
424
425 if (old_mpqueue != timer_longterm_queue) {
426 priority_queue_remove(&old_mpqueue->mpq_pqhead,
427 &entry->tc_pqlink);
428 }
429
430 remqueue(&entry->tc_qlink);
431 entry->tc_async_dequeue = true;
39236c6e
A
432 }
433 return;
434}
435
6d2010ae
A
436#if TIMER_ASSERT
437unsigned timer_call_enqueue_deadline_unlocked_async1;
438unsigned timer_call_enqueue_deadline_unlocked_async2;
439#endif
440/*
441 * Assumes call_entry and queues unlocked, interrupts disabled.
442 */
443__inline__ mpqueue_head_t *
444timer_call_enqueue_deadline_unlocked(
0a7de745
A
445 timer_call_t call,
446 mpqueue_head_t *queue,
447 uint64_t deadline,
448 uint64_t soft_deadline,
449 uint64_t ttd,
450 timer_call_param_t param1,
451 uint32_t callout_flags)
1c79356b 452{
6d2010ae 453 DBG("timer_call_enqueue_deadline_unlocked(%p,%p,)\n", call, queue);
1c79356b 454
f427ee49 455 simple_lock(&call->tc_lock, LCK_GRP_NULL);
fe8ab488 456
f427ee49 457 mpqueue_head_t *old_queue = mpqueue_for_timer_call(call);
fe8ab488 458
6d2010ae 459 if (old_queue != NULL) {
39236c6e 460 timer_queue_lock_spin(old_queue);
f427ee49 461 if (call->tc_async_dequeue) {
39236c6e 462 /* collision (1c): timer already dequeued, clear flag */
6d2010ae 463#if TIMER_ASSERT
0a7de745
A
464 TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
465 DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
466 VM_KERNEL_UNSLIDE_OR_PERM(call),
f427ee49
A
467 call->tc_async_dequeue,
468 VM_KERNEL_UNSLIDE_OR_PERM(call->tc_queue),
0a7de745 469 0x1c, 0);
6d2010ae
A
470 timer_call_enqueue_deadline_unlocked_async1++;
471#endif
f427ee49
A
472 call->tc_async_dequeue = false;
473 call->tc_queue = NULL;
39236c6e
A
474 } else if (old_queue != queue) {
475 timer_call_entry_dequeue(call);
6d2010ae
A
476#if TIMER_ASSERT
477 timer_call_enqueue_deadline_unlocked_async2++;
478#endif
479 }
0a7de745 480 if (old_queue == timer_longterm_queue) {
39236c6e 481 timer_longterm_dequeued_locked(call);
0a7de745 482 }
6d2010ae 483 if (old_queue != queue) {
39236c6e
A
484 timer_queue_unlock(old_queue);
485 timer_queue_lock_spin(queue);
6d2010ae
A
486 }
487 } else {
39236c6e 488 timer_queue_lock_spin(queue);
6d2010ae 489 }
1c79356b 490
f427ee49
A
491 call->tc_soft_deadline = soft_deadline;
492 call->tc_flags = callout_flags;
493 call->tc_param1 = param1;
494 call->tc_ttd = ttd;
fe8ab488 495
6d2010ae 496 timer_call_entry_enqueue_deadline(call, queue, deadline);
39236c6e 497 timer_queue_unlock(queue);
f427ee49 498 simple_unlock(&call->tc_lock);
1c79356b 499
0a7de745 500 return old_queue;
c910b4d9 501}
1c79356b 502
6d2010ae
A
503#if TIMER_ASSERT
504unsigned timer_call_dequeue_unlocked_async1;
505unsigned timer_call_dequeue_unlocked_async2;
506#endif
507mpqueue_head_t *
508timer_call_dequeue_unlocked(
0a7de745 509 timer_call_t call)
c910b4d9 510{
6d2010ae 511 DBG("timer_call_dequeue_unlocked(%p)\n", call);
1c79356b 512
f427ee49
A
513 simple_lock(&call->tc_lock, LCK_GRP_NULL);
514
515 mpqueue_head_t *old_queue = mpqueue_for_timer_call(call);
516
39236c6e 517#if TIMER_ASSERT
0a7de745
A
518 TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
519 DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
520 VM_KERNEL_UNSLIDE_OR_PERM(call),
f427ee49
A
521 call->tc_async_dequeue,
522 VM_KERNEL_UNSLIDE_OR_PERM(call->tc_queue),
0a7de745 523 0, 0);
39236c6e 524#endif
6d2010ae 525 if (old_queue != NULL) {
39236c6e 526 timer_queue_lock_spin(old_queue);
f427ee49 527 if (call->tc_async_dequeue) {
39236c6e 528 /* collision (1c): timer already dequeued, clear flag */
6d2010ae 529#if TIMER_ASSERT
0a7de745
A
530 TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
531 DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
4bd07ac2 532 VM_KERNEL_UNSLIDE_OR_PERM(call),
f427ee49
A
533 call->tc_async_dequeue,
534 VM_KERNEL_UNSLIDE_OR_PERM(call->tc_queue),
0a7de745 535 0x1c, 0);
6d2010ae
A
536 timer_call_dequeue_unlocked_async1++;
537#endif
f427ee49
A
538 call->tc_async_dequeue = false;
539 call->tc_queue = NULL;
6d2010ae 540 } else {
39236c6e 541 timer_call_entry_dequeue(call);
6d2010ae 542 }
0a7de745 543 if (old_queue == timer_longterm_queue) {
39236c6e 544 timer_longterm_dequeued_locked(call);
0a7de745 545 }
39236c6e 546 timer_queue_unlock(old_queue);
6d2010ae 547 }
f427ee49 548 simple_unlock(&call->tc_lock);
0a7de745 549 return old_queue;
1c79356b
A
550}
551
f427ee49
A
552uint64_t
553timer_call_past_deadline_timer_handle(uint64_t deadline, uint64_t ctime)
5ba3f43e 554{
0a7de745
A
555 uint64_t delta = (ctime - deadline);
556
557 past_deadline_timers++;
558 past_deadline_deltas += delta;
559 if (delta > past_deadline_longest) {
560 past_deadline_longest = deadline;
561 }
562 if (delta < past_deadline_shortest) {
563 past_deadline_shortest = delta;
564 }
565
566 return ctime + past_deadline_timer_adjustment;
5ba3f43e 567}
fe8ab488
A
568
569/*
570 * Timer call entry locking model
571 * ==============================
572 *
573 * Timer call entries are linked on per-cpu timer queues which are protected
574 * by the queue lock and the call entry lock. The locking protocol is:
575 *
576 * 0) The canonical locking order is timer call entry followed by queue.
577 *
578 * 1) With only the entry lock held, entry.queue is valid:
579 * 1a) NULL: the entry is not queued, or
580 * 1b) non-NULL: this queue must be locked before the entry is modified.
581 * After locking the queue, the call.async_dequeue flag must be checked:
582 * 1c) TRUE: the entry was removed from the queue by another thread
583 * and we must NULL the entry.queue and reset this flag, or
584 * 1d) FALSE: (ie. queued), the entry can be manipulated.
585 *
586 * 2) If a queue lock is obtained first, the queue is stable:
587 * 2a) If a try-lock of a queued entry succeeds, the call can be operated on
588 * and dequeued.
589 * 2b) If a try-lock fails, it indicates that another thread is attempting
590 * to change the entry and move it to a different position in this queue
591 * or to different queue. The entry can be dequeued but it should not be
592 * operated upon since it is being changed. Furthermore, we don't null
593 * the entry.queue pointer (protected by the entry lock we don't own).
594 * Instead, we set the async_dequeue flag -- see (1c).
595 * 2c) Same as 2b but occurring when a longterm timer is matured.
596 * 3) A callout's parameters (deadline, flags, parameters, soft deadline &c.)
597 * should be manipulated with the appropriate timer queue lock held,
598 * to prevent queue traversal observations from observing inconsistent
599 * updates to an in-flight callout.
600 */
601
602/*
0a7de745 603 * In the debug case, we assert that the timer call locking protocol
fe8ab488
A
604 * is being obeyed.
605 */
606
0a7de745 607static boolean_t
6d2010ae 608timer_call_enter_internal(
0a7de745
A
609 timer_call_t call,
610 timer_call_param_t param1,
611 uint64_t deadline,
612 uint64_t leeway,
613 uint32_t flags,
614 boolean_t ratelimited)
1c79356b 615{
0a7de745
A
616 mpqueue_head_t *queue = NULL;
617 mpqueue_head_t *old_queue;
618 spl_t s;
619 uint64_t slop;
620 uint32_t urgency;
621 uint64_t sdeadline, ttd;
1c79356b 622
f427ee49 623 assert(call->tc_func != NULL);
1c79356b 624 s = splclock();
6d2010ae 625
fe8ab488 626 sdeadline = deadline;
39236c6e
A
627 uint64_t ctime = mach_absolute_time();
628
629 TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
0a7de745 630 DECR_TIMER_ENTER | DBG_FUNC_START,
4bd07ac2 631 VM_KERNEL_UNSLIDE_OR_PERM(call),
5ba3f43e 632 VM_KERNEL_ADDRHIDE(param1), deadline, flags, 0);
39236c6e
A
633
634 urgency = (flags & TIMER_CALL_URGENCY_MASK);
635
636 boolean_t slop_ratelimited = FALSE;
637 slop = timer_call_slop(deadline, ctime, urgency, current_thread(), &slop_ratelimited);
638
0a7de745 639 if ((flags & TIMER_CALL_LEEWAY) != 0 && leeway > slop) {
39236c6e 640 slop = leeway;
0a7de745 641 }
39236c6e
A
642
643 if (UINT64_MAX - deadline <= slop) {
644 deadline = UINT64_MAX;
645 } else {
6d2010ae
A
646 deadline += slop;
647 }
1c79356b 648
316670eb 649 if (__improbable(deadline < ctime)) {
f427ee49 650 deadline = timer_call_past_deadline_timer_handle(deadline, ctime);
fe8ab488 651 sdeadline = deadline;
316670eb 652 }
39236c6e 653
39236c6e 654 if (ratelimited || slop_ratelimited) {
fe8ab488 655 flags |= TIMER_CALL_RATELIMITED;
39236c6e 656 } else {
fe8ab488 657 flags &= ~TIMER_CALL_RATELIMITED;
39236c6e
A
658 }
659
fe8ab488 660 ttd = sdeadline - ctime;
4b17d6b6 661#if CONFIG_DTRACE
f427ee49
A
662 DTRACE_TMR7(callout__create, timer_call_func_t, call->tc_func,
663 timer_call_param_t, call->tc_param0, uint32_t, flags,
fe8ab488
A
664 (deadline - sdeadline),
665 (ttd >> 32), (unsigned) (ttd & 0xFFFFFFFF), call);
4b17d6b6
A
666#endif
667
fe8ab488
A
668 /* Program timer callout parameters under the appropriate per-CPU or
669 * longterm queue lock. The callout may have been previously enqueued
670 * and in-flight on this or another timer queue.
671 */
39236c6e 672 if (!ratelimited && !slop_ratelimited) {
fe8ab488 673 queue = timer_longterm_enqueue_unlocked(call, ctime, deadline, &old_queue, sdeadline, ttd, param1, flags);
39236c6e 674 }
1c79356b 675
39236c6e
A
676 if (queue == NULL) {
677 queue = timer_queue_assign(deadline);
fe8ab488 678 old_queue = timer_call_enqueue_deadline_unlocked(call, queue, deadline, sdeadline, ttd, param1, flags);
39236c6e 679 }
1c79356b 680
39236c6e 681#if TIMER_TRACE
f427ee49 682 call->tc_entry_time = ctime;
39236c6e
A
683#endif
684
685 TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
0a7de745
A
686 DECR_TIMER_ENTER | DBG_FUNC_END,
687 VM_KERNEL_UNSLIDE_OR_PERM(call),
688 (old_queue != NULL), deadline, queue->count, 0);
1c79356b 689
1c79356b
A
690 splx(s);
691
0a7de745 692 return old_queue != NULL;
1c79356b
A
693}
694
39236c6e
A
695/*
696 * timer_call_*()
697 * return boolean indicating whether the call was previously queued.
698 */
6d2010ae
A
699boolean_t
700timer_call_enter(
0a7de745
A
701 timer_call_t call,
702 uint64_t deadline,
703 uint32_t flags)
6d2010ae 704{
39236c6e 705 return timer_call_enter_internal(call, NULL, deadline, 0, flags, FALSE);
6d2010ae
A
706}
707
1c79356b 708boolean_t
c910b4d9 709timer_call_enter1(
0a7de745
A
710 timer_call_t call,
711 timer_call_param_t param1,
712 uint64_t deadline,
713 uint32_t flags)
1c79356b 714{
39236c6e
A
715 return timer_call_enter_internal(call, param1, deadline, 0, flags, FALSE);
716}
717
718boolean_t
719timer_call_enter_with_leeway(
0a7de745
A
720 timer_call_t call,
721 timer_call_param_t param1,
722 uint64_t deadline,
723 uint64_t leeway,
724 uint32_t flags,
725 boolean_t ratelimited)
39236c6e
A
726{
727 return timer_call_enter_internal(call, param1, deadline, leeway, flags, ratelimited);
1c79356b
A
728}
729
730boolean_t
c910b4d9 731timer_call_cancel(
0a7de745 732 timer_call_t call)
1c79356b 733{
0a7de745
A
734 mpqueue_head_t *old_queue;
735 spl_t s;
1c79356b
A
736
737 s = splclock();
1c79356b 738
39236c6e 739 TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
0a7de745
A
740 DECR_TIMER_CANCEL | DBG_FUNC_START,
741 VM_KERNEL_UNSLIDE_OR_PERM(call),
f427ee49 742 call->tc_pqlink.deadline, call->tc_soft_deadline, call->tc_flags, 0);
39236c6e 743
6d2010ae 744 old_queue = timer_call_dequeue_unlocked(call);
c910b4d9
A
745
746 if (old_queue != NULL) {
39236c6e 747 timer_queue_lock_spin(old_queue);
f427ee49
A
748
749 timer_call_t new_head = priority_queue_min(&old_queue->mpq_pqhead, struct timer_call, tc_pqlink);
750
751 if (new_head) {
752 timer_queue_cancel(old_queue, call->tc_pqlink.deadline, new_head->tc_pqlink.deadline);
753 old_queue->earliest_soft_deadline = new_head->tc_flags & TIMER_CALL_RATELIMITED ? new_head->tc_pqlink.deadline : new_head->tc_soft_deadline;
0a7de745 754 } else {
f427ee49 755 timer_queue_cancel(old_queue, call->tc_pqlink.deadline, UINT64_MAX);
39236c6e
A
756 old_queue->earliest_soft_deadline = UINT64_MAX;
757 }
f427ee49 758
39236c6e 759 timer_queue_unlock(old_queue);
1c79356b 760 }
39236c6e 761 TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
0a7de745
A
762 DECR_TIMER_CANCEL | DBG_FUNC_END,
763 VM_KERNEL_UNSLIDE_OR_PERM(call),
764 VM_KERNEL_UNSLIDE_OR_PERM(old_queue),
f427ee49
A
765 call->tc_pqlink.deadline - mach_absolute_time(),
766 call->tc_pqlink.deadline - call->tc_entry_time, 0);
1c79356b
A
767 splx(s);
768
4b17d6b6 769#if CONFIG_DTRACE
f427ee49
A
770 DTRACE_TMR6(callout__cancel, timer_call_func_t, call->tc_func,
771 timer_call_param_t, call->tc_param0, uint32_t, call->tc_flags, 0,
772 (call->tc_ttd >> 32), (unsigned) (call->tc_ttd & 0xFFFFFFFF));
773#endif /* CONFIG_DTRACE */
4b17d6b6 774
0a7de745 775 return old_queue != NULL;
1c79356b
A
776}
777
0a7de745 778static uint32_t timer_queue_shutdown_lock_skips;
fe8ab488
A
779static uint32_t timer_queue_shutdown_discarded;
780
9bccf70c 781void
c910b4d9 782timer_queue_shutdown(
0a7de745 783 mpqueue_head_t *queue)
9bccf70c 784{
0a7de745
A
785 timer_call_t call;
786 mpqueue_head_t *new_queue;
787 spl_t s;
9bccf70c 788
fe8ab488 789
6d2010ae
A
790 DBG("timer_queue_shutdown(%p)\n", queue);
791
c910b4d9 792 s = splclock();
9bccf70c 793
f427ee49
A
794 while (TRUE) {
795 timer_queue_lock_spin(queue);
796
797 call = qe_queue_first(&queue->head, struct timer_call, tc_qlink);
798
799 if (call == NULL) {
800 break;
801 }
fe8ab488 802
f427ee49 803 if (!simple_lock_try(&call->tc_lock, LCK_GRP_NULL)) {
6d2010ae
A
804 /*
805 * case (2b) lock order inversion, dequeue and skip
806 * Don't change the call_entry queue back-pointer
807 * but set the async_dequeue field.
808 */
809 timer_queue_shutdown_lock_skips++;
39236c6e
A
810 timer_call_entry_dequeue_async(call);
811#if TIMER_ASSERT
0a7de745
A
812 TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
813 DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
814 VM_KERNEL_UNSLIDE_OR_PERM(call),
f427ee49
A
815 call->tc_async_dequeue,
816 VM_KERNEL_UNSLIDE_OR_PERM(call->tc_queue),
0a7de745 817 0x2b, 0);
39236c6e
A
818#endif
819 timer_queue_unlock(queue);
6d2010ae
A
820 continue;
821 }
9bccf70c 822
f427ee49 823 boolean_t call_local = ((call->tc_flags & TIMER_CALL_LOCAL) != 0);
fe8ab488 824
6d2010ae
A
825 /* remove entry from old queue */
826 timer_call_entry_dequeue(call);
39236c6e 827 timer_queue_unlock(queue);
9bccf70c 828
fe8ab488
A
829 if (call_local == FALSE) {
830 /* and queue it on new, discarding LOCAL timers */
f427ee49 831 new_queue = timer_queue_assign(call->tc_pqlink.deadline);
fe8ab488
A
832 timer_queue_lock_spin(new_queue);
833 timer_call_entry_enqueue_deadline(
f427ee49 834 call, new_queue, call->tc_pqlink.deadline);
fe8ab488
A
835 timer_queue_unlock(new_queue);
836 } else {
837 timer_queue_shutdown_discarded++;
838 }
839
5ba3f43e 840 assert(call_local == FALSE);
f427ee49 841 simple_unlock(&call->tc_lock);
9bccf70c
A
842 }
843
39236c6e 844 timer_queue_unlock(queue);
c910b4d9 845 splx(s);
9bccf70c
A
846}
847
5ba3f43e 848
0a7de745 849static uint32_t timer_queue_expire_lock_skips;
c910b4d9 850uint64_t
39236c6e 851timer_queue_expire_with_options(
0a7de745
A
852 mpqueue_head_t *queue,
853 uint64_t deadline,
854 boolean_t rescan)
1c79356b 855{
0a7de745 856 timer_call_t call = NULL;
39236c6e 857 uint32_t tc_iterations = 0;
6d2010ae
A
858 DBG("timer_queue_expire(%p,)\n", queue);
859
f427ee49
A
860 /* 'rescan' means look at every timer in the list, instead of
861 * early-exiting when the head of the list expires in the future.
862 * when 'rescan' is true, iterate by linked list instead of priority queue.
863 *
864 * TODO: if we keep a deadline ordered and soft-deadline ordered
865 * priority queue, then it's no longer necessary to do that
866 */
867
39236c6e
A
868 uint64_t cur_deadline = deadline;
869 timer_queue_lock_spin(queue);
1c79356b 870
6d2010ae 871 while (!queue_empty(&queue->head)) {
39236c6e
A
872 /* Upon processing one or more timer calls, refresh the
873 * deadline to account for time elapsed in the callout
874 */
0a7de745 875 if (++tc_iterations > 1) {
39236c6e 876 cur_deadline = mach_absolute_time();
0a7de745 877 }
39236c6e 878
0a7de745 879 if (call == NULL) {
f427ee49
A
880 if (rescan == FALSE) {
881 call = priority_queue_min(&queue->mpq_pqhead, struct timer_call, tc_pqlink);
882 } else {
883 call = qe_queue_first(&queue->head, struct timer_call, tc_qlink);
884 }
0a7de745 885 }
1c79356b 886
f427ee49 887 if (call->tc_soft_deadline <= cur_deadline) {
0a7de745
A
888 timer_call_func_t func;
889 timer_call_param_t param0, param1;
1c79356b 890
f427ee49 891 TCOAL_DEBUG(0xDDDD0000, queue->earliest_soft_deadline, call->tc_soft_deadline, 0, 0, 0);
0a7de745
A
892 TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
893 DECR_TIMER_EXPIRE | DBG_FUNC_NONE,
894 VM_KERNEL_UNSLIDE_OR_PERM(call),
f427ee49
A
895 call->tc_soft_deadline,
896 call->tc_pqlink.deadline,
897 call->tc_entry_time, 0);
39236c6e 898
f427ee49
A
899 if ((call->tc_flags & TIMER_CALL_RATELIMITED) &&
900 (call->tc_pqlink.deadline > cur_deadline)) {
0a7de745 901 if (rescan == FALSE) {
39236c6e 902 break;
0a7de745 903 }
39236c6e
A
904 }
905
f427ee49 906 if (!simple_lock_try(&call->tc_lock, LCK_GRP_NULL)) {
6d2010ae
A
907 /* case (2b) lock inversion, dequeue and skip */
908 timer_queue_expire_lock_skips++;
39236c6e
A
909 timer_call_entry_dequeue_async(call);
910 call = NULL;
6d2010ae
A
911 continue;
912 }
913
914 timer_call_entry_dequeue(call);
1c79356b 915
f427ee49
A
916 func = call->tc_func;
917 param0 = call->tc_param0;
918 param1 = call->tc_param1;
1c79356b 919
f427ee49 920 simple_unlock(&call->tc_lock);
39236c6e 921 timer_queue_unlock(queue);
1c79356b 922
0a7de745
A
923 TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
924 DECR_TIMER_CALLOUT | DBG_FUNC_START,
925 VM_KERNEL_UNSLIDE_OR_PERM(call), VM_KERNEL_UNSLIDE(func),
926 VM_KERNEL_ADDRHIDE(param0),
927 VM_KERNEL_ADDRHIDE(param1),
928 0);
2d21ac55 929
4b17d6b6 930#if CONFIG_DTRACE
39236c6e 931 DTRACE_TMR7(callout__start, timer_call_func_t, func,
f427ee49
A
932 timer_call_param_t, param0, unsigned, call->tc_flags,
933 0, (call->tc_ttd >> 32),
934 (unsigned) (call->tc_ttd & 0xFFFFFFFF), call);
2d21ac55 935#endif
4b17d6b6
A
936 /* Maintain time-to-deadline in per-processor data
937 * structure for thread wakeup deadline statistics.
938 */
f427ee49
A
939 uint64_t *ttdp = &current_processor()->timer_call_ttd;
940 *ttdp = call->tc_ttd;
1c79356b 941 (*func)(param0, param1);
4b17d6b6 942 *ttdp = 0;
4b17d6b6 943#if CONFIG_DTRACE
39236c6e
A
944 DTRACE_TMR4(callout__end, timer_call_func_t, func,
945 param0, param1, call);
2d21ac55
A
946#endif
947
0a7de745
A
948 TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
949 DECR_TIMER_CALLOUT | DBG_FUNC_END,
950 VM_KERNEL_UNSLIDE_OR_PERM(call), VM_KERNEL_UNSLIDE(func),
951 VM_KERNEL_ADDRHIDE(param0),
952 VM_KERNEL_ADDRHIDE(param1),
953 0);
39236c6e
A
954 call = NULL;
955 timer_queue_lock_spin(queue);
956 } else {
957 if (__probable(rescan == FALSE)) {
958 break;
959 } else {
f427ee49
A
960 int64_t skew = call->tc_pqlink.deadline - call->tc_soft_deadline;
961 assert(call->tc_pqlink.deadline >= call->tc_soft_deadline);
39236c6e
A
962
963 /* DRK: On a latency quality-of-service level change,
964 * re-sort potentially rate-limited timers. The platform
965 * layer determines which timers require
966 * this. In the absence of the per-callout
967 * synchronization requirement, a global resort could
968 * be more efficient. The re-sort effectively
969 * annuls all timer adjustments, i.e. the "soft
970 * deadline" is the sort key.
971 */
0a7de745 972
39236c6e 973 if (timer_resort_threshold(skew)) {
f427ee49
A
974 if (__probable(simple_lock_try(&call->tc_lock, LCK_GRP_NULL))) {
975 /* TODO: don't need to dequeue before enqueue */
39236c6e 976 timer_call_entry_dequeue(call);
f427ee49
A
977 timer_call_entry_enqueue_deadline(call, queue, call->tc_soft_deadline);
978 simple_unlock(&call->tc_lock);
39236c6e
A
979 call = NULL;
980 }
981 }
982 if (call) {
f427ee49
A
983 call = qe_queue_next(&queue->head, call, struct timer_call, tc_qlink);
984
985 if (call == NULL) {
39236c6e 986 break;
0a7de745 987 }
39236c6e
A
988 }
989 }
c910b4d9 990 }
1c79356b
A
991 }
992
f427ee49
A
993 call = priority_queue_min(&queue->mpq_pqhead, struct timer_call, tc_pqlink);
994
995 if (call) {
996 cur_deadline = call->tc_pqlink.deadline;
997 queue->earliest_soft_deadline = (call->tc_flags & TIMER_CALL_RATELIMITED) ? call->tc_pqlink.deadline: call->tc_soft_deadline;
39236c6e
A
998 } else {
999 queue->earliest_soft_deadline = cur_deadline = UINT64_MAX;
1000 }
1c79356b 1001
39236c6e 1002 timer_queue_unlock(queue);
c910b4d9 1003
0a7de745 1004 return cur_deadline;
1c79356b 1005}
6d2010ae 1006
39236c6e
A
1007uint64_t
1008timer_queue_expire(
0a7de745
A
1009 mpqueue_head_t *queue,
1010 uint64_t deadline)
39236c6e
A
1011{
1012 return timer_queue_expire_with_options(queue, deadline, FALSE);
1013}
6d2010ae
A
1014
1015extern int serverperfmode;
0a7de745 1016static uint32_t timer_queue_migrate_lock_skips;
6d2010ae 1017/*
39236c6e 1018 * timer_queue_migrate() is called by timer_queue_migrate_cpu()
6d2010ae
A
1019 * to move timer requests from the local processor (queue_from)
1020 * to a target processor's (queue_to).
1021 */
1022int
1023timer_queue_migrate(mpqueue_head_t *queue_from, mpqueue_head_t *queue_to)
1024{
0a7de745
A
1025 timer_call_t call;
1026 timer_call_t head_to;
1027 int timers_migrated = 0;
6d2010ae
A
1028
1029 DBG("timer_queue_migrate(%p,%p)\n", queue_from, queue_to);
1030
1031 assert(!ml_get_interrupts_enabled());
1032 assert(queue_from != queue_to);
1033
1034 if (serverperfmode) {
1035 /*
1036 * if we're running a high end server
1037 * avoid migrations... they add latency
1038 * and don't save us power under typical
1039 * server workloads
1040 */
1041 return -4;
1042 }
1043
1044 /*
1045 * Take both local (from) and target (to) timer queue locks while
1046 * moving the timers from the local queue to the target processor.
1047 * We assume that the target is always the boot processor.
1048 * But only move if all of the following is true:
1049 * - the target queue is non-empty
1050 * - the local queue is non-empty
1051 * - the local queue's first deadline is later than the target's
1052 * - the local queue contains no non-migrateable "local" call
1053 * so that we need not have the target resync.
1054 */
1055
0a7de745 1056 timer_queue_lock_spin(queue_to);
6d2010ae 1057
f427ee49
A
1058 head_to = priority_queue_min(&queue_to->mpq_pqhead, struct timer_call, tc_pqlink);
1059
1060 if (head_to == NULL) {
6d2010ae
A
1061 timers_migrated = -1;
1062 goto abort1;
1063 }
1064
0a7de745 1065 timer_queue_lock_spin(queue_from);
6d2010ae 1066
f427ee49
A
1067 call = priority_queue_min(&queue_from->mpq_pqhead, struct timer_call, tc_pqlink);
1068
1069 if (call == NULL) {
6d2010ae
A
1070 timers_migrated = -2;
1071 goto abort2;
1072 }
1073
f427ee49 1074 if (call->tc_pqlink.deadline < head_to->tc_pqlink.deadline) {
6d2010ae
A
1075 timers_migrated = 0;
1076 goto abort2;
1077 }
1078
1079 /* perform scan for non-migratable timers */
f427ee49
A
1080 qe_foreach_element(call, &queue_from->head, tc_qlink) {
1081 if (call->tc_flags & TIMER_CALL_LOCAL) {
6d2010ae
A
1082 timers_migrated = -3;
1083 goto abort2;
1084 }
f427ee49 1085 }
6d2010ae
A
1086
1087 /* migration loop itself -- both queues are locked */
f427ee49
A
1088 qe_foreach_element_safe(call, &queue_from->head, tc_qlink) {
1089 if (!simple_lock_try(&call->tc_lock, LCK_GRP_NULL)) {
6d2010ae 1090 /* case (2b) lock order inversion, dequeue only */
39236c6e 1091#ifdef TIMER_ASSERT
0a7de745
A
1092 TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1093 DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
1094 VM_KERNEL_UNSLIDE_OR_PERM(call),
f427ee49
A
1095 VM_KERNEL_UNSLIDE_OR_PERM(call->tc_queue),
1096 0,
0a7de745 1097 0x2b, 0);
39236c6e 1098#endif
6d2010ae 1099 timer_queue_migrate_lock_skips++;
39236c6e 1100 timer_call_entry_dequeue_async(call);
6d2010ae
A
1101 continue;
1102 }
1103 timer_call_entry_dequeue(call);
1104 timer_call_entry_enqueue_deadline(
f427ee49 1105 call, queue_to, call->tc_pqlink.deadline);
6d2010ae 1106 timers_migrated++;
f427ee49 1107 simple_unlock(&call->tc_lock);
6d2010ae 1108 }
39236c6e 1109 queue_from->earliest_soft_deadline = UINT64_MAX;
6d2010ae 1110abort2:
0a7de745 1111 timer_queue_unlock(queue_from);
6d2010ae 1112abort1:
0a7de745 1113 timer_queue_unlock(queue_to);
6d2010ae
A
1114
1115 return timers_migrated;
1116}
39236c6e
A
1117
1118void
1119timer_queue_trace_cpu(int ncpu)
1120{
1121 timer_call_nosync_cpu(
1122 ncpu,
0a7de745 1123 (void (*)(void *))timer_queue_trace,
39236c6e
A
1124 (void*) timer_queue_cpu(ncpu));
1125}
1126
1127void
1128timer_queue_trace(
0a7de745 1129 mpqueue_head_t *queue)
39236c6e 1130{
0a7de745
A
1131 timer_call_t call;
1132 spl_t s;
39236c6e 1133
0a7de745 1134 if (!kdebug_enable) {
39236c6e 1135 return;
0a7de745 1136 }
39236c6e
A
1137
1138 s = splclock();
1139 timer_queue_lock_spin(queue);
1140
1141 TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
0a7de745
A
1142 DECR_TIMER_QUEUE | DBG_FUNC_START,
1143 queue->count, mach_absolute_time(), 0, 0, 0);
39236c6e 1144
f427ee49
A
1145 qe_foreach_element(call, &queue->head, tc_qlink) {
1146 TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1147 DECR_TIMER_QUEUE | DBG_FUNC_NONE,
1148 call->tc_soft_deadline,
1149 call->tc_pqlink.deadline,
1150 call->tc_entry_time,
1151 VM_KERNEL_UNSLIDE(call->tc_func),
1152 0);
39236c6e
A
1153 }
1154
1155 TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
0a7de745
A
1156 DECR_TIMER_QUEUE | DBG_FUNC_END,
1157 queue->count, mach_absolute_time(), 0, 0, 0);
39236c6e
A
1158
1159 timer_queue_unlock(queue);
1160 splx(s);
1161}
1162
1163void
1164timer_longterm_dequeued_locked(timer_call_t call)
1165{
0a7de745 1166 timer_longterm_t *tlp = &timer_longterm;
39236c6e
A
1167
1168 tlp->dequeues++;
0a7de745 1169 if (call == tlp->threshold.call) {
39236c6e 1170 tlp->threshold.call = NULL;
0a7de745 1171 }
39236c6e
A
1172}
1173
1174/*
1175 * Place a timer call in the longterm list
1176 * and adjust the next timer callout deadline if the new timer is first.
1177 */
1178mpqueue_head_t *
0a7de745
A
1179timer_longterm_enqueue_unlocked(timer_call_t call,
1180 uint64_t now,
1181 uint64_t deadline,
1182 mpqueue_head_t **old_queue,
1183 uint64_t soft_deadline,
1184 uint64_t ttd,
1185 timer_call_param_t param1,
1186 uint32_t callout_flags)
39236c6e 1187{
0a7de745
A
1188 timer_longterm_t *tlp = &timer_longterm;
1189 boolean_t update_required = FALSE;
1190 uint64_t longterm_threshold;
39236c6e
A
1191
1192 longterm_threshold = now + tlp->threshold.interval;
1193
1194 /*
1195 * Return NULL without doing anything if:
1196 * - this timer is local, or
1197 * - the longterm mechanism is disabled, or
1198 * - this deadline is too short.
1199 */
fe8ab488 1200 if ((callout_flags & TIMER_CALL_LOCAL) != 0 ||
39236c6e 1201 (tlp->threshold.interval == TIMER_LONGTERM_NONE) ||
0a7de745 1202 (deadline <= longterm_threshold)) {
39236c6e 1203 return NULL;
0a7de745 1204 }
39236c6e
A
1205
1206 /*
0a7de745 1207 * Remove timer from its current queue, if any.
39236c6e
A
1208 */
1209 *old_queue = timer_call_dequeue_unlocked(call);
1210
1211 /*
1212 * Lock the longterm queue, queue timer and determine
1213 * whether an update is necessary.
1214 */
1215 assert(!ml_get_interrupts_enabled());
f427ee49 1216 simple_lock(&call->tc_lock, LCK_GRP_NULL);
39236c6e 1217 timer_queue_lock_spin(timer_longterm_queue);
f427ee49
A
1218 call->tc_pqlink.deadline = deadline;
1219 call->tc_param1 = param1;
1220 call->tc_ttd = ttd;
1221 call->tc_soft_deadline = soft_deadline;
1222 call->tc_flags = callout_flags;
39236c6e 1223 timer_call_entry_enqueue_tail(call, timer_longterm_queue);
0a7de745 1224
39236c6e
A
1225 tlp->enqueues++;
1226
1227 /*
1228 * We'll need to update the currently set threshold timer
1229 * if the new deadline is sooner and no sooner update is in flight.
0a7de745 1230 */
39236c6e
A
1231 if (deadline < tlp->threshold.deadline &&
1232 deadline < tlp->threshold.preempted) {
1233 tlp->threshold.preempted = deadline;
1234 tlp->threshold.call = call;
1235 update_required = TRUE;
1236 }
1237 timer_queue_unlock(timer_longterm_queue);
f427ee49 1238 simple_unlock(&call->tc_lock);
0a7de745 1239
39236c6e 1240 if (update_required) {
fe8ab488
A
1241 /*
1242 * Note: this call expects that calling the master cpu
1243 * alone does not involve locking the topo lock.
1244 */
39236c6e
A
1245 timer_call_nosync_cpu(
1246 master_cpu,
0a7de745 1247 (void (*)(void *))timer_longterm_update,
39236c6e
A
1248 (void *)tlp);
1249 }
1250
1251 return timer_longterm_queue;
1252}
1253
1254/*
1255 * Scan for timers below the longterm threshold.
1256 * Move these to the local timer queue (of the boot processor on which the
1257 * calling thread is running).
1258 * Both the local (boot) queue and the longterm queue are locked.
1259 * The scan is similar to the timer migrate sequence but is performed by
1260 * successively examining each timer on the longterm queue:
1261 * - if within the short-term threshold
0a7de745 1262 * - enter on the local queue (unless being deleted),
39236c6e
A
1263 * - otherwise:
1264 * - if sooner, deadline becomes the next threshold deadline.
5ba3f43e
A
1265 * The total scan time is limited to TIMER_LONGTERM_SCAN_LIMIT. Should this be
1266 * exceeded, we abort and reschedule again so that we don't shut others from
1267 * the timer queues. Longterm timers firing late is not critical.
39236c6e
A
1268 */
1269void
0a7de745
A
1270timer_longterm_scan(timer_longterm_t *tlp,
1271 uint64_t time_start)
39236c6e 1272{
0a7de745
A
1273 timer_call_t call;
1274 uint64_t threshold;
1275 uint64_t deadline;
1276 uint64_t time_limit = time_start + tlp->scan_limit;
1277 mpqueue_head_t *timer_master_queue;
39236c6e
A
1278
1279 assert(!ml_get_interrupts_enabled());
1280 assert(cpu_number() == master_cpu);
1281
0a7de745 1282 if (tlp->threshold.interval != TIMER_LONGTERM_NONE) {
5ba3f43e 1283 threshold = time_start + tlp->threshold.interval;
0a7de745 1284 }
39236c6e
A
1285
1286 tlp->threshold.deadline = TIMER_LONGTERM_NONE;
1287 tlp->threshold.call = NULL;
1288
0a7de745 1289 if (queue_empty(&timer_longterm_queue->head)) {
39236c6e 1290 return;
0a7de745 1291 }
39236c6e
A
1292
1293 timer_master_queue = timer_queue_cpu(master_cpu);
1294 timer_queue_lock_spin(timer_master_queue);
1295
f427ee49
A
1296 qe_foreach_element_safe(call, &timer_longterm_queue->head, tc_qlink) {
1297 deadline = call->tc_soft_deadline;
1298 if (!simple_lock_try(&call->tc_lock, LCK_GRP_NULL)) {
39236c6e
A
1299 /* case (2c) lock order inversion, dequeue only */
1300#ifdef TIMER_ASSERT
1301 TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
0a7de745
A
1302 DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
1303 VM_KERNEL_UNSLIDE_OR_PERM(call),
f427ee49
A
1304 VM_KERNEL_UNSLIDE_OR_PERM(call->tc_queue),
1305 0,
0a7de745 1306 0x2c, 0);
39236c6e
A
1307#endif
1308 timer_call_entry_dequeue_async(call);
1309 continue;
1310 }
1311 if (deadline < threshold) {
1312 /*
1313 * This timer needs moving (escalating)
1314 * to the local (boot) processor's queue.
1315 */
1316#ifdef TIMER_ASSERT
0a7de745 1317 if (deadline < time_start) {
39236c6e 1318 TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
0a7de745
A
1319 DECR_TIMER_OVERDUE | DBG_FUNC_NONE,
1320 VM_KERNEL_UNSLIDE_OR_PERM(call),
1321 deadline,
1322 time_start,
1323 threshold,
1324 0);
1325 }
39236c6e
A
1326#endif
1327 TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
0a7de745
A
1328 DECR_TIMER_ESCALATE | DBG_FUNC_NONE,
1329 VM_KERNEL_UNSLIDE_OR_PERM(call),
f427ee49
A
1330 call->tc_pqlink.deadline,
1331 call->tc_entry_time,
1332 VM_KERNEL_UNSLIDE(call->tc_func),
0a7de745 1333 0);
39236c6e
A
1334 tlp->escalates++;
1335 timer_call_entry_dequeue(call);
1336 timer_call_entry_enqueue_deadline(
f427ee49 1337 call, timer_master_queue, call->tc_pqlink.deadline);
39236c6e
A
1338 /*
1339 * A side-effect of the following call is to update
1340 * the actual hardware deadline if required.
1341 */
1342 (void) timer_queue_assign(deadline);
1343 } else {
1344 if (deadline < tlp->threshold.deadline) {
1345 tlp->threshold.deadline = deadline;
1346 tlp->threshold.call = call;
1347 }
1348 }
f427ee49 1349 simple_unlock(&call->tc_lock);
5ba3f43e
A
1350
1351 /* Abort scan if we're taking too long. */
1352 if (mach_absolute_time() > time_limit) {
1353 tlp->threshold.deadline = TIMER_LONGTERM_SCAN_AGAIN;
1354 tlp->scan_pauses++;
1355 DBG("timer_longterm_scan() paused %llu, qlen: %llu\n",
0a7de745 1356 time_limit, tlp->queue.count);
5ba3f43e
A
1357 break;
1358 }
39236c6e
A
1359 }
1360
1361 timer_queue_unlock(timer_master_queue);
1362}
1363
1364void
1365timer_longterm_callout(timer_call_param_t p0, __unused timer_call_param_t p1)
1366{
0a7de745 1367 timer_longterm_t *tlp = (timer_longterm_t *) p0;
39236c6e
A
1368
1369 timer_longterm_update(tlp);
1370}
1371
1372void
1373timer_longterm_update_locked(timer_longterm_t *tlp)
1374{
0a7de745 1375 uint64_t latency;
39236c6e 1376
0a7de745
A
1377 TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1378 DECR_TIMER_UPDATE | DBG_FUNC_START,
1379 VM_KERNEL_UNSLIDE_OR_PERM(&tlp->queue),
1380 tlp->threshold.deadline,
1381 tlp->threshold.preempted,
1382 tlp->queue.count, 0);
39236c6e
A
1383
1384 tlp->scan_time = mach_absolute_time();
1385 if (tlp->threshold.preempted != TIMER_LONGTERM_NONE) {
1386 tlp->threshold.preempts++;
1387 tlp->threshold.deadline = tlp->threshold.preempted;
1388 tlp->threshold.preempted = TIMER_LONGTERM_NONE;
1389 /*
1390 * Note: in the unlikely event that a pre-empted timer has
1391 * itself been cancelled, we'll simply re-scan later at the
1392 * time of the preempted/cancelled timer.
1393 */
1394 } else {
1395 tlp->threshold.scans++;
1396
1397 /*
1398 * Maintain a moving average of our wakeup latency.
1399 * Clamp latency to 0 and ignore above threshold interval.
1400 */
0a7de745 1401 if (tlp->scan_time > tlp->threshold.deadline_set) {
39236c6e 1402 latency = tlp->scan_time - tlp->threshold.deadline_set;
0a7de745 1403 } else {
39236c6e 1404 latency = 0;
0a7de745 1405 }
39236c6e
A
1406 if (latency < tlp->threshold.interval) {
1407 tlp->threshold.latency_min =
0a7de745 1408 MIN(tlp->threshold.latency_min, latency);
39236c6e 1409 tlp->threshold.latency_max =
0a7de745 1410 MAX(tlp->threshold.latency_max, latency);
39236c6e 1411 tlp->threshold.latency =
0a7de745 1412 (tlp->threshold.latency * 99 + latency) / 100;
39236c6e
A
1413 }
1414
0a7de745 1415 timer_longterm_scan(tlp, tlp->scan_time);
39236c6e
A
1416 }
1417
1418 tlp->threshold.deadline_set = tlp->threshold.deadline;
1419 /* The next deadline timer to be set is adjusted */
5ba3f43e
A
1420 if (tlp->threshold.deadline != TIMER_LONGTERM_NONE &&
1421 tlp->threshold.deadline != TIMER_LONGTERM_SCAN_AGAIN) {
39236c6e
A
1422 tlp->threshold.deadline_set -= tlp->threshold.margin;
1423 tlp->threshold.deadline_set -= tlp->threshold.latency;
1424 }
0a7de745 1425
5ba3f43e 1426 /* Throttle next scan time */
a39ff7e2 1427 uint64_t scan_clamp = mach_absolute_time() + tlp->scan_interval;
0a7de745 1428 if (tlp->threshold.deadline_set < scan_clamp) {
5ba3f43e 1429 tlp->threshold.deadline_set = scan_clamp;
0a7de745 1430 }
39236c6e 1431
0a7de745
A
1432 TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1433 DECR_TIMER_UPDATE | DBG_FUNC_END,
1434 VM_KERNEL_UNSLIDE_OR_PERM(&tlp->queue),
1435 tlp->threshold.deadline,
1436 tlp->threshold.scans,
1437 tlp->queue.count, 0);
39236c6e
A
1438}
1439
1440void
1441timer_longterm_update(timer_longterm_t *tlp)
1442{
0a7de745 1443 spl_t s = splclock();
39236c6e
A
1444
1445 timer_queue_lock_spin(timer_longterm_queue);
1446
0a7de745 1447 if (cpu_number() != master_cpu) {
39236c6e 1448 panic("timer_longterm_update_master() on non-boot cpu");
0a7de745 1449 }
39236c6e
A
1450
1451 timer_longterm_update_locked(tlp);
1452
0a7de745 1453 if (tlp->threshold.deadline != TIMER_LONGTERM_NONE) {
39236c6e
A
1454 timer_call_enter(
1455 &tlp->threshold.timer,
1456 tlp->threshold.deadline_set,
1457 TIMER_CALL_LOCAL | TIMER_CALL_SYS_CRITICAL);
0a7de745
A
1458 }
1459
39236c6e
A
1460 timer_queue_unlock(timer_longterm_queue);
1461 splx(s);
1462}
1463
1464void
1465timer_longterm_init(void)
1466{
0a7de745
A
1467 uint32_t longterm;
1468 timer_longterm_t *tlp = &timer_longterm;
39236c6e
A
1469
1470 DBG("timer_longterm_init() tlp: %p, queue: %p\n", tlp, &tlp->queue);
1471
1472 /*
15129b1c
A
1473 * Set the longterm timer threshold. Defaults to TIMER_LONGTERM_THRESHOLD
1474 * or TIMER_LONGTERM_NONE (disabled) for server;
0a7de745 1475 * overridden longterm boot-arg
39236c6e 1476 */
15129b1c 1477 tlp->threshold.interval = serverperfmode ? TIMER_LONGTERM_NONE
0a7de745
A
1478 : TIMER_LONGTERM_THRESHOLD;
1479 if (PE_parse_boot_argn("longterm", &longterm, sizeof(longterm))) {
39236c6e 1480 tlp->threshold.interval = (longterm == 0) ?
0a7de745
A
1481 TIMER_LONGTERM_NONE :
1482 longterm * NSEC_PER_MSEC;
39236c6e
A
1483 }
1484 if (tlp->threshold.interval != TIMER_LONGTERM_NONE) {
1485 printf("Longterm timer threshold: %llu ms\n",
0a7de745 1486 tlp->threshold.interval / NSEC_PER_MSEC);
39236c6e 1487 kprintf("Longterm timer threshold: %llu ms\n",
0a7de745 1488 tlp->threshold.interval / NSEC_PER_MSEC);
39236c6e 1489 nanoseconds_to_absolutetime(tlp->threshold.interval,
0a7de745 1490 &tlp->threshold.interval);
39236c6e
A
1491 tlp->threshold.margin = tlp->threshold.interval / 10;
1492 tlp->threshold.latency_min = EndOfAllTime;
1493 tlp->threshold.latency_max = 0;
1494 }
1495
1496 tlp->threshold.preempted = TIMER_LONGTERM_NONE;
1497 tlp->threshold.deadline = TIMER_LONGTERM_NONE;
1498
f427ee49 1499 mpqueue_init(&tlp->queue, &timer_longterm_lck_grp, LCK_ATTR_NULL);
39236c6e
A
1500
1501 timer_call_setup(&tlp->threshold.timer,
0a7de745 1502 timer_longterm_callout, (timer_call_param_t) tlp);
39236c6e
A
1503
1504 timer_longterm_queue = &tlp->queue;
1505}
1506
1507enum {
1508 THRESHOLD, QCOUNT,
1509 ENQUEUES, DEQUEUES, ESCALATES, SCANS, PREEMPTS,
a39ff7e2 1510 LATENCY, LATENCY_MIN, LATENCY_MAX, SCAN_LIMIT, SCAN_INTERVAL, PAUSES
39236c6e
A
1511};
1512uint64_t
1513timer_sysctl_get(int oid)
1514{
0a7de745 1515 timer_longterm_t *tlp = &timer_longterm;
39236c6e
A
1516
1517 switch (oid) {
1518 case THRESHOLD:
1519 return (tlp->threshold.interval == TIMER_LONGTERM_NONE) ?
0a7de745 1520 0 : tlp->threshold.interval / NSEC_PER_MSEC;
39236c6e
A
1521 case QCOUNT:
1522 return tlp->queue.count;
1523 case ENQUEUES:
1524 return tlp->enqueues;
1525 case DEQUEUES:
1526 return tlp->dequeues;
1527 case ESCALATES:
1528 return tlp->escalates;
1529 case SCANS:
1530 return tlp->threshold.scans;
1531 case PREEMPTS:
1532 return tlp->threshold.preempts;
1533 case LATENCY:
1534 return tlp->threshold.latency;
1535 case LATENCY_MIN:
1536 return tlp->threshold.latency_min;
1537 case LATENCY_MAX:
1538 return tlp->threshold.latency_max;
5ba3f43e
A
1539 case SCAN_LIMIT:
1540 return tlp->scan_limit;
a39ff7e2
A
1541 case SCAN_INTERVAL:
1542 return tlp->scan_interval;
5ba3f43e
A
1543 case PAUSES:
1544 return tlp->scan_pauses;
39236c6e
A
1545 default:
1546 return 0;
1547 }
1548}
1549
1550/*
1551 * timer_master_scan() is the inverse of timer_longterm_scan()
1552 * since it un-escalates timers to the longterm queue.
1553 */
1554static void
0a7de745
A
1555timer_master_scan(timer_longterm_t *tlp,
1556 uint64_t now)
39236c6e 1557{
0a7de745
A
1558 timer_call_t call;
1559 uint64_t threshold;
1560 uint64_t deadline;
1561 mpqueue_head_t *timer_master_queue;
39236c6e 1562
0a7de745 1563 if (tlp->threshold.interval != TIMER_LONGTERM_NONE) {
39236c6e 1564 threshold = now + tlp->threshold.interval;
0a7de745 1565 } else {
39236c6e 1566 threshold = TIMER_LONGTERM_NONE;
0a7de745 1567 }
39236c6e
A
1568
1569 timer_master_queue = timer_queue_cpu(master_cpu);
1570 timer_queue_lock_spin(timer_master_queue);
1571
f427ee49
A
1572 qe_foreach_element_safe(call, &timer_master_queue->head, tc_qlink) {
1573 deadline = call->tc_pqlink.deadline;
1574 if ((call->tc_flags & TIMER_CALL_LOCAL) != 0) {
39236c6e 1575 continue;
0a7de745 1576 }
f427ee49 1577 if (!simple_lock_try(&call->tc_lock, LCK_GRP_NULL)) {
39236c6e
A
1578 /* case (2c) lock order inversion, dequeue only */
1579 timer_call_entry_dequeue_async(call);
1580 continue;
1581 }
1582 if (deadline > threshold) {
1583 /* move from master to longterm */
1584 timer_call_entry_dequeue(call);
1585 timer_call_entry_enqueue_tail(call, timer_longterm_queue);
1586 if (deadline < tlp->threshold.deadline) {
1587 tlp->threshold.deadline = deadline;
1588 tlp->threshold.call = call;
1589 }
1590 }
f427ee49 1591 simple_unlock(&call->tc_lock);
39236c6e
A
1592 }
1593 timer_queue_unlock(timer_master_queue);
1594}
1595
1596static void
1597timer_sysctl_set_threshold(uint64_t value)
1598{
0a7de745
A
1599 timer_longterm_t *tlp = &timer_longterm;
1600 spl_t s = splclock();
1601 boolean_t threshold_increase;
39236c6e
A
1602
1603 timer_queue_lock_spin(timer_longterm_queue);
1604
1605 timer_call_cancel(&tlp->threshold.timer);
1606
1607 /*
1608 * Set the new threshold and note whther it's increasing.
1609 */
1610 if (value == 0) {
1611 tlp->threshold.interval = TIMER_LONGTERM_NONE;
1612 threshold_increase = TRUE;
1613 timer_call_cancel(&tlp->threshold.timer);
1614 } else {
0a7de745 1615 uint64_t old_interval = tlp->threshold.interval;
39236c6e
A
1616 tlp->threshold.interval = value * NSEC_PER_MSEC;
1617 nanoseconds_to_absolutetime(tlp->threshold.interval,
0a7de745 1618 &tlp->threshold.interval);
39236c6e 1619 tlp->threshold.margin = tlp->threshold.interval / 10;
0a7de745 1620 if (old_interval == TIMER_LONGTERM_NONE) {
39236c6e 1621 threshold_increase = FALSE;
0a7de745 1622 } else {
39236c6e 1623 threshold_increase = (tlp->threshold.interval > old_interval);
0a7de745 1624 }
39236c6e
A
1625 }
1626
1627 if (threshold_increase /* or removal */) {
1628 /* Escalate timers from the longterm queue */
1629 timer_longterm_scan(tlp, mach_absolute_time());
0a7de745 1630 } else { /* decrease or addition */
39236c6e
A
1631 /*
1632 * We scan the local/master queue for timers now longterm.
1633 * To be strictly correct, we should scan all processor queues
1634 * but timer migration results in most timers gravitating to the
1635 * master processor in any case.
1636 */
1637 timer_master_scan(tlp, mach_absolute_time());
1638 }
1639
1640 /* Set new timer accordingly */
1641 tlp->threshold.deadline_set = tlp->threshold.deadline;
1642 if (tlp->threshold.deadline != TIMER_LONGTERM_NONE) {
1643 tlp->threshold.deadline_set -= tlp->threshold.margin;
1644 tlp->threshold.deadline_set -= tlp->threshold.latency;
1645 timer_call_enter(
1646 &tlp->threshold.timer,
1647 tlp->threshold.deadline_set,
1648 TIMER_CALL_LOCAL | TIMER_CALL_SYS_CRITICAL);
1649 }
1650
1651 /* Reset stats */
1652 tlp->enqueues = 0;
1653 tlp->dequeues = 0;
1654 tlp->escalates = 0;
5ba3f43e 1655 tlp->scan_pauses = 0;
39236c6e
A
1656 tlp->threshold.scans = 0;
1657 tlp->threshold.preempts = 0;
1658 tlp->threshold.latency = 0;
1659 tlp->threshold.latency_min = EndOfAllTime;
1660 tlp->threshold.latency_max = 0;
1661
1662 timer_queue_unlock(timer_longterm_queue);
1663 splx(s);
1664}
1665
1666int
1667timer_sysctl_set(int oid, uint64_t value)
1668{
1669 switch (oid) {
1670 case THRESHOLD:
1671 timer_call_cpu(
1672 master_cpu,
0a7de745 1673 (void (*)(void *))timer_sysctl_set_threshold,
39236c6e
A
1674 (void *) value);
1675 return KERN_SUCCESS;
5ba3f43e
A
1676 case SCAN_LIMIT:
1677 timer_longterm.scan_limit = value;
1678 return KERN_SUCCESS;
a39ff7e2
A
1679 case SCAN_INTERVAL:
1680 timer_longterm.scan_interval = value;
1681 return KERN_SUCCESS;
39236c6e
A
1682 default:
1683 return KERN_INVALID_ARGUMENT;
1684 }
1685}
fe8ab488
A
1686
1687
1688/* Select timer coalescing window based on per-task quality-of-service hints */
0a7de745
A
1689static boolean_t
1690tcoal_qos_adjust(thread_t t, int32_t *tshift, uint64_t *tmax_abstime, boolean_t *pratelimited)
1691{
fe8ab488
A
1692 uint32_t latency_qos;
1693 boolean_t adjusted = FALSE;
1694 task_t ctask = t->task;
1695
1696 if (ctask) {
1697 latency_qos = proc_get_effective_thread_policy(t, TASK_POLICY_LATENCY_QOS);
1698
1699 assert(latency_qos <= NUM_LATENCY_QOS_TIERS);
1700
1701 if (latency_qos) {
1702 *tshift = tcoal_prio_params.latency_qos_scale[latency_qos - 1];
1703 *tmax_abstime = tcoal_prio_params.latency_qos_abstime_max[latency_qos - 1];
1704 *pratelimited = tcoal_prio_params.latency_tier_rate_limited[latency_qos - 1];
1705 adjusted = TRUE;
1706 }
1707 }
1708 return adjusted;
1709}
1710
1711
1712/* Adjust timer deadlines based on priority of the thread and the
1713 * urgency value provided at timeout establishment. With this mechanism,
1714 * timers are no longer necessarily sorted in order of soft deadline
1715 * on a given timer queue, i.e. they may be differentially skewed.
1716 * In the current scheme, this could lead to fewer pending timers
1717 * processed than is technically possible when the HW deadline arrives.
1718 */
1719static void
0a7de745
A
1720timer_compute_leeway(thread_t cthread, int32_t urgency, int32_t *tshift, uint64_t *tmax_abstime, boolean_t *pratelimited)
1721{
fe8ab488
A
1722 int16_t tpri = cthread->sched_pri;
1723 if ((urgency & TIMER_CALL_USER_MASK) != 0) {
1724 if (tpri >= BASEPRI_RTQUEUES ||
0a7de745 1725 urgency == TIMER_CALL_USER_CRITICAL) {
fe8ab488
A
1726 *tshift = tcoal_prio_params.timer_coalesce_rt_shift;
1727 *tmax_abstime = tcoal_prio_params.timer_coalesce_rt_abstime_max;
1728 TCOAL_PRIO_STAT(rt_tcl);
1729 } else if (proc_get_effective_thread_policy(cthread, TASK_POLICY_DARWIN_BG) ||
0a7de745 1730 (urgency == TIMER_CALL_USER_BACKGROUND)) {
fe8ab488
A
1731 /* Determine if timer should be subjected to a lower QoS */
1732 if (tcoal_qos_adjust(cthread, tshift, tmax_abstime, pratelimited)) {
1733 if (*tmax_abstime > tcoal_prio_params.timer_coalesce_bg_abstime_max) {
1734 return;
1735 } else {
1736 *pratelimited = FALSE;
1737 }
1738 }
1739 *tshift = tcoal_prio_params.timer_coalesce_bg_shift;
1740 *tmax_abstime = tcoal_prio_params.timer_coalesce_bg_abstime_max;
1741 TCOAL_PRIO_STAT(bg_tcl);
1742 } else if (tpri >= MINPRI_KERNEL) {
1743 *tshift = tcoal_prio_params.timer_coalesce_kt_shift;
1744 *tmax_abstime = tcoal_prio_params.timer_coalesce_kt_abstime_max;
1745 TCOAL_PRIO_STAT(kt_tcl);
1746 } else if (cthread->sched_mode == TH_MODE_FIXED) {
1747 *tshift = tcoal_prio_params.timer_coalesce_fp_shift;
1748 *tmax_abstime = tcoal_prio_params.timer_coalesce_fp_abstime_max;
1749 TCOAL_PRIO_STAT(fp_tcl);
1750 } else if (tcoal_qos_adjust(cthread, tshift, tmax_abstime, pratelimited)) {
1751 TCOAL_PRIO_STAT(qos_tcl);
1752 } else if (cthread->sched_mode == TH_MODE_TIMESHARE) {
1753 *tshift = tcoal_prio_params.timer_coalesce_ts_shift;
1754 *tmax_abstime = tcoal_prio_params.timer_coalesce_ts_abstime_max;
1755 TCOAL_PRIO_STAT(ts_tcl);
1756 } else {
1757 TCOAL_PRIO_STAT(nc_tcl);
1758 }
1759 } else if (urgency == TIMER_CALL_SYS_BACKGROUND) {
1760 *tshift = tcoal_prio_params.timer_coalesce_bg_shift;
1761 *tmax_abstime = tcoal_prio_params.timer_coalesce_bg_abstime_max;
1762 TCOAL_PRIO_STAT(bg_tcl);
1763 } else {
1764 *tshift = tcoal_prio_params.timer_coalesce_kt_shift;
1765 *tmax_abstime = tcoal_prio_params.timer_coalesce_kt_abstime_max;
1766 TCOAL_PRIO_STAT(kt_tcl);
1767 }
1768}
1769
1770
1771int timer_user_idle_level;
1772
1773uint64_t
1774timer_call_slop(uint64_t deadline, uint64_t now, uint32_t flags, thread_t cthread, boolean_t *pratelimited)
1775{
1776 int32_t tcs_shift = 0;
1777 uint64_t tcs_max_abstime = 0;
1778 uint64_t adjval;
1779 uint32_t urgency = (flags & TIMER_CALL_URGENCY_MASK);
1780
0a7de745 1781 if (mach_timer_coalescing_enabled &&
fe8ab488
A
1782 (deadline > now) && (urgency != TIMER_CALL_SYS_CRITICAL)) {
1783 timer_compute_leeway(cthread, urgency, &tcs_shift, &tcs_max_abstime, pratelimited);
0a7de745
A
1784
1785 if (tcs_shift >= 0) {
fe8ab488 1786 adjval = MIN((deadline - now) >> tcs_shift, tcs_max_abstime);
0a7de745 1787 } else {
fe8ab488 1788 adjval = MIN((deadline - now) << (-tcs_shift), tcs_max_abstime);
0a7de745 1789 }
fe8ab488
A
1790 /* Apply adjustments derived from "user idle level" heuristic */
1791 adjval += (adjval * timer_user_idle_level) >> 7;
1792 return adjval;
0a7de745 1793 } else {
fe8ab488
A
1794 return 0;
1795 }
1796}
1797
1798int
0a7de745
A
1799timer_get_user_idle_level(void)
1800{
fe8ab488
A
1801 return timer_user_idle_level;
1802}
1803
0a7de745
A
1804kern_return_t
1805timer_set_user_idle_level(int ilevel)
1806{
fe8ab488
A
1807 boolean_t do_reeval = FALSE;
1808
0a7de745 1809 if ((ilevel < 0) || (ilevel > 128)) {
fe8ab488 1810 return KERN_INVALID_ARGUMENT;
0a7de745 1811 }
fe8ab488
A
1812
1813 if (ilevel < timer_user_idle_level) {
1814 do_reeval = TRUE;
1815 }
1816
1817 timer_user_idle_level = ilevel;
1818
0a7de745 1819 if (do_reeval) {
fe8ab488 1820 ml_timer_evaluate();
0a7de745 1821 }
fe8ab488
A
1822
1823 return KERN_SUCCESS;
1824}
f427ee49
A
1825
1826#pragma mark - running timers
1827
1828#define RUNNING_TIMER_FAKE_FLAGS (TIMER_CALL_SYS_CRITICAL | \
1829 TIMER_CALL_LOCAL)
1830
1831/*
1832 * timer_call_trace_* functions mimic the tracing behavior from the normal
1833 * timer_call subsystem, so tools continue to function.
1834 */
1835
1836static void
1837timer_call_trace_enter_before(struct timer_call *call, uint64_t deadline,
1838 uint32_t flags, uint64_t now)
1839{
1840#pragma unused(call, deadline, flags, now)
1841 TIMER_KDEBUG_TRACE(KDEBUG_TRACE, DECR_TIMER_ENTER | DBG_FUNC_START,
1842 VM_KERNEL_UNSLIDE_OR_PERM(call), VM_KERNEL_ADDRHIDE(call->tc_param1),
1843 deadline, flags, 0);
1844#if CONFIG_DTRACE
1845 uint64_t ttd = deadline - now;
1846 DTRACE_TMR7(callout__create, timer_call_func_t, call->tc_func,
1847 timer_call_param_t, call->tc_param0, uint32_t, flags, 0,
1848 (ttd >> 32), (unsigned int)(ttd & 0xFFFFFFFF), NULL);
1849#endif /* CONFIG_DTRACE */
1850 TIMER_KDEBUG_TRACE(KDEBUG_TRACE, DECR_TIMER_ENTER | DBG_FUNC_END,
1851 VM_KERNEL_UNSLIDE_OR_PERM(call), 0, deadline, 0, 0);
1852}
1853
1854static void
1855timer_call_trace_enter_after(struct timer_call *call, uint64_t deadline)
1856{
1857#pragma unused(call, deadline)
1858 TIMER_KDEBUG_TRACE(KDEBUG_TRACE, DECR_TIMER_ENTER | DBG_FUNC_END,
1859 VM_KERNEL_UNSLIDE_OR_PERM(call), 0, deadline, 0, 0);
1860}
1861
1862static void
1863timer_call_trace_cancel(struct timer_call *call)
1864{
1865#pragma unused(call)
1866 __unused uint64_t deadline = call->tc_pqlink.deadline;
1867 TIMER_KDEBUG_TRACE(KDEBUG_TRACE, DECR_TIMER_CANCEL | DBG_FUNC_START,
1868 VM_KERNEL_UNSLIDE_OR_PERM(call), deadline, 0,
1869 call->tc_flags, 0);
1870 TIMER_KDEBUG_TRACE(KDEBUG_TRACE, DECR_TIMER_CANCEL | DBG_FUNC_END,
1871 VM_KERNEL_UNSLIDE_OR_PERM(call), 0, deadline - mach_absolute_time(),
1872 deadline - call->tc_entry_time, 0);
1873#if CONFIG_DTRACE
1874#if TIMER_TRACE
1875 uint64_t ttd = deadline - call->tc_entry_time;
1876#else
1877 uint64_t ttd = UINT64_MAX;
1878#endif /* TIMER_TRACE */
1879 DTRACE_TMR6(callout__cancel, timer_call_func_t, call->tc_func,
1880 timer_call_param_t, call->tc_param0, uint32_t, call->tc_flags, 0,
1881 (ttd >> 32), (unsigned int)(ttd & 0xFFFFFFFF));
1882#endif /* CONFIG_DTRACE */
1883}
1884
1885static void
1886timer_call_trace_expire_entry(struct timer_call *call)
1887{
1888#pragma unused(call)
1889 TIMER_KDEBUG_TRACE(KDEBUG_TRACE, DECR_TIMER_CALLOUT | DBG_FUNC_START,
1890 VM_KERNEL_UNSLIDE_OR_PERM(call), VM_KERNEL_UNSLIDE(call->tc_func),
1891 VM_KERNEL_ADDRHIDE(call->tc_param0),
1892 VM_KERNEL_ADDRHIDE(call->tc_param1),
1893 0);
1894#if CONFIG_DTRACE
1895#if TIMER_TRACE
1896 uint64_t ttd = call->tc_pqlink.deadline - call->tc_entry_time;
1897#else /* TIMER_TRACE */
1898 uint64_t ttd = UINT64_MAX;
1899#endif /* TIMER_TRACE */
1900 DTRACE_TMR7(callout__start, timer_call_func_t, call->tc_func,
1901 timer_call_param_t, call->tc_param0, unsigned, call->tc_flags,
1902 0, (ttd >> 32), (unsigned int)(ttd & 0xFFFFFFFF), NULL);
1903#endif /* CONFIG_DTRACE */
1904}
1905
1906static void
1907timer_call_trace_expire_return(struct timer_call *call)
1908{
1909#pragma unused(call)
1910#if CONFIG_DTRACE
1911 DTRACE_TMR4(callout__end, timer_call_func_t, call->tc_func,
1912 call->tc_param0, call->tc_param1, NULL);
1913#endif /* CONFIG_DTRACE */
1914 TIMER_KDEBUG_TRACE(KDEBUG_TRACE, DECR_TIMER_CALLOUT | DBG_FUNC_END,
1915 VM_KERNEL_UNSLIDE_OR_PERM(call),
1916 VM_KERNEL_UNSLIDE(call->tc_func),
1917 VM_KERNEL_ADDRHIDE(call->tc_param0),
1918 VM_KERNEL_ADDRHIDE(call->tc_param1),
1919 0);
1920}
1921
1922/*
1923 * Set a new deadline for a running timer on this processor.
1924 */
1925void
1926running_timer_setup(processor_t processor, enum running_timer timer,
1927 void *param, uint64_t deadline, uint64_t now)
1928{
1929 assert(timer < RUNNING_TIMER_MAX);
1930 assert(ml_get_interrupts_enabled() == FALSE);
1931
1932 struct timer_call *call = &processor->running_timers[timer];
1933
1934 timer_call_trace_enter_before(call, deadline, RUNNING_TIMER_FAKE_FLAGS,
1935 now);
1936
1937 if (__improbable(deadline < now)) {
1938 deadline = timer_call_past_deadline_timer_handle(deadline, now);
1939 }
1940
1941 call->tc_pqlink.deadline = deadline;
1942#if TIMER_TRACE
1943 call->tc_entry_time = now;
1944#endif /* TIMER_TRACE */
1945 call->tc_param1 = param;
1946
1947 timer_call_trace_enter_after(call, deadline);
1948}
1949
1950void
1951running_timers_sync(void)
1952{
1953 timer_resync_deadlines();
1954}
1955
1956void
1957running_timer_enter(processor_t processor, unsigned int timer,
1958 void *param, uint64_t deadline, uint64_t now)
1959{
1960 running_timer_setup(processor, timer, param, deadline, now);
1961 running_timers_sync();
1962}
1963
1964/*
1965 * Call the callback for any running timers that fired for this processor.
1966 * Returns true if any timers were past their deadline.
1967 */
1968bool
1969running_timers_expire(processor_t processor, uint64_t now)
1970{
1971 bool expired = false;
1972
1973 if (!processor->running_timers_active) {
1974 return expired;
1975 }
1976
1977 for (int i = 0; i < RUNNING_TIMER_MAX; i++) {
1978 struct timer_call *call = &processor->running_timers[i];
1979
1980 uint64_t deadline = call->tc_pqlink.deadline;
1981 if (deadline > now) {
1982 continue;
1983 }
1984
1985 expired = true;
1986 timer_call_trace_expire_entry(call);
1987 call->tc_func(call->tc_param0, call->tc_param1);
1988 timer_call_trace_expire_return(call);
1989 }
1990
1991 return expired;
1992}
1993
1994void
1995running_timer_clear(processor_t processor, enum running_timer timer)
1996{
1997 struct timer_call *call = &processor->running_timers[timer];
1998 uint64_t deadline = call->tc_pqlink.deadline;
1999 if (deadline == EndOfAllTime) {
2000 return;
2001 }
2002
2003 call->tc_pqlink.deadline = EndOfAllTime;
2004#if TIMER_TRACE
2005 call->tc_entry_time = 0;
2006#endif /* TIMER_TRACE */
2007 timer_call_trace_cancel(call);
2008}
2009
2010void
2011running_timer_cancel(processor_t processor, unsigned int timer)
2012{
2013 running_timer_clear(processor, timer);
2014 running_timers_sync();
2015}
2016
2017uint64_t
2018running_timers_deadline(processor_t processor)
2019{
2020 if (!processor->running_timers_active) {
2021 return EndOfAllTime;
2022 }
2023
2024 uint64_t deadline = EndOfAllTime;
2025 for (int i = 0; i < RUNNING_TIMER_MAX; i++) {
2026 uint64_t candidate =
2027 processor->running_timers[i].tc_pqlink.deadline;
2028 if (candidate != 0 && candidate < deadline) {
2029 deadline = candidate;
2030 }
2031 }
2032
2033 return deadline;
2034}
2035
2036void
2037running_timers_activate(processor_t processor)
2038{
2039 processor->running_timers_active = true;
2040 running_timers_sync();
2041}
2042
2043void
2044running_timers_deactivate(processor_t processor)
2045{
2046 assert(processor->running_timers_active == true);
2047 processor->running_timers_active = false;
2048 running_timers_sync();
2049}