]> git.saurik.com Git - apple/xnu.git/blob - osfmk/kern/timer_call.c
xnu-3789.41.3.tar.gz
[apple/xnu.git] / osfmk / kern / timer_call.c
1 /*
2 * Copyright (c) 1993-2008 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * Timer interrupt callout module.
30 */
31
32 #include <mach/mach_types.h>
33
34 #include <kern/clock.h>
35 #include <kern/smp.h>
36 #include <kern/processor.h>
37 #include <kern/timer_call.h>
38 #include <kern/timer_queue.h>
39 #include <kern/call_entry.h>
40 #include <kern/thread.h>
41 #include <kern/policy_internal.h>
42
43 #include <sys/kdebug.h>
44
45 #if CONFIG_DTRACE
46 #include <mach/sdt.h>
47 #endif
48
49
50 #if DEBUG
51 #define TIMER_ASSERT 1
52 #endif
53
54 //#define TIMER_ASSERT 1
55 //#define TIMER_DBG 1
56
57 #if TIMER_DBG
58 #define DBG(x...) kprintf("DBG: " x);
59 #else
60 #define DBG(x...)
61 #endif
62
63 #if TIMER_TRACE
64 #define TIMER_KDEBUG_TRACE KERNEL_DEBUG_CONSTANT_IST
65 #else
66 #define TIMER_KDEBUG_TRACE(x...)
67 #endif
68
69
70 lck_grp_t timer_call_lck_grp;
71 lck_attr_t timer_call_lck_attr;
72 lck_grp_attr_t timer_call_lck_grp_attr;
73
74 lck_grp_t timer_longterm_lck_grp;
75 lck_attr_t timer_longterm_lck_attr;
76 lck_grp_attr_t timer_longterm_lck_grp_attr;
77
78 /* Timer queue lock must be acquired with interrupts disabled (under splclock()) */
79 #if __SMP__
80 #define timer_queue_lock_spin(queue) \
81 lck_mtx_lock_spin_always(&queue->lock_data)
82
83 #define timer_queue_unlock(queue) \
84 lck_mtx_unlock_always(&queue->lock_data)
85 #else
86 #define timer_queue_lock_spin(queue) (void)1
87 #define timer_queue_unlock(queue) (void)1
88 #endif
89
90 #define QUEUE(x) ((queue_t)(x))
91 #define MPQUEUE(x) ((mpqueue_head_t *)(x))
92 #define TIMER_CALL(x) ((timer_call_t)(x))
93 #define TCE(x) (&(x->call_entry))
94 /*
95 * The longterm timer object is a global structure holding all timers
96 * beyond the short-term, local timer queue threshold. The boot processor
97 * is responsible for moving each timer to its local timer queue
98 * if and when that timer becomes due within the threshold.
99 */
100 #define TIMER_LONGTERM_NONE EndOfAllTime
101 #if defined(__x86_64__)
102 #define TIMER_LONGTERM_THRESHOLD (1ULL * NSEC_PER_SEC)
103 #else
104 #define TIMER_LONGTERM_THRESHOLD TIMER_LONGTERM_NONE
105 #endif
106
107 typedef struct {
108 uint64_t interval; /* longterm timer interval */
109 uint64_t margin; /* fudge factor (10% of interval */
110 uint64_t deadline; /* first/soonest longterm deadline */
111 uint64_t preempted; /* sooner timer has pre-empted */
112 timer_call_t call; /* first/soonest longterm timer call */
113 uint64_t deadline_set; /* next timer set */
114 timer_call_data_t timer; /* timer used by threshold management */
115 /* Stats: */
116 uint64_t scans; /* num threshold timer scans */
117 uint64_t preempts; /* num threshold reductions */
118 uint64_t latency; /* average threshold latency */
119 uint64_t latency_min; /* minimum threshold latency */
120 uint64_t latency_max; /* maximum threshold latency */
121 } threshold_t;
122
123 typedef struct {
124 mpqueue_head_t queue; /* longterm timer list */
125 uint64_t enqueues; /* num timers queued */
126 uint64_t dequeues; /* num timers dequeued */
127 uint64_t escalates; /* num timers becoming shortterm */
128 uint64_t scan_time; /* last time the list was scanned */
129 threshold_t threshold; /* longterm timer threshold */
130 } timer_longterm_t;
131
132 timer_longterm_t timer_longterm;
133
134 static mpqueue_head_t *timer_longterm_queue = NULL;
135
136 static void timer_longterm_init(void);
137 static void timer_longterm_callout(
138 timer_call_param_t p0,
139 timer_call_param_t p1);
140 extern void timer_longterm_scan(
141 timer_longterm_t *tlp,
142 uint64_t now);
143 static void timer_longterm_update(
144 timer_longterm_t *tlp);
145 static void timer_longterm_update_locked(
146 timer_longterm_t *tlp);
147 static mpqueue_head_t * timer_longterm_enqueue_unlocked(
148 timer_call_t call,
149 uint64_t now,
150 uint64_t deadline,
151 mpqueue_head_t ** old_queue,
152 uint64_t soft_deadline,
153 uint64_t ttd,
154 timer_call_param_t param1,
155 uint32_t callout_flags);
156 static void timer_longterm_dequeued_locked(
157 timer_call_t call);
158
159 uint64_t past_deadline_timers;
160 uint64_t past_deadline_deltas;
161 uint64_t past_deadline_longest;
162 uint64_t past_deadline_shortest = ~0ULL;
163 enum {PAST_DEADLINE_TIMER_ADJUSTMENT_NS = 10 * 1000};
164
165 uint64_t past_deadline_timer_adjustment;
166
167 static boolean_t timer_call_enter_internal(timer_call_t call, timer_call_param_t param1, uint64_t deadline, uint64_t leeway, uint32_t flags, boolean_t ratelimited);
168 boolean_t mach_timer_coalescing_enabled = TRUE;
169
170 mpqueue_head_t *timer_call_enqueue_deadline_unlocked(
171 timer_call_t call,
172 mpqueue_head_t *queue,
173 uint64_t deadline,
174 uint64_t soft_deadline,
175 uint64_t ttd,
176 timer_call_param_t param1,
177 uint32_t flags);
178
179 mpqueue_head_t *timer_call_dequeue_unlocked(
180 timer_call_t call);
181
182 timer_coalescing_priority_params_t tcoal_prio_params;
183
184 #if TCOAL_PRIO_STATS
185 int32_t nc_tcl, rt_tcl, bg_tcl, kt_tcl, fp_tcl, ts_tcl, qos_tcl;
186 #define TCOAL_PRIO_STAT(x) (x++)
187 #else
188 #define TCOAL_PRIO_STAT(x)
189 #endif
190
191 static void
192 timer_call_init_abstime(void)
193 {
194 int i;
195 uint64_t result;
196 timer_coalescing_priority_params_ns_t * tcoal_prio_params_init = timer_call_get_priority_params();
197 nanoseconds_to_absolutetime(PAST_DEADLINE_TIMER_ADJUSTMENT_NS, &past_deadline_timer_adjustment);
198 nanoseconds_to_absolutetime(tcoal_prio_params_init->idle_entry_timer_processing_hdeadline_threshold_ns, &result);
199 tcoal_prio_params.idle_entry_timer_processing_hdeadline_threshold_abstime = (uint32_t)result;
200 nanoseconds_to_absolutetime(tcoal_prio_params_init->interrupt_timer_coalescing_ilat_threshold_ns, &result);
201 tcoal_prio_params.interrupt_timer_coalescing_ilat_threshold_abstime = (uint32_t)result;
202 nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_resort_threshold_ns, &result);
203 tcoal_prio_params.timer_resort_threshold_abstime = (uint32_t)result;
204 tcoal_prio_params.timer_coalesce_rt_shift = tcoal_prio_params_init->timer_coalesce_rt_shift;
205 tcoal_prio_params.timer_coalesce_bg_shift = tcoal_prio_params_init->timer_coalesce_bg_shift;
206 tcoal_prio_params.timer_coalesce_kt_shift = tcoal_prio_params_init->timer_coalesce_kt_shift;
207 tcoal_prio_params.timer_coalesce_fp_shift = tcoal_prio_params_init->timer_coalesce_fp_shift;
208 tcoal_prio_params.timer_coalesce_ts_shift = tcoal_prio_params_init->timer_coalesce_ts_shift;
209
210 nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_coalesce_rt_ns_max,
211 &tcoal_prio_params.timer_coalesce_rt_abstime_max);
212 nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_coalesce_bg_ns_max,
213 &tcoal_prio_params.timer_coalesce_bg_abstime_max);
214 nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_coalesce_kt_ns_max,
215 &tcoal_prio_params.timer_coalesce_kt_abstime_max);
216 nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_coalesce_fp_ns_max,
217 &tcoal_prio_params.timer_coalesce_fp_abstime_max);
218 nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_coalesce_ts_ns_max,
219 &tcoal_prio_params.timer_coalesce_ts_abstime_max);
220
221 for (i = 0; i < NUM_LATENCY_QOS_TIERS; i++) {
222 tcoal_prio_params.latency_qos_scale[i] = tcoal_prio_params_init->latency_qos_scale[i];
223 nanoseconds_to_absolutetime(tcoal_prio_params_init->latency_qos_ns_max[i],
224 &tcoal_prio_params.latency_qos_abstime_max[i]);
225 tcoal_prio_params.latency_tier_rate_limited[i] = tcoal_prio_params_init->latency_tier_rate_limited[i];
226 }
227 }
228
229
230 void
231 timer_call_init(void)
232 {
233 lck_attr_setdefault(&timer_call_lck_attr);
234 lck_grp_attr_setdefault(&timer_call_lck_grp_attr);
235 lck_grp_init(&timer_call_lck_grp, "timer_call", &timer_call_lck_grp_attr);
236
237 timer_longterm_init();
238 timer_call_init_abstime();
239 }
240
241
242 void
243 timer_call_queue_init(mpqueue_head_t *queue)
244 {
245 DBG("timer_call_queue_init(%p)\n", queue);
246 mpqueue_init(queue, &timer_call_lck_grp, &timer_call_lck_attr);
247 }
248
249
250 void
251 timer_call_setup(
252 timer_call_t call,
253 timer_call_func_t func,
254 timer_call_param_t param0)
255 {
256 DBG("timer_call_setup(%p,%p,%p)\n", call, func, param0);
257 call_entry_setup(TCE(call), func, param0);
258 simple_lock_init(&(call)->lock, 0);
259 call->async_dequeue = FALSE;
260 }
261 #if TIMER_ASSERT
262 static __inline__ mpqueue_head_t *
263 timer_call_entry_dequeue(
264 timer_call_t entry)
265 {
266 mpqueue_head_t *old_queue = MPQUEUE(TCE(entry)->queue);
267
268 if (!hw_lock_held((hw_lock_t)&entry->lock))
269 panic("_call_entry_dequeue() "
270 "entry %p is not locked\n", entry);
271 /*
272 * XXX The queue lock is actually a mutex in spin mode
273 * but there's no way to test for it being held
274 * so we pretend it's a spinlock!
275 */
276 if (!hw_lock_held((hw_lock_t)&old_queue->lock_data))
277 panic("_call_entry_dequeue() "
278 "queue %p is not locked\n", old_queue);
279
280 call_entry_dequeue(TCE(entry));
281 old_queue->count--;
282
283 return (old_queue);
284 }
285
286 static __inline__ mpqueue_head_t *
287 timer_call_entry_enqueue_deadline(
288 timer_call_t entry,
289 mpqueue_head_t *queue,
290 uint64_t deadline)
291 {
292 mpqueue_head_t *old_queue = MPQUEUE(TCE(entry)->queue);
293
294 if (!hw_lock_held((hw_lock_t)&entry->lock))
295 panic("_call_entry_enqueue_deadline() "
296 "entry %p is not locked\n", entry);
297 /* XXX More lock pretense: */
298 if (!hw_lock_held((hw_lock_t)&queue->lock_data))
299 panic("_call_entry_enqueue_deadline() "
300 "queue %p is not locked\n", queue);
301 if (old_queue != NULL && old_queue != queue)
302 panic("_call_entry_enqueue_deadline() "
303 "old_queue %p != queue", old_queue);
304
305 call_entry_enqueue_deadline(TCE(entry), QUEUE(queue), deadline);
306
307 /* For efficiency, track the earliest soft deadline on the queue, so that
308 * fuzzy decisions can be made without lock acquisitions.
309 */
310 timer_call_t thead = (timer_call_t)queue_first(&queue->head);
311
312 queue->earliest_soft_deadline = thead->flags & TIMER_CALL_RATELIMITED ? TCE(thead)->deadline : thead->soft_deadline;
313
314 if (old_queue)
315 old_queue->count--;
316 queue->count++;
317
318 return (old_queue);
319 }
320
321 #else
322
323 static __inline__ mpqueue_head_t *
324 timer_call_entry_dequeue(
325 timer_call_t entry)
326 {
327 mpqueue_head_t *old_queue = MPQUEUE(TCE(entry)->queue);
328
329 call_entry_dequeue(TCE(entry));
330 old_queue->count--;
331
332 return old_queue;
333 }
334
335 static __inline__ mpqueue_head_t *
336 timer_call_entry_enqueue_deadline(
337 timer_call_t entry,
338 mpqueue_head_t *queue,
339 uint64_t deadline)
340 {
341 mpqueue_head_t *old_queue = MPQUEUE(TCE(entry)->queue);
342
343 call_entry_enqueue_deadline(TCE(entry), QUEUE(queue), deadline);
344
345 /* For efficiency, track the earliest soft deadline on the queue,
346 * so that fuzzy decisions can be made without lock acquisitions.
347 */
348
349 timer_call_t thead = (timer_call_t)queue_first(&queue->head);
350 queue->earliest_soft_deadline = thead->flags & TIMER_CALL_RATELIMITED ? TCE(thead)->deadline : thead->soft_deadline;
351
352 if (old_queue)
353 old_queue->count--;
354 queue->count++;
355
356 return old_queue;
357 }
358
359 #endif
360
361 static __inline__ void
362 timer_call_entry_enqueue_tail(
363 timer_call_t entry,
364 mpqueue_head_t *queue)
365 {
366 call_entry_enqueue_tail(TCE(entry), QUEUE(queue));
367 queue->count++;
368 return;
369 }
370
371 /*
372 * Remove timer entry from its queue but don't change the queue pointer
373 * and set the async_dequeue flag. This is locking case 2b.
374 */
375 static __inline__ void
376 timer_call_entry_dequeue_async(
377 timer_call_t entry)
378 {
379 mpqueue_head_t *old_queue = MPQUEUE(TCE(entry)->queue);
380 if (old_queue) {
381 old_queue->count--;
382 (void) remque(qe(entry));
383 entry->async_dequeue = TRUE;
384 }
385 return;
386 }
387
388 #if TIMER_ASSERT
389 unsigned timer_call_enqueue_deadline_unlocked_async1;
390 unsigned timer_call_enqueue_deadline_unlocked_async2;
391 #endif
392 /*
393 * Assumes call_entry and queues unlocked, interrupts disabled.
394 */
395 __inline__ mpqueue_head_t *
396 timer_call_enqueue_deadline_unlocked(
397 timer_call_t call,
398 mpqueue_head_t *queue,
399 uint64_t deadline,
400 uint64_t soft_deadline,
401 uint64_t ttd,
402 timer_call_param_t param1,
403 uint32_t callout_flags)
404 {
405 call_entry_t entry = TCE(call);
406 mpqueue_head_t *old_queue;
407
408 DBG("timer_call_enqueue_deadline_unlocked(%p,%p,)\n", call, queue);
409
410 simple_lock(&call->lock);
411
412 old_queue = MPQUEUE(entry->queue);
413
414 if (old_queue != NULL) {
415 timer_queue_lock_spin(old_queue);
416 if (call->async_dequeue) {
417 /* collision (1c): timer already dequeued, clear flag */
418 #if TIMER_ASSERT
419 TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
420 DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
421 VM_KERNEL_UNSLIDE_OR_PERM(call),
422 call->async_dequeue,
423 VM_KERNEL_UNSLIDE_OR_PERM(TCE(call)->queue),
424 0x1c, 0);
425 timer_call_enqueue_deadline_unlocked_async1++;
426 #endif
427 call->async_dequeue = FALSE;
428 entry->queue = NULL;
429 } else if (old_queue != queue) {
430 timer_call_entry_dequeue(call);
431 #if TIMER_ASSERT
432 timer_call_enqueue_deadline_unlocked_async2++;
433 #endif
434 }
435 if (old_queue == timer_longterm_queue)
436 timer_longterm_dequeued_locked(call);
437 if (old_queue != queue) {
438 timer_queue_unlock(old_queue);
439 timer_queue_lock_spin(queue);
440 }
441 } else {
442 timer_queue_lock_spin(queue);
443 }
444
445 call->soft_deadline = soft_deadline;
446 call->flags = callout_flags;
447 TCE(call)->param1 = param1;
448 call->ttd = ttd;
449
450 timer_call_entry_enqueue_deadline(call, queue, deadline);
451 timer_queue_unlock(queue);
452 simple_unlock(&call->lock);
453
454 return (old_queue);
455 }
456
457 #if TIMER_ASSERT
458 unsigned timer_call_dequeue_unlocked_async1;
459 unsigned timer_call_dequeue_unlocked_async2;
460 #endif
461 mpqueue_head_t *
462 timer_call_dequeue_unlocked(
463 timer_call_t call)
464 {
465 call_entry_t entry = TCE(call);
466 mpqueue_head_t *old_queue;
467
468 DBG("timer_call_dequeue_unlocked(%p)\n", call);
469
470 simple_lock(&call->lock);
471 old_queue = MPQUEUE(entry->queue);
472 #if TIMER_ASSERT
473 TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
474 DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
475 VM_KERNEL_UNSLIDE_OR_PERM(call),
476 call->async_dequeue,
477 VM_KERNEL_UNSLIDE_OR_PERM(TCE(call)->queue),
478 0, 0);
479 #endif
480 if (old_queue != NULL) {
481 timer_queue_lock_spin(old_queue);
482 if (call->async_dequeue) {
483 /* collision (1c): timer already dequeued, clear flag */
484 #if TIMER_ASSERT
485 TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
486 DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
487 VM_KERNEL_UNSLIDE_OR_PERM(call),
488 call->async_dequeue,
489 VM_KERNEL_UNSLIDE_OR_PERM(TCE(call)->queue),
490 0x1c, 0);
491 timer_call_dequeue_unlocked_async1++;
492 #endif
493 call->async_dequeue = FALSE;
494 entry->queue = NULL;
495 } else {
496 timer_call_entry_dequeue(call);
497 }
498 if (old_queue == timer_longterm_queue)
499 timer_longterm_dequeued_locked(call);
500 timer_queue_unlock(old_queue);
501 }
502 simple_unlock(&call->lock);
503 return (old_queue);
504 }
505
506
507 /*
508 * Timer call entry locking model
509 * ==============================
510 *
511 * Timer call entries are linked on per-cpu timer queues which are protected
512 * by the queue lock and the call entry lock. The locking protocol is:
513 *
514 * 0) The canonical locking order is timer call entry followed by queue.
515 *
516 * 1) With only the entry lock held, entry.queue is valid:
517 * 1a) NULL: the entry is not queued, or
518 * 1b) non-NULL: this queue must be locked before the entry is modified.
519 * After locking the queue, the call.async_dequeue flag must be checked:
520 * 1c) TRUE: the entry was removed from the queue by another thread
521 * and we must NULL the entry.queue and reset this flag, or
522 * 1d) FALSE: (ie. queued), the entry can be manipulated.
523 *
524 * 2) If a queue lock is obtained first, the queue is stable:
525 * 2a) If a try-lock of a queued entry succeeds, the call can be operated on
526 * and dequeued.
527 * 2b) If a try-lock fails, it indicates that another thread is attempting
528 * to change the entry and move it to a different position in this queue
529 * or to different queue. The entry can be dequeued but it should not be
530 * operated upon since it is being changed. Furthermore, we don't null
531 * the entry.queue pointer (protected by the entry lock we don't own).
532 * Instead, we set the async_dequeue flag -- see (1c).
533 * 2c) Same as 2b but occurring when a longterm timer is matured.
534 * 3) A callout's parameters (deadline, flags, parameters, soft deadline &c.)
535 * should be manipulated with the appropriate timer queue lock held,
536 * to prevent queue traversal observations from observing inconsistent
537 * updates to an in-flight callout.
538 */
539
540 /*
541 * Inlines timer_call_entry_dequeue() and timer_call_entry_enqueue_deadline()
542 * cast between pointer types (mpqueue_head_t *) and (queue_t) so that
543 * we can use the call_entry_dequeue() and call_entry_enqueue_deadline()
544 * methods to operate on timer_call structs as if they are call_entry structs.
545 * These structures are identical except for their queue head pointer fields.
546 *
547 * In the debug case, we assert that the timer call locking protocol
548 * is being obeyed.
549 */
550
551 static boolean_t
552 timer_call_enter_internal(
553 timer_call_t call,
554 timer_call_param_t param1,
555 uint64_t deadline,
556 uint64_t leeway,
557 uint32_t flags,
558 boolean_t ratelimited)
559 {
560 mpqueue_head_t *queue = NULL;
561 mpqueue_head_t *old_queue;
562 spl_t s;
563 uint64_t slop;
564 uint32_t urgency;
565 uint64_t sdeadline, ttd;
566
567 assert(call->call_entry.func != NULL);
568 s = splclock();
569
570 sdeadline = deadline;
571 uint64_t ctime = mach_absolute_time();
572
573 TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
574 DECR_TIMER_ENTER | DBG_FUNC_START,
575 VM_KERNEL_UNSLIDE_OR_PERM(call),
576 VM_KERNEL_UNSLIDE_OR_PERM(param1), deadline, flags, 0);
577
578 urgency = (flags & TIMER_CALL_URGENCY_MASK);
579
580 boolean_t slop_ratelimited = FALSE;
581 slop = timer_call_slop(deadline, ctime, urgency, current_thread(), &slop_ratelimited);
582
583 if ((flags & TIMER_CALL_LEEWAY) != 0 && leeway > slop)
584 slop = leeway;
585
586 if (UINT64_MAX - deadline <= slop) {
587 deadline = UINT64_MAX;
588 } else {
589 deadline += slop;
590 }
591
592 if (__improbable(deadline < ctime)) {
593 uint64_t delta = (ctime - deadline);
594
595 past_deadline_timers++;
596 past_deadline_deltas += delta;
597 if (delta > past_deadline_longest)
598 past_deadline_longest = deadline;
599 if (delta < past_deadline_shortest)
600 past_deadline_shortest = delta;
601
602 deadline = ctime + past_deadline_timer_adjustment;
603 sdeadline = deadline;
604 }
605
606 if (ratelimited || slop_ratelimited) {
607 flags |= TIMER_CALL_RATELIMITED;
608 } else {
609 flags &= ~TIMER_CALL_RATELIMITED;
610 }
611
612 ttd = sdeadline - ctime;
613 #if CONFIG_DTRACE
614 DTRACE_TMR7(callout__create, timer_call_func_t, TCE(call)->func,
615 timer_call_param_t, TCE(call)->param0, uint32_t, flags,
616 (deadline - sdeadline),
617 (ttd >> 32), (unsigned) (ttd & 0xFFFFFFFF), call);
618 #endif
619
620 /* Program timer callout parameters under the appropriate per-CPU or
621 * longterm queue lock. The callout may have been previously enqueued
622 * and in-flight on this or another timer queue.
623 */
624 if (!ratelimited && !slop_ratelimited) {
625 queue = timer_longterm_enqueue_unlocked(call, ctime, deadline, &old_queue, sdeadline, ttd, param1, flags);
626 }
627
628 if (queue == NULL) {
629 queue = timer_queue_assign(deadline);
630 old_queue = timer_call_enqueue_deadline_unlocked(call, queue, deadline, sdeadline, ttd, param1, flags);
631 }
632
633 #if TIMER_TRACE
634 TCE(call)->entry_time = ctime;
635 #endif
636
637 TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
638 DECR_TIMER_ENTER | DBG_FUNC_END,
639 VM_KERNEL_UNSLIDE_OR_PERM(call),
640 (old_queue != NULL), deadline, queue->count, 0);
641
642 splx(s);
643
644 return (old_queue != NULL);
645 }
646
647 /*
648 * timer_call_*()
649 * return boolean indicating whether the call was previously queued.
650 */
651 boolean_t
652 timer_call_enter(
653 timer_call_t call,
654 uint64_t deadline,
655 uint32_t flags)
656 {
657 return timer_call_enter_internal(call, NULL, deadline, 0, flags, FALSE);
658 }
659
660 boolean_t
661 timer_call_enter1(
662 timer_call_t call,
663 timer_call_param_t param1,
664 uint64_t deadline,
665 uint32_t flags)
666 {
667 return timer_call_enter_internal(call, param1, deadline, 0, flags, FALSE);
668 }
669
670 boolean_t
671 timer_call_enter_with_leeway(
672 timer_call_t call,
673 timer_call_param_t param1,
674 uint64_t deadline,
675 uint64_t leeway,
676 uint32_t flags,
677 boolean_t ratelimited)
678 {
679 return timer_call_enter_internal(call, param1, deadline, leeway, flags, ratelimited);
680 }
681
682 boolean_t
683 timer_call_cancel(
684 timer_call_t call)
685 {
686 mpqueue_head_t *old_queue;
687 spl_t s;
688
689 s = splclock();
690
691 TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
692 DECR_TIMER_CANCEL | DBG_FUNC_START,
693 VM_KERNEL_UNSLIDE_OR_PERM(call),
694 TCE(call)->deadline, call->soft_deadline, call->flags, 0);
695
696 old_queue = timer_call_dequeue_unlocked(call);
697
698 if (old_queue != NULL) {
699 timer_queue_lock_spin(old_queue);
700 if (!queue_empty(&old_queue->head)) {
701 timer_queue_cancel(old_queue, TCE(call)->deadline, CE(queue_first(&old_queue->head))->deadline);
702 timer_call_t thead = (timer_call_t)queue_first(&old_queue->head);
703 old_queue->earliest_soft_deadline = thead->flags & TIMER_CALL_RATELIMITED ? TCE(thead)->deadline : thead->soft_deadline;
704 }
705 else {
706 timer_queue_cancel(old_queue, TCE(call)->deadline, UINT64_MAX);
707 old_queue->earliest_soft_deadline = UINT64_MAX;
708 }
709 timer_queue_unlock(old_queue);
710 }
711 TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
712 DECR_TIMER_CANCEL | DBG_FUNC_END,
713 VM_KERNEL_UNSLIDE_OR_PERM(call),
714 VM_KERNEL_UNSLIDE_OR_PERM(old_queue),
715 TCE(call)->deadline - mach_absolute_time(),
716 TCE(call)->deadline - TCE(call)->entry_time, 0);
717 splx(s);
718
719 #if CONFIG_DTRACE
720 DTRACE_TMR6(callout__cancel, timer_call_func_t, TCE(call)->func,
721 timer_call_param_t, TCE(call)->param0, uint32_t, call->flags, 0,
722 (call->ttd >> 32), (unsigned) (call->ttd & 0xFFFFFFFF));
723 #endif
724
725 return (old_queue != NULL);
726 }
727
728 static uint32_t timer_queue_shutdown_lock_skips;
729 static uint32_t timer_queue_shutdown_discarded;
730
731 void
732 timer_queue_shutdown(
733 mpqueue_head_t *queue)
734 {
735 timer_call_t call;
736 mpqueue_head_t *new_queue;
737 spl_t s;
738
739
740 DBG("timer_queue_shutdown(%p)\n", queue);
741
742 s = splclock();
743
744 /* Note comma operator in while expression re-locking each iteration */
745 while ((void)timer_queue_lock_spin(queue), !queue_empty(&queue->head)) {
746 call = TIMER_CALL(queue_first(&queue->head));
747
748 if (!simple_lock_try(&call->lock)) {
749 /*
750 * case (2b) lock order inversion, dequeue and skip
751 * Don't change the call_entry queue back-pointer
752 * but set the async_dequeue field.
753 */
754 timer_queue_shutdown_lock_skips++;
755 timer_call_entry_dequeue_async(call);
756 #if TIMER_ASSERT
757 TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
758 DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
759 VM_KERNEL_UNSLIDE_OR_PERM(call),
760 call->async_dequeue,
761 VM_KERNEL_UNSLIDE_OR_PERM(TCE(call)->queue),
762 0x2b, 0);
763 #endif
764 timer_queue_unlock(queue);
765 continue;
766 }
767
768 boolean_t call_local = ((call->flags & TIMER_CALL_LOCAL) != 0);
769
770 /* remove entry from old queue */
771 timer_call_entry_dequeue(call);
772 timer_queue_unlock(queue);
773
774 if (call_local == FALSE) {
775 /* and queue it on new, discarding LOCAL timers */
776 new_queue = timer_queue_assign(TCE(call)->deadline);
777 timer_queue_lock_spin(new_queue);
778 timer_call_entry_enqueue_deadline(
779 call, new_queue, TCE(call)->deadline);
780 timer_queue_unlock(new_queue);
781 } else {
782 timer_queue_shutdown_discarded++;
783 }
784
785 /* The only lingering LOCAL timer should be this thread's
786 * quantum expiration timer.
787 */
788 assert((call_local == FALSE) ||
789 (TCE(call)->func == thread_quantum_expire));
790
791 simple_unlock(&call->lock);
792 }
793
794 timer_queue_unlock(queue);
795 splx(s);
796 }
797
798 static uint32_t timer_queue_expire_lock_skips;
799 uint64_t
800 timer_queue_expire_with_options(
801 mpqueue_head_t *queue,
802 uint64_t deadline,
803 boolean_t rescan)
804 {
805 timer_call_t call = NULL;
806 uint32_t tc_iterations = 0;
807 DBG("timer_queue_expire(%p,)\n", queue);
808
809 uint64_t cur_deadline = deadline;
810 timer_queue_lock_spin(queue);
811
812 while (!queue_empty(&queue->head)) {
813 /* Upon processing one or more timer calls, refresh the
814 * deadline to account for time elapsed in the callout
815 */
816 if (++tc_iterations > 1)
817 cur_deadline = mach_absolute_time();
818
819 if (call == NULL)
820 call = TIMER_CALL(queue_first(&queue->head));
821
822 if (call->soft_deadline <= cur_deadline) {
823 timer_call_func_t func;
824 timer_call_param_t param0, param1;
825
826 TCOAL_DEBUG(0xDDDD0000, queue->earliest_soft_deadline, call->soft_deadline, 0, 0, 0);
827 TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
828 DECR_TIMER_EXPIRE | DBG_FUNC_NONE,
829 VM_KERNEL_UNSLIDE_OR_PERM(call),
830 call->soft_deadline,
831 TCE(call)->deadline,
832 TCE(call)->entry_time, 0);
833
834 if ((call->flags & TIMER_CALL_RATELIMITED) &&
835 (TCE(call)->deadline > cur_deadline)) {
836 if (rescan == FALSE)
837 break;
838 }
839
840 if (!simple_lock_try(&call->lock)) {
841 /* case (2b) lock inversion, dequeue and skip */
842 timer_queue_expire_lock_skips++;
843 timer_call_entry_dequeue_async(call);
844 call = NULL;
845 continue;
846 }
847
848 timer_call_entry_dequeue(call);
849
850 func = TCE(call)->func;
851 param0 = TCE(call)->param0;
852 param1 = TCE(call)->param1;
853
854 simple_unlock(&call->lock);
855 timer_queue_unlock(queue);
856
857 TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
858 DECR_TIMER_CALLOUT | DBG_FUNC_START,
859 VM_KERNEL_UNSLIDE_OR_PERM(call), VM_KERNEL_UNSLIDE(func),
860 VM_KERNEL_UNSLIDE_OR_PERM(param0),
861 VM_KERNEL_UNSLIDE_OR_PERM(param1),
862 0);
863
864 #if CONFIG_DTRACE
865 DTRACE_TMR7(callout__start, timer_call_func_t, func,
866 timer_call_param_t, param0, unsigned, call->flags,
867 0, (call->ttd >> 32),
868 (unsigned) (call->ttd & 0xFFFFFFFF), call);
869 #endif
870 /* Maintain time-to-deadline in per-processor data
871 * structure for thread wakeup deadline statistics.
872 */
873 uint64_t *ttdp = &(PROCESSOR_DATA(current_processor(), timer_call_ttd));
874 *ttdp = call->ttd;
875 (*func)(param0, param1);
876 *ttdp = 0;
877 #if CONFIG_DTRACE
878 DTRACE_TMR4(callout__end, timer_call_func_t, func,
879 param0, param1, call);
880 #endif
881
882 TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
883 DECR_TIMER_CALLOUT | DBG_FUNC_END,
884 VM_KERNEL_UNSLIDE_OR_PERM(call), VM_KERNEL_UNSLIDE(func),
885 VM_KERNEL_UNSLIDE_OR_PERM(param0),
886 VM_KERNEL_UNSLIDE_OR_PERM(param1),
887 0);
888 call = NULL;
889 timer_queue_lock_spin(queue);
890 } else {
891 if (__probable(rescan == FALSE)) {
892 break;
893 } else {
894 int64_t skew = TCE(call)->deadline - call->soft_deadline;
895 assert(TCE(call)->deadline >= call->soft_deadline);
896
897 /* DRK: On a latency quality-of-service level change,
898 * re-sort potentially rate-limited timers. The platform
899 * layer determines which timers require
900 * this. In the absence of the per-callout
901 * synchronization requirement, a global resort could
902 * be more efficient. The re-sort effectively
903 * annuls all timer adjustments, i.e. the "soft
904 * deadline" is the sort key.
905 */
906
907 if (timer_resort_threshold(skew)) {
908 if (__probable(simple_lock_try(&call->lock))) {
909 timer_call_entry_dequeue(call);
910 timer_call_entry_enqueue_deadline(call, queue, call->soft_deadline);
911 simple_unlock(&call->lock);
912 call = NULL;
913 }
914 }
915 if (call) {
916 call = TIMER_CALL(queue_next(qe(call)));
917 if (queue_end(&queue->head, qe(call)))
918 break;
919 }
920 }
921 }
922 }
923
924 if (!queue_empty(&queue->head)) {
925 call = TIMER_CALL(queue_first(&queue->head));
926 cur_deadline = TCE(call)->deadline;
927 queue->earliest_soft_deadline = (call->flags & TIMER_CALL_RATELIMITED) ? TCE(call)->deadline: call->soft_deadline;
928 } else {
929 queue->earliest_soft_deadline = cur_deadline = UINT64_MAX;
930 }
931
932 timer_queue_unlock(queue);
933
934 return (cur_deadline);
935 }
936
937 uint64_t
938 timer_queue_expire(
939 mpqueue_head_t *queue,
940 uint64_t deadline)
941 {
942 return timer_queue_expire_with_options(queue, deadline, FALSE);
943 }
944
945 extern int serverperfmode;
946 static uint32_t timer_queue_migrate_lock_skips;
947 /*
948 * timer_queue_migrate() is called by timer_queue_migrate_cpu()
949 * to move timer requests from the local processor (queue_from)
950 * to a target processor's (queue_to).
951 */
952 int
953 timer_queue_migrate(mpqueue_head_t *queue_from, mpqueue_head_t *queue_to)
954 {
955 timer_call_t call;
956 timer_call_t head_to;
957 int timers_migrated = 0;
958
959 DBG("timer_queue_migrate(%p,%p)\n", queue_from, queue_to);
960
961 assert(!ml_get_interrupts_enabled());
962 assert(queue_from != queue_to);
963
964 if (serverperfmode) {
965 /*
966 * if we're running a high end server
967 * avoid migrations... they add latency
968 * and don't save us power under typical
969 * server workloads
970 */
971 return -4;
972 }
973
974 /*
975 * Take both local (from) and target (to) timer queue locks while
976 * moving the timers from the local queue to the target processor.
977 * We assume that the target is always the boot processor.
978 * But only move if all of the following is true:
979 * - the target queue is non-empty
980 * - the local queue is non-empty
981 * - the local queue's first deadline is later than the target's
982 * - the local queue contains no non-migrateable "local" call
983 * so that we need not have the target resync.
984 */
985
986 timer_queue_lock_spin(queue_to);
987
988 head_to = TIMER_CALL(queue_first(&queue_to->head));
989 if (queue_empty(&queue_to->head)) {
990 timers_migrated = -1;
991 goto abort1;
992 }
993
994 timer_queue_lock_spin(queue_from);
995
996 if (queue_empty(&queue_from->head)) {
997 timers_migrated = -2;
998 goto abort2;
999 }
1000
1001 call = TIMER_CALL(queue_first(&queue_from->head));
1002 if (TCE(call)->deadline < TCE(head_to)->deadline) {
1003 timers_migrated = 0;
1004 goto abort2;
1005 }
1006
1007 /* perform scan for non-migratable timers */
1008 do {
1009 if (call->flags & TIMER_CALL_LOCAL) {
1010 timers_migrated = -3;
1011 goto abort2;
1012 }
1013 call = TIMER_CALL(queue_next(qe(call)));
1014 } while (!queue_end(&queue_from->head, qe(call)));
1015
1016 /* migration loop itself -- both queues are locked */
1017 while (!queue_empty(&queue_from->head)) {
1018 call = TIMER_CALL(queue_first(&queue_from->head));
1019 if (!simple_lock_try(&call->lock)) {
1020 /* case (2b) lock order inversion, dequeue only */
1021 #ifdef TIMER_ASSERT
1022 TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1023 DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
1024 VM_KERNEL_UNSLIDE_OR_PERM(call),
1025 VM_KERNEL_UNSLIDE_OR_PERM(TCE(call)->queue),
1026 VM_KERNEL_UNSLIDE_OR_PERM(call->lock.interlock.lock_data),
1027 0x2b, 0);
1028 #endif
1029 timer_queue_migrate_lock_skips++;
1030 timer_call_entry_dequeue_async(call);
1031 continue;
1032 }
1033 timer_call_entry_dequeue(call);
1034 timer_call_entry_enqueue_deadline(
1035 call, queue_to, TCE(call)->deadline);
1036 timers_migrated++;
1037 simple_unlock(&call->lock);
1038 }
1039 queue_from->earliest_soft_deadline = UINT64_MAX;
1040 abort2:
1041 timer_queue_unlock(queue_from);
1042 abort1:
1043 timer_queue_unlock(queue_to);
1044
1045 return timers_migrated;
1046 }
1047
1048 void
1049 timer_queue_trace_cpu(int ncpu)
1050 {
1051 timer_call_nosync_cpu(
1052 ncpu,
1053 (void(*)())timer_queue_trace,
1054 (void*) timer_queue_cpu(ncpu));
1055 }
1056
1057 void
1058 timer_queue_trace(
1059 mpqueue_head_t *queue)
1060 {
1061 timer_call_t call;
1062 spl_t s;
1063
1064 if (!kdebug_enable)
1065 return;
1066
1067 s = splclock();
1068 timer_queue_lock_spin(queue);
1069
1070 TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1071 DECR_TIMER_QUEUE | DBG_FUNC_START,
1072 queue->count, mach_absolute_time(), 0, 0, 0);
1073
1074 if (!queue_empty(&queue->head)) {
1075 call = TIMER_CALL(queue_first(&queue->head));
1076 do {
1077 TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1078 DECR_TIMER_QUEUE | DBG_FUNC_NONE,
1079 call->soft_deadline,
1080 TCE(call)->deadline,
1081 TCE(call)->entry_time,
1082 VM_KERNEL_UNSLIDE(TCE(call)->func),
1083 0);
1084 call = TIMER_CALL(queue_next(qe(call)));
1085 } while (!queue_end(&queue->head, qe(call)));
1086 }
1087
1088 TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1089 DECR_TIMER_QUEUE | DBG_FUNC_END,
1090 queue->count, mach_absolute_time(), 0, 0, 0);
1091
1092 timer_queue_unlock(queue);
1093 splx(s);
1094 }
1095
1096 void
1097 timer_longterm_dequeued_locked(timer_call_t call)
1098 {
1099 timer_longterm_t *tlp = &timer_longterm;
1100
1101 tlp->dequeues++;
1102 if (call == tlp->threshold.call)
1103 tlp->threshold.call = NULL;
1104 }
1105
1106 /*
1107 * Place a timer call in the longterm list
1108 * and adjust the next timer callout deadline if the new timer is first.
1109 */
1110 mpqueue_head_t *
1111 timer_longterm_enqueue_unlocked(timer_call_t call,
1112 uint64_t now,
1113 uint64_t deadline,
1114 mpqueue_head_t **old_queue,
1115 uint64_t soft_deadline,
1116 uint64_t ttd,
1117 timer_call_param_t param1,
1118 uint32_t callout_flags)
1119 {
1120 timer_longterm_t *tlp = &timer_longterm;
1121 boolean_t update_required = FALSE;
1122 uint64_t longterm_threshold;
1123
1124 longterm_threshold = now + tlp->threshold.interval;
1125
1126 /*
1127 * Return NULL without doing anything if:
1128 * - this timer is local, or
1129 * - the longterm mechanism is disabled, or
1130 * - this deadline is too short.
1131 */
1132 if ((callout_flags & TIMER_CALL_LOCAL) != 0 ||
1133 (tlp->threshold.interval == TIMER_LONGTERM_NONE) ||
1134 (deadline <= longterm_threshold))
1135 return NULL;
1136
1137 /*
1138 * Remove timer from its current queue, if any.
1139 */
1140 *old_queue = timer_call_dequeue_unlocked(call);
1141
1142 /*
1143 * Lock the longterm queue, queue timer and determine
1144 * whether an update is necessary.
1145 */
1146 assert(!ml_get_interrupts_enabled());
1147 simple_lock(&call->lock);
1148 timer_queue_lock_spin(timer_longterm_queue);
1149 TCE(call)->deadline = deadline;
1150 TCE(call)->param1 = param1;
1151 call->ttd = ttd;
1152 call->soft_deadline = soft_deadline;
1153 call->flags = callout_flags;
1154 timer_call_entry_enqueue_tail(call, timer_longterm_queue);
1155
1156 tlp->enqueues++;
1157
1158 /*
1159 * We'll need to update the currently set threshold timer
1160 * if the new deadline is sooner and no sooner update is in flight.
1161 */
1162 if (deadline < tlp->threshold.deadline &&
1163 deadline < tlp->threshold.preempted) {
1164 tlp->threshold.preempted = deadline;
1165 tlp->threshold.call = call;
1166 update_required = TRUE;
1167 }
1168 timer_queue_unlock(timer_longterm_queue);
1169 simple_unlock(&call->lock);
1170
1171 if (update_required) {
1172 /*
1173 * Note: this call expects that calling the master cpu
1174 * alone does not involve locking the topo lock.
1175 */
1176 timer_call_nosync_cpu(
1177 master_cpu,
1178 (void (*)(void *)) timer_longterm_update,
1179 (void *)tlp);
1180 }
1181
1182 return timer_longterm_queue;
1183 }
1184
1185 /*
1186 * Scan for timers below the longterm threshold.
1187 * Move these to the local timer queue (of the boot processor on which the
1188 * calling thread is running).
1189 * Both the local (boot) queue and the longterm queue are locked.
1190 * The scan is similar to the timer migrate sequence but is performed by
1191 * successively examining each timer on the longterm queue:
1192 * - if within the short-term threshold
1193 * - enter on the local queue (unless being deleted),
1194 * - otherwise:
1195 * - if sooner, deadline becomes the next threshold deadline.
1196 */
1197 void
1198 timer_longterm_scan(timer_longterm_t *tlp,
1199 uint64_t now)
1200 {
1201 queue_entry_t qe;
1202 timer_call_t call;
1203 uint64_t threshold;
1204 uint64_t deadline;
1205 mpqueue_head_t *timer_master_queue;
1206
1207 assert(!ml_get_interrupts_enabled());
1208 assert(cpu_number() == master_cpu);
1209
1210 if (tlp->threshold.interval != TIMER_LONGTERM_NONE)
1211 threshold = now + tlp->threshold.interval;
1212 else
1213 threshold = TIMER_LONGTERM_NONE;
1214
1215 tlp->threshold.deadline = TIMER_LONGTERM_NONE;
1216 tlp->threshold.call = NULL;
1217
1218 if (queue_empty(&timer_longterm_queue->head))
1219 return;
1220
1221 timer_master_queue = timer_queue_cpu(master_cpu);
1222 timer_queue_lock_spin(timer_master_queue);
1223
1224 qe = queue_first(&timer_longterm_queue->head);
1225 while (!queue_end(&timer_longterm_queue->head, qe)) {
1226 call = TIMER_CALL(qe);
1227 deadline = call->soft_deadline;
1228 qe = queue_next(qe);
1229 if (!simple_lock_try(&call->lock)) {
1230 /* case (2c) lock order inversion, dequeue only */
1231 #ifdef TIMER_ASSERT
1232 TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1233 DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
1234 VM_KERNEL_UNSLIDE_OR_PERM(call),
1235 VM_KERNEL_UNSLIDE_OR_PERM(TCE(call)->queue),
1236 VM_KERNEL_UNSLIDE_OR_PERM(call->lock.interlock.lock_data),
1237 0x2c, 0);
1238 #endif
1239 timer_call_entry_dequeue_async(call);
1240 continue;
1241 }
1242 if (deadline < threshold) {
1243 /*
1244 * This timer needs moving (escalating)
1245 * to the local (boot) processor's queue.
1246 */
1247 #ifdef TIMER_ASSERT
1248 if (deadline < now)
1249 TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1250 DECR_TIMER_OVERDUE | DBG_FUNC_NONE,
1251 VM_KERNEL_UNSLIDE_OR_PERM(call),
1252 deadline,
1253 now,
1254 threshold,
1255 0);
1256 #endif
1257 TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1258 DECR_TIMER_ESCALATE | DBG_FUNC_NONE,
1259 VM_KERNEL_UNSLIDE_OR_PERM(call),
1260 TCE(call)->deadline,
1261 TCE(call)->entry_time,
1262 VM_KERNEL_UNSLIDE(TCE(call)->func),
1263 0);
1264 tlp->escalates++;
1265 timer_call_entry_dequeue(call);
1266 timer_call_entry_enqueue_deadline(
1267 call, timer_master_queue, TCE(call)->deadline);
1268 /*
1269 * A side-effect of the following call is to update
1270 * the actual hardware deadline if required.
1271 */
1272 (void) timer_queue_assign(deadline);
1273 } else {
1274 if (deadline < tlp->threshold.deadline) {
1275 tlp->threshold.deadline = deadline;
1276 tlp->threshold.call = call;
1277 }
1278 }
1279 simple_unlock(&call->lock);
1280 }
1281
1282 timer_queue_unlock(timer_master_queue);
1283 }
1284
1285 void
1286 timer_longterm_callout(timer_call_param_t p0, __unused timer_call_param_t p1)
1287 {
1288 timer_longterm_t *tlp = (timer_longterm_t *) p0;
1289
1290 timer_longterm_update(tlp);
1291 }
1292
1293 void
1294 timer_longterm_update_locked(timer_longterm_t *tlp)
1295 {
1296 uint64_t latency;
1297
1298 TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1299 DECR_TIMER_UPDATE | DBG_FUNC_START,
1300 VM_KERNEL_UNSLIDE_OR_PERM(&tlp->queue),
1301 tlp->threshold.deadline,
1302 tlp->threshold.preempted,
1303 tlp->queue.count, 0);
1304
1305 tlp->scan_time = mach_absolute_time();
1306 if (tlp->threshold.preempted != TIMER_LONGTERM_NONE) {
1307 tlp->threshold.preempts++;
1308 tlp->threshold.deadline = tlp->threshold.preempted;
1309 tlp->threshold.preempted = TIMER_LONGTERM_NONE;
1310 /*
1311 * Note: in the unlikely event that a pre-empted timer has
1312 * itself been cancelled, we'll simply re-scan later at the
1313 * time of the preempted/cancelled timer.
1314 */
1315 } else {
1316 tlp->threshold.scans++;
1317
1318 /*
1319 * Maintain a moving average of our wakeup latency.
1320 * Clamp latency to 0 and ignore above threshold interval.
1321 */
1322 if (tlp->scan_time > tlp->threshold.deadline_set)
1323 latency = tlp->scan_time - tlp->threshold.deadline_set;
1324 else
1325 latency = 0;
1326 if (latency < tlp->threshold.interval) {
1327 tlp->threshold.latency_min =
1328 MIN(tlp->threshold.latency_min, latency);
1329 tlp->threshold.latency_max =
1330 MAX(tlp->threshold.latency_max, latency);
1331 tlp->threshold.latency =
1332 (tlp->threshold.latency*99 + latency) / 100;
1333 }
1334
1335 timer_longterm_scan(tlp, tlp->scan_time);
1336 }
1337
1338 tlp->threshold.deadline_set = tlp->threshold.deadline;
1339 /* The next deadline timer to be set is adjusted */
1340 if (tlp->threshold.deadline != TIMER_LONGTERM_NONE) {
1341 tlp->threshold.deadline_set -= tlp->threshold.margin;
1342 tlp->threshold.deadline_set -= tlp->threshold.latency;
1343 }
1344
1345 TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1346 DECR_TIMER_UPDATE | DBG_FUNC_END,
1347 VM_KERNEL_UNSLIDE_OR_PERM(&tlp->queue),
1348 tlp->threshold.deadline,
1349 tlp->threshold.scans,
1350 tlp->queue.count, 0);
1351 }
1352
1353 void
1354 timer_longterm_update(timer_longterm_t *tlp)
1355 {
1356 spl_t s = splclock();
1357
1358 timer_queue_lock_spin(timer_longterm_queue);
1359
1360 if (cpu_number() != master_cpu)
1361 panic("timer_longterm_update_master() on non-boot cpu");
1362
1363 timer_longterm_update_locked(tlp);
1364
1365 if (tlp->threshold.deadline != TIMER_LONGTERM_NONE)
1366 timer_call_enter(
1367 &tlp->threshold.timer,
1368 tlp->threshold.deadline_set,
1369 TIMER_CALL_LOCAL | TIMER_CALL_SYS_CRITICAL);
1370
1371 timer_queue_unlock(timer_longterm_queue);
1372 splx(s);
1373 }
1374
1375 void
1376 timer_longterm_init(void)
1377 {
1378 uint32_t longterm;
1379 timer_longterm_t *tlp = &timer_longterm;
1380
1381 DBG("timer_longterm_init() tlp: %p, queue: %p\n", tlp, &tlp->queue);
1382
1383 /*
1384 * Set the longterm timer threshold. Defaults to TIMER_LONGTERM_THRESHOLD
1385 * or TIMER_LONGTERM_NONE (disabled) for server;
1386 * overridden longterm boot-arg
1387 */
1388 tlp->threshold.interval = serverperfmode ? TIMER_LONGTERM_NONE
1389 : TIMER_LONGTERM_THRESHOLD;
1390 if (PE_parse_boot_argn("longterm", &longterm, sizeof (longterm))) {
1391 tlp->threshold.interval = (longterm == 0) ?
1392 TIMER_LONGTERM_NONE :
1393 longterm * NSEC_PER_MSEC;
1394 }
1395 if (tlp->threshold.interval != TIMER_LONGTERM_NONE) {
1396 printf("Longterm timer threshold: %llu ms\n",
1397 tlp->threshold.interval / NSEC_PER_MSEC);
1398 kprintf("Longterm timer threshold: %llu ms\n",
1399 tlp->threshold.interval / NSEC_PER_MSEC);
1400 nanoseconds_to_absolutetime(tlp->threshold.interval,
1401 &tlp->threshold.interval);
1402 tlp->threshold.margin = tlp->threshold.interval / 10;
1403 tlp->threshold.latency_min = EndOfAllTime;
1404 tlp->threshold.latency_max = 0;
1405 }
1406
1407 tlp->threshold.preempted = TIMER_LONGTERM_NONE;
1408 tlp->threshold.deadline = TIMER_LONGTERM_NONE;
1409
1410 lck_attr_setdefault(&timer_longterm_lck_attr);
1411 lck_grp_attr_setdefault(&timer_longterm_lck_grp_attr);
1412 lck_grp_init(&timer_longterm_lck_grp,
1413 "timer_longterm", &timer_longterm_lck_grp_attr);
1414 mpqueue_init(&tlp->queue,
1415 &timer_longterm_lck_grp, &timer_longterm_lck_attr);
1416
1417 timer_call_setup(&tlp->threshold.timer,
1418 timer_longterm_callout, (timer_call_param_t) tlp);
1419
1420 timer_longterm_queue = &tlp->queue;
1421 }
1422
1423 enum {
1424 THRESHOLD, QCOUNT,
1425 ENQUEUES, DEQUEUES, ESCALATES, SCANS, PREEMPTS,
1426 LATENCY, LATENCY_MIN, LATENCY_MAX
1427 };
1428 uint64_t
1429 timer_sysctl_get(int oid)
1430 {
1431 timer_longterm_t *tlp = &timer_longterm;
1432
1433 switch (oid) {
1434 case THRESHOLD:
1435 return (tlp->threshold.interval == TIMER_LONGTERM_NONE) ?
1436 0 : tlp->threshold.interval / NSEC_PER_MSEC;
1437 case QCOUNT:
1438 return tlp->queue.count;
1439 case ENQUEUES:
1440 return tlp->enqueues;
1441 case DEQUEUES:
1442 return tlp->dequeues;
1443 case ESCALATES:
1444 return tlp->escalates;
1445 case SCANS:
1446 return tlp->threshold.scans;
1447 case PREEMPTS:
1448 return tlp->threshold.preempts;
1449 case LATENCY:
1450 return tlp->threshold.latency;
1451 case LATENCY_MIN:
1452 return tlp->threshold.latency_min;
1453 case LATENCY_MAX:
1454 return tlp->threshold.latency_max;
1455 default:
1456 return 0;
1457 }
1458 }
1459
1460 /*
1461 * timer_master_scan() is the inverse of timer_longterm_scan()
1462 * since it un-escalates timers to the longterm queue.
1463 */
1464 static void
1465 timer_master_scan(timer_longterm_t *tlp,
1466 uint64_t now)
1467 {
1468 queue_entry_t qe;
1469 timer_call_t call;
1470 uint64_t threshold;
1471 uint64_t deadline;
1472 mpqueue_head_t *timer_master_queue;
1473
1474 if (tlp->threshold.interval != TIMER_LONGTERM_NONE)
1475 threshold = now + tlp->threshold.interval;
1476 else
1477 threshold = TIMER_LONGTERM_NONE;
1478
1479 timer_master_queue = timer_queue_cpu(master_cpu);
1480 timer_queue_lock_spin(timer_master_queue);
1481
1482 qe = queue_first(&timer_master_queue->head);
1483 while (!queue_end(&timer_master_queue->head, qe)) {
1484 call = TIMER_CALL(qe);
1485 deadline = TCE(call)->deadline;
1486 qe = queue_next(qe);
1487 if ((call->flags & TIMER_CALL_LOCAL) != 0)
1488 continue;
1489 if (!simple_lock_try(&call->lock)) {
1490 /* case (2c) lock order inversion, dequeue only */
1491 timer_call_entry_dequeue_async(call);
1492 continue;
1493 }
1494 if (deadline > threshold) {
1495 /* move from master to longterm */
1496 timer_call_entry_dequeue(call);
1497 timer_call_entry_enqueue_tail(call, timer_longterm_queue);
1498 if (deadline < tlp->threshold.deadline) {
1499 tlp->threshold.deadline = deadline;
1500 tlp->threshold.call = call;
1501 }
1502 }
1503 simple_unlock(&call->lock);
1504 }
1505 timer_queue_unlock(timer_master_queue);
1506 }
1507
1508 static void
1509 timer_sysctl_set_threshold(uint64_t value)
1510 {
1511 timer_longterm_t *tlp = &timer_longterm;
1512 spl_t s = splclock();
1513 boolean_t threshold_increase;
1514
1515 timer_queue_lock_spin(timer_longterm_queue);
1516
1517 timer_call_cancel(&tlp->threshold.timer);
1518
1519 /*
1520 * Set the new threshold and note whther it's increasing.
1521 */
1522 if (value == 0) {
1523 tlp->threshold.interval = TIMER_LONGTERM_NONE;
1524 threshold_increase = TRUE;
1525 timer_call_cancel(&tlp->threshold.timer);
1526 } else {
1527 uint64_t old_interval = tlp->threshold.interval;
1528 tlp->threshold.interval = value * NSEC_PER_MSEC;
1529 nanoseconds_to_absolutetime(tlp->threshold.interval,
1530 &tlp->threshold.interval);
1531 tlp->threshold.margin = tlp->threshold.interval / 10;
1532 if (old_interval == TIMER_LONGTERM_NONE)
1533 threshold_increase = FALSE;
1534 else
1535 threshold_increase = (tlp->threshold.interval > old_interval);
1536 }
1537
1538 if (threshold_increase /* or removal */) {
1539 /* Escalate timers from the longterm queue */
1540 timer_longterm_scan(tlp, mach_absolute_time());
1541 } else /* decrease or addition */ {
1542 /*
1543 * We scan the local/master queue for timers now longterm.
1544 * To be strictly correct, we should scan all processor queues
1545 * but timer migration results in most timers gravitating to the
1546 * master processor in any case.
1547 */
1548 timer_master_scan(tlp, mach_absolute_time());
1549 }
1550
1551 /* Set new timer accordingly */
1552 tlp->threshold.deadline_set = tlp->threshold.deadline;
1553 if (tlp->threshold.deadline != TIMER_LONGTERM_NONE) {
1554 tlp->threshold.deadline_set -= tlp->threshold.margin;
1555 tlp->threshold.deadline_set -= tlp->threshold.latency;
1556 timer_call_enter(
1557 &tlp->threshold.timer,
1558 tlp->threshold.deadline_set,
1559 TIMER_CALL_LOCAL | TIMER_CALL_SYS_CRITICAL);
1560 }
1561
1562 /* Reset stats */
1563 tlp->enqueues = 0;
1564 tlp->dequeues = 0;
1565 tlp->escalates = 0;
1566 tlp->threshold.scans = 0;
1567 tlp->threshold.preempts = 0;
1568 tlp->threshold.latency = 0;
1569 tlp->threshold.latency_min = EndOfAllTime;
1570 tlp->threshold.latency_max = 0;
1571
1572 timer_queue_unlock(timer_longterm_queue);
1573 splx(s);
1574 }
1575
1576 int
1577 timer_sysctl_set(int oid, uint64_t value)
1578 {
1579 switch (oid) {
1580 case THRESHOLD:
1581 timer_call_cpu(
1582 master_cpu,
1583 (void (*)(void *)) timer_sysctl_set_threshold,
1584 (void *) value);
1585 return KERN_SUCCESS;
1586 default:
1587 return KERN_INVALID_ARGUMENT;
1588 }
1589 }
1590
1591
1592 /* Select timer coalescing window based on per-task quality-of-service hints */
1593 static boolean_t tcoal_qos_adjust(thread_t t, int32_t *tshift, uint64_t *tmax_abstime, boolean_t *pratelimited) {
1594 uint32_t latency_qos;
1595 boolean_t adjusted = FALSE;
1596 task_t ctask = t->task;
1597
1598 if (ctask) {
1599 latency_qos = proc_get_effective_thread_policy(t, TASK_POLICY_LATENCY_QOS);
1600
1601 assert(latency_qos <= NUM_LATENCY_QOS_TIERS);
1602
1603 if (latency_qos) {
1604 *tshift = tcoal_prio_params.latency_qos_scale[latency_qos - 1];
1605 *tmax_abstime = tcoal_prio_params.latency_qos_abstime_max[latency_qos - 1];
1606 *pratelimited = tcoal_prio_params.latency_tier_rate_limited[latency_qos - 1];
1607 adjusted = TRUE;
1608 }
1609 }
1610 return adjusted;
1611 }
1612
1613
1614 /* Adjust timer deadlines based on priority of the thread and the
1615 * urgency value provided at timeout establishment. With this mechanism,
1616 * timers are no longer necessarily sorted in order of soft deadline
1617 * on a given timer queue, i.e. they may be differentially skewed.
1618 * In the current scheme, this could lead to fewer pending timers
1619 * processed than is technically possible when the HW deadline arrives.
1620 */
1621 static void
1622 timer_compute_leeway(thread_t cthread, int32_t urgency, int32_t *tshift, uint64_t *tmax_abstime, boolean_t *pratelimited) {
1623 int16_t tpri = cthread->sched_pri;
1624 if ((urgency & TIMER_CALL_USER_MASK) != 0) {
1625 if (tpri >= BASEPRI_RTQUEUES ||
1626 urgency == TIMER_CALL_USER_CRITICAL) {
1627 *tshift = tcoal_prio_params.timer_coalesce_rt_shift;
1628 *tmax_abstime = tcoal_prio_params.timer_coalesce_rt_abstime_max;
1629 TCOAL_PRIO_STAT(rt_tcl);
1630 } else if (proc_get_effective_thread_policy(cthread, TASK_POLICY_DARWIN_BG) ||
1631 (urgency == TIMER_CALL_USER_BACKGROUND)) {
1632 /* Determine if timer should be subjected to a lower QoS */
1633 if (tcoal_qos_adjust(cthread, tshift, tmax_abstime, pratelimited)) {
1634 if (*tmax_abstime > tcoal_prio_params.timer_coalesce_bg_abstime_max) {
1635 return;
1636 } else {
1637 *pratelimited = FALSE;
1638 }
1639 }
1640 *tshift = tcoal_prio_params.timer_coalesce_bg_shift;
1641 *tmax_abstime = tcoal_prio_params.timer_coalesce_bg_abstime_max;
1642 TCOAL_PRIO_STAT(bg_tcl);
1643 } else if (tpri >= MINPRI_KERNEL) {
1644 *tshift = tcoal_prio_params.timer_coalesce_kt_shift;
1645 *tmax_abstime = tcoal_prio_params.timer_coalesce_kt_abstime_max;
1646 TCOAL_PRIO_STAT(kt_tcl);
1647 } else if (cthread->sched_mode == TH_MODE_FIXED) {
1648 *tshift = tcoal_prio_params.timer_coalesce_fp_shift;
1649 *tmax_abstime = tcoal_prio_params.timer_coalesce_fp_abstime_max;
1650 TCOAL_PRIO_STAT(fp_tcl);
1651 } else if (tcoal_qos_adjust(cthread, tshift, tmax_abstime, pratelimited)) {
1652 TCOAL_PRIO_STAT(qos_tcl);
1653 } else if (cthread->sched_mode == TH_MODE_TIMESHARE) {
1654 *tshift = tcoal_prio_params.timer_coalesce_ts_shift;
1655 *tmax_abstime = tcoal_prio_params.timer_coalesce_ts_abstime_max;
1656 TCOAL_PRIO_STAT(ts_tcl);
1657 } else {
1658 TCOAL_PRIO_STAT(nc_tcl);
1659 }
1660 } else if (urgency == TIMER_CALL_SYS_BACKGROUND) {
1661 *tshift = tcoal_prio_params.timer_coalesce_bg_shift;
1662 *tmax_abstime = tcoal_prio_params.timer_coalesce_bg_abstime_max;
1663 TCOAL_PRIO_STAT(bg_tcl);
1664 } else {
1665 *tshift = tcoal_prio_params.timer_coalesce_kt_shift;
1666 *tmax_abstime = tcoal_prio_params.timer_coalesce_kt_abstime_max;
1667 TCOAL_PRIO_STAT(kt_tcl);
1668 }
1669 }
1670
1671
1672 int timer_user_idle_level;
1673
1674 uint64_t
1675 timer_call_slop(uint64_t deadline, uint64_t now, uint32_t flags, thread_t cthread, boolean_t *pratelimited)
1676 {
1677 int32_t tcs_shift = 0;
1678 uint64_t tcs_max_abstime = 0;
1679 uint64_t adjval;
1680 uint32_t urgency = (flags & TIMER_CALL_URGENCY_MASK);
1681
1682 if (mach_timer_coalescing_enabled &&
1683 (deadline > now) && (urgency != TIMER_CALL_SYS_CRITICAL)) {
1684 timer_compute_leeway(cthread, urgency, &tcs_shift, &tcs_max_abstime, pratelimited);
1685
1686 if (tcs_shift >= 0)
1687 adjval = MIN((deadline - now) >> tcs_shift, tcs_max_abstime);
1688 else
1689 adjval = MIN((deadline - now) << (-tcs_shift), tcs_max_abstime);
1690 /* Apply adjustments derived from "user idle level" heuristic */
1691 adjval += (adjval * timer_user_idle_level) >> 7;
1692 return adjval;
1693 } else {
1694 return 0;
1695 }
1696 }
1697
1698 int
1699 timer_get_user_idle_level(void) {
1700 return timer_user_idle_level;
1701 }
1702
1703 kern_return_t timer_set_user_idle_level(int ilevel) {
1704 boolean_t do_reeval = FALSE;
1705
1706 if ((ilevel < 0) || (ilevel > 128))
1707 return KERN_INVALID_ARGUMENT;
1708
1709 if (ilevel < timer_user_idle_level) {
1710 do_reeval = TRUE;
1711 }
1712
1713 timer_user_idle_level = ilevel;
1714
1715 if (do_reeval)
1716 ml_timer_evaluate();
1717
1718 return KERN_SUCCESS;
1719 }