osfmk/kern/timer_call.c

   1 /*
   2  * Copyright (c) 1993-2008 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * Timer interrupt callout module.
  30  */
  31
  32 #include <mach/mach_types.h>
  33
  34 #include <kern/clock.h>
  35 #include <kern/smp.h>
  36 #include <kern/processor.h>
  37 #include <kern/timer_call.h>
  38 #include <kern/timer_queue.h>
  39 #include <kern/thread.h>
  40 #include <kern/policy_internal.h>
  41
  42 #include <sys/kdebug.h>
  43
  44 #if CONFIG_DTRACE
  45 #include <mach/sdt.h>
  46 #endif
  47
  48
  49 #if DEBUG
  50 #define TIMER_ASSERT    1
  51 #endif
  52
  53 //#define TIMER_ASSERT  1
  54 //#define TIMER_DBG     1
  55
  56 #if TIMER_DBG
  57 #define DBG(x...) kprintf("DBG: " x);
  58 #else
  59 #define DBG(x...)
  60 #endif
  61
  62 #if TIMER_TRACE
  63 #define TIMER_KDEBUG_TRACE      KERNEL_DEBUG_CONSTANT_IST
  64 #else
  65 #define TIMER_KDEBUG_TRACE(x...)
  66 #endif
  67
  68 LCK_GRP_DECLARE(timer_call_lck_grp, "timer_call");
  69 LCK_GRP_DECLARE(timer_longterm_lck_grp, "timer_longterm");
  70
  71 /* Timer queue lock must be acquired with interrupts disabled (under splclock()) */
  72 #define timer_queue_lock_spin(queue)                                    \
  73         lck_mtx_lock_spin_always(&queue->lock_data)
  74
  75 #define timer_queue_unlock(queue)               \
  76         lck_mtx_unlock_always(&queue->lock_data)
  77
  78 /*
  79  * The longterm timer object is a global structure holding all timers
  80  * beyond the short-term, local timer queue threshold. The boot processor
  81  * is responsible for moving each timer to its local timer queue
  82  * if and when that timer becomes due within the threshold.
  83  */
  84
  85 /* Sentinel for "no time set": */
  86 #define TIMER_LONGTERM_NONE             EndOfAllTime
  87 /* The default threadhold is the delta above which a timer is "long-term" */
  88 #if defined(__x86_64__)
  89 #define TIMER_LONGTERM_THRESHOLD        (1ULL * NSEC_PER_SEC)   /* 1 sec */
  90 #else
  91 #define TIMER_LONGTERM_THRESHOLD        TIMER_LONGTERM_NONE     /* disabled */
  92 #endif
  93
  94 /*
  95  * The scan_limit throttles processing of the longterm queue.
  96  * If the scan time exceeds this limit, we terminate, unlock
  97  * and defer for scan_interval. This prevents unbounded holding of
  98  * timer queue locks with interrupts masked.
  99  */
 100 #define TIMER_LONGTERM_SCAN_LIMIT       (100ULL * NSEC_PER_USEC)        /* 100 us */
 101 #define TIMER_LONGTERM_SCAN_INTERVAL    (100ULL * NSEC_PER_USEC)        /* 100 us */
 102 /* Sentinel for "scan limit exceeded": */
 103 #define TIMER_LONGTERM_SCAN_AGAIN       0
 104
 105 typedef struct {
 106         uint64_t        interval;       /* longterm timer interval */
 107         uint64_t        margin;         /* fudge factor (10% of interval */
 108         uint64_t        deadline;       /* first/soonest longterm deadline */
 109         uint64_t        preempted;      /* sooner timer has pre-empted */
 110         timer_call_t    call;           /* first/soonest longterm timer call */
 111         uint64_t        deadline_set;   /* next timer set */
 112         timer_call_data_t timer;        /* timer used by threshold management */
 113                                         /* Stats: */
 114         uint64_t        scans;          /*   num threshold timer scans */
 115         uint64_t        preempts;       /*   num threshold reductions */
 116         uint64_t        latency;        /*   average threshold latency */
 117         uint64_t        latency_min;    /*   minimum threshold latency */
 118         uint64_t        latency_max;    /*   maximum threshold latency */
 119 } threshold_t;
 120
 121 typedef struct {
 122         mpqueue_head_t  queue;          /* longterm timer list */
 123         uint64_t        enqueues;       /* num timers queued */
 124         uint64_t        dequeues;       /* num timers dequeued */
 125         uint64_t        escalates;      /* num timers becoming shortterm */
 126         uint64_t        scan_time;      /* last time the list was scanned */
 127         threshold_t     threshold;      /* longterm timer threshold */
 128         uint64_t        scan_limit;     /* maximum scan time */
 129         uint64_t        scan_interval;  /* interval between LT "escalation" scans */
 130         uint64_t        scan_pauses;    /* num scans exceeding time limit */
 131 } timer_longterm_t;
 132
 133 timer_longterm_t                timer_longterm = {
 134         .scan_limit = TIMER_LONGTERM_SCAN_LIMIT,
 135         .scan_interval = TIMER_LONGTERM_SCAN_INTERVAL,
 136 };
 137
 138 static mpqueue_head_t           *timer_longterm_queue = NULL;
 139
 140 static void                     timer_longterm_init(void);
 141 static void                     timer_longterm_callout(
 142         timer_call_param_t      p0,
 143         timer_call_param_t      p1);
 144 extern void                     timer_longterm_scan(
 145         timer_longterm_t        *tlp,
 146         uint64_t                now);
 147 static void                     timer_longterm_update(
 148         timer_longterm_t *tlp);
 149 static void                     timer_longterm_update_locked(
 150         timer_longterm_t *tlp);
 151 static mpqueue_head_t *         timer_longterm_enqueue_unlocked(
 152         timer_call_t            call,
 153         uint64_t                now,
 154         uint64_t                deadline,
 155         mpqueue_head_t **       old_queue,
 156         uint64_t                soft_deadline,
 157         uint64_t                ttd,
 158         timer_call_param_t      param1,
 159         uint32_t                callout_flags);
 160 static void                     timer_longterm_dequeued_locked(
 161         timer_call_t            call);
 162
 163 uint64_t past_deadline_timers;
 164 uint64_t past_deadline_deltas;
 165 uint64_t past_deadline_longest;
 166 uint64_t past_deadline_shortest = ~0ULL;
 167 enum {PAST_DEADLINE_TIMER_ADJUSTMENT_NS = 10 * 1000};
 168
 169 uint64_t past_deadline_timer_adjustment;
 170
 171 static boolean_t timer_call_enter_internal(timer_call_t call, timer_call_param_t param1, uint64_t deadline, uint64_t leeway, uint32_t flags, boolean_t ratelimited);
 172 boolean_t       mach_timer_coalescing_enabled = TRUE;
 173
 174 mpqueue_head_t  *timer_call_enqueue_deadline_unlocked(
 175         timer_call_t            call,
 176         mpqueue_head_t          *queue,
 177         uint64_t                deadline,
 178         uint64_t                soft_deadline,
 179         uint64_t                ttd,
 180         timer_call_param_t      param1,
 181         uint32_t                flags);
 182
 183 mpqueue_head_t  *timer_call_dequeue_unlocked(
 184         timer_call_t            call);
 185
 186 timer_coalescing_priority_params_t tcoal_prio_params;
 187
 188 #if TCOAL_PRIO_STATS
 189 int32_t nc_tcl, rt_tcl, bg_tcl, kt_tcl, fp_tcl, ts_tcl, qos_tcl;
 190 #define TCOAL_PRIO_STAT(x) (x++)
 191 #else
 192 #define TCOAL_PRIO_STAT(x)
 193 #endif
 194
 195 static void
 196 timer_call_init_abstime(void)
 197 {
 198         int i;
 199         uint64_t result;
 200         timer_coalescing_priority_params_ns_t * tcoal_prio_params_init = timer_call_get_priority_params();
 201         nanoseconds_to_absolutetime(PAST_DEADLINE_TIMER_ADJUSTMENT_NS, &past_deadline_timer_adjustment);
 202         nanoseconds_to_absolutetime(tcoal_prio_params_init->idle_entry_timer_processing_hdeadline_threshold_ns, &result);
 203         tcoal_prio_params.idle_entry_timer_processing_hdeadline_threshold_abstime = (uint32_t)result;
 204         nanoseconds_to_absolutetime(tcoal_prio_params_init->interrupt_timer_coalescing_ilat_threshold_ns, &result);
 205         tcoal_prio_params.interrupt_timer_coalescing_ilat_threshold_abstime = (uint32_t)result;
 206         nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_resort_threshold_ns, &result);
 207         tcoal_prio_params.timer_resort_threshold_abstime = (uint32_t)result;
 208         tcoal_prio_params.timer_coalesce_rt_shift = tcoal_prio_params_init->timer_coalesce_rt_shift;
 209         tcoal_prio_params.timer_coalesce_bg_shift = tcoal_prio_params_init->timer_coalesce_bg_shift;
 210         tcoal_prio_params.timer_coalesce_kt_shift = tcoal_prio_params_init->timer_coalesce_kt_shift;
 211         tcoal_prio_params.timer_coalesce_fp_shift = tcoal_prio_params_init->timer_coalesce_fp_shift;
 212         tcoal_prio_params.timer_coalesce_ts_shift = tcoal_prio_params_init->timer_coalesce_ts_shift;
 213
 214         nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_coalesce_rt_ns_max,
 215             &tcoal_prio_params.timer_coalesce_rt_abstime_max);
 216         nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_coalesce_bg_ns_max,
 217             &tcoal_prio_params.timer_coalesce_bg_abstime_max);
 218         nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_coalesce_kt_ns_max,
 219             &tcoal_prio_params.timer_coalesce_kt_abstime_max);
 220         nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_coalesce_fp_ns_max,
 221             &tcoal_prio_params.timer_coalesce_fp_abstime_max);
 222         nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_coalesce_ts_ns_max,
 223             &tcoal_prio_params.timer_coalesce_ts_abstime_max);
 224
 225         for (i = 0; i < NUM_LATENCY_QOS_TIERS; i++) {
 226                 tcoal_prio_params.latency_qos_scale[i] = tcoal_prio_params_init->latency_qos_scale[i];
 227                 nanoseconds_to_absolutetime(tcoal_prio_params_init->latency_qos_ns_max[i],
 228                     &tcoal_prio_params.latency_qos_abstime_max[i]);
 229                 tcoal_prio_params.latency_tier_rate_limited[i] = tcoal_prio_params_init->latency_tier_rate_limited[i];
 230         }
 231 }
 232
 233
 234 void
 235 timer_call_init(void)
 236 {
 237         timer_longterm_init();
 238         timer_call_init_abstime();
 239 }
 240
 241
 242 void
 243 timer_call_queue_init(mpqueue_head_t *queue)
 244 {
 245         DBG("timer_call_queue_init(%p)\n", queue);
 246         mpqueue_init(queue, &timer_call_lck_grp, LCK_ATTR_NULL);
 247 }
 248
 249
 250 void
 251 timer_call_setup(
 252         timer_call_t                    call,
 253         timer_call_func_t               func,
 254         timer_call_param_t              param0)
 255 {
 256         DBG("timer_call_setup(%p,%p,%p)\n", call, func, param0);
 257
 258         *call = (struct timer_call) {
 259                 .tc_func = func,
 260                 .tc_param0 = param0,
 261                 .tc_async_dequeue = false,
 262         };
 263
 264         simple_lock_init(&(call)->tc_lock, 0);
 265 }
 266
 267 static mpqueue_head_t*
 268 mpqueue_for_timer_call(timer_call_t entry)
 269 {
 270         queue_t queue_entry_is_on = entry->tc_queue;
 271         /* 'cast' the queue back to the orignal mpqueue */
 272         return __container_of(queue_entry_is_on, struct mpqueue_head, head);
 273 }
 274
 275
 276 static __inline__ mpqueue_head_t *
 277 timer_call_entry_dequeue(
 278         timer_call_t            entry)
 279 {
 280         mpqueue_head_t  *old_mpqueue = mpqueue_for_timer_call(entry);
 281
 282         /* The entry was always on a queue */
 283         assert(old_mpqueue != NULL);
 284
 285 #if TIMER_ASSERT
 286         if (!hw_lock_held((hw_lock_t)&entry->tc_lock)) {
 287                 panic("_call_entry_dequeue() "
 288                     "entry %p is not locked\n", entry);
 289         }
 290
 291         /*
 292          * XXX The queue lock is actually a mutex in spin mode
 293          *     but there's no way to test for it being held
 294          *     so we pretend it's a spinlock!
 295          */
 296         if (!hw_lock_held((hw_lock_t)&old_mpqueue->lock_data)) {
 297                 panic("_call_entry_dequeue() "
 298                     "queue %p is not locked\n", old_mpqueue);
 299         }
 300 #endif /* TIMER_ASSERT */
 301
 302         if (old_mpqueue != timer_longterm_queue) {
 303                 priority_queue_remove(&old_mpqueue->mpq_pqhead,
 304                     &entry->tc_pqlink);
 305         }
 306
 307         remqueue(&entry->tc_qlink);
 308
 309         entry->tc_queue = NULL;
 310
 311         old_mpqueue->count--;
 312
 313         return old_mpqueue;
 314 }
 315
 316 static __inline__ mpqueue_head_t *
 317 timer_call_entry_enqueue_deadline(
 318         timer_call_t                    entry,
 319         mpqueue_head_t                  *new_mpqueue,
 320         uint64_t                        deadline)
 321 {
 322         mpqueue_head_t  *old_mpqueue = mpqueue_for_timer_call(entry);
 323
 324 #if TIMER_ASSERT
 325         if (!hw_lock_held((hw_lock_t)&entry->tc_lock)) {
 326                 panic("_call_entry_enqueue_deadline() "
 327                     "entry %p is not locked\n", entry);
 328         }
 329
 330         /* XXX More lock pretense:  */
 331         if (!hw_lock_held((hw_lock_t)&new_mpqueue->lock_data)) {
 332                 panic("_call_entry_enqueue_deadline() "
 333                     "queue %p is not locked\n", new_mpqueue);
 334         }
 335
 336         if (old_mpqueue != NULL && old_mpqueue != new_mpqueue) {
 337                 panic("_call_entry_enqueue_deadline() "
 338                     "old_mpqueue %p != new_mpqueue", old_mpqueue);
 339         }
 340 #endif /* TIMER_ASSERT */
 341
 342         /* no longterm queue involved */
 343         assert(new_mpqueue != timer_longterm_queue);
 344         assert(old_mpqueue != timer_longterm_queue);
 345
 346         if (old_mpqueue == new_mpqueue) {
 347                 /* optimize the same-queue case to avoid a full re-insert */
 348                 uint64_t old_deadline = entry->tc_pqlink.deadline;
 349                 entry->tc_pqlink.deadline = deadline;
 350
 351                 if (old_deadline < deadline) {
 352                         priority_queue_entry_increased(&new_mpqueue->mpq_pqhead,
 353                             &entry->tc_pqlink);
 354                 } else {
 355                         priority_queue_entry_decreased(&new_mpqueue->mpq_pqhead,
 356                             &entry->tc_pqlink);
 357                 }
 358         } else {
 359                 if (old_mpqueue != NULL) {
 360                         priority_queue_remove(&old_mpqueue->mpq_pqhead,
 361                             &entry->tc_pqlink);
 362
 363                         re_queue_tail(&new_mpqueue->head, &entry->tc_qlink);
 364                 } else {
 365                         enqueue_tail(&new_mpqueue->head, &entry->tc_qlink);
 366                 }
 367
 368                 entry->tc_queue = &new_mpqueue->head;
 369                 entry->tc_pqlink.deadline = deadline;
 370
 371                 priority_queue_insert(&new_mpqueue->mpq_pqhead, &entry->tc_pqlink);
 372         }
 373
 374
 375         /* For efficiency, track the earliest soft deadline on the queue,
 376          * so that fuzzy decisions can be made without lock acquisitions.
 377          */
 378
 379         timer_call_t thead = priority_queue_min(&new_mpqueue->mpq_pqhead, struct timer_call, tc_pqlink);
 380
 381         new_mpqueue->earliest_soft_deadline = thead->tc_flags & TIMER_CALL_RATELIMITED ? thead->tc_pqlink.deadline : thead->tc_soft_deadline;
 382
 383         if (old_mpqueue) {
 384                 old_mpqueue->count--;
 385         }
 386         new_mpqueue->count++;
 387
 388         return old_mpqueue;
 389 }
 390
 391 static __inline__ void
 392 timer_call_entry_enqueue_tail(
 393         timer_call_t                    entry,
 394         mpqueue_head_t                  *queue)
 395 {
 396         /* entry is always dequeued before this call */
 397         assert(entry->tc_queue == NULL);
 398
 399         /*
 400          * this is only used for timer_longterm_queue, which is unordered
 401          * and thus needs no priority queueing
 402          */
 403         assert(queue == timer_longterm_queue);
 404
 405         enqueue_tail(&queue->head, &entry->tc_qlink);
 406
 407         entry->tc_queue = &queue->head;
 408
 409         queue->count++;
 410         return;
 411 }
 412
 413 /*
 414  * Remove timer entry from its queue but don't change the queue pointer
 415  * and set the async_dequeue flag. This is locking case 2b.
 416  */
 417 static __inline__ void
 418 timer_call_entry_dequeue_async(
 419         timer_call_t            entry)
 420 {
 421         mpqueue_head_t  *old_mpqueue = mpqueue_for_timer_call(entry);
 422         if (old_mpqueue) {
 423                 old_mpqueue->count--;
 424
 425                 if (old_mpqueue != timer_longterm_queue) {
 426                         priority_queue_remove(&old_mpqueue->mpq_pqhead,
 427                             &entry->tc_pqlink);
 428                 }
 429
 430                 remqueue(&entry->tc_qlink);
 431                 entry->tc_async_dequeue = true;
 432         }
 433         return;
 434 }
 435
 436 #if TIMER_ASSERT
 437 unsigned timer_call_enqueue_deadline_unlocked_async1;
 438 unsigned timer_call_enqueue_deadline_unlocked_async2;
 439 #endif
 440 /*
 441  * Assumes call_entry and queues unlocked, interrupts disabled.
 442  */
 443 __inline__ mpqueue_head_t *
 444 timer_call_enqueue_deadline_unlocked(
 445         timer_call_t                    call,
 446         mpqueue_head_t                  *queue,
 447         uint64_t                        deadline,
 448         uint64_t                        soft_deadline,
 449         uint64_t                        ttd,
 450         timer_call_param_t              param1,
 451         uint32_t                        callout_flags)
 452 {
 453         DBG("timer_call_enqueue_deadline_unlocked(%p,%p,)\n", call, queue);
 454
 455         simple_lock(&call->tc_lock, LCK_GRP_NULL);
 456
 457         mpqueue_head_t  *old_queue = mpqueue_for_timer_call(call);
 458
 459         if (old_queue != NULL) {
 460                 timer_queue_lock_spin(old_queue);
 461                 if (call->tc_async_dequeue) {
 462                         /* collision (1c): timer already dequeued, clear flag */
 463 #if TIMER_ASSERT
 464                         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
 465                             DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
 466                             VM_KERNEL_UNSLIDE_OR_PERM(call),
 467                             call->tc_async_dequeue,
 468                             VM_KERNEL_UNSLIDE_OR_PERM(call->tc_queue),
 469                             0x1c, 0);
 470                         timer_call_enqueue_deadline_unlocked_async1++;
 471 #endif
 472                         call->tc_async_dequeue = false;
 473                         call->tc_queue = NULL;
 474                 } else if (old_queue != queue) {
 475                         timer_call_entry_dequeue(call);
 476 #if TIMER_ASSERT
 477                         timer_call_enqueue_deadline_unlocked_async2++;
 478 #endif
 479                 }
 480                 if (old_queue == timer_longterm_queue) {
 481                         timer_longterm_dequeued_locked(call);
 482                 }
 483                 if (old_queue != queue) {
 484                         timer_queue_unlock(old_queue);
 485                         timer_queue_lock_spin(queue);
 486                 }
 487         } else {
 488                 timer_queue_lock_spin(queue);
 489         }
 490
 491         call->tc_soft_deadline = soft_deadline;
 492         call->tc_flags = callout_flags;
 493         call->tc_param1 = param1;
 494         call->tc_ttd = ttd;
 495
 496         timer_call_entry_enqueue_deadline(call, queue, deadline);
 497         timer_queue_unlock(queue);
 498         simple_unlock(&call->tc_lock);
 499
 500         return old_queue;
 501 }
 502
 503 #if TIMER_ASSERT
 504 unsigned timer_call_dequeue_unlocked_async1;
 505 unsigned timer_call_dequeue_unlocked_async2;
 506 #endif
 507 mpqueue_head_t *
 508 timer_call_dequeue_unlocked(
 509         timer_call_t            call)
 510 {
 511         DBG("timer_call_dequeue_unlocked(%p)\n", call);
 512
 513         simple_lock(&call->tc_lock, LCK_GRP_NULL);
 514
 515         mpqueue_head_t  *old_queue = mpqueue_for_timer_call(call);
 516
 517 #if TIMER_ASSERT
 518         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
 519             DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
 520             VM_KERNEL_UNSLIDE_OR_PERM(call),
 521             call->tc_async_dequeue,
 522             VM_KERNEL_UNSLIDE_OR_PERM(call->tc_queue),
 523             0, 0);
 524 #endif
 525         if (old_queue != NULL) {
 526                 timer_queue_lock_spin(old_queue);
 527                 if (call->tc_async_dequeue) {
 528                         /* collision (1c): timer already dequeued, clear flag */
 529 #if TIMER_ASSERT
 530                         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
 531                             DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
 532                             VM_KERNEL_UNSLIDE_OR_PERM(call),
 533                             call->tc_async_dequeue,
 534                             VM_KERNEL_UNSLIDE_OR_PERM(call->tc_queue),
 535                             0x1c, 0);
 536                         timer_call_dequeue_unlocked_async1++;
 537 #endif
 538                         call->tc_async_dequeue = false;
 539                         call->tc_queue = NULL;
 540                 } else {
 541                         timer_call_entry_dequeue(call);
 542                 }
 543                 if (old_queue == timer_longterm_queue) {
 544                         timer_longterm_dequeued_locked(call);
 545                 }
 546                 timer_queue_unlock(old_queue);
 547         }
 548         simple_unlock(&call->tc_lock);
 549         return old_queue;
 550 }
 551
 552 uint64_t
 553 timer_call_past_deadline_timer_handle(uint64_t deadline, uint64_t ctime)
 554 {
 555         uint64_t delta = (ctime - deadline);
 556
 557         past_deadline_timers++;
 558         past_deadline_deltas += delta;
 559         if (delta > past_deadline_longest) {
 560                 past_deadline_longest = deadline;
 561         }
 562         if (delta < past_deadline_shortest) {
 563                 past_deadline_shortest = delta;
 564         }
 565
 566         return ctime + past_deadline_timer_adjustment;
 567 }
 568
 569 /*
 570  * Timer call entry locking model
 571  * ==============================
 572  *
 573  * Timer call entries are linked on per-cpu timer queues which are protected
 574  * by the queue lock and the call entry lock. The locking protocol is:
 575  *
 576  *  0) The canonical locking order is timer call entry followed by queue.
 577  *
 578  *  1) With only the entry lock held, entry.queue is valid:
 579  *    1a) NULL: the entry is not queued, or
 580  *    1b) non-NULL: this queue must be locked before the entry is modified.
 581  *        After locking the queue, the call.async_dequeue flag must be checked:
 582  *    1c) TRUE: the entry was removed from the queue by another thread
 583  *              and we must NULL the entry.queue and reset this flag, or
 584  *    1d) FALSE: (ie. queued), the entry can be manipulated.
 585  *
 586  *  2) If a queue lock is obtained first, the queue is stable:
 587  *    2a) If a try-lock of a queued entry succeeds, the call can be operated on
 588  *        and dequeued.
 589  *    2b) If a try-lock fails, it indicates that another thread is attempting
 590  *        to change the entry and move it to a different position in this queue
 591  *        or to different queue. The entry can be dequeued but it should not be
 592  *        operated upon since it is being changed. Furthermore, we don't null
 593  *        the entry.queue pointer (protected by the entry lock we don't own).
 594  *        Instead, we set the async_dequeue flag -- see (1c).
 595  *    2c) Same as 2b but occurring when a longterm timer is matured.
 596  *  3) A callout's parameters (deadline, flags, parameters, soft deadline &c.)
 597  *     should be manipulated with the appropriate timer queue lock held,
 598  *     to prevent queue traversal observations from observing inconsistent
 599  *     updates to an in-flight callout.
 600  */
 601
 602 /*
 603  * In the debug case, we assert that the timer call locking protocol
 604  * is being obeyed.
 605  */
 606
 607 static boolean_t
 608 timer_call_enter_internal(
 609         timer_call_t            call,
 610         timer_call_param_t      param1,
 611         uint64_t                deadline,
 612         uint64_t                leeway,
 613         uint32_t                flags,
 614         boolean_t               ratelimited)
 615 {
 616         mpqueue_head_t          *queue = NULL;
 617         mpqueue_head_t          *old_queue;
 618         spl_t                   s;
 619         uint64_t                slop;
 620         uint32_t                urgency;
 621         uint64_t                sdeadline, ttd;
 622
 623         assert(call->tc_func != NULL);
 624         s = splclock();
 625
 626         sdeadline = deadline;
 627         uint64_t ctime = mach_absolute_time();
 628
 629         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
 630             DECR_TIMER_ENTER | DBG_FUNC_START,
 631             VM_KERNEL_UNSLIDE_OR_PERM(call),
 632             VM_KERNEL_ADDRHIDE(param1), deadline, flags, 0);
 633
 634         urgency = (flags & TIMER_CALL_URGENCY_MASK);
 635
 636         boolean_t slop_ratelimited = FALSE;
 637         slop = timer_call_slop(deadline, ctime, urgency, current_thread(), &slop_ratelimited);
 638
 639         if ((flags & TIMER_CALL_LEEWAY) != 0 && leeway > slop) {
 640                 slop = leeway;
 641         }
 642
 643         if (UINT64_MAX - deadline <= slop) {
 644                 deadline = UINT64_MAX;
 645         } else {
 646                 deadline += slop;
 647         }
 648
 649         if (__improbable(deadline < ctime)) {
 650                 deadline = timer_call_past_deadline_timer_handle(deadline, ctime);
 651                 sdeadline = deadline;
 652         }
 653
 654         if (ratelimited || slop_ratelimited) {
 655                 flags |= TIMER_CALL_RATELIMITED;
 656         } else {
 657                 flags &= ~TIMER_CALL_RATELIMITED;
 658         }
 659
 660         ttd =  sdeadline - ctime;
 661 #if CONFIG_DTRACE
 662         DTRACE_TMR7(callout__create, timer_call_func_t, call->tc_func,
 663             timer_call_param_t, call->tc_param0, uint32_t, flags,
 664             (deadline - sdeadline),
 665             (ttd >> 32), (unsigned) (ttd & 0xFFFFFFFF), call);
 666 #endif
 667
 668         /* Program timer callout parameters under the appropriate per-CPU or
 669          * longterm queue lock. The callout may have been previously enqueued
 670          * and in-flight on this or another timer queue.
 671          */
 672         if (!ratelimited && !slop_ratelimited) {
 673                 queue = timer_longterm_enqueue_unlocked(call, ctime, deadline, &old_queue, sdeadline, ttd, param1, flags);
 674         }
 675
 676         if (queue == NULL) {
 677                 queue = timer_queue_assign(deadline);
 678                 old_queue = timer_call_enqueue_deadline_unlocked(call, queue, deadline, sdeadline, ttd, param1, flags);
 679         }
 680
 681 #if TIMER_TRACE
 682         call->tc_entry_time = ctime;
 683 #endif
 684
 685         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
 686             DECR_TIMER_ENTER | DBG_FUNC_END,
 687             VM_KERNEL_UNSLIDE_OR_PERM(call),
 688             (old_queue != NULL), deadline, queue->count, 0);
 689
 690         splx(s);
 691
 692         return old_queue != NULL;
 693 }
 694
 695 /*
 696  * timer_call_*()
 697  *      return boolean indicating whether the call was previously queued.
 698  */
 699 boolean_t
 700 timer_call_enter(
 701         timer_call_t            call,
 702         uint64_t                deadline,
 703         uint32_t                flags)
 704 {
 705         return timer_call_enter_internal(call, NULL, deadline, 0, flags, FALSE);
 706 }
 707
 708 boolean_t
 709 timer_call_enter1(
 710         timer_call_t            call,
 711         timer_call_param_t      param1,
 712         uint64_t                deadline,
 713         uint32_t                flags)
 714 {
 715         return timer_call_enter_internal(call, param1, deadline, 0, flags, FALSE);
 716 }
 717
 718 boolean_t
 719 timer_call_enter_with_leeway(
 720         timer_call_t            call,
 721         timer_call_param_t      param1,
 722         uint64_t                deadline,
 723         uint64_t                leeway,
 724         uint32_t                flags,
 725         boolean_t               ratelimited)
 726 {
 727         return timer_call_enter_internal(call, param1, deadline, leeway, flags, ratelimited);
 728 }
 729
 730 boolean_t
 731 timer_call_cancel(
 732         timer_call_t            call)
 733 {
 734         mpqueue_head_t          *old_queue;
 735         spl_t                   s;
 736
 737         s = splclock();
 738
 739         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
 740             DECR_TIMER_CANCEL | DBG_FUNC_START,
 741             VM_KERNEL_UNSLIDE_OR_PERM(call),
 742             call->tc_pqlink.deadline, call->tc_soft_deadline, call->tc_flags, 0);
 743
 744         old_queue = timer_call_dequeue_unlocked(call);
 745
 746         if (old_queue != NULL) {
 747                 timer_queue_lock_spin(old_queue);
 748
 749                 timer_call_t new_head = priority_queue_min(&old_queue->mpq_pqhead, struct timer_call, tc_pqlink);
 750
 751                 if (new_head) {
 752                         timer_queue_cancel(old_queue, call->tc_pqlink.deadline, new_head->tc_pqlink.deadline);
 753                         old_queue->earliest_soft_deadline = new_head->tc_flags & TIMER_CALL_RATELIMITED ? new_head->tc_pqlink.deadline : new_head->tc_soft_deadline;
 754                 } else {
 755                         timer_queue_cancel(old_queue, call->tc_pqlink.deadline, UINT64_MAX);
 756                         old_queue->earliest_soft_deadline = UINT64_MAX;
 757                 }
 758
 759                 timer_queue_unlock(old_queue);
 760         }
 761         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
 762             DECR_TIMER_CANCEL | DBG_FUNC_END,
 763             VM_KERNEL_UNSLIDE_OR_PERM(call),
 764             VM_KERNEL_UNSLIDE_OR_PERM(old_queue),
 765             call->tc_pqlink.deadline - mach_absolute_time(),
 766             call->tc_pqlink.deadline - call->tc_entry_time, 0);
 767         splx(s);
 768
 769 #if CONFIG_DTRACE
 770         DTRACE_TMR6(callout__cancel, timer_call_func_t, call->tc_func,
 771             timer_call_param_t, call->tc_param0, uint32_t, call->tc_flags, 0,
 772             (call->tc_ttd >> 32), (unsigned) (call->tc_ttd & 0xFFFFFFFF));
 773 #endif /* CONFIG_DTRACE */
 774
 775         return old_queue != NULL;
 776 }
 777
 778 static uint32_t timer_queue_shutdown_lock_skips;
 779 static uint32_t timer_queue_shutdown_discarded;
 780
 781 void
 782 timer_queue_shutdown(
 783         mpqueue_head_t          *queue)
 784 {
 785         timer_call_t            call;
 786         mpqueue_head_t          *new_queue;
 787         spl_t                   s;
 788
 789
 790         DBG("timer_queue_shutdown(%p)\n", queue);
 791
 792         s = splclock();
 793
 794         while (TRUE) {
 795                 timer_queue_lock_spin(queue);
 796
 797                 call = qe_queue_first(&queue->head, struct timer_call, tc_qlink);
 798
 799                 if (call == NULL) {
 800                         break;
 801                 }
 802
 803                 if (!simple_lock_try(&call->tc_lock, LCK_GRP_NULL)) {
 804                         /*
 805                          * case (2b) lock order inversion, dequeue and skip
 806                          * Don't change the call_entry queue back-pointer
 807                          * but set the async_dequeue field.
 808                          */
 809                         timer_queue_shutdown_lock_skips++;
 810                         timer_call_entry_dequeue_async(call);
 811 #if TIMER_ASSERT
 812                         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
 813                             DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
 814                             VM_KERNEL_UNSLIDE_OR_PERM(call),
 815                             call->tc_async_dequeue,
 816                             VM_KERNEL_UNSLIDE_OR_PERM(call->tc_queue),
 817                             0x2b, 0);
 818 #endif
 819                         timer_queue_unlock(queue);
 820                         continue;
 821                 }
 822
 823                 boolean_t call_local = ((call->tc_flags & TIMER_CALL_LOCAL) != 0);
 824
 825                 /* remove entry from old queue */
 826                 timer_call_entry_dequeue(call);
 827                 timer_queue_unlock(queue);
 828
 829                 if (call_local == FALSE) {
 830                         /* and queue it on new, discarding LOCAL timers */
 831                         new_queue = timer_queue_assign(call->tc_pqlink.deadline);
 832                         timer_queue_lock_spin(new_queue);
 833                         timer_call_entry_enqueue_deadline(
 834                                 call, new_queue, call->tc_pqlink.deadline);
 835                         timer_queue_unlock(new_queue);
 836                 } else {
 837                         timer_queue_shutdown_discarded++;
 838                 }
 839
 840                 assert(call_local == FALSE);
 841                 simple_unlock(&call->tc_lock);
 842         }
 843
 844         timer_queue_unlock(queue);
 845         splx(s);
 846 }
 847
 848
 849 static uint32_t timer_queue_expire_lock_skips;
 850 uint64_t
 851 timer_queue_expire_with_options(
 852         mpqueue_head_t          *queue,
 853         uint64_t                deadline,
 854         boolean_t               rescan)
 855 {
 856         timer_call_t    call = NULL;
 857         uint32_t tc_iterations = 0;
 858         DBG("timer_queue_expire(%p,)\n", queue);
 859
 860         /* 'rescan' means look at every timer in the list, instead of
 861          * early-exiting when the head of the list expires in the future.
 862          * when 'rescan' is true, iterate by linked list instead of priority queue.
 863          *
 864          * TODO: if we keep a deadline ordered and soft-deadline ordered
 865          * priority queue, then it's no longer necessary to do that
 866          */
 867
 868         uint64_t cur_deadline = deadline;
 869         timer_queue_lock_spin(queue);
 870
 871         while (!queue_empty(&queue->head)) {
 872                 /* Upon processing one or more timer calls, refresh the
 873                  * deadline to account for time elapsed in the callout
 874                  */
 875                 if (++tc_iterations > 1) {
 876                         cur_deadline = mach_absolute_time();
 877                 }
 878
 879                 if (call == NULL) {
 880                         if (rescan == FALSE) {
 881                                 call = priority_queue_min(&queue->mpq_pqhead, struct timer_call, tc_pqlink);
 882                         } else {
 883                                 call = qe_queue_first(&queue->head, struct timer_call, tc_qlink);
 884                         }
 885                 }
 886
 887                 if (call->tc_soft_deadline <= cur_deadline) {
 888                         timer_call_func_t               func;
 889                         timer_call_param_t              param0, param1;
 890
 891                         TCOAL_DEBUG(0xDDDD0000, queue->earliest_soft_deadline, call->tc_soft_deadline, 0, 0, 0);
 892                         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
 893                             DECR_TIMER_EXPIRE | DBG_FUNC_NONE,
 894                             VM_KERNEL_UNSLIDE_OR_PERM(call),
 895                             call->tc_soft_deadline,
 896                             call->tc_pqlink.deadline,
 897                             call->tc_entry_time, 0);
 898
 899                         if ((call->tc_flags & TIMER_CALL_RATELIMITED) &&
 900                             (call->tc_pqlink.deadline > cur_deadline)) {
 901                                 if (rescan == FALSE) {
 902                                         break;
 903                                 }
 904                         }
 905
 906                         if (!simple_lock_try(&call->tc_lock, LCK_GRP_NULL)) {
 907                                 /* case (2b) lock inversion, dequeue and skip */
 908                                 timer_queue_expire_lock_skips++;
 909                                 timer_call_entry_dequeue_async(call);
 910                                 call = NULL;
 911                                 continue;
 912                         }
 913
 914                         timer_call_entry_dequeue(call);
 915
 916                         func = call->tc_func;
 917                         param0 = call->tc_param0;
 918                         param1 = call->tc_param1;
 919
 920                         simple_unlock(&call->tc_lock);
 921                         timer_queue_unlock(queue);
 922
 923                         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
 924                             DECR_TIMER_CALLOUT | DBG_FUNC_START,
 925                             VM_KERNEL_UNSLIDE_OR_PERM(call), VM_KERNEL_UNSLIDE(func),
 926                             VM_KERNEL_ADDRHIDE(param0),
 927                             VM_KERNEL_ADDRHIDE(param1),
 928                             0);
 929
 930 #if CONFIG_DTRACE
 931                         DTRACE_TMR7(callout__start, timer_call_func_t, func,
 932                             timer_call_param_t, param0, unsigned, call->tc_flags,
 933                             0, (call->tc_ttd >> 32),
 934                             (unsigned) (call->tc_ttd & 0xFFFFFFFF), call);
 935 #endif
 936                         /* Maintain time-to-deadline in per-processor data
 937                          * structure for thread wakeup deadline statistics.
 938                          */
 939                         uint64_t *ttdp = &current_processor()->timer_call_ttd;
 940                         *ttdp = call->tc_ttd;
 941                         (*func)(param0, param1);
 942                         *ttdp = 0;
 943 #if CONFIG_DTRACE
 944                         DTRACE_TMR4(callout__end, timer_call_func_t, func,
 945                             param0, param1, call);
 946 #endif
 947
 948                         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
 949                             DECR_TIMER_CALLOUT | DBG_FUNC_END,
 950                             VM_KERNEL_UNSLIDE_OR_PERM(call), VM_KERNEL_UNSLIDE(func),
 951                             VM_KERNEL_ADDRHIDE(param0),
 952                             VM_KERNEL_ADDRHIDE(param1),
 953                             0);
 954                         call = NULL;
 955                         timer_queue_lock_spin(queue);
 956                 } else {
 957                         if (__probable(rescan == FALSE)) {
 958                                 break;
 959                         } else {
 960                                 int64_t skew = call->tc_pqlink.deadline - call->tc_soft_deadline;
 961                                 assert(call->tc_pqlink.deadline >= call->tc_soft_deadline);
 962
 963                                 /* DRK: On a latency quality-of-service level change,
 964                                  * re-sort potentially rate-limited timers. The platform
 965                                  * layer determines which timers require
 966                                  * this. In the absence of the per-callout
 967                                  * synchronization requirement, a global resort could
 968                                  * be more efficient. The re-sort effectively
 969                                  * annuls all timer adjustments, i.e. the "soft
 970                                  * deadline" is the sort key.
 971                                  */
 972
 973                                 if (timer_resort_threshold(skew)) {
 974                                         if (__probable(simple_lock_try(&call->tc_lock, LCK_GRP_NULL))) {
 975                                                 /* TODO: don't need to dequeue before enqueue */
 976                                                 timer_call_entry_dequeue(call);
 977                                                 timer_call_entry_enqueue_deadline(call, queue, call->tc_soft_deadline);
 978                                                 simple_unlock(&call->tc_lock);
 979                                                 call = NULL;
 980                                         }
 981                                 }
 982                                 if (call) {
 983                                         call = qe_queue_next(&queue->head, call, struct timer_call, tc_qlink);
 984
 985                                         if (call == NULL) {
 986                                                 break;
 987                                         }
 988                                 }
 989                         }
 990                 }
 991         }
 992
 993         call = priority_queue_min(&queue->mpq_pqhead, struct timer_call, tc_pqlink);
 994
 995         if (call) {
 996                 cur_deadline = call->tc_pqlink.deadline;
 997                 queue->earliest_soft_deadline = (call->tc_flags & TIMER_CALL_RATELIMITED) ? call->tc_pqlink.deadline: call->tc_soft_deadline;
 998         } else {
 999                 queue->earliest_soft_deadline = cur_deadline = UINT64_MAX;
1000         }
1001
1002         timer_queue_unlock(queue);
1003
1004         return cur_deadline;
1005 }
1006
1007 uint64_t
1008 timer_queue_expire(
1009         mpqueue_head_t          *queue,
1010         uint64_t                deadline)
1011 {
1012         return timer_queue_expire_with_options(queue, deadline, FALSE);
1013 }
1014
1015 extern int serverperfmode;
1016 static uint32_t timer_queue_migrate_lock_skips;
1017 /*
1018  * timer_queue_migrate() is called by timer_queue_migrate_cpu()
1019  * to move timer requests from the local processor (queue_from)
1020  * to a target processor's (queue_to).
1021  */
1022 int
1023 timer_queue_migrate(mpqueue_head_t *queue_from, mpqueue_head_t *queue_to)
1024 {
1025         timer_call_t    call;
1026         timer_call_t    head_to;
1027         int             timers_migrated = 0;
1028
1029         DBG("timer_queue_migrate(%p,%p)\n", queue_from, queue_to);
1030
1031         assert(!ml_get_interrupts_enabled());
1032         assert(queue_from != queue_to);
1033
1034         if (serverperfmode) {
1035                 /*
1036                  * if we're running a high end server
1037                  * avoid migrations... they add latency
1038                  * and don't save us power under typical
1039                  * server workloads
1040                  */
1041                 return -4;
1042         }
1043
1044         /*
1045          * Take both local (from) and target (to) timer queue locks while
1046          * moving the timers from the local queue to the target processor.
1047          * We assume that the target is always the boot processor.
1048          * But only move if all of the following is true:
1049          *  - the target queue is non-empty
1050          *  - the local queue is non-empty
1051          *  - the local queue's first deadline is later than the target's
1052          *  - the local queue contains no non-migrateable "local" call
1053          * so that we need not have the target resync.
1054          */
1055
1056         timer_queue_lock_spin(queue_to);
1057
1058         head_to = priority_queue_min(&queue_to->mpq_pqhead, struct timer_call, tc_pqlink);
1059
1060         if (head_to == NULL) {
1061                 timers_migrated = -1;
1062                 goto abort1;
1063         }
1064
1065         timer_queue_lock_spin(queue_from);
1066
1067         call = priority_queue_min(&queue_from->mpq_pqhead, struct timer_call, tc_pqlink);
1068
1069         if (call == NULL) {
1070                 timers_migrated = -2;
1071                 goto abort2;
1072         }
1073
1074         if (call->tc_pqlink.deadline < head_to->tc_pqlink.deadline) {
1075                 timers_migrated = 0;
1076                 goto abort2;
1077         }
1078
1079         /* perform scan for non-migratable timers */
1080         qe_foreach_element(call, &queue_from->head, tc_qlink) {
1081                 if (call->tc_flags & TIMER_CALL_LOCAL) {
1082                         timers_migrated = -3;
1083                         goto abort2;
1084                 }
1085         }
1086
1087         /* migration loop itself -- both queues are locked */
1088         qe_foreach_element_safe(call, &queue_from->head, tc_qlink) {
1089                 if (!simple_lock_try(&call->tc_lock, LCK_GRP_NULL)) {
1090                         /* case (2b) lock order inversion, dequeue only */
1091 #ifdef TIMER_ASSERT
1092                         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1093                             DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
1094                             VM_KERNEL_UNSLIDE_OR_PERM(call),
1095                             VM_KERNEL_UNSLIDE_OR_PERM(call->tc_queue),
1096                             0,
1097                             0x2b, 0);
1098 #endif
1099                         timer_queue_migrate_lock_skips++;
1100                         timer_call_entry_dequeue_async(call);
1101                         continue;
1102                 }
1103                 timer_call_entry_dequeue(call);
1104                 timer_call_entry_enqueue_deadline(
1105                         call, queue_to, call->tc_pqlink.deadline);
1106                 timers_migrated++;
1107                 simple_unlock(&call->tc_lock);
1108         }
1109         queue_from->earliest_soft_deadline = UINT64_MAX;
1110 abort2:
1111         timer_queue_unlock(queue_from);
1112 abort1:
1113         timer_queue_unlock(queue_to);
1114
1115         return timers_migrated;
1116 }
1117
1118 void
1119 timer_queue_trace_cpu(int ncpu)
1120 {
1121         timer_call_nosync_cpu(
1122                 ncpu,
1123                 (void (*)(void *))timer_queue_trace,
1124                 (void*) timer_queue_cpu(ncpu));
1125 }
1126
1127 void
1128 timer_queue_trace(
1129         mpqueue_head_t                  *queue)
1130 {
1131         timer_call_t    call;
1132         spl_t           s;
1133
1134         if (!kdebug_enable) {
1135                 return;
1136         }
1137
1138         s = splclock();
1139         timer_queue_lock_spin(queue);
1140
1141         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1142             DECR_TIMER_QUEUE | DBG_FUNC_START,
1143             queue->count, mach_absolute_time(), 0, 0, 0);
1144
1145         qe_foreach_element(call, &queue->head, tc_qlink) {
1146                 TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1147                     DECR_TIMER_QUEUE | DBG_FUNC_NONE,
1148                     call->tc_soft_deadline,
1149                     call->tc_pqlink.deadline,
1150                     call->tc_entry_time,
1151                     VM_KERNEL_UNSLIDE(call->tc_func),
1152                     0);
1153         }
1154
1155         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1156             DECR_TIMER_QUEUE | DBG_FUNC_END,
1157             queue->count, mach_absolute_time(), 0, 0, 0);
1158
1159         timer_queue_unlock(queue);
1160         splx(s);
1161 }
1162
1163 void
1164 timer_longterm_dequeued_locked(timer_call_t call)
1165 {
1166         timer_longterm_t        *tlp = &timer_longterm;
1167
1168         tlp->dequeues++;
1169         if (call == tlp->threshold.call) {
1170                 tlp->threshold.call = NULL;
1171         }
1172 }
1173
1174 /*
1175  * Place a timer call in the longterm list
1176  * and adjust the next timer callout deadline if the new timer is first.
1177  */
1178 mpqueue_head_t *
1179 timer_longterm_enqueue_unlocked(timer_call_t    call,
1180     uint64_t        now,
1181     uint64_t        deadline,
1182     mpqueue_head_t  **old_queue,
1183     uint64_t        soft_deadline,
1184     uint64_t        ttd,
1185     timer_call_param_t      param1,
1186     uint32_t        callout_flags)
1187 {
1188         timer_longterm_t        *tlp = &timer_longterm;
1189         boolean_t               update_required = FALSE;
1190         uint64_t                longterm_threshold;
1191
1192         longterm_threshold = now + tlp->threshold.interval;
1193
1194         /*
1195          * Return NULL without doing anything if:
1196          *  - this timer is local, or
1197          *  - the longterm mechanism is disabled, or
1198          *  - this deadline is too short.
1199          */
1200         if ((callout_flags & TIMER_CALL_LOCAL) != 0 ||
1201             (tlp->threshold.interval == TIMER_LONGTERM_NONE) ||
1202             (deadline <= longterm_threshold)) {
1203                 return NULL;
1204         }
1205
1206         /*
1207          * Remove timer from its current queue, if any.
1208          */
1209         *old_queue = timer_call_dequeue_unlocked(call);
1210
1211         /*
1212          * Lock the longterm queue, queue timer and determine
1213          * whether an update is necessary.
1214          */
1215         assert(!ml_get_interrupts_enabled());
1216         simple_lock(&call->tc_lock, LCK_GRP_NULL);
1217         timer_queue_lock_spin(timer_longterm_queue);
1218         call->tc_pqlink.deadline = deadline;
1219         call->tc_param1 = param1;
1220         call->tc_ttd = ttd;
1221         call->tc_soft_deadline = soft_deadline;
1222         call->tc_flags = callout_flags;
1223         timer_call_entry_enqueue_tail(call, timer_longterm_queue);
1224
1225         tlp->enqueues++;
1226
1227         /*
1228          * We'll need to update the currently set threshold timer
1229          * if the new deadline is sooner and no sooner update is in flight.
1230          */
1231         if (deadline < tlp->threshold.deadline &&
1232             deadline < tlp->threshold.preempted) {
1233                 tlp->threshold.preempted = deadline;
1234                 tlp->threshold.call = call;
1235                 update_required = TRUE;
1236         }
1237         timer_queue_unlock(timer_longterm_queue);
1238         simple_unlock(&call->tc_lock);
1239
1240         if (update_required) {
1241                 /*
1242                  * Note: this call expects that calling the master cpu
1243                  * alone does not involve locking the topo lock.
1244                  */
1245                 timer_call_nosync_cpu(
1246                         master_cpu,
1247                         (void (*)(void *))timer_longterm_update,
1248                         (void *)tlp);
1249         }
1250
1251         return timer_longterm_queue;
1252 }
1253
1254 /*
1255  * Scan for timers below the longterm threshold.
1256  * Move these to the local timer queue (of the boot processor on which the
1257  * calling thread is running).
1258  * Both the local (boot) queue and the longterm queue are locked.
1259  * The scan is similar to the timer migrate sequence but is performed by
1260  * successively examining each timer on the longterm queue:
1261  *  - if within the short-term threshold
1262  *    - enter on the local queue (unless being deleted),
1263  *  - otherwise:
1264  *    - if sooner, deadline becomes the next threshold deadline.
1265  * The total scan time is limited to TIMER_LONGTERM_SCAN_LIMIT. Should this be
1266  * exceeded, we abort and reschedule again so that we don't shut others from
1267  * the timer queues. Longterm timers firing late is not critical.
1268  */
1269 void
1270 timer_longterm_scan(timer_longterm_t    *tlp,
1271     uint64_t            time_start)
1272 {
1273         timer_call_t    call;
1274         uint64_t        threshold;
1275         uint64_t        deadline;
1276         uint64_t        time_limit = time_start + tlp->scan_limit;
1277         mpqueue_head_t  *timer_master_queue;
1278
1279         assert(!ml_get_interrupts_enabled());
1280         assert(cpu_number() == master_cpu);
1281
1282         if (tlp->threshold.interval != TIMER_LONGTERM_NONE) {
1283                 threshold = time_start + tlp->threshold.interval;
1284         }
1285
1286         tlp->threshold.deadline = TIMER_LONGTERM_NONE;
1287         tlp->threshold.call = NULL;
1288
1289         if (queue_empty(&timer_longterm_queue->head)) {
1290                 return;
1291         }
1292
1293         timer_master_queue = timer_queue_cpu(master_cpu);
1294         timer_queue_lock_spin(timer_master_queue);
1295
1296         qe_foreach_element_safe(call, &timer_longterm_queue->head, tc_qlink) {
1297                 deadline = call->tc_soft_deadline;
1298                 if (!simple_lock_try(&call->tc_lock, LCK_GRP_NULL)) {
1299                         /* case (2c) lock order inversion, dequeue only */
1300 #ifdef TIMER_ASSERT
1301                         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1302                             DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
1303                             VM_KERNEL_UNSLIDE_OR_PERM(call),
1304                             VM_KERNEL_UNSLIDE_OR_PERM(call->tc_queue),
1305                             0,
1306                             0x2c, 0);
1307 #endif
1308                         timer_call_entry_dequeue_async(call);
1309                         continue;
1310                 }
1311                 if (deadline < threshold) {
1312                         /*
1313                          * This timer needs moving (escalating)
1314                          * to the local (boot) processor's queue.
1315                          */
1316 #ifdef TIMER_ASSERT
1317                         if (deadline < time_start) {
1318                                 TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1319                                     DECR_TIMER_OVERDUE | DBG_FUNC_NONE,
1320                                     VM_KERNEL_UNSLIDE_OR_PERM(call),
1321                                     deadline,
1322                                     time_start,
1323                                     threshold,
1324                                     0);
1325                         }
1326 #endif
1327                         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1328                             DECR_TIMER_ESCALATE | DBG_FUNC_NONE,
1329                             VM_KERNEL_UNSLIDE_OR_PERM(call),
1330                             call->tc_pqlink.deadline,
1331                             call->tc_entry_time,
1332                             VM_KERNEL_UNSLIDE(call->tc_func),
1333                             0);
1334                         tlp->escalates++;
1335                         timer_call_entry_dequeue(call);
1336                         timer_call_entry_enqueue_deadline(
1337                                 call, timer_master_queue, call->tc_pqlink.deadline);
1338                         /*
1339                          * A side-effect of the following call is to update
1340                          * the actual hardware deadline if required.
1341                          */
1342                         (void) timer_queue_assign(deadline);
1343                 } else {
1344                         if (deadline < tlp->threshold.deadline) {
1345                                 tlp->threshold.deadline = deadline;
1346                                 tlp->threshold.call = call;
1347                         }
1348                 }
1349                 simple_unlock(&call->tc_lock);
1350
1351                 /* Abort scan if we're taking too long. */
1352                 if (mach_absolute_time() > time_limit) {
1353                         tlp->threshold.deadline = TIMER_LONGTERM_SCAN_AGAIN;
1354                         tlp->scan_pauses++;
1355                         DBG("timer_longterm_scan() paused %llu, qlen: %llu\n",
1356                             time_limit, tlp->queue.count);
1357                         break;
1358                 }
1359         }
1360
1361         timer_queue_unlock(timer_master_queue);
1362 }
1363
1364 void
1365 timer_longterm_callout(timer_call_param_t p0, __unused timer_call_param_t p1)
1366 {
1367         timer_longterm_t        *tlp = (timer_longterm_t *) p0;
1368
1369         timer_longterm_update(tlp);
1370 }
1371
1372 void
1373 timer_longterm_update_locked(timer_longterm_t *tlp)
1374 {
1375         uint64_t        latency;
1376
1377         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1378             DECR_TIMER_UPDATE | DBG_FUNC_START,
1379             VM_KERNEL_UNSLIDE_OR_PERM(&tlp->queue),
1380             tlp->threshold.deadline,
1381             tlp->threshold.preempted,
1382             tlp->queue.count, 0);
1383
1384         tlp->scan_time = mach_absolute_time();
1385         if (tlp->threshold.preempted != TIMER_LONGTERM_NONE) {
1386                 tlp->threshold.preempts++;
1387                 tlp->threshold.deadline = tlp->threshold.preempted;
1388                 tlp->threshold.preempted = TIMER_LONGTERM_NONE;
1389                 /*
1390                  * Note: in the unlikely event that a pre-empted timer has
1391                  * itself been cancelled, we'll simply re-scan later at the
1392                  * time of the preempted/cancelled timer.
1393                  */
1394         } else {
1395                 tlp->threshold.scans++;
1396
1397                 /*
1398                  * Maintain a moving average of our wakeup latency.
1399                  * Clamp latency to 0 and ignore above threshold interval.
1400                  */
1401                 if (tlp->scan_time > tlp->threshold.deadline_set) {
1402                         latency = tlp->scan_time - tlp->threshold.deadline_set;
1403                 } else {
1404                         latency = 0;
1405                 }
1406                 if (latency < tlp->threshold.interval) {
1407                         tlp->threshold.latency_min =
1408                             MIN(tlp->threshold.latency_min, latency);
1409                         tlp->threshold.latency_max =
1410                             MAX(tlp->threshold.latency_max, latency);
1411                         tlp->threshold.latency =
1412                             (tlp->threshold.latency * 99 + latency) / 100;
1413                 }
1414
1415                 timer_longterm_scan(tlp, tlp->scan_time);
1416         }
1417
1418         tlp->threshold.deadline_set = tlp->threshold.deadline;
1419         /* The next deadline timer to be set is adjusted */
1420         if (tlp->threshold.deadline != TIMER_LONGTERM_NONE &&
1421             tlp->threshold.deadline != TIMER_LONGTERM_SCAN_AGAIN) {
1422                 tlp->threshold.deadline_set -= tlp->threshold.margin;
1423                 tlp->threshold.deadline_set -= tlp->threshold.latency;
1424         }
1425
1426         /* Throttle next scan time */
1427         uint64_t scan_clamp = mach_absolute_time() + tlp->scan_interval;
1428         if (tlp->threshold.deadline_set < scan_clamp) {
1429                 tlp->threshold.deadline_set = scan_clamp;
1430         }
1431
1432         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1433             DECR_TIMER_UPDATE | DBG_FUNC_END,
1434             VM_KERNEL_UNSLIDE_OR_PERM(&tlp->queue),
1435             tlp->threshold.deadline,
1436             tlp->threshold.scans,
1437             tlp->queue.count, 0);
1438 }
1439
1440 void
1441 timer_longterm_update(timer_longterm_t *tlp)
1442 {
1443         spl_t   s = splclock();
1444
1445         timer_queue_lock_spin(timer_longterm_queue);
1446
1447         if (cpu_number() != master_cpu) {
1448                 panic("timer_longterm_update_master() on non-boot cpu");
1449         }
1450
1451         timer_longterm_update_locked(tlp);
1452
1453         if (tlp->threshold.deadline != TIMER_LONGTERM_NONE) {
1454                 timer_call_enter(
1455                         &tlp->threshold.timer,
1456                         tlp->threshold.deadline_set,
1457                         TIMER_CALL_LOCAL | TIMER_CALL_SYS_CRITICAL);
1458         }
1459
1460         timer_queue_unlock(timer_longterm_queue);
1461         splx(s);
1462 }
1463
1464 void
1465 timer_longterm_init(void)
1466 {
1467         uint32_t                longterm;
1468         timer_longterm_t        *tlp = &timer_longterm;
1469
1470         DBG("timer_longterm_init() tlp: %p, queue: %p\n", tlp, &tlp->queue);
1471
1472         /*
1473          * Set the longterm timer threshold. Defaults to TIMER_LONGTERM_THRESHOLD
1474          * or TIMER_LONGTERM_NONE (disabled) for server;
1475          * overridden longterm boot-arg
1476          */
1477         tlp->threshold.interval = serverperfmode ? TIMER_LONGTERM_NONE
1478             : TIMER_LONGTERM_THRESHOLD;
1479         if (PE_parse_boot_argn("longterm", &longterm, sizeof(longterm))) {
1480                 tlp->threshold.interval = (longterm == 0) ?
1481                     TIMER_LONGTERM_NONE :
1482                     longterm * NSEC_PER_MSEC;
1483         }
1484         if (tlp->threshold.interval != TIMER_LONGTERM_NONE) {
1485                 printf("Longterm timer threshold: %llu ms\n",
1486                     tlp->threshold.interval / NSEC_PER_MSEC);
1487                 kprintf("Longterm timer threshold: %llu ms\n",
1488                     tlp->threshold.interval / NSEC_PER_MSEC);
1489                 nanoseconds_to_absolutetime(tlp->threshold.interval,
1490                     &tlp->threshold.interval);
1491                 tlp->threshold.margin = tlp->threshold.interval / 10;
1492                 tlp->threshold.latency_min = EndOfAllTime;
1493                 tlp->threshold.latency_max = 0;
1494         }
1495
1496         tlp->threshold.preempted = TIMER_LONGTERM_NONE;
1497         tlp->threshold.deadline = TIMER_LONGTERM_NONE;
1498
1499         mpqueue_init(&tlp->queue, &timer_longterm_lck_grp, LCK_ATTR_NULL);
1500
1501         timer_call_setup(&tlp->threshold.timer,
1502             timer_longterm_callout, (timer_call_param_t) tlp);
1503
1504         timer_longterm_queue = &tlp->queue;
1505 }
1506
1507 enum {
1508         THRESHOLD, QCOUNT,
1509         ENQUEUES, DEQUEUES, ESCALATES, SCANS, PREEMPTS,
1510         LATENCY, LATENCY_MIN, LATENCY_MAX, SCAN_LIMIT, SCAN_INTERVAL, PAUSES
1511 };
1512 uint64_t
1513 timer_sysctl_get(int oid)
1514 {
1515         timer_longterm_t        *tlp = &timer_longterm;
1516
1517         switch (oid) {
1518         case THRESHOLD:
1519                 return (tlp->threshold.interval == TIMER_LONGTERM_NONE) ?
1520                        0 : tlp->threshold.interval / NSEC_PER_MSEC;
1521         case QCOUNT:
1522                 return tlp->queue.count;
1523         case ENQUEUES:
1524                 return tlp->enqueues;
1525         case DEQUEUES:
1526                 return tlp->dequeues;
1527         case ESCALATES:
1528                 return tlp->escalates;
1529         case SCANS:
1530                 return tlp->threshold.scans;
1531         case PREEMPTS:
1532                 return tlp->threshold.preempts;
1533         case LATENCY:
1534                 return tlp->threshold.latency;
1535         case LATENCY_MIN:
1536                 return tlp->threshold.latency_min;
1537         case LATENCY_MAX:
1538                 return tlp->threshold.latency_max;
1539         case SCAN_LIMIT:
1540                 return tlp->scan_limit;
1541         case SCAN_INTERVAL:
1542                 return tlp->scan_interval;
1543         case PAUSES:
1544                 return tlp->scan_pauses;
1545         default:
1546                 return 0;
1547         }
1548 }
1549
1550 /*
1551  * timer_master_scan() is the inverse of timer_longterm_scan()
1552  * since it un-escalates timers to the longterm queue.
1553  */
1554 static void
1555 timer_master_scan(timer_longterm_t      *tlp,
1556     uint64_t              now)
1557 {
1558         timer_call_t    call;
1559         uint64_t        threshold;
1560         uint64_t        deadline;
1561         mpqueue_head_t  *timer_master_queue;
1562
1563         if (tlp->threshold.interval != TIMER_LONGTERM_NONE) {
1564                 threshold = now + tlp->threshold.interval;
1565         } else {
1566                 threshold = TIMER_LONGTERM_NONE;
1567         }
1568
1569         timer_master_queue = timer_queue_cpu(master_cpu);
1570         timer_queue_lock_spin(timer_master_queue);
1571
1572         qe_foreach_element_safe(call, &timer_master_queue->head, tc_qlink) {
1573                 deadline = call->tc_pqlink.deadline;
1574                 if ((call->tc_flags & TIMER_CALL_LOCAL) != 0) {
1575                         continue;
1576                 }
1577                 if (!simple_lock_try(&call->tc_lock, LCK_GRP_NULL)) {
1578                         /* case (2c) lock order inversion, dequeue only */
1579                         timer_call_entry_dequeue_async(call);
1580                         continue;
1581                 }
1582                 if (deadline > threshold) {
1583                         /* move from master to longterm */
1584                         timer_call_entry_dequeue(call);
1585                         timer_call_entry_enqueue_tail(call, timer_longterm_queue);
1586                         if (deadline < tlp->threshold.deadline) {
1587                                 tlp->threshold.deadline = deadline;
1588                                 tlp->threshold.call = call;
1589                         }
1590                 }
1591                 simple_unlock(&call->tc_lock);
1592         }
1593         timer_queue_unlock(timer_master_queue);
1594 }
1595
1596 static void
1597 timer_sysctl_set_threshold(uint64_t value)
1598 {
1599         timer_longterm_t        *tlp = &timer_longterm;
1600         spl_t                   s = splclock();
1601         boolean_t               threshold_increase;
1602
1603         timer_queue_lock_spin(timer_longterm_queue);
1604
1605         timer_call_cancel(&tlp->threshold.timer);
1606
1607         /*
1608          * Set the new threshold and note whther it's increasing.
1609          */
1610         if (value == 0) {
1611                 tlp->threshold.interval = TIMER_LONGTERM_NONE;
1612                 threshold_increase = TRUE;
1613                 timer_call_cancel(&tlp->threshold.timer);
1614         } else {
1615                 uint64_t        old_interval = tlp->threshold.interval;
1616                 tlp->threshold.interval = value * NSEC_PER_MSEC;
1617                 nanoseconds_to_absolutetime(tlp->threshold.interval,
1618                     &tlp->threshold.interval);
1619                 tlp->threshold.margin = tlp->threshold.interval / 10;
1620                 if (old_interval == TIMER_LONGTERM_NONE) {
1621                         threshold_increase = FALSE;
1622                 } else {
1623                         threshold_increase = (tlp->threshold.interval > old_interval);
1624                 }
1625         }
1626
1627         if (threshold_increase /* or removal */) {
1628                 /* Escalate timers from the longterm queue */
1629                 timer_longterm_scan(tlp, mach_absolute_time());
1630         } else { /* decrease or addition  */
1631                 /*
1632                  * We scan the local/master queue for timers now longterm.
1633                  * To be strictly correct, we should scan all processor queues
1634                  * but timer migration results in most timers gravitating to the
1635                  * master processor in any case.
1636                  */
1637                 timer_master_scan(tlp, mach_absolute_time());
1638         }
1639
1640         /* Set new timer accordingly */
1641         tlp->threshold.deadline_set = tlp->threshold.deadline;
1642         if (tlp->threshold.deadline != TIMER_LONGTERM_NONE) {
1643                 tlp->threshold.deadline_set -= tlp->threshold.margin;
1644                 tlp->threshold.deadline_set -= tlp->threshold.latency;
1645                 timer_call_enter(
1646                         &tlp->threshold.timer,
1647                         tlp->threshold.deadline_set,
1648                         TIMER_CALL_LOCAL | TIMER_CALL_SYS_CRITICAL);
1649         }
1650
1651         /* Reset stats */
1652         tlp->enqueues = 0;
1653         tlp->dequeues = 0;
1654         tlp->escalates = 0;
1655         tlp->scan_pauses = 0;
1656         tlp->threshold.scans = 0;
1657         tlp->threshold.preempts = 0;
1658         tlp->threshold.latency = 0;
1659         tlp->threshold.latency_min = EndOfAllTime;
1660         tlp->threshold.latency_max = 0;
1661
1662         timer_queue_unlock(timer_longterm_queue);
1663         splx(s);
1664 }
1665
1666 int
1667 timer_sysctl_set(int oid, uint64_t value)
1668 {
1669         switch (oid) {
1670         case THRESHOLD:
1671                 timer_call_cpu(
1672                         master_cpu,
1673                         (void (*)(void *))timer_sysctl_set_threshold,
1674                         (void *) value);
1675                 return KERN_SUCCESS;
1676         case SCAN_LIMIT:
1677                 timer_longterm.scan_limit = value;
1678                 return KERN_SUCCESS;
1679         case SCAN_INTERVAL:
1680                 timer_longterm.scan_interval = value;
1681                 return KERN_SUCCESS;
1682         default:
1683                 return KERN_INVALID_ARGUMENT;
1684         }
1685 }
1686
1687
1688 /* Select timer coalescing window based on per-task quality-of-service hints */
1689 static boolean_t
1690 tcoal_qos_adjust(thread_t t, int32_t *tshift, uint64_t *tmax_abstime, boolean_t *pratelimited)
1691 {
1692         uint32_t latency_qos;
1693         boolean_t adjusted = FALSE;
1694         task_t ctask = t->task;
1695
1696         if (ctask) {
1697                 latency_qos = proc_get_effective_thread_policy(t, TASK_POLICY_LATENCY_QOS);
1698
1699                 assert(latency_qos <= NUM_LATENCY_QOS_TIERS);
1700
1701                 if (latency_qos) {
1702                         *tshift = tcoal_prio_params.latency_qos_scale[latency_qos - 1];
1703                         *tmax_abstime = tcoal_prio_params.latency_qos_abstime_max[latency_qos - 1];
1704                         *pratelimited = tcoal_prio_params.latency_tier_rate_limited[latency_qos - 1];
1705                         adjusted = TRUE;
1706                 }
1707         }
1708         return adjusted;
1709 }
1710
1711
1712 /* Adjust timer deadlines based on priority of the thread and the
1713  * urgency value provided at timeout establishment. With this mechanism,
1714  * timers are no longer necessarily sorted in order of soft deadline
1715  * on a given timer queue, i.e. they may be differentially skewed.
1716  * In the current scheme, this could lead to fewer pending timers
1717  * processed than is technically possible when the HW deadline arrives.
1718  */
1719 static void
1720 timer_compute_leeway(thread_t cthread, int32_t urgency, int32_t *tshift, uint64_t *tmax_abstime, boolean_t *pratelimited)
1721 {
1722         int16_t tpri = cthread->sched_pri;
1723         if ((urgency & TIMER_CALL_USER_MASK) != 0) {
1724                 if (tpri >= BASEPRI_RTQUEUES ||
1725                     urgency == TIMER_CALL_USER_CRITICAL) {
1726                         *tshift = tcoal_prio_params.timer_coalesce_rt_shift;
1727                         *tmax_abstime = tcoal_prio_params.timer_coalesce_rt_abstime_max;
1728                         TCOAL_PRIO_STAT(rt_tcl);
1729                 } else if (proc_get_effective_thread_policy(cthread, TASK_POLICY_DARWIN_BG) ||
1730                     (urgency == TIMER_CALL_USER_BACKGROUND)) {
1731                         /* Determine if timer should be subjected to a lower QoS */
1732                         if (tcoal_qos_adjust(cthread, tshift, tmax_abstime, pratelimited)) {
1733                                 if (*tmax_abstime > tcoal_prio_params.timer_coalesce_bg_abstime_max) {
1734                                         return;
1735                                 } else {
1736                                         *pratelimited = FALSE;
1737                                 }
1738                         }
1739                         *tshift = tcoal_prio_params.timer_coalesce_bg_shift;
1740                         *tmax_abstime = tcoal_prio_params.timer_coalesce_bg_abstime_max;
1741                         TCOAL_PRIO_STAT(bg_tcl);
1742                 } else if (tpri >= MINPRI_KERNEL) {
1743                         *tshift = tcoal_prio_params.timer_coalesce_kt_shift;
1744                         *tmax_abstime = tcoal_prio_params.timer_coalesce_kt_abstime_max;
1745                         TCOAL_PRIO_STAT(kt_tcl);
1746                 } else if (cthread->sched_mode == TH_MODE_FIXED) {
1747                         *tshift = tcoal_prio_params.timer_coalesce_fp_shift;
1748                         *tmax_abstime = tcoal_prio_params.timer_coalesce_fp_abstime_max;
1749                         TCOAL_PRIO_STAT(fp_tcl);
1750                 } else if (tcoal_qos_adjust(cthread, tshift, tmax_abstime, pratelimited)) {
1751                         TCOAL_PRIO_STAT(qos_tcl);
1752                 } else if (cthread->sched_mode == TH_MODE_TIMESHARE) {
1753                         *tshift = tcoal_prio_params.timer_coalesce_ts_shift;
1754                         *tmax_abstime = tcoal_prio_params.timer_coalesce_ts_abstime_max;
1755                         TCOAL_PRIO_STAT(ts_tcl);
1756                 } else {
1757                         TCOAL_PRIO_STAT(nc_tcl);
1758                 }
1759         } else if (urgency == TIMER_CALL_SYS_BACKGROUND) {
1760                 *tshift = tcoal_prio_params.timer_coalesce_bg_shift;
1761                 *tmax_abstime = tcoal_prio_params.timer_coalesce_bg_abstime_max;
1762                 TCOAL_PRIO_STAT(bg_tcl);
1763         } else {
1764                 *tshift = tcoal_prio_params.timer_coalesce_kt_shift;
1765                 *tmax_abstime = tcoal_prio_params.timer_coalesce_kt_abstime_max;
1766                 TCOAL_PRIO_STAT(kt_tcl);
1767         }
1768 }
1769
1770
1771 int timer_user_idle_level;
1772
1773 uint64_t
1774 timer_call_slop(uint64_t deadline, uint64_t now, uint32_t flags, thread_t cthread, boolean_t *pratelimited)
1775 {
1776         int32_t tcs_shift = 0;
1777         uint64_t tcs_max_abstime = 0;
1778         uint64_t adjval;
1779         uint32_t urgency = (flags & TIMER_CALL_URGENCY_MASK);
1780
1781         if (mach_timer_coalescing_enabled &&
1782             (deadline > now) && (urgency != TIMER_CALL_SYS_CRITICAL)) {
1783                 timer_compute_leeway(cthread, urgency, &tcs_shift, &tcs_max_abstime, pratelimited);
1784
1785                 if (tcs_shift >= 0) {
1786                         adjval =  MIN((deadline - now) >> tcs_shift, tcs_max_abstime);
1787                 } else {
1788                         adjval =  MIN((deadline - now) << (-tcs_shift), tcs_max_abstime);
1789                 }
1790                 /* Apply adjustments derived from "user idle level" heuristic */
1791                 adjval += (adjval * timer_user_idle_level) >> 7;
1792                 return adjval;
1793         } else {
1794                 return 0;
1795         }
1796 }
1797
1798 int
1799 timer_get_user_idle_level(void)
1800 {
1801         return timer_user_idle_level;
1802 }
1803
1804 kern_return_t
1805 timer_set_user_idle_level(int ilevel)
1806 {
1807         boolean_t do_reeval = FALSE;
1808
1809         if ((ilevel < 0) || (ilevel > 128)) {
1810                 return KERN_INVALID_ARGUMENT;
1811         }
1812
1813         if (ilevel < timer_user_idle_level) {
1814                 do_reeval = TRUE;
1815         }
1816
1817         timer_user_idle_level = ilevel;
1818
1819         if (do_reeval) {
1820                 ml_timer_evaluate();
1821         }
1822
1823         return KERN_SUCCESS;
1824 }
1825
1826 #pragma mark - running timers
1827
1828 #define RUNNING_TIMER_FAKE_FLAGS (TIMER_CALL_SYS_CRITICAL | \
1829     TIMER_CALL_LOCAL)
1830
1831 /*
1832  * timer_call_trace_* functions mimic the tracing behavior from the normal
1833  * timer_call subsystem, so tools continue to function.
1834  */
1835
1836 static void
1837 timer_call_trace_enter_before(struct timer_call *call, uint64_t deadline,
1838     uint32_t flags, uint64_t now)
1839 {
1840 #pragma unused(call, deadline, flags, now)
1841         TIMER_KDEBUG_TRACE(KDEBUG_TRACE, DECR_TIMER_ENTER | DBG_FUNC_START,
1842             VM_KERNEL_UNSLIDE_OR_PERM(call), VM_KERNEL_ADDRHIDE(call->tc_param1),
1843             deadline, flags, 0);
1844 #if CONFIG_DTRACE
1845         uint64_t ttd = deadline - now;
1846         DTRACE_TMR7(callout__create, timer_call_func_t, call->tc_func,
1847             timer_call_param_t, call->tc_param0, uint32_t, flags, 0,
1848             (ttd >> 32), (unsigned int)(ttd & 0xFFFFFFFF), NULL);
1849 #endif /* CONFIG_DTRACE */
1850         TIMER_KDEBUG_TRACE(KDEBUG_TRACE, DECR_TIMER_ENTER | DBG_FUNC_END,
1851             VM_KERNEL_UNSLIDE_OR_PERM(call), 0, deadline, 0, 0);
1852 }
1853
1854 static void
1855 timer_call_trace_enter_after(struct timer_call *call, uint64_t deadline)
1856 {
1857 #pragma unused(call, deadline)
1858         TIMER_KDEBUG_TRACE(KDEBUG_TRACE, DECR_TIMER_ENTER | DBG_FUNC_END,
1859             VM_KERNEL_UNSLIDE_OR_PERM(call), 0, deadline, 0, 0);
1860 }
1861
1862 static void
1863 timer_call_trace_cancel(struct timer_call *call)
1864 {
1865 #pragma unused(call)
1866         __unused uint64_t deadline = call->tc_pqlink.deadline;
1867         TIMER_KDEBUG_TRACE(KDEBUG_TRACE, DECR_TIMER_CANCEL | DBG_FUNC_START,
1868             VM_KERNEL_UNSLIDE_OR_PERM(call), deadline, 0,
1869             call->tc_flags, 0);
1870         TIMER_KDEBUG_TRACE(KDEBUG_TRACE, DECR_TIMER_CANCEL | DBG_FUNC_END,
1871             VM_KERNEL_UNSLIDE_OR_PERM(call), 0, deadline - mach_absolute_time(),
1872             deadline - call->tc_entry_time, 0);
1873 #if CONFIG_DTRACE
1874 #if TIMER_TRACE
1875         uint64_t ttd = deadline - call->tc_entry_time;
1876 #else
1877         uint64_t ttd = UINT64_MAX;
1878 #endif /* TIMER_TRACE */
1879         DTRACE_TMR6(callout__cancel, timer_call_func_t, call->tc_func,
1880             timer_call_param_t, call->tc_param0, uint32_t, call->tc_flags, 0,
1881             (ttd >> 32), (unsigned int)(ttd & 0xFFFFFFFF));
1882 #endif /* CONFIG_DTRACE */
1883 }
1884
1885 static void
1886 timer_call_trace_expire_entry(struct timer_call *call)
1887 {
1888 #pragma unused(call)
1889         TIMER_KDEBUG_TRACE(KDEBUG_TRACE, DECR_TIMER_CALLOUT | DBG_FUNC_START,
1890             VM_KERNEL_UNSLIDE_OR_PERM(call), VM_KERNEL_UNSLIDE(call->tc_func),
1891             VM_KERNEL_ADDRHIDE(call->tc_param0),
1892             VM_KERNEL_ADDRHIDE(call->tc_param1),
1893             0);
1894 #if CONFIG_DTRACE
1895 #if TIMER_TRACE
1896         uint64_t ttd = call->tc_pqlink.deadline - call->tc_entry_time;
1897 #else /* TIMER_TRACE */
1898         uint64_t ttd = UINT64_MAX;
1899 #endif /* TIMER_TRACE */
1900         DTRACE_TMR7(callout__start, timer_call_func_t, call->tc_func,
1901             timer_call_param_t, call->tc_param0, unsigned, call->tc_flags,
1902             0, (ttd >> 32), (unsigned int)(ttd & 0xFFFFFFFF), NULL);
1903 #endif /* CONFIG_DTRACE */
1904 }
1905
1906 static void
1907 timer_call_trace_expire_return(struct timer_call *call)
1908 {
1909 #pragma unused(call)
1910 #if CONFIG_DTRACE
1911         DTRACE_TMR4(callout__end, timer_call_func_t, call->tc_func,
1912             call->tc_param0, call->tc_param1, NULL);
1913 #endif /* CONFIG_DTRACE */
1914         TIMER_KDEBUG_TRACE(KDEBUG_TRACE, DECR_TIMER_CALLOUT | DBG_FUNC_END,
1915             VM_KERNEL_UNSLIDE_OR_PERM(call),
1916             VM_KERNEL_UNSLIDE(call->tc_func),
1917             VM_KERNEL_ADDRHIDE(call->tc_param0),
1918             VM_KERNEL_ADDRHIDE(call->tc_param1),
1919             0);
1920 }
1921
1922 /*
1923  * Set a new deadline for a running timer on this processor.
1924  */
1925 void
1926 running_timer_setup(processor_t processor, enum running_timer timer,
1927     void *param, uint64_t deadline, uint64_t now)
1928 {
1929         assert(timer < RUNNING_TIMER_MAX);
1930         assert(ml_get_interrupts_enabled() == FALSE);
1931
1932         struct timer_call *call = &processor->running_timers[timer];
1933
1934         timer_call_trace_enter_before(call, deadline, RUNNING_TIMER_FAKE_FLAGS,
1935             now);
1936
1937         if (__improbable(deadline < now)) {
1938                 deadline = timer_call_past_deadline_timer_handle(deadline, now);
1939         }
1940
1941         call->tc_pqlink.deadline = deadline;
1942 #if TIMER_TRACE
1943         call->tc_entry_time = now;
1944 #endif /* TIMER_TRACE */
1945         call->tc_param1 = param;
1946
1947         timer_call_trace_enter_after(call, deadline);
1948 }
1949
1950 void
1951 running_timers_sync(void)
1952 {
1953         timer_resync_deadlines();
1954 }
1955
1956 void
1957 running_timer_enter(processor_t processor, unsigned int timer,
1958     void *param, uint64_t deadline, uint64_t now)
1959 {
1960         running_timer_setup(processor, timer, param, deadline, now);
1961         running_timers_sync();
1962 }
1963
1964 /*
1965  * Call the callback for any running timers that fired for this processor.
1966  * Returns true if any timers were past their deadline.
1967  */
1968 bool
1969 running_timers_expire(processor_t processor, uint64_t now)
1970 {
1971         bool expired = false;
1972
1973         if (!processor->running_timers_active) {
1974                 return expired;
1975         }
1976
1977         for (int i = 0; i < RUNNING_TIMER_MAX; i++) {
1978                 struct timer_call *call = &processor->running_timers[i];
1979
1980                 uint64_t deadline = call->tc_pqlink.deadline;
1981                 if (deadline > now) {
1982                         continue;
1983                 }
1984
1985                 expired = true;
1986                 timer_call_trace_expire_entry(call);
1987                 call->tc_func(call->tc_param0, call->tc_param1);
1988                 timer_call_trace_expire_return(call);
1989         }
1990
1991         return expired;
1992 }
1993
1994 void
1995 running_timer_clear(processor_t processor, enum running_timer timer)
1996 {
1997         struct timer_call *call = &processor->running_timers[timer];
1998         uint64_t deadline = call->tc_pqlink.deadline;
1999         if (deadline == EndOfAllTime) {
2000                 return;
2001         }
2002
2003         call->tc_pqlink.deadline = EndOfAllTime;
2004 #if TIMER_TRACE
2005         call->tc_entry_time = 0;
2006 #endif /* TIMER_TRACE */
2007         timer_call_trace_cancel(call);
2008 }
2009
2010 void
2011 running_timer_cancel(processor_t processor, unsigned int timer)
2012 {
2013         running_timer_clear(processor, timer);
2014         running_timers_sync();
2015 }
2016
2017 uint64_t
2018 running_timers_deadline(processor_t processor)
2019 {
2020         if (!processor->running_timers_active) {
2021                 return EndOfAllTime;
2022         }
2023
2024         uint64_t deadline = EndOfAllTime;
2025         for (int i = 0; i < RUNNING_TIMER_MAX; i++) {
2026                 uint64_t candidate =
2027                     processor->running_timers[i].tc_pqlink.deadline;
2028                 if (candidate != 0 && candidate < deadline) {
2029                         deadline = candidate;
2030                 }
2031         }
2032
2033         return deadline;
2034 }
2035
2036 void
2037 running_timers_activate(processor_t processor)
2038 {
2039         processor->running_timers_active = true;
2040         running_timers_sync();
2041 }
2042
2043 void
2044 running_timers_deactivate(processor_t processor)
2045 {
2046         assert(processor->running_timers_active == true);
2047         processor->running_timers_active = false;
2048         running_timers_sync();
2049 }