osfmk/kern/timer_call.c

   1 /*
   2  * Copyright (c) 1993-2008 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * Timer interrupt callout module.
  30  */
  31
  32 #include <mach/mach_types.h>
  33
  34 #include <kern/clock.h>
  35 #include <kern/smp.h>
  36 #include <kern/processor.h>
  37 #include <kern/timer_call.h>
  38 #include <kern/timer_queue.h>
  39 #include <kern/call_entry.h>
  40 #include <kern/thread.h>
  41 #include <kern/policy_internal.h>
  42
  43 #include <sys/kdebug.h>
  44
  45 #if CONFIG_DTRACE
  46 #include <mach/sdt.h>
  47 #endif
  48
  49
  50 #if DEBUG
  51 #define TIMER_ASSERT    1
  52 #endif
  53
  54 //#define TIMER_ASSERT  1
  55 //#define TIMER_DBG     1
  56
  57 #if TIMER_DBG
  58 #define DBG(x...) kprintf("DBG: " x);
  59 #else
  60 #define DBG(x...)
  61 #endif
  62
  63 #if TIMER_TRACE
  64 #define TIMER_KDEBUG_TRACE      KERNEL_DEBUG_CONSTANT_IST
  65 #else
  66 #define TIMER_KDEBUG_TRACE(x...)
  67 #endif
  68
  69
  70 lck_grp_t               timer_call_lck_grp;
  71 lck_attr_t              timer_call_lck_attr;
  72 lck_grp_attr_t          timer_call_lck_grp_attr;
  73
  74 lck_grp_t               timer_longterm_lck_grp;
  75 lck_attr_t              timer_longterm_lck_attr;
  76 lck_grp_attr_t          timer_longterm_lck_grp_attr;
  77
  78 /* Timer queue lock must be acquired with interrupts disabled (under splclock()) */
  79 #if __SMP__
  80 #define timer_queue_lock_spin(queue)                                    \
  81         lck_mtx_lock_spin_always(&queue->lock_data)
  82
  83 #define timer_queue_unlock(queue)               \
  84         lck_mtx_unlock_always(&queue->lock_data)
  85 #else
  86 #define timer_queue_lock_spin(queue)    (void)1
  87 #define timer_queue_unlock(queue)               (void)1
  88 #endif
  89
  90 #define QUEUE(x)        ((queue_t)(x))
  91 #define MPQUEUE(x)      ((mpqueue_head_t *)(x))
  92 #define TIMER_CALL(x)   ((timer_call_t)(x))
  93 #define TCE(x)          (&(x->call_entry))
  94 /*
  95  * The longterm timer object is a global structure holding all timers
  96  * beyond the short-term, local timer queue threshold. The boot processor
  97  * is responsible for moving each timer to its local timer queue
  98  * if and when that timer becomes due within the threshold.
  99  */
 100
 101 /* Sentinel for "no time set": */
 102 #define TIMER_LONGTERM_NONE             EndOfAllTime
 103 /* The default threadhold is the delta above which a timer is "long-term" */
 104 #if defined(__x86_64__)
 105 #define TIMER_LONGTERM_THRESHOLD        (1ULL * NSEC_PER_SEC)   /* 1 sec */
 106 #else
 107 #define TIMER_LONGTERM_THRESHOLD        TIMER_LONGTERM_NONE     /* disabled */
 108 #endif
 109
 110 /*
 111  * The scan_limit throttles processing of the longterm queue.
 112  * If the scan time exceeds this limit, we terminate, unlock
 113  * and defer for scan_interval. This prevents unbounded holding of
 114  * timer queue locks with interrupts masked.
 115  */
 116 #define TIMER_LONGTERM_SCAN_LIMIT       (100ULL * NSEC_PER_USEC)        /* 100 us */
 117 #define TIMER_LONGTERM_SCAN_INTERVAL    (100ULL * NSEC_PER_USEC)        /* 100 us */
 118 /* Sentinel for "scan limit exceeded": */
 119 #define TIMER_LONGTERM_SCAN_AGAIN       0
 120
 121 typedef struct {
 122         uint64_t        interval;       /* longterm timer interval */
 123         uint64_t        margin;         /* fudge factor (10% of interval */
 124         uint64_t        deadline;       /* first/soonest longterm deadline */
 125         uint64_t        preempted;      /* sooner timer has pre-empted */
 126         timer_call_t    call;           /* first/soonest longterm timer call */
 127         uint64_t        deadline_set;   /* next timer set */
 128         timer_call_data_t timer;        /* timer used by threshold management */
 129                                         /* Stats: */
 130         uint64_t        scans;          /*   num threshold timer scans */
 131         uint64_t        preempts;       /*   num threshold reductions */
 132         uint64_t        latency;        /*   average threshold latency */
 133         uint64_t        latency_min;    /*   minimum threshold latency */
 134         uint64_t        latency_max;    /*   maximum threshold latency */
 135 } threshold_t;
 136
 137 typedef struct {
 138         mpqueue_head_t  queue;          /* longterm timer list */
 139         uint64_t        enqueues;       /* num timers queued */
 140         uint64_t        dequeues;       /* num timers dequeued */
 141         uint64_t        escalates;      /* num timers becoming shortterm */
 142         uint64_t        scan_time;      /* last time the list was scanned */
 143         threshold_t     threshold;      /* longterm timer threshold */
 144         uint64_t        scan_limit;     /* maximum scan time */
 145         uint64_t        scan_interval;  /* interval between LT "escalation" scans */
 146         uint64_t        scan_pauses;    /* num scans exceeding time limit */
 147 } timer_longterm_t;
 148
 149 timer_longterm_t                timer_longterm = {
 150         .scan_limit = TIMER_LONGTERM_SCAN_LIMIT,
 151         .scan_interval = TIMER_LONGTERM_SCAN_INTERVAL,
 152 };
 153
 154 static mpqueue_head_t           *timer_longterm_queue = NULL;
 155
 156 static void                     timer_longterm_init(void);
 157 static void                     timer_longterm_callout(
 158         timer_call_param_t      p0,
 159         timer_call_param_t      p1);
 160 extern void                     timer_longterm_scan(
 161         timer_longterm_t        *tlp,
 162         uint64_t                now);
 163 static void                     timer_longterm_update(
 164         timer_longterm_t *tlp);
 165 static void                     timer_longterm_update_locked(
 166         timer_longterm_t *tlp);
 167 static mpqueue_head_t *         timer_longterm_enqueue_unlocked(
 168         timer_call_t            call,
 169         uint64_t                now,
 170         uint64_t                deadline,
 171         mpqueue_head_t **       old_queue,
 172         uint64_t                soft_deadline,
 173         uint64_t                ttd,
 174         timer_call_param_t      param1,
 175         uint32_t                callout_flags);
 176 static void                     timer_longterm_dequeued_locked(
 177         timer_call_t            call);
 178
 179 uint64_t past_deadline_timers;
 180 uint64_t past_deadline_deltas;
 181 uint64_t past_deadline_longest;
 182 uint64_t past_deadline_shortest = ~0ULL;
 183 enum {PAST_DEADLINE_TIMER_ADJUSTMENT_NS = 10 * 1000};
 184
 185 uint64_t past_deadline_timer_adjustment;
 186
 187 static boolean_t timer_call_enter_internal(timer_call_t call, timer_call_param_t param1, uint64_t deadline, uint64_t leeway, uint32_t flags, boolean_t ratelimited);
 188 boolean_t       mach_timer_coalescing_enabled = TRUE;
 189
 190 mpqueue_head_t  *timer_call_enqueue_deadline_unlocked(
 191         timer_call_t            call,
 192         mpqueue_head_t          *queue,
 193         uint64_t                deadline,
 194         uint64_t                soft_deadline,
 195         uint64_t                ttd,
 196         timer_call_param_t      param1,
 197         uint32_t                flags);
 198
 199 mpqueue_head_t  *timer_call_dequeue_unlocked(
 200         timer_call_t            call);
 201
 202 timer_coalescing_priority_params_t tcoal_prio_params;
 203
 204 #if TCOAL_PRIO_STATS
 205 int32_t nc_tcl, rt_tcl, bg_tcl, kt_tcl, fp_tcl, ts_tcl, qos_tcl;
 206 #define TCOAL_PRIO_STAT(x) (x++)
 207 #else
 208 #define TCOAL_PRIO_STAT(x)
 209 #endif
 210
 211 static void
 212 timer_call_init_abstime(void)
 213 {
 214         int i;
 215         uint64_t result;
 216         timer_coalescing_priority_params_ns_t * tcoal_prio_params_init = timer_call_get_priority_params();
 217         nanoseconds_to_absolutetime(PAST_DEADLINE_TIMER_ADJUSTMENT_NS, &past_deadline_timer_adjustment);
 218         nanoseconds_to_absolutetime(tcoal_prio_params_init->idle_entry_timer_processing_hdeadline_threshold_ns, &result);
 219         tcoal_prio_params.idle_entry_timer_processing_hdeadline_threshold_abstime = (uint32_t)result;
 220         nanoseconds_to_absolutetime(tcoal_prio_params_init->interrupt_timer_coalescing_ilat_threshold_ns, &result);
 221         tcoal_prio_params.interrupt_timer_coalescing_ilat_threshold_abstime = (uint32_t)result;
 222         nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_resort_threshold_ns, &result);
 223         tcoal_prio_params.timer_resort_threshold_abstime = (uint32_t)result;
 224         tcoal_prio_params.timer_coalesce_rt_shift = tcoal_prio_params_init->timer_coalesce_rt_shift;
 225         tcoal_prio_params.timer_coalesce_bg_shift = tcoal_prio_params_init->timer_coalesce_bg_shift;
 226         tcoal_prio_params.timer_coalesce_kt_shift = tcoal_prio_params_init->timer_coalesce_kt_shift;
 227         tcoal_prio_params.timer_coalesce_fp_shift = tcoal_prio_params_init->timer_coalesce_fp_shift;
 228         tcoal_prio_params.timer_coalesce_ts_shift = tcoal_prio_params_init->timer_coalesce_ts_shift;
 229
 230         nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_coalesce_rt_ns_max,
 231             &tcoal_prio_params.timer_coalesce_rt_abstime_max);
 232         nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_coalesce_bg_ns_max,
 233             &tcoal_prio_params.timer_coalesce_bg_abstime_max);
 234         nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_coalesce_kt_ns_max,
 235             &tcoal_prio_params.timer_coalesce_kt_abstime_max);
 236         nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_coalesce_fp_ns_max,
 237             &tcoal_prio_params.timer_coalesce_fp_abstime_max);
 238         nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_coalesce_ts_ns_max,
 239             &tcoal_prio_params.timer_coalesce_ts_abstime_max);
 240
 241         for (i = 0; i < NUM_LATENCY_QOS_TIERS; i++) {
 242                 tcoal_prio_params.latency_qos_scale[i] = tcoal_prio_params_init->latency_qos_scale[i];
 243                 nanoseconds_to_absolutetime(tcoal_prio_params_init->latency_qos_ns_max[i],
 244                     &tcoal_prio_params.latency_qos_abstime_max[i]);
 245                 tcoal_prio_params.latency_tier_rate_limited[i] = tcoal_prio_params_init->latency_tier_rate_limited[i];
 246         }
 247 }
 248
 249
 250 void
 251 timer_call_init(void)
 252 {
 253         lck_attr_setdefault(&timer_call_lck_attr);
 254         lck_grp_attr_setdefault(&timer_call_lck_grp_attr);
 255         lck_grp_init(&timer_call_lck_grp, "timer_call", &timer_call_lck_grp_attr);
 256
 257         timer_longterm_init();
 258         timer_call_init_abstime();
 259 }
 260
 261
 262 void
 263 timer_call_queue_init(mpqueue_head_t *queue)
 264 {
 265         DBG("timer_call_queue_init(%p)\n", queue);
 266         mpqueue_init(queue, &timer_call_lck_grp, &timer_call_lck_attr);
 267 }
 268
 269
 270 void
 271 timer_call_setup(
 272         timer_call_t                    call,
 273         timer_call_func_t               func,
 274         timer_call_param_t              param0)
 275 {
 276         DBG("timer_call_setup(%p,%p,%p)\n", call, func, param0);
 277         call_entry_setup(TCE(call), func, param0);
 278         simple_lock_init(&(call)->lock, 0);
 279         call->async_dequeue = FALSE;
 280 }
 281 #if TIMER_ASSERT
 282 static __inline__ mpqueue_head_t *
 283 timer_call_entry_dequeue(
 284         timer_call_t            entry)
 285 {
 286         mpqueue_head_t  *old_queue = MPQUEUE(TCE(entry)->queue);
 287
 288         if (!hw_lock_held((hw_lock_t)&entry->lock)) {
 289                 panic("_call_entry_dequeue() "
 290                     "entry %p is not locked\n", entry);
 291         }
 292         /*
 293          * XXX The queue lock is actually a mutex in spin mode
 294          *     but there's no way to test for it being held
 295          *     so we pretend it's a spinlock!
 296          */
 297         if (!hw_lock_held((hw_lock_t)&old_queue->lock_data)) {
 298                 panic("_call_entry_dequeue() "
 299                     "queue %p is not locked\n", old_queue);
 300         }
 301
 302         call_entry_dequeue(TCE(entry));
 303         old_queue->count--;
 304
 305         return old_queue;
 306 }
 307
 308 static __inline__ mpqueue_head_t *
 309 timer_call_entry_enqueue_deadline(
 310         timer_call_t            entry,
 311         mpqueue_head_t          *queue,
 312         uint64_t                deadline)
 313 {
 314         mpqueue_head_t  *old_queue = MPQUEUE(TCE(entry)->queue);
 315
 316         if (!hw_lock_held((hw_lock_t)&entry->lock)) {
 317                 panic("_call_entry_enqueue_deadline() "
 318                     "entry %p is not locked\n", entry);
 319         }
 320         /* XXX More lock pretense:  */
 321         if (!hw_lock_held((hw_lock_t)&queue->lock_data)) {
 322                 panic("_call_entry_enqueue_deadline() "
 323                     "queue %p is not locked\n", queue);
 324         }
 325         if (old_queue != NULL && old_queue != queue) {
 326                 panic("_call_entry_enqueue_deadline() "
 327                     "old_queue %p != queue", old_queue);
 328         }
 329
 330         call_entry_enqueue_deadline(TCE(entry), QUEUE(queue), deadline);
 331
 332 /* For efficiency, track the earliest soft deadline on the queue, so that
 333  * fuzzy decisions can be made without lock acquisitions.
 334  */
 335         timer_call_t thead = (timer_call_t)queue_first(&queue->head);
 336
 337         queue->earliest_soft_deadline = thead->flags & TIMER_CALL_RATELIMITED ? TCE(thead)->deadline : thead->soft_deadline;
 338
 339         if (old_queue) {
 340                 old_queue->count--;
 341         }
 342         queue->count++;
 343
 344         return old_queue;
 345 }
 346
 347 #else
 348
 349 static __inline__ mpqueue_head_t *
 350 timer_call_entry_dequeue(
 351         timer_call_t            entry)
 352 {
 353         mpqueue_head_t  *old_queue = MPQUEUE(TCE(entry)->queue);
 354
 355         call_entry_dequeue(TCE(entry));
 356         old_queue->count--;
 357
 358         return old_queue;
 359 }
 360
 361 static __inline__ mpqueue_head_t *
 362 timer_call_entry_enqueue_deadline(
 363         timer_call_t                    entry,
 364         mpqueue_head_t                  *queue,
 365         uint64_t                        deadline)
 366 {
 367         mpqueue_head_t  *old_queue = MPQUEUE(TCE(entry)->queue);
 368
 369         call_entry_enqueue_deadline(TCE(entry), QUEUE(queue), deadline);
 370
 371         /* For efficiency, track the earliest soft deadline on the queue,
 372          * so that fuzzy decisions can be made without lock acquisitions.
 373          */
 374
 375         timer_call_t thead = (timer_call_t)queue_first(&queue->head);
 376         queue->earliest_soft_deadline = thead->flags & TIMER_CALL_RATELIMITED ? TCE(thead)->deadline : thead->soft_deadline;
 377
 378         if (old_queue) {
 379                 old_queue->count--;
 380         }
 381         queue->count++;
 382
 383         return old_queue;
 384 }
 385
 386 #endif
 387
 388 static __inline__ void
 389 timer_call_entry_enqueue_tail(
 390         timer_call_t                    entry,
 391         mpqueue_head_t                  *queue)
 392 {
 393         call_entry_enqueue_tail(TCE(entry), QUEUE(queue));
 394         queue->count++;
 395         return;
 396 }
 397
 398 /*
 399  * Remove timer entry from its queue but don't change the queue pointer
 400  * and set the async_dequeue flag. This is locking case 2b.
 401  */
 402 static __inline__ void
 403 timer_call_entry_dequeue_async(
 404         timer_call_t            entry)
 405 {
 406         mpqueue_head_t  *old_queue = MPQUEUE(TCE(entry)->queue);
 407         if (old_queue) {
 408                 old_queue->count--;
 409                 (void) remque(qe(entry));
 410                 entry->async_dequeue = TRUE;
 411         }
 412         return;
 413 }
 414
 415 #if TIMER_ASSERT
 416 unsigned timer_call_enqueue_deadline_unlocked_async1;
 417 unsigned timer_call_enqueue_deadline_unlocked_async2;
 418 #endif
 419 /*
 420  * Assumes call_entry and queues unlocked, interrupts disabled.
 421  */
 422 __inline__ mpqueue_head_t *
 423 timer_call_enqueue_deadline_unlocked(
 424         timer_call_t                    call,
 425         mpqueue_head_t                  *queue,
 426         uint64_t                        deadline,
 427         uint64_t                        soft_deadline,
 428         uint64_t                        ttd,
 429         timer_call_param_t              param1,
 430         uint32_t                        callout_flags)
 431 {
 432         call_entry_t    entry = TCE(call);
 433         mpqueue_head_t  *old_queue;
 434
 435         DBG("timer_call_enqueue_deadline_unlocked(%p,%p,)\n", call, queue);
 436
 437         simple_lock(&call->lock, LCK_GRP_NULL);
 438
 439         old_queue = MPQUEUE(entry->queue);
 440
 441         if (old_queue != NULL) {
 442                 timer_queue_lock_spin(old_queue);
 443                 if (call->async_dequeue) {
 444                         /* collision (1c): timer already dequeued, clear flag */
 445 #if TIMER_ASSERT
 446                         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
 447                             DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
 448                             VM_KERNEL_UNSLIDE_OR_PERM(call),
 449                             call->async_dequeue,
 450                             VM_KERNEL_UNSLIDE_OR_PERM(TCE(call)->queue),
 451                             0x1c, 0);
 452                         timer_call_enqueue_deadline_unlocked_async1++;
 453 #endif
 454                         call->async_dequeue = FALSE;
 455                         entry->queue = NULL;
 456                 } else if (old_queue != queue) {
 457                         timer_call_entry_dequeue(call);
 458 #if TIMER_ASSERT
 459                         timer_call_enqueue_deadline_unlocked_async2++;
 460 #endif
 461                 }
 462                 if (old_queue == timer_longterm_queue) {
 463                         timer_longterm_dequeued_locked(call);
 464                 }
 465                 if (old_queue != queue) {
 466                         timer_queue_unlock(old_queue);
 467                         timer_queue_lock_spin(queue);
 468                 }
 469         } else {
 470                 timer_queue_lock_spin(queue);
 471         }
 472
 473         call->soft_deadline = soft_deadline;
 474         call->flags = callout_flags;
 475         TCE(call)->param1 = param1;
 476         call->ttd = ttd;
 477
 478         timer_call_entry_enqueue_deadline(call, queue, deadline);
 479         timer_queue_unlock(queue);
 480         simple_unlock(&call->lock);
 481
 482         return old_queue;
 483 }
 484
 485 #if TIMER_ASSERT
 486 unsigned timer_call_dequeue_unlocked_async1;
 487 unsigned timer_call_dequeue_unlocked_async2;
 488 #endif
 489 mpqueue_head_t *
 490 timer_call_dequeue_unlocked(
 491         timer_call_t            call)
 492 {
 493         call_entry_t    entry = TCE(call);
 494         mpqueue_head_t  *old_queue;
 495
 496         DBG("timer_call_dequeue_unlocked(%p)\n", call);
 497
 498         simple_lock(&call->lock, LCK_GRP_NULL);
 499         old_queue = MPQUEUE(entry->queue);
 500 #if TIMER_ASSERT
 501         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
 502             DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
 503             VM_KERNEL_UNSLIDE_OR_PERM(call),
 504             call->async_dequeue,
 505             VM_KERNEL_UNSLIDE_OR_PERM(TCE(call)->queue),
 506             0, 0);
 507 #endif
 508         if (old_queue != NULL) {
 509                 timer_queue_lock_spin(old_queue);
 510                 if (call->async_dequeue) {
 511                         /* collision (1c): timer already dequeued, clear flag */
 512 #if TIMER_ASSERT
 513                         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
 514                             DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
 515                             VM_KERNEL_UNSLIDE_OR_PERM(call),
 516                             call->async_dequeue,
 517                             VM_KERNEL_UNSLIDE_OR_PERM(TCE(call)->queue),
 518                             0x1c, 0);
 519                         timer_call_dequeue_unlocked_async1++;
 520 #endif
 521                         call->async_dequeue = FALSE;
 522                         entry->queue = NULL;
 523                 } else {
 524                         timer_call_entry_dequeue(call);
 525                 }
 526                 if (old_queue == timer_longterm_queue) {
 527                         timer_longterm_dequeued_locked(call);
 528                 }
 529                 timer_queue_unlock(old_queue);
 530         }
 531         simple_unlock(&call->lock);
 532         return old_queue;
 533 }
 534
 535 static uint64_t
 536 past_deadline_timer_handle(uint64_t deadline, uint64_t ctime)
 537 {
 538         uint64_t delta = (ctime - deadline);
 539
 540         past_deadline_timers++;
 541         past_deadline_deltas += delta;
 542         if (delta > past_deadline_longest) {
 543                 past_deadline_longest = deadline;
 544         }
 545         if (delta < past_deadline_shortest) {
 546                 past_deadline_shortest = delta;
 547         }
 548
 549         return ctime + past_deadline_timer_adjustment;
 550 }
 551
 552 /*
 553  * Timer call entry locking model
 554  * ==============================
 555  *
 556  * Timer call entries are linked on per-cpu timer queues which are protected
 557  * by the queue lock and the call entry lock. The locking protocol is:
 558  *
 559  *  0) The canonical locking order is timer call entry followed by queue.
 560  *
 561  *  1) With only the entry lock held, entry.queue is valid:
 562  *    1a) NULL: the entry is not queued, or
 563  *    1b) non-NULL: this queue must be locked before the entry is modified.
 564  *        After locking the queue, the call.async_dequeue flag must be checked:
 565  *    1c) TRUE: the entry was removed from the queue by another thread
 566  *              and we must NULL the entry.queue and reset this flag, or
 567  *    1d) FALSE: (ie. queued), the entry can be manipulated.
 568  *
 569  *  2) If a queue lock is obtained first, the queue is stable:
 570  *    2a) If a try-lock of a queued entry succeeds, the call can be operated on
 571  *        and dequeued.
 572  *    2b) If a try-lock fails, it indicates that another thread is attempting
 573  *        to change the entry and move it to a different position in this queue
 574  *        or to different queue. The entry can be dequeued but it should not be
 575  *        operated upon since it is being changed. Furthermore, we don't null
 576  *        the entry.queue pointer (protected by the entry lock we don't own).
 577  *        Instead, we set the async_dequeue flag -- see (1c).
 578  *    2c) Same as 2b but occurring when a longterm timer is matured.
 579  *  3) A callout's parameters (deadline, flags, parameters, soft deadline &c.)
 580  *     should be manipulated with the appropriate timer queue lock held,
 581  *     to prevent queue traversal observations from observing inconsistent
 582  *     updates to an in-flight callout.
 583  */
 584
 585 /*
 586  * Inlines timer_call_entry_dequeue() and timer_call_entry_enqueue_deadline()
 587  * cast between pointer types (mpqueue_head_t *) and (queue_t) so that
 588  * we can use the call_entry_dequeue() and call_entry_enqueue_deadline()
 589  * methods to operate on timer_call structs as if they are call_entry structs.
 590  * These structures are identical except for their queue head pointer fields.
 591  *
 592  * In the debug case, we assert that the timer call locking protocol
 593  * is being obeyed.
 594  */
 595
 596 static boolean_t
 597 timer_call_enter_internal(
 598         timer_call_t            call,
 599         timer_call_param_t      param1,
 600         uint64_t                deadline,
 601         uint64_t                leeway,
 602         uint32_t                flags,
 603         boolean_t               ratelimited)
 604 {
 605         mpqueue_head_t          *queue = NULL;
 606         mpqueue_head_t          *old_queue;
 607         spl_t                   s;
 608         uint64_t                slop;
 609         uint32_t                urgency;
 610         uint64_t                sdeadline, ttd;
 611
 612         assert(call->call_entry.func != NULL);
 613         s = splclock();
 614
 615         sdeadline = deadline;
 616         uint64_t ctime = mach_absolute_time();
 617
 618         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
 619             DECR_TIMER_ENTER | DBG_FUNC_START,
 620             VM_KERNEL_UNSLIDE_OR_PERM(call),
 621             VM_KERNEL_ADDRHIDE(param1), deadline, flags, 0);
 622
 623         urgency = (flags & TIMER_CALL_URGENCY_MASK);
 624
 625         boolean_t slop_ratelimited = FALSE;
 626         slop = timer_call_slop(deadline, ctime, urgency, current_thread(), &slop_ratelimited);
 627
 628         if ((flags & TIMER_CALL_LEEWAY) != 0 && leeway > slop) {
 629                 slop = leeway;
 630         }
 631
 632         if (UINT64_MAX - deadline <= slop) {
 633                 deadline = UINT64_MAX;
 634         } else {
 635                 deadline += slop;
 636         }
 637
 638         if (__improbable(deadline < ctime)) {
 639                 deadline = past_deadline_timer_handle(deadline, ctime);
 640                 sdeadline = deadline;
 641         }
 642
 643         if (ratelimited || slop_ratelimited) {
 644                 flags |= TIMER_CALL_RATELIMITED;
 645         } else {
 646                 flags &= ~TIMER_CALL_RATELIMITED;
 647         }
 648
 649         ttd =  sdeadline - ctime;
 650 #if CONFIG_DTRACE
 651         DTRACE_TMR7(callout__create, timer_call_func_t, TCE(call)->func,
 652             timer_call_param_t, TCE(call)->param0, uint32_t, flags,
 653             (deadline - sdeadline),
 654             (ttd >> 32), (unsigned) (ttd & 0xFFFFFFFF), call);
 655 #endif
 656
 657         /* Program timer callout parameters under the appropriate per-CPU or
 658          * longterm queue lock. The callout may have been previously enqueued
 659          * and in-flight on this or another timer queue.
 660          */
 661         if (!ratelimited && !slop_ratelimited) {
 662                 queue = timer_longterm_enqueue_unlocked(call, ctime, deadline, &old_queue, sdeadline, ttd, param1, flags);
 663         }
 664
 665         if (queue == NULL) {
 666                 queue = timer_queue_assign(deadline);
 667                 old_queue = timer_call_enqueue_deadline_unlocked(call, queue, deadline, sdeadline, ttd, param1, flags);
 668         }
 669
 670 #if TIMER_TRACE
 671         TCE(call)->entry_time = ctime;
 672 #endif
 673
 674         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
 675             DECR_TIMER_ENTER | DBG_FUNC_END,
 676             VM_KERNEL_UNSLIDE_OR_PERM(call),
 677             (old_queue != NULL), deadline, queue->count, 0);
 678
 679         splx(s);
 680
 681         return old_queue != NULL;
 682 }
 683
 684 /*
 685  * timer_call_*()
 686  *      return boolean indicating whether the call was previously queued.
 687  */
 688 boolean_t
 689 timer_call_enter(
 690         timer_call_t            call,
 691         uint64_t                deadline,
 692         uint32_t                flags)
 693 {
 694         return timer_call_enter_internal(call, NULL, deadline, 0, flags, FALSE);
 695 }
 696
 697 boolean_t
 698 timer_call_enter1(
 699         timer_call_t            call,
 700         timer_call_param_t      param1,
 701         uint64_t                deadline,
 702         uint32_t                flags)
 703 {
 704         return timer_call_enter_internal(call, param1, deadline, 0, flags, FALSE);
 705 }
 706
 707 boolean_t
 708 timer_call_enter_with_leeway(
 709         timer_call_t            call,
 710         timer_call_param_t      param1,
 711         uint64_t                deadline,
 712         uint64_t                leeway,
 713         uint32_t                flags,
 714         boolean_t               ratelimited)
 715 {
 716         return timer_call_enter_internal(call, param1, deadline, leeway, flags, ratelimited);
 717 }
 718
 719 boolean_t
 720 timer_call_quantum_timer_enter(
 721         timer_call_t            call,
 722         timer_call_param_t      param1,
 723         uint64_t                deadline,
 724         uint64_t                ctime)
 725 {
 726         assert(call->call_entry.func != NULL);
 727         assert(ml_get_interrupts_enabled() == FALSE);
 728
 729         uint32_t flags = TIMER_CALL_SYS_CRITICAL | TIMER_CALL_LOCAL;
 730
 731         TIMER_KDEBUG_TRACE(KDEBUG_TRACE, DECR_TIMER_ENTER | DBG_FUNC_START,
 732             VM_KERNEL_UNSLIDE_OR_PERM(call),
 733             VM_KERNEL_ADDRHIDE(param1), deadline,
 734             flags, 0);
 735
 736         if (__improbable(deadline < ctime)) {
 737                 deadline = past_deadline_timer_handle(deadline, ctime);
 738         }
 739
 740         uint64_t ttd = deadline - ctime;
 741 #if CONFIG_DTRACE
 742         DTRACE_TMR7(callout__create, timer_call_func_t, TCE(call)->func,
 743             timer_call_param_t, TCE(call)->param0, uint32_t, flags, 0,
 744             (ttd >> 32), (unsigned) (ttd & 0xFFFFFFFF), call);
 745 #endif
 746
 747         quantum_timer_set_deadline(deadline);
 748         TCE(call)->deadline = deadline;
 749         TCE(call)->param1 = param1;
 750         call->ttd = ttd;
 751         call->flags = flags;
 752
 753 #if TIMER_TRACE
 754         TCE(call)->entry_time = ctime;
 755 #endif
 756
 757         TIMER_KDEBUG_TRACE(KDEBUG_TRACE, DECR_TIMER_ENTER | DBG_FUNC_END,
 758             VM_KERNEL_UNSLIDE_OR_PERM(call),
 759             1, deadline, 0, 0);
 760
 761         return true;
 762 }
 763
 764
 765 boolean_t
 766 timer_call_quantum_timer_cancel(
 767         timer_call_t            call)
 768 {
 769         assert(ml_get_interrupts_enabled() == FALSE);
 770
 771         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
 772             DECR_TIMER_CANCEL | DBG_FUNC_START,
 773             VM_KERNEL_UNSLIDE_OR_PERM(call), TCE(call)->deadline,
 774             0, call->flags, 0);
 775
 776         TCE(call)->deadline = 0;
 777         quantum_timer_set_deadline(0);
 778
 779         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
 780             DECR_TIMER_CANCEL | DBG_FUNC_END,
 781             VM_KERNEL_UNSLIDE_OR_PERM(call), 0,
 782             TCE(call)->deadline - mach_absolute_time(),
 783             TCE(call)->deadline - TCE(call)->entry_time, 0);
 784
 785 #if CONFIG_DTRACE
 786         DTRACE_TMR6(callout__cancel, timer_call_func_t, TCE(call)->func,
 787             timer_call_param_t, TCE(call)->param0, uint32_t, call->flags, 0,
 788             (call->ttd >> 32), (unsigned) (call->ttd & 0xFFFFFFFF));
 789 #endif
 790
 791         return true;
 792 }
 793
 794 boolean_t
 795 timer_call_cancel(
 796         timer_call_t            call)
 797 {
 798         mpqueue_head_t          *old_queue;
 799         spl_t                   s;
 800
 801         s = splclock();
 802
 803         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
 804             DECR_TIMER_CANCEL | DBG_FUNC_START,
 805             VM_KERNEL_UNSLIDE_OR_PERM(call),
 806             TCE(call)->deadline, call->soft_deadline, call->flags, 0);
 807
 808         old_queue = timer_call_dequeue_unlocked(call);
 809
 810         if (old_queue != NULL) {
 811                 timer_queue_lock_spin(old_queue);
 812                 if (!queue_empty(&old_queue->head)) {
 813                         timer_queue_cancel(old_queue, TCE(call)->deadline, CE(queue_first(&old_queue->head))->deadline);
 814                         timer_call_t thead = (timer_call_t)queue_first(&old_queue->head);
 815                         old_queue->earliest_soft_deadline = thead->flags & TIMER_CALL_RATELIMITED ? TCE(thead)->deadline : thead->soft_deadline;
 816                 } else {
 817                         timer_queue_cancel(old_queue, TCE(call)->deadline, UINT64_MAX);
 818                         old_queue->earliest_soft_deadline = UINT64_MAX;
 819                 }
 820                 timer_queue_unlock(old_queue);
 821         }
 822         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
 823             DECR_TIMER_CANCEL | DBG_FUNC_END,
 824             VM_KERNEL_UNSLIDE_OR_PERM(call),
 825             VM_KERNEL_UNSLIDE_OR_PERM(old_queue),
 826             TCE(call)->deadline - mach_absolute_time(),
 827             TCE(call)->deadline - TCE(call)->entry_time, 0);
 828         splx(s);
 829
 830 #if CONFIG_DTRACE
 831         DTRACE_TMR6(callout__cancel, timer_call_func_t, TCE(call)->func,
 832             timer_call_param_t, TCE(call)->param0, uint32_t, call->flags, 0,
 833             (call->ttd >> 32), (unsigned) (call->ttd & 0xFFFFFFFF));
 834 #endif
 835
 836         return old_queue != NULL;
 837 }
 838
 839 static uint32_t timer_queue_shutdown_lock_skips;
 840 static uint32_t timer_queue_shutdown_discarded;
 841
 842 void
 843 timer_queue_shutdown(
 844         mpqueue_head_t          *queue)
 845 {
 846         timer_call_t            call;
 847         mpqueue_head_t          *new_queue;
 848         spl_t                   s;
 849
 850
 851         DBG("timer_queue_shutdown(%p)\n", queue);
 852
 853         s = splclock();
 854
 855         /* Note comma operator in while expression re-locking each iteration */
 856         while ((void)timer_queue_lock_spin(queue), !queue_empty(&queue->head)) {
 857                 call = TIMER_CALL(queue_first(&queue->head));
 858
 859                 if (!simple_lock_try(&call->lock, LCK_GRP_NULL)) {
 860                         /*
 861                          * case (2b) lock order inversion, dequeue and skip
 862                          * Don't change the call_entry queue back-pointer
 863                          * but set the async_dequeue field.
 864                          */
 865                         timer_queue_shutdown_lock_skips++;
 866                         timer_call_entry_dequeue_async(call);
 867 #if TIMER_ASSERT
 868                         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
 869                             DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
 870                             VM_KERNEL_UNSLIDE_OR_PERM(call),
 871                             call->async_dequeue,
 872                             VM_KERNEL_UNSLIDE_OR_PERM(TCE(call)->queue),
 873                             0x2b, 0);
 874 #endif
 875                         timer_queue_unlock(queue);
 876                         continue;
 877                 }
 878
 879                 boolean_t call_local = ((call->flags & TIMER_CALL_LOCAL) != 0);
 880
 881                 /* remove entry from old queue */
 882                 timer_call_entry_dequeue(call);
 883                 timer_queue_unlock(queue);
 884
 885                 if (call_local == FALSE) {
 886                         /* and queue it on new, discarding LOCAL timers */
 887                         new_queue = timer_queue_assign(TCE(call)->deadline);
 888                         timer_queue_lock_spin(new_queue);
 889                         timer_call_entry_enqueue_deadline(
 890                                 call, new_queue, TCE(call)->deadline);
 891                         timer_queue_unlock(new_queue);
 892                 } else {
 893                         timer_queue_shutdown_discarded++;
 894                 }
 895
 896                 assert(call_local == FALSE);
 897                 simple_unlock(&call->lock);
 898         }
 899
 900         timer_queue_unlock(queue);
 901         splx(s);
 902 }
 903
 904
 905 void
 906 quantum_timer_expire(
 907         uint64_t                deadline)
 908 {
 909         processor_t processor = current_processor();
 910         timer_call_t call = TIMER_CALL(&(processor->quantum_timer));
 911
 912         if (__improbable(TCE(call)->deadline > deadline)) {
 913                 panic("CPU quantum timer deadlin out of sync with timer call deadline");
 914         }
 915
 916         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
 917             DECR_TIMER_EXPIRE | DBG_FUNC_NONE,
 918             VM_KERNEL_UNSLIDE_OR_PERM(call),
 919             TCE(call)->deadline,
 920             TCE(call)->deadline,
 921             TCE(call)->entry_time, 0);
 922
 923         timer_call_func_t func = TCE(call)->func;
 924         timer_call_param_t param0 = TCE(call)->param0;
 925         timer_call_param_t param1 = TCE(call)->param1;
 926
 927         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
 928             DECR_TIMER_CALLOUT | DBG_FUNC_START,
 929             VM_KERNEL_UNSLIDE_OR_PERM(call), VM_KERNEL_UNSLIDE(func),
 930             VM_KERNEL_ADDRHIDE(param0),
 931             VM_KERNEL_ADDRHIDE(param1),
 932             0);
 933
 934 #if CONFIG_DTRACE
 935         DTRACE_TMR7(callout__start, timer_call_func_t, func,
 936             timer_call_param_t, param0, unsigned, call->flags,
 937             0, (call->ttd >> 32),
 938             (unsigned) (call->ttd & 0xFFFFFFFF), call);
 939 #endif
 940         (*func)(param0, param1);
 941
 942         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
 943             DECR_TIMER_CALLOUT | DBG_FUNC_END,
 944             VM_KERNEL_UNSLIDE_OR_PERM(call), VM_KERNEL_UNSLIDE(func),
 945             VM_KERNEL_ADDRHIDE(param0),
 946             VM_KERNEL_ADDRHIDE(param1),
 947             0);
 948 }
 949
 950 static uint32_t timer_queue_expire_lock_skips;
 951 uint64_t
 952 timer_queue_expire_with_options(
 953         mpqueue_head_t          *queue,
 954         uint64_t                deadline,
 955         boolean_t               rescan)
 956 {
 957         timer_call_t    call = NULL;
 958         uint32_t tc_iterations = 0;
 959         DBG("timer_queue_expire(%p,)\n", queue);
 960
 961         uint64_t cur_deadline = deadline;
 962         timer_queue_lock_spin(queue);
 963
 964         while (!queue_empty(&queue->head)) {
 965                 /* Upon processing one or more timer calls, refresh the
 966                  * deadline to account for time elapsed in the callout
 967                  */
 968                 if (++tc_iterations > 1) {
 969                         cur_deadline = mach_absolute_time();
 970                 }
 971
 972                 if (call == NULL) {
 973                         call = TIMER_CALL(queue_first(&queue->head));
 974                 }
 975
 976                 if (call->soft_deadline <= cur_deadline) {
 977                         timer_call_func_t               func;
 978                         timer_call_param_t              param0, param1;
 979
 980                         TCOAL_DEBUG(0xDDDD0000, queue->earliest_soft_deadline, call->soft_deadline, 0, 0, 0);
 981                         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
 982                             DECR_TIMER_EXPIRE | DBG_FUNC_NONE,
 983                             VM_KERNEL_UNSLIDE_OR_PERM(call),
 984                             call->soft_deadline,
 985                             TCE(call)->deadline,
 986                             TCE(call)->entry_time, 0);
 987
 988                         if ((call->flags & TIMER_CALL_RATELIMITED) &&
 989                             (TCE(call)->deadline > cur_deadline)) {
 990                                 if (rescan == FALSE) {
 991                                         break;
 992                                 }
 993                         }
 994
 995                         if (!simple_lock_try(&call->lock, LCK_GRP_NULL)) {
 996                                 /* case (2b) lock inversion, dequeue and skip */
 997                                 timer_queue_expire_lock_skips++;
 998                                 timer_call_entry_dequeue_async(call);
 999                                 call = NULL;
1000                                 continue;
1001                         }
1002
1003                         timer_call_entry_dequeue(call);
1004
1005                         func = TCE(call)->func;
1006                         param0 = TCE(call)->param0;
1007                         param1 = TCE(call)->param1;
1008
1009                         simple_unlock(&call->lock);
1010                         timer_queue_unlock(queue);
1011
1012                         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1013                             DECR_TIMER_CALLOUT | DBG_FUNC_START,
1014                             VM_KERNEL_UNSLIDE_OR_PERM(call), VM_KERNEL_UNSLIDE(func),
1015                             VM_KERNEL_ADDRHIDE(param0),
1016                             VM_KERNEL_ADDRHIDE(param1),
1017                             0);
1018
1019 #if CONFIG_DTRACE
1020                         DTRACE_TMR7(callout__start, timer_call_func_t, func,
1021                             timer_call_param_t, param0, unsigned, call->flags,
1022                             0, (call->ttd >> 32),
1023                             (unsigned) (call->ttd & 0xFFFFFFFF), call);
1024 #endif
1025                         /* Maintain time-to-deadline in per-processor data
1026                          * structure for thread wakeup deadline statistics.
1027                          */
1028                         uint64_t *ttdp = &(PROCESSOR_DATA(current_processor(), timer_call_ttd));
1029                         *ttdp = call->ttd;
1030                         (*func)(param0, param1);
1031                         *ttdp = 0;
1032 #if CONFIG_DTRACE
1033                         DTRACE_TMR4(callout__end, timer_call_func_t, func,
1034                             param0, param1, call);
1035 #endif
1036
1037                         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1038                             DECR_TIMER_CALLOUT | DBG_FUNC_END,
1039                             VM_KERNEL_UNSLIDE_OR_PERM(call), VM_KERNEL_UNSLIDE(func),
1040                             VM_KERNEL_ADDRHIDE(param0),
1041                             VM_KERNEL_ADDRHIDE(param1),
1042                             0);
1043                         call = NULL;
1044                         timer_queue_lock_spin(queue);
1045                 } else {
1046                         if (__probable(rescan == FALSE)) {
1047                                 break;
1048                         } else {
1049                                 int64_t skew = TCE(call)->deadline - call->soft_deadline;
1050                                 assert(TCE(call)->deadline >= call->soft_deadline);
1051
1052                                 /* DRK: On a latency quality-of-service level change,
1053                                  * re-sort potentially rate-limited timers. The platform
1054                                  * layer determines which timers require
1055                                  * this. In the absence of the per-callout
1056                                  * synchronization requirement, a global resort could
1057                                  * be more efficient. The re-sort effectively
1058                                  * annuls all timer adjustments, i.e. the "soft
1059                                  * deadline" is the sort key.
1060                                  */
1061
1062                                 if (timer_resort_threshold(skew)) {
1063                                         if (__probable(simple_lock_try(&call->lock, LCK_GRP_NULL))) {
1064                                                 timer_call_entry_dequeue(call);
1065                                                 timer_call_entry_enqueue_deadline(call, queue, call->soft_deadline);
1066                                                 simple_unlock(&call->lock);
1067                                                 call = NULL;
1068                                         }
1069                                 }
1070                                 if (call) {
1071                                         call = TIMER_CALL(queue_next(qe(call)));
1072                                         if (queue_end(&queue->head, qe(call))) {
1073                                                 break;
1074                                         }
1075                                 }
1076                         }
1077                 }
1078         }
1079
1080         if (!queue_empty(&queue->head)) {
1081                 call = TIMER_CALL(queue_first(&queue->head));
1082                 cur_deadline = TCE(call)->deadline;
1083                 queue->earliest_soft_deadline = (call->flags & TIMER_CALL_RATELIMITED) ? TCE(call)->deadline: call->soft_deadline;
1084         } else {
1085                 queue->earliest_soft_deadline = cur_deadline = UINT64_MAX;
1086         }
1087
1088         timer_queue_unlock(queue);
1089
1090         return cur_deadline;
1091 }
1092
1093 uint64_t
1094 timer_queue_expire(
1095         mpqueue_head_t          *queue,
1096         uint64_t                deadline)
1097 {
1098         return timer_queue_expire_with_options(queue, deadline, FALSE);
1099 }
1100
1101 extern int serverperfmode;
1102 static uint32_t timer_queue_migrate_lock_skips;
1103 /*
1104  * timer_queue_migrate() is called by timer_queue_migrate_cpu()
1105  * to move timer requests from the local processor (queue_from)
1106  * to a target processor's (queue_to).
1107  */
1108 int
1109 timer_queue_migrate(mpqueue_head_t *queue_from, mpqueue_head_t *queue_to)
1110 {
1111         timer_call_t    call;
1112         timer_call_t    head_to;
1113         int             timers_migrated = 0;
1114
1115         DBG("timer_queue_migrate(%p,%p)\n", queue_from, queue_to);
1116
1117         assert(!ml_get_interrupts_enabled());
1118         assert(queue_from != queue_to);
1119
1120         if (serverperfmode) {
1121                 /*
1122                  * if we're running a high end server
1123                  * avoid migrations... they add latency
1124                  * and don't save us power under typical
1125                  * server workloads
1126                  */
1127                 return -4;
1128         }
1129
1130         /*
1131          * Take both local (from) and target (to) timer queue locks while
1132          * moving the timers from the local queue to the target processor.
1133          * We assume that the target is always the boot processor.
1134          * But only move if all of the following is true:
1135          *  - the target queue is non-empty
1136          *  - the local queue is non-empty
1137          *  - the local queue's first deadline is later than the target's
1138          *  - the local queue contains no non-migrateable "local" call
1139          * so that we need not have the target resync.
1140          */
1141
1142         timer_queue_lock_spin(queue_to);
1143
1144         head_to = TIMER_CALL(queue_first(&queue_to->head));
1145         if (queue_empty(&queue_to->head)) {
1146                 timers_migrated = -1;
1147                 goto abort1;
1148         }
1149
1150         timer_queue_lock_spin(queue_from);
1151
1152         if (queue_empty(&queue_from->head)) {
1153                 timers_migrated = -2;
1154                 goto abort2;
1155         }
1156
1157         call = TIMER_CALL(queue_first(&queue_from->head));
1158         if (TCE(call)->deadline < TCE(head_to)->deadline) {
1159                 timers_migrated = 0;
1160                 goto abort2;
1161         }
1162
1163         /* perform scan for non-migratable timers */
1164         do {
1165                 if (call->flags & TIMER_CALL_LOCAL) {
1166                         timers_migrated = -3;
1167                         goto abort2;
1168                 }
1169                 call = TIMER_CALL(queue_next(qe(call)));
1170         } while (!queue_end(&queue_from->head, qe(call)));
1171
1172         /* migration loop itself -- both queues are locked */
1173         while (!queue_empty(&queue_from->head)) {
1174                 call = TIMER_CALL(queue_first(&queue_from->head));
1175                 if (!simple_lock_try(&call->lock, LCK_GRP_NULL)) {
1176                         /* case (2b) lock order inversion, dequeue only */
1177 #ifdef TIMER_ASSERT
1178                         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1179                             DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
1180                             VM_KERNEL_UNSLIDE_OR_PERM(call),
1181                             VM_KERNEL_UNSLIDE_OR_PERM(TCE(call)->queue),
1182                             VM_KERNEL_UNSLIDE_OR_PERM(call->lock.interlock.lock_data),
1183                             0x2b, 0);
1184 #endif
1185                         timer_queue_migrate_lock_skips++;
1186                         timer_call_entry_dequeue_async(call);
1187                         continue;
1188                 }
1189                 timer_call_entry_dequeue(call);
1190                 timer_call_entry_enqueue_deadline(
1191                         call, queue_to, TCE(call)->deadline);
1192                 timers_migrated++;
1193                 simple_unlock(&call->lock);
1194         }
1195         queue_from->earliest_soft_deadline = UINT64_MAX;
1196 abort2:
1197         timer_queue_unlock(queue_from);
1198 abort1:
1199         timer_queue_unlock(queue_to);
1200
1201         return timers_migrated;
1202 }
1203
1204 void
1205 timer_queue_trace_cpu(int ncpu)
1206 {
1207         timer_call_nosync_cpu(
1208                 ncpu,
1209                 (void (*)(void *))timer_queue_trace,
1210                 (void*) timer_queue_cpu(ncpu));
1211 }
1212
1213 void
1214 timer_queue_trace(
1215         mpqueue_head_t                  *queue)
1216 {
1217         timer_call_t    call;
1218         spl_t           s;
1219
1220         if (!kdebug_enable) {
1221                 return;
1222         }
1223
1224         s = splclock();
1225         timer_queue_lock_spin(queue);
1226
1227         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1228             DECR_TIMER_QUEUE | DBG_FUNC_START,
1229             queue->count, mach_absolute_time(), 0, 0, 0);
1230
1231         if (!queue_empty(&queue->head)) {
1232                 call = TIMER_CALL(queue_first(&queue->head));
1233                 do {
1234                         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1235                             DECR_TIMER_QUEUE | DBG_FUNC_NONE,
1236                             call->soft_deadline,
1237                             TCE(call)->deadline,
1238                             TCE(call)->entry_time,
1239                             VM_KERNEL_UNSLIDE(TCE(call)->func),
1240                             0);
1241                         call = TIMER_CALL(queue_next(qe(call)));
1242                 } while (!queue_end(&queue->head, qe(call)));
1243         }
1244
1245         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1246             DECR_TIMER_QUEUE | DBG_FUNC_END,
1247             queue->count, mach_absolute_time(), 0, 0, 0);
1248
1249         timer_queue_unlock(queue);
1250         splx(s);
1251 }
1252
1253 void
1254 timer_longterm_dequeued_locked(timer_call_t call)
1255 {
1256         timer_longterm_t        *tlp = &timer_longterm;
1257
1258         tlp->dequeues++;
1259         if (call == tlp->threshold.call) {
1260                 tlp->threshold.call = NULL;
1261         }
1262 }
1263
1264 /*
1265  * Place a timer call in the longterm list
1266  * and adjust the next timer callout deadline if the new timer is first.
1267  */
1268 mpqueue_head_t *
1269 timer_longterm_enqueue_unlocked(timer_call_t    call,
1270     uint64_t        now,
1271     uint64_t        deadline,
1272     mpqueue_head_t  **old_queue,
1273     uint64_t        soft_deadline,
1274     uint64_t        ttd,
1275     timer_call_param_t      param1,
1276     uint32_t        callout_flags)
1277 {
1278         timer_longterm_t        *tlp = &timer_longterm;
1279         boolean_t               update_required = FALSE;
1280         uint64_t                longterm_threshold;
1281
1282         longterm_threshold = now + tlp->threshold.interval;
1283
1284         /*
1285          * Return NULL without doing anything if:
1286          *  - this timer is local, or
1287          *  - the longterm mechanism is disabled, or
1288          *  - this deadline is too short.
1289          */
1290         if ((callout_flags & TIMER_CALL_LOCAL) != 0 ||
1291             (tlp->threshold.interval == TIMER_LONGTERM_NONE) ||
1292             (deadline <= longterm_threshold)) {
1293                 return NULL;
1294         }
1295
1296         /*
1297          * Remove timer from its current queue, if any.
1298          */
1299         *old_queue = timer_call_dequeue_unlocked(call);
1300
1301         /*
1302          * Lock the longterm queue, queue timer and determine
1303          * whether an update is necessary.
1304          */
1305         assert(!ml_get_interrupts_enabled());
1306         simple_lock(&call->lock, LCK_GRP_NULL);
1307         timer_queue_lock_spin(timer_longterm_queue);
1308         TCE(call)->deadline = deadline;
1309         TCE(call)->param1 = param1;
1310         call->ttd = ttd;
1311         call->soft_deadline = soft_deadline;
1312         call->flags = callout_flags;
1313         timer_call_entry_enqueue_tail(call, timer_longterm_queue);
1314
1315         tlp->enqueues++;
1316
1317         /*
1318          * We'll need to update the currently set threshold timer
1319          * if the new deadline is sooner and no sooner update is in flight.
1320          */
1321         if (deadline < tlp->threshold.deadline &&
1322             deadline < tlp->threshold.preempted) {
1323                 tlp->threshold.preempted = deadline;
1324                 tlp->threshold.call = call;
1325                 update_required = TRUE;
1326         }
1327         timer_queue_unlock(timer_longterm_queue);
1328         simple_unlock(&call->lock);
1329
1330         if (update_required) {
1331                 /*
1332                  * Note: this call expects that calling the master cpu
1333                  * alone does not involve locking the topo lock.
1334                  */
1335                 timer_call_nosync_cpu(
1336                         master_cpu,
1337                         (void (*)(void *))timer_longterm_update,
1338                         (void *)tlp);
1339         }
1340
1341         return timer_longterm_queue;
1342 }
1343
1344 /*
1345  * Scan for timers below the longterm threshold.
1346  * Move these to the local timer queue (of the boot processor on which the
1347  * calling thread is running).
1348  * Both the local (boot) queue and the longterm queue are locked.
1349  * The scan is similar to the timer migrate sequence but is performed by
1350  * successively examining each timer on the longterm queue:
1351  *  - if within the short-term threshold
1352  *    - enter on the local queue (unless being deleted),
1353  *  - otherwise:
1354  *    - if sooner, deadline becomes the next threshold deadline.
1355  * The total scan time is limited to TIMER_LONGTERM_SCAN_LIMIT. Should this be
1356  * exceeded, we abort and reschedule again so that we don't shut others from
1357  * the timer queues. Longterm timers firing late is not critical.
1358  */
1359 void
1360 timer_longterm_scan(timer_longterm_t    *tlp,
1361     uint64_t            time_start)
1362 {
1363         queue_entry_t   qe;
1364         timer_call_t    call;
1365         uint64_t        threshold;
1366         uint64_t        deadline;
1367         uint64_t        time_limit = time_start + tlp->scan_limit;
1368         mpqueue_head_t  *timer_master_queue;
1369
1370         assert(!ml_get_interrupts_enabled());
1371         assert(cpu_number() == master_cpu);
1372
1373         if (tlp->threshold.interval != TIMER_LONGTERM_NONE) {
1374                 threshold = time_start + tlp->threshold.interval;
1375         }
1376
1377         tlp->threshold.deadline = TIMER_LONGTERM_NONE;
1378         tlp->threshold.call = NULL;
1379
1380         if (queue_empty(&timer_longterm_queue->head)) {
1381                 return;
1382         }
1383
1384         timer_master_queue = timer_queue_cpu(master_cpu);
1385         timer_queue_lock_spin(timer_master_queue);
1386
1387         qe = queue_first(&timer_longterm_queue->head);
1388         while (!queue_end(&timer_longterm_queue->head, qe)) {
1389                 call = TIMER_CALL(qe);
1390                 deadline = call->soft_deadline;
1391                 qe = queue_next(qe);
1392                 if (!simple_lock_try(&call->lock, LCK_GRP_NULL)) {
1393                         /* case (2c) lock order inversion, dequeue only */
1394 #ifdef TIMER_ASSERT
1395                         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1396                             DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
1397                             VM_KERNEL_UNSLIDE_OR_PERM(call),
1398                             VM_KERNEL_UNSLIDE_OR_PERM(TCE(call)->queue),
1399                             VM_KERNEL_UNSLIDE_OR_PERM(call->lock.interlock.lock_data),
1400                             0x2c, 0);
1401 #endif
1402                         timer_call_entry_dequeue_async(call);
1403                         continue;
1404                 }
1405                 if (deadline < threshold) {
1406                         /*
1407                          * This timer needs moving (escalating)
1408                          * to the local (boot) processor's queue.
1409                          */
1410 #ifdef TIMER_ASSERT
1411                         if (deadline < time_start) {
1412                                 TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1413                                     DECR_TIMER_OVERDUE | DBG_FUNC_NONE,
1414                                     VM_KERNEL_UNSLIDE_OR_PERM(call),
1415                                     deadline,
1416                                     time_start,
1417                                     threshold,
1418                                     0);
1419                         }
1420 #endif
1421                         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1422                             DECR_TIMER_ESCALATE | DBG_FUNC_NONE,
1423                             VM_KERNEL_UNSLIDE_OR_PERM(call),
1424                             TCE(call)->deadline,
1425                             TCE(call)->entry_time,
1426                             VM_KERNEL_UNSLIDE(TCE(call)->func),
1427                             0);
1428                         tlp->escalates++;
1429                         timer_call_entry_dequeue(call);
1430                         timer_call_entry_enqueue_deadline(
1431                                 call, timer_master_queue, TCE(call)->deadline);
1432                         /*
1433                          * A side-effect of the following call is to update
1434                          * the actual hardware deadline if required.
1435                          */
1436                         (void) timer_queue_assign(deadline);
1437                 } else {
1438                         if (deadline < tlp->threshold.deadline) {
1439                                 tlp->threshold.deadline = deadline;
1440                                 tlp->threshold.call = call;
1441                         }
1442                 }
1443                 simple_unlock(&call->lock);
1444
1445                 /* Abort scan if we're taking too long. */
1446                 if (mach_absolute_time() > time_limit) {
1447                         tlp->threshold.deadline = TIMER_LONGTERM_SCAN_AGAIN;
1448                         tlp->scan_pauses++;
1449                         DBG("timer_longterm_scan() paused %llu, qlen: %llu\n",
1450                             time_limit, tlp->queue.count);
1451                         break;
1452                 }
1453         }
1454
1455         timer_queue_unlock(timer_master_queue);
1456 }
1457
1458 void
1459 timer_longterm_callout(timer_call_param_t p0, __unused timer_call_param_t p1)
1460 {
1461         timer_longterm_t        *tlp = (timer_longterm_t *) p0;
1462
1463         timer_longterm_update(tlp);
1464 }
1465
1466 void
1467 timer_longterm_update_locked(timer_longterm_t *tlp)
1468 {
1469         uint64_t        latency;
1470
1471         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1472             DECR_TIMER_UPDATE | DBG_FUNC_START,
1473             VM_KERNEL_UNSLIDE_OR_PERM(&tlp->queue),
1474             tlp->threshold.deadline,
1475             tlp->threshold.preempted,
1476             tlp->queue.count, 0);
1477
1478         tlp->scan_time = mach_absolute_time();
1479         if (tlp->threshold.preempted != TIMER_LONGTERM_NONE) {
1480                 tlp->threshold.preempts++;
1481                 tlp->threshold.deadline = tlp->threshold.preempted;
1482                 tlp->threshold.preempted = TIMER_LONGTERM_NONE;
1483                 /*
1484                  * Note: in the unlikely event that a pre-empted timer has
1485                  * itself been cancelled, we'll simply re-scan later at the
1486                  * time of the preempted/cancelled timer.
1487                  */
1488         } else {
1489                 tlp->threshold.scans++;
1490
1491                 /*
1492                  * Maintain a moving average of our wakeup latency.
1493                  * Clamp latency to 0 and ignore above threshold interval.
1494                  */
1495                 if (tlp->scan_time > tlp->threshold.deadline_set) {
1496                         latency = tlp->scan_time - tlp->threshold.deadline_set;
1497                 } else {
1498                         latency = 0;
1499                 }
1500                 if (latency < tlp->threshold.interval) {
1501                         tlp->threshold.latency_min =
1502                             MIN(tlp->threshold.latency_min, latency);
1503                         tlp->threshold.latency_max =
1504                             MAX(tlp->threshold.latency_max, latency);
1505                         tlp->threshold.latency =
1506                             (tlp->threshold.latency * 99 + latency) / 100;
1507                 }
1508
1509                 timer_longterm_scan(tlp, tlp->scan_time);
1510         }
1511
1512         tlp->threshold.deadline_set = tlp->threshold.deadline;
1513         /* The next deadline timer to be set is adjusted */
1514         if (tlp->threshold.deadline != TIMER_LONGTERM_NONE &&
1515             tlp->threshold.deadline != TIMER_LONGTERM_SCAN_AGAIN) {
1516                 tlp->threshold.deadline_set -= tlp->threshold.margin;
1517                 tlp->threshold.deadline_set -= tlp->threshold.latency;
1518         }
1519
1520         /* Throttle next scan time */
1521         uint64_t scan_clamp = mach_absolute_time() + tlp->scan_interval;
1522         if (tlp->threshold.deadline_set < scan_clamp) {
1523                 tlp->threshold.deadline_set = scan_clamp;
1524         }
1525
1526         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1527             DECR_TIMER_UPDATE | DBG_FUNC_END,
1528             VM_KERNEL_UNSLIDE_OR_PERM(&tlp->queue),
1529             tlp->threshold.deadline,
1530             tlp->threshold.scans,
1531             tlp->queue.count, 0);
1532 }
1533
1534 void
1535 timer_longterm_update(timer_longterm_t *tlp)
1536 {
1537         spl_t   s = splclock();
1538
1539         timer_queue_lock_spin(timer_longterm_queue);
1540
1541         if (cpu_number() != master_cpu) {
1542                 panic("timer_longterm_update_master() on non-boot cpu");
1543         }
1544
1545         timer_longterm_update_locked(tlp);
1546
1547         if (tlp->threshold.deadline != TIMER_LONGTERM_NONE) {
1548                 timer_call_enter(
1549                         &tlp->threshold.timer,
1550                         tlp->threshold.deadline_set,
1551                         TIMER_CALL_LOCAL | TIMER_CALL_SYS_CRITICAL);
1552         }
1553
1554         timer_queue_unlock(timer_longterm_queue);
1555         splx(s);
1556 }
1557
1558 void
1559 timer_longterm_init(void)
1560 {
1561         uint32_t                longterm;
1562         timer_longterm_t        *tlp = &timer_longterm;
1563
1564         DBG("timer_longterm_init() tlp: %p, queue: %p\n", tlp, &tlp->queue);
1565
1566         /*
1567          * Set the longterm timer threshold. Defaults to TIMER_LONGTERM_THRESHOLD
1568          * or TIMER_LONGTERM_NONE (disabled) for server;
1569          * overridden longterm boot-arg
1570          */
1571         tlp->threshold.interval = serverperfmode ? TIMER_LONGTERM_NONE
1572             : TIMER_LONGTERM_THRESHOLD;
1573         if (PE_parse_boot_argn("longterm", &longterm, sizeof(longterm))) {
1574                 tlp->threshold.interval = (longterm == 0) ?
1575                     TIMER_LONGTERM_NONE :
1576                     longterm * NSEC_PER_MSEC;
1577         }
1578         if (tlp->threshold.interval != TIMER_LONGTERM_NONE) {
1579                 printf("Longterm timer threshold: %llu ms\n",
1580                     tlp->threshold.interval / NSEC_PER_MSEC);
1581                 kprintf("Longterm timer threshold: %llu ms\n",
1582                     tlp->threshold.interval / NSEC_PER_MSEC);
1583                 nanoseconds_to_absolutetime(tlp->threshold.interval,
1584                     &tlp->threshold.interval);
1585                 tlp->threshold.margin = tlp->threshold.interval / 10;
1586                 tlp->threshold.latency_min = EndOfAllTime;
1587                 tlp->threshold.latency_max = 0;
1588         }
1589
1590         tlp->threshold.preempted = TIMER_LONGTERM_NONE;
1591         tlp->threshold.deadline = TIMER_LONGTERM_NONE;
1592
1593         lck_attr_setdefault(&timer_longterm_lck_attr);
1594         lck_grp_attr_setdefault(&timer_longterm_lck_grp_attr);
1595         lck_grp_init(&timer_longterm_lck_grp,
1596             "timer_longterm", &timer_longterm_lck_grp_attr);
1597         mpqueue_init(&tlp->queue,
1598             &timer_longterm_lck_grp, &timer_longterm_lck_attr);
1599
1600         timer_call_setup(&tlp->threshold.timer,
1601             timer_longterm_callout, (timer_call_param_t) tlp);
1602
1603         timer_longterm_queue = &tlp->queue;
1604 }
1605
1606 enum {
1607         THRESHOLD, QCOUNT,
1608         ENQUEUES, DEQUEUES, ESCALATES, SCANS, PREEMPTS,
1609         LATENCY, LATENCY_MIN, LATENCY_MAX, SCAN_LIMIT, SCAN_INTERVAL, PAUSES
1610 };
1611 uint64_t
1612 timer_sysctl_get(int oid)
1613 {
1614         timer_longterm_t        *tlp = &timer_longterm;
1615
1616         switch (oid) {
1617         case THRESHOLD:
1618                 return (tlp->threshold.interval == TIMER_LONGTERM_NONE) ?
1619                        0 : tlp->threshold.interval / NSEC_PER_MSEC;
1620         case QCOUNT:
1621                 return tlp->queue.count;
1622         case ENQUEUES:
1623                 return tlp->enqueues;
1624         case DEQUEUES:
1625                 return tlp->dequeues;
1626         case ESCALATES:
1627                 return tlp->escalates;
1628         case SCANS:
1629                 return tlp->threshold.scans;
1630         case PREEMPTS:
1631                 return tlp->threshold.preempts;
1632         case LATENCY:
1633                 return tlp->threshold.latency;
1634         case LATENCY_MIN:
1635                 return tlp->threshold.latency_min;
1636         case LATENCY_MAX:
1637                 return tlp->threshold.latency_max;
1638         case SCAN_LIMIT:
1639                 return tlp->scan_limit;
1640         case SCAN_INTERVAL:
1641                 return tlp->scan_interval;
1642         case PAUSES:
1643                 return tlp->scan_pauses;
1644         default:
1645                 return 0;
1646         }
1647 }
1648
1649 /*
1650  * timer_master_scan() is the inverse of timer_longterm_scan()
1651  * since it un-escalates timers to the longterm queue.
1652  */
1653 static void
1654 timer_master_scan(timer_longterm_t      *tlp,
1655     uint64_t              now)
1656 {
1657         queue_entry_t   qe;
1658         timer_call_t    call;
1659         uint64_t        threshold;
1660         uint64_t        deadline;
1661         mpqueue_head_t  *timer_master_queue;
1662
1663         if (tlp->threshold.interval != TIMER_LONGTERM_NONE) {
1664                 threshold = now + tlp->threshold.interval;
1665         } else {
1666                 threshold = TIMER_LONGTERM_NONE;
1667         }
1668
1669         timer_master_queue = timer_queue_cpu(master_cpu);
1670         timer_queue_lock_spin(timer_master_queue);
1671
1672         qe = queue_first(&timer_master_queue->head);
1673         while (!queue_end(&timer_master_queue->head, qe)) {
1674                 call = TIMER_CALL(qe);
1675                 deadline = TCE(call)->deadline;
1676                 qe = queue_next(qe);
1677                 if ((call->flags & TIMER_CALL_LOCAL) != 0) {
1678                         continue;
1679                 }
1680                 if (!simple_lock_try(&call->lock, LCK_GRP_NULL)) {
1681                         /* case (2c) lock order inversion, dequeue only */
1682                         timer_call_entry_dequeue_async(call);
1683                         continue;
1684                 }
1685                 if (deadline > threshold) {
1686                         /* move from master to longterm */
1687                         timer_call_entry_dequeue(call);
1688                         timer_call_entry_enqueue_tail(call, timer_longterm_queue);
1689                         if (deadline < tlp->threshold.deadline) {
1690                                 tlp->threshold.deadline = deadline;
1691                                 tlp->threshold.call = call;
1692                         }
1693                 }
1694                 simple_unlock(&call->lock);
1695         }
1696         timer_queue_unlock(timer_master_queue);
1697 }
1698
1699 static void
1700 timer_sysctl_set_threshold(uint64_t value)
1701 {
1702         timer_longterm_t        *tlp = &timer_longterm;
1703         spl_t                   s = splclock();
1704         boolean_t               threshold_increase;
1705
1706         timer_queue_lock_spin(timer_longterm_queue);
1707
1708         timer_call_cancel(&tlp->threshold.timer);
1709
1710         /*
1711          * Set the new threshold and note whther it's increasing.
1712          */
1713         if (value == 0) {
1714                 tlp->threshold.interval = TIMER_LONGTERM_NONE;
1715                 threshold_increase = TRUE;
1716                 timer_call_cancel(&tlp->threshold.timer);
1717         } else {
1718                 uint64_t        old_interval = tlp->threshold.interval;
1719                 tlp->threshold.interval = value * NSEC_PER_MSEC;
1720                 nanoseconds_to_absolutetime(tlp->threshold.interval,
1721                     &tlp->threshold.interval);
1722                 tlp->threshold.margin = tlp->threshold.interval / 10;
1723                 if (old_interval == TIMER_LONGTERM_NONE) {
1724                         threshold_increase = FALSE;
1725                 } else {
1726                         threshold_increase = (tlp->threshold.interval > old_interval);
1727                 }
1728         }
1729
1730         if (threshold_increase /* or removal */) {
1731                 /* Escalate timers from the longterm queue */
1732                 timer_longterm_scan(tlp, mach_absolute_time());
1733         } else { /* decrease or addition  */
1734                 /*
1735                  * We scan the local/master queue for timers now longterm.
1736                  * To be strictly correct, we should scan all processor queues
1737                  * but timer migration results in most timers gravitating to the
1738                  * master processor in any case.
1739                  */
1740                 timer_master_scan(tlp, mach_absolute_time());
1741         }
1742
1743         /* Set new timer accordingly */
1744         tlp->threshold.deadline_set = tlp->threshold.deadline;
1745         if (tlp->threshold.deadline != TIMER_LONGTERM_NONE) {
1746                 tlp->threshold.deadline_set -= tlp->threshold.margin;
1747                 tlp->threshold.deadline_set -= tlp->threshold.latency;
1748                 timer_call_enter(
1749                         &tlp->threshold.timer,
1750                         tlp->threshold.deadline_set,
1751                         TIMER_CALL_LOCAL | TIMER_CALL_SYS_CRITICAL);
1752         }
1753
1754         /* Reset stats */
1755         tlp->enqueues = 0;
1756         tlp->dequeues = 0;
1757         tlp->escalates = 0;
1758         tlp->scan_pauses = 0;
1759         tlp->threshold.scans = 0;
1760         tlp->threshold.preempts = 0;
1761         tlp->threshold.latency = 0;
1762         tlp->threshold.latency_min = EndOfAllTime;
1763         tlp->threshold.latency_max = 0;
1764
1765         timer_queue_unlock(timer_longterm_queue);
1766         splx(s);
1767 }
1768
1769 int
1770 timer_sysctl_set(int oid, uint64_t value)
1771 {
1772         switch (oid) {
1773         case THRESHOLD:
1774                 timer_call_cpu(
1775                         master_cpu,
1776                         (void (*)(void *))timer_sysctl_set_threshold,
1777                         (void *) value);
1778                 return KERN_SUCCESS;
1779         case SCAN_LIMIT:
1780                 timer_longterm.scan_limit = value;
1781                 return KERN_SUCCESS;
1782         case SCAN_INTERVAL:
1783                 timer_longterm.scan_interval = value;
1784                 return KERN_SUCCESS;
1785         default:
1786                 return KERN_INVALID_ARGUMENT;
1787         }
1788 }
1789
1790
1791 /* Select timer coalescing window based on per-task quality-of-service hints */
1792 static boolean_t
1793 tcoal_qos_adjust(thread_t t, int32_t *tshift, uint64_t *tmax_abstime, boolean_t *pratelimited)
1794 {
1795         uint32_t latency_qos;
1796         boolean_t adjusted = FALSE;
1797         task_t ctask = t->task;
1798
1799         if (ctask) {
1800                 latency_qos = proc_get_effective_thread_policy(t, TASK_POLICY_LATENCY_QOS);
1801
1802                 assert(latency_qos <= NUM_LATENCY_QOS_TIERS);
1803
1804                 if (latency_qos) {
1805                         *tshift = tcoal_prio_params.latency_qos_scale[latency_qos - 1];
1806                         *tmax_abstime = tcoal_prio_params.latency_qos_abstime_max[latency_qos - 1];
1807                         *pratelimited = tcoal_prio_params.latency_tier_rate_limited[latency_qos - 1];
1808                         adjusted = TRUE;
1809                 }
1810         }
1811         return adjusted;
1812 }
1813
1814
1815 /* Adjust timer deadlines based on priority of the thread and the
1816  * urgency value provided at timeout establishment. With this mechanism,
1817  * timers are no longer necessarily sorted in order of soft deadline
1818  * on a given timer queue, i.e. they may be differentially skewed.
1819  * In the current scheme, this could lead to fewer pending timers
1820  * processed than is technically possible when the HW deadline arrives.
1821  */
1822 static void
1823 timer_compute_leeway(thread_t cthread, int32_t urgency, int32_t *tshift, uint64_t *tmax_abstime, boolean_t *pratelimited)
1824 {
1825         int16_t tpri = cthread->sched_pri;
1826         if ((urgency & TIMER_CALL_USER_MASK) != 0) {
1827                 if (tpri >= BASEPRI_RTQUEUES ||
1828                     urgency == TIMER_CALL_USER_CRITICAL) {
1829                         *tshift = tcoal_prio_params.timer_coalesce_rt_shift;
1830                         *tmax_abstime = tcoal_prio_params.timer_coalesce_rt_abstime_max;
1831                         TCOAL_PRIO_STAT(rt_tcl);
1832                 } else if (proc_get_effective_thread_policy(cthread, TASK_POLICY_DARWIN_BG) ||
1833                     (urgency == TIMER_CALL_USER_BACKGROUND)) {
1834                         /* Determine if timer should be subjected to a lower QoS */
1835                         if (tcoal_qos_adjust(cthread, tshift, tmax_abstime, pratelimited)) {
1836                                 if (*tmax_abstime > tcoal_prio_params.timer_coalesce_bg_abstime_max) {
1837                                         return;
1838                                 } else {
1839                                         *pratelimited = FALSE;
1840                                 }
1841                         }
1842                         *tshift = tcoal_prio_params.timer_coalesce_bg_shift;
1843                         *tmax_abstime = tcoal_prio_params.timer_coalesce_bg_abstime_max;
1844                         TCOAL_PRIO_STAT(bg_tcl);
1845                 } else if (tpri >= MINPRI_KERNEL) {
1846                         *tshift = tcoal_prio_params.timer_coalesce_kt_shift;
1847                         *tmax_abstime = tcoal_prio_params.timer_coalesce_kt_abstime_max;
1848                         TCOAL_PRIO_STAT(kt_tcl);
1849                 } else if (cthread->sched_mode == TH_MODE_FIXED) {
1850                         *tshift = tcoal_prio_params.timer_coalesce_fp_shift;
1851                         *tmax_abstime = tcoal_prio_params.timer_coalesce_fp_abstime_max;
1852                         TCOAL_PRIO_STAT(fp_tcl);
1853                 } else if (tcoal_qos_adjust(cthread, tshift, tmax_abstime, pratelimited)) {
1854                         TCOAL_PRIO_STAT(qos_tcl);
1855                 } else if (cthread->sched_mode == TH_MODE_TIMESHARE) {
1856                         *tshift = tcoal_prio_params.timer_coalesce_ts_shift;
1857                         *tmax_abstime = tcoal_prio_params.timer_coalesce_ts_abstime_max;
1858                         TCOAL_PRIO_STAT(ts_tcl);
1859                 } else {
1860                         TCOAL_PRIO_STAT(nc_tcl);
1861                 }
1862         } else if (urgency == TIMER_CALL_SYS_BACKGROUND) {
1863                 *tshift = tcoal_prio_params.timer_coalesce_bg_shift;
1864                 *tmax_abstime = tcoal_prio_params.timer_coalesce_bg_abstime_max;
1865                 TCOAL_PRIO_STAT(bg_tcl);
1866         } else {
1867                 *tshift = tcoal_prio_params.timer_coalesce_kt_shift;
1868                 *tmax_abstime = tcoal_prio_params.timer_coalesce_kt_abstime_max;
1869                 TCOAL_PRIO_STAT(kt_tcl);
1870         }
1871 }
1872
1873
1874 int timer_user_idle_level;
1875
1876 uint64_t
1877 timer_call_slop(uint64_t deadline, uint64_t now, uint32_t flags, thread_t cthread, boolean_t *pratelimited)
1878 {
1879         int32_t tcs_shift = 0;
1880         uint64_t tcs_max_abstime = 0;
1881         uint64_t adjval;
1882         uint32_t urgency = (flags & TIMER_CALL_URGENCY_MASK);
1883
1884         if (mach_timer_coalescing_enabled &&
1885             (deadline > now) && (urgency != TIMER_CALL_SYS_CRITICAL)) {
1886                 timer_compute_leeway(cthread, urgency, &tcs_shift, &tcs_max_abstime, pratelimited);
1887
1888                 if (tcs_shift >= 0) {
1889                         adjval =  MIN((deadline - now) >> tcs_shift, tcs_max_abstime);
1890                 } else {
1891                         adjval =  MIN((deadline - now) << (-tcs_shift), tcs_max_abstime);
1892                 }
1893                 /* Apply adjustments derived from "user idle level" heuristic */
1894                 adjval += (adjval * timer_user_idle_level) >> 7;
1895                 return adjval;
1896         } else {
1897                 return 0;
1898         }
1899 }
1900
1901 int
1902 timer_get_user_idle_level(void)
1903 {
1904         return timer_user_idle_level;
1905 }
1906
1907 kern_return_t
1908 timer_set_user_idle_level(int ilevel)
1909 {
1910         boolean_t do_reeval = FALSE;
1911
1912         if ((ilevel < 0) || (ilevel > 128)) {
1913                 return KERN_INVALID_ARGUMENT;
1914         }
1915
1916         if (ilevel < timer_user_idle_level) {
1917                 do_reeval = TRUE;
1918         }
1919
1920         timer_user_idle_level = ilevel;
1921
1922         if (do_reeval) {
1923                 ml_timer_evaluate();
1924         }
1925
1926         return KERN_SUCCESS;
1927 }