osfmk/kern/timer_call.c

   1 /*
   2  * Copyright (c) 1993-2008 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * Timer interrupt callout module.
  30  */
  31
  32 #include <mach/mach_types.h>
  33
  34 #include <kern/clock.h>
  35 #include <kern/smp.h>
  36 #include <kern/processor.h>
  37 #include <kern/timer_call.h>
  38 #include <kern/timer_queue.h>
  39 #include <kern/call_entry.h>
  40 #include <kern/thread.h>
  41
  42 #include <sys/kdebug.h>
  43
  44 #if CONFIG_DTRACE
  45 #include <mach/sdt.h>
  46 #endif
  47
  48
  49 #if DEBUG
  50 #define TIMER_ASSERT    1
  51 #endif
  52
  53 //#define TIMER_ASSERT  1
  54 //#define TIMER_DBG     1
  55
  56 #if TIMER_DBG
  57 #define DBG(x...) kprintf("DBG: " x);
  58 #else
  59 #define DBG(x...)
  60 #endif
  61
  62 #if TIMER_TRACE
  63 #define TIMER_KDEBUG_TRACE      KERNEL_DEBUG_CONSTANT_IST
  64 #else
  65 #define TIMER_KDEBUG_TRACE(x...)
  66 #endif
  67
  68
  69 lck_grp_t               timer_call_lck_grp;
  70 lck_attr_t              timer_call_lck_attr;
  71 lck_grp_attr_t          timer_call_lck_grp_attr;
  72
  73 lck_grp_t               timer_longterm_lck_grp;
  74 lck_attr_t              timer_longterm_lck_attr;
  75 lck_grp_attr_t          timer_longterm_lck_grp_attr;
  76
  77 /* Timer queue lock must be acquired with interrupts disabled (under splclock()) */
  78 #if __SMP__
  79 #define timer_queue_lock_spin(queue)                                    \
  80         lck_mtx_lock_spin_always(&queue->lock_data)
  81
  82 #define timer_queue_unlock(queue)               \
  83         lck_mtx_unlock_always(&queue->lock_data)
  84 #else
  85 #define timer_queue_lock_spin(queue)    (void)1
  86 #define timer_queue_unlock(queue)               (void)1
  87 #endif
  88
  89 #define QUEUE(x)        ((queue_t)(x))
  90 #define MPQUEUE(x)      ((mpqueue_head_t *)(x))
  91 #define TIMER_CALL(x)   ((timer_call_t)(x))
  92 #define TCE(x)          (&(x->call_entry))
  93 /*
  94  * The longterm timer object is a global structure holding all timers
  95  * beyond the short-term, local timer queue threshold. The boot processor
  96  * is responsible for moving each timer to its local timer queue
  97  * if and when that timer becomes due within the threshold.
  98  */
  99 #define TIMER_LONGTERM_NONE             EndOfAllTime
 100 #if defined(__x86_64__)
 101 #define TIMER_LONGTERM_THRESHOLD        (1ULL * NSEC_PER_SEC)
 102 #else
 103 #define TIMER_LONGTERM_THRESHOLD        TIMER_LONGTERM_NONE
 104 #endif
 105
 106 typedef struct {
 107         uint64_t        interval;       /* longterm timer interval */
 108         uint64_t        margin;         /* fudge factor (10% of interval */
 109         uint64_t        deadline;       /* first/soonest longterm deadline */
 110         uint64_t        preempted;      /* sooner timer has pre-empted */
 111         timer_call_t    call;           /* first/soonest longterm timer call */
 112         uint64_t        deadline_set;   /* next timer set */
 113         timer_call_data_t timer;        /* timer used by threshold management */
 114                                         /* Stats: */
 115         uint64_t        scans;          /*   num threshold timer scans */
 116         uint64_t        preempts;       /*   num threshold reductions */
 117         uint64_t        latency;        /*   average threshold latency */
 118         uint64_t        latency_min;    /*   minimum threshold latency */
 119         uint64_t        latency_max;    /*   maximum threshold latency */
 120 } threshold_t;
 121
 122 typedef struct {
 123         mpqueue_head_t  queue;          /* longterm timer list */
 124         uint64_t        enqueues;       /* num timers queued */
 125         uint64_t        dequeues;       /* num timers dequeued */
 126         uint64_t        escalates;      /* num timers becoming shortterm */
 127         uint64_t        scan_time;      /* last time the list was scanned */
 128         threshold_t     threshold;      /* longterm timer threshold */
 129 } timer_longterm_t;
 130
 131 timer_longterm_t                timer_longterm;
 132
 133 static mpqueue_head_t           *timer_longterm_queue = NULL;
 134
 135 static void                     timer_longterm_init(void);
 136 static void                     timer_longterm_callout(
 137                                         timer_call_param_t      p0,
 138                                         timer_call_param_t      p1);
 139 extern void                     timer_longterm_scan(
 140                                         timer_longterm_t        *tlp,
 141                                         uint64_t                now);
 142 static void                     timer_longterm_update(
 143                                         timer_longterm_t *tlp);
 144 static void                     timer_longterm_update_locked(
 145                                         timer_longterm_t *tlp);
 146 static mpqueue_head_t *         timer_longterm_enqueue_unlocked(
 147                                         timer_call_t            call,
 148                                         uint64_t                now,
 149                                         uint64_t                deadline,
 150                                         mpqueue_head_t **       old_queue,
 151                                         uint64_t                soft_deadline,
 152                                         uint64_t                ttd,
 153                                         timer_call_param_t      param1,
 154                                         uint32_t                callout_flags);
 155 static void                     timer_longterm_dequeued_locked(
 156                                         timer_call_t            call);
 157
 158 uint64_t past_deadline_timers;
 159 uint64_t past_deadline_deltas;
 160 uint64_t past_deadline_longest;
 161 uint64_t past_deadline_shortest = ~0ULL;
 162 enum {PAST_DEADLINE_TIMER_ADJUSTMENT_NS = 10 * 1000};
 163
 164 uint64_t past_deadline_timer_adjustment;
 165
 166 static boolean_t timer_call_enter_internal(timer_call_t call, timer_call_param_t param1, uint64_t deadline, uint64_t leeway, uint32_t flags, boolean_t ratelimited);
 167 boolean_t       mach_timer_coalescing_enabled = TRUE;
 168
 169 mpqueue_head_t  *timer_call_enqueue_deadline_unlocked(
 170                         timer_call_t            call,
 171                         mpqueue_head_t          *queue,
 172                         uint64_t                deadline,
 173                         uint64_t                soft_deadline,
 174                         uint64_t                ttd,
 175                         timer_call_param_t      param1,
 176                         uint32_t                flags);
 177
 178 mpqueue_head_t  *timer_call_dequeue_unlocked(
 179                         timer_call_t            call);
 180
 181 timer_coalescing_priority_params_t tcoal_prio_params;
 182
 183 #if TCOAL_PRIO_STATS
 184 int32_t nc_tcl, rt_tcl, bg_tcl, kt_tcl, fp_tcl, ts_tcl, qos_tcl;
 185 #define TCOAL_PRIO_STAT(x) (x++)
 186 #else
 187 #define TCOAL_PRIO_STAT(x)
 188 #endif
 189
 190 static void
 191 timer_call_init_abstime(void)
 192 {
 193         int i;
 194         uint64_t result;
 195         timer_coalescing_priority_params_ns_t * tcoal_prio_params_init = timer_call_get_priority_params();
 196         nanoseconds_to_absolutetime(PAST_DEADLINE_TIMER_ADJUSTMENT_NS, &past_deadline_timer_adjustment);
 197         nanoseconds_to_absolutetime(tcoal_prio_params_init->idle_entry_timer_processing_hdeadline_threshold_ns, &result);
 198         tcoal_prio_params.idle_entry_timer_processing_hdeadline_threshold_abstime = (uint32_t)result;
 199         nanoseconds_to_absolutetime(tcoal_prio_params_init->interrupt_timer_coalescing_ilat_threshold_ns, &result);
 200         tcoal_prio_params.interrupt_timer_coalescing_ilat_threshold_abstime = (uint32_t)result;
 201         nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_resort_threshold_ns, &result);
 202         tcoal_prio_params.timer_resort_threshold_abstime = (uint32_t)result;
 203         tcoal_prio_params.timer_coalesce_rt_shift = tcoal_prio_params_init->timer_coalesce_rt_shift;
 204         tcoal_prio_params.timer_coalesce_bg_shift = tcoal_prio_params_init->timer_coalesce_bg_shift;
 205         tcoal_prio_params.timer_coalesce_kt_shift = tcoal_prio_params_init->timer_coalesce_kt_shift;
 206         tcoal_prio_params.timer_coalesce_fp_shift = tcoal_prio_params_init->timer_coalesce_fp_shift;
 207         tcoal_prio_params.timer_coalesce_ts_shift = tcoal_prio_params_init->timer_coalesce_ts_shift;
 208
 209         nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_coalesce_rt_ns_max,
 210             &tcoal_prio_params.timer_coalesce_rt_abstime_max);
 211         nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_coalesce_bg_ns_max,
 212             &tcoal_prio_params.timer_coalesce_bg_abstime_max);
 213         nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_coalesce_kt_ns_max,
 214             &tcoal_prio_params.timer_coalesce_kt_abstime_max);
 215         nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_coalesce_fp_ns_max,
 216             &tcoal_prio_params.timer_coalesce_fp_abstime_max);
 217         nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_coalesce_ts_ns_max,
 218             &tcoal_prio_params.timer_coalesce_ts_abstime_max);
 219
 220         for (i = 0; i < NUM_LATENCY_QOS_TIERS; i++) {
 221                 tcoal_prio_params.latency_qos_scale[i] = tcoal_prio_params_init->latency_qos_scale[i];
 222                 nanoseconds_to_absolutetime(tcoal_prio_params_init->latency_qos_ns_max[i],
 223                     &tcoal_prio_params.latency_qos_abstime_max[i]);
 224                 tcoal_prio_params.latency_tier_rate_limited[i] = tcoal_prio_params_init->latency_tier_rate_limited[i];
 225         }
 226 }
 227
 228
 229 void
 230 timer_call_init(void)
 231 {
 232         lck_attr_setdefault(&timer_call_lck_attr);
 233         lck_grp_attr_setdefault(&timer_call_lck_grp_attr);
 234         lck_grp_init(&timer_call_lck_grp, "timer_call", &timer_call_lck_grp_attr);
 235
 236         timer_longterm_init();
 237         timer_call_init_abstime();
 238 }
 239
 240
 241 void
 242 timer_call_queue_init(mpqueue_head_t *queue)
 243 {
 244         DBG("timer_call_queue_init(%p)\n", queue);
 245         mpqueue_init(queue, &timer_call_lck_grp, &timer_call_lck_attr);
 246 }
 247
 248
 249 void
 250 timer_call_setup(
 251         timer_call_t                    call,
 252         timer_call_func_t               func,
 253         timer_call_param_t              param0)
 254 {
 255         DBG("timer_call_setup(%p,%p,%p)\n", call, func, param0);
 256         call_entry_setup(TCE(call), func, param0);
 257         simple_lock_init(&(call)->lock, 0);
 258         call->async_dequeue = FALSE;
 259 }
 260 #if TIMER_ASSERT
 261 static __inline__ mpqueue_head_t *
 262 timer_call_entry_dequeue(
 263         timer_call_t            entry)
 264 {
 265         mpqueue_head_t  *old_queue = MPQUEUE(TCE(entry)->queue);
 266
 267         if (!hw_lock_held((hw_lock_t)&entry->lock))
 268                 panic("_call_entry_dequeue() "
 269                         "entry %p is not locked\n", entry);
 270         /*
 271          * XXX The queue lock is actually a mutex in spin mode
 272          *     but there's no way to test for it being held
 273          *     so we pretend it's a spinlock!
 274          */
 275         if (!hw_lock_held((hw_lock_t)&old_queue->lock_data))
 276                 panic("_call_entry_dequeue() "
 277                         "queue %p is not locked\n", old_queue);
 278
 279         call_entry_dequeue(TCE(entry));
 280         old_queue->count--;
 281
 282         return (old_queue);
 283 }
 284
 285 static __inline__ mpqueue_head_t *
 286 timer_call_entry_enqueue_deadline(
 287         timer_call_t            entry,
 288         mpqueue_head_t          *queue,
 289         uint64_t                deadline)
 290 {
 291         mpqueue_head_t  *old_queue = MPQUEUE(TCE(entry)->queue);
 292
 293         if (!hw_lock_held((hw_lock_t)&entry->lock))
 294                 panic("_call_entry_enqueue_deadline() "
 295                         "entry %p is not locked\n", entry);
 296         /* XXX More lock pretense:  */
 297         if (!hw_lock_held((hw_lock_t)&queue->lock_data))
 298                 panic("_call_entry_enqueue_deadline() "
 299                         "queue %p is not locked\n", queue);
 300         if (old_queue != NULL && old_queue != queue)
 301                 panic("_call_entry_enqueue_deadline() "
 302                         "old_queue %p != queue", old_queue);
 303
 304         call_entry_enqueue_deadline(TCE(entry), QUEUE(queue), deadline);
 305
 306 /* For efficiency, track the earliest soft deadline on the queue, so that
 307  * fuzzy decisions can be made without lock acquisitions.
 308  */
 309         timer_call_t thead = (timer_call_t)queue_first(&queue->head);
 310
 311         queue->earliest_soft_deadline = thead->flags & TIMER_CALL_RATELIMITED ? TCE(thead)->deadline : thead->soft_deadline;
 312
 313         if (old_queue)
 314                 old_queue->count--;
 315         queue->count++;
 316
 317         return (old_queue);
 318 }
 319
 320 #else
 321
 322 static __inline__ mpqueue_head_t *
 323 timer_call_entry_dequeue(
 324         timer_call_t            entry)
 325 {
 326         mpqueue_head_t  *old_queue = MPQUEUE(TCE(entry)->queue);
 327
 328         call_entry_dequeue(TCE(entry));
 329         old_queue->count--;
 330
 331         return old_queue;
 332 }
 333
 334 static __inline__ mpqueue_head_t *
 335 timer_call_entry_enqueue_deadline(
 336         timer_call_t                    entry,
 337         mpqueue_head_t                  *queue,
 338         uint64_t                        deadline)
 339 {
 340         mpqueue_head_t  *old_queue = MPQUEUE(TCE(entry)->queue);
 341
 342         call_entry_enqueue_deadline(TCE(entry), QUEUE(queue), deadline);
 343
 344         /* For efficiency, track the earliest soft deadline on the queue,
 345          * so that fuzzy decisions can be made without lock acquisitions.
 346          */
 347
 348         timer_call_t thead = (timer_call_t)queue_first(&queue->head);
 349         queue->earliest_soft_deadline = thead->flags & TIMER_CALL_RATELIMITED ? TCE(thead)->deadline : thead->soft_deadline;
 350
 351         if (old_queue)
 352                 old_queue->count--;
 353         queue->count++;
 354
 355         return old_queue;
 356 }
 357
 358 #endif
 359
 360 static __inline__ void
 361 timer_call_entry_enqueue_tail(
 362         timer_call_t                    entry,
 363         mpqueue_head_t                  *queue)
 364 {
 365         call_entry_enqueue_tail(TCE(entry), QUEUE(queue));
 366         queue->count++;
 367         return;
 368 }
 369
 370 /*
 371  * Remove timer entry from its queue but don't change the queue pointer
 372  * and set the async_dequeue flag. This is locking case 2b.
 373  */
 374 static __inline__ void
 375 timer_call_entry_dequeue_async(
 376         timer_call_t            entry)
 377 {
 378         mpqueue_head_t  *old_queue = MPQUEUE(TCE(entry)->queue);
 379         if (old_queue) {
 380                 old_queue->count--;
 381                 (void) remque(qe(entry));
 382                 entry->async_dequeue = TRUE;
 383         }
 384         return;
 385 }
 386
 387 #if TIMER_ASSERT
 388 unsigned timer_call_enqueue_deadline_unlocked_async1;
 389 unsigned timer_call_enqueue_deadline_unlocked_async2;
 390 #endif
 391 /*
 392  * Assumes call_entry and queues unlocked, interrupts disabled.
 393  */
 394 __inline__ mpqueue_head_t *
 395 timer_call_enqueue_deadline_unlocked(
 396         timer_call_t                    call,
 397         mpqueue_head_t                  *queue,
 398         uint64_t                        deadline,
 399         uint64_t                        soft_deadline,
 400         uint64_t                        ttd,
 401         timer_call_param_t              param1,
 402         uint32_t                        callout_flags)
 403 {
 404         call_entry_t    entry = TCE(call);
 405         mpqueue_head_t  *old_queue;
 406
 407         DBG("timer_call_enqueue_deadline_unlocked(%p,%p,)\n", call, queue);
 408
 409         simple_lock(&call->lock);
 410
 411         old_queue = MPQUEUE(entry->queue);
 412
 413         if (old_queue != NULL) {
 414                 timer_queue_lock_spin(old_queue);
 415                 if (call->async_dequeue) {
 416                         /* collision (1c): timer already dequeued, clear flag */
 417 #if TIMER_ASSERT
 418                         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
 419                                 DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
 420                                 call,
 421                                 call->async_dequeue,
 422                                 TCE(call)->queue,
 423                                 0x1c, 0);
 424                         timer_call_enqueue_deadline_unlocked_async1++;
 425 #endif
 426                         call->async_dequeue = FALSE;
 427                         entry->queue = NULL;
 428                 } else if (old_queue != queue) {
 429                         timer_call_entry_dequeue(call);
 430 #if TIMER_ASSERT
 431                         timer_call_enqueue_deadline_unlocked_async2++;
 432 #endif
 433                 }
 434                 if (old_queue == timer_longterm_queue)
 435                         timer_longterm_dequeued_locked(call);
 436                 if (old_queue != queue) {
 437                         timer_queue_unlock(old_queue);
 438                         timer_queue_lock_spin(queue);
 439                 }
 440         } else {
 441                 timer_queue_lock_spin(queue);
 442         }
 443
 444         call->soft_deadline = soft_deadline;
 445         call->flags = callout_flags;
 446         TCE(call)->param1 = param1;
 447         call->ttd = ttd;
 448
 449         timer_call_entry_enqueue_deadline(call, queue, deadline);
 450         timer_queue_unlock(queue);
 451         simple_unlock(&call->lock);
 452
 453         return (old_queue);
 454 }
 455
 456 #if TIMER_ASSERT
 457 unsigned timer_call_dequeue_unlocked_async1;
 458 unsigned timer_call_dequeue_unlocked_async2;
 459 #endif
 460 mpqueue_head_t *
 461 timer_call_dequeue_unlocked(
 462         timer_call_t            call)
 463 {
 464         call_entry_t    entry = TCE(call);
 465         mpqueue_head_t  *old_queue;
 466
 467         DBG("timer_call_dequeue_unlocked(%p)\n", call);
 468
 469         simple_lock(&call->lock);
 470         old_queue = MPQUEUE(entry->queue);
 471 #if TIMER_ASSERT
 472         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
 473                 DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
 474                 call,
 475                 call->async_dequeue,
 476                 TCE(call)->queue,
 477                 0, 0);
 478 #endif
 479         if (old_queue != NULL) {
 480                 timer_queue_lock_spin(old_queue);
 481                 if (call->async_dequeue) {
 482                         /* collision (1c): timer already dequeued, clear flag */
 483 #if TIMER_ASSERT
 484                         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
 485                                 DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
 486                                 call,
 487                                 call->async_dequeue,
 488                                 TCE(call)->queue,
 489                                 0x1c, 0);
 490                         timer_call_dequeue_unlocked_async1++;
 491 #endif
 492                         call->async_dequeue = FALSE;
 493                         entry->queue = NULL;
 494                 } else {
 495                         timer_call_entry_dequeue(call);
 496                 }
 497                 if (old_queue == timer_longterm_queue)
 498                         timer_longterm_dequeued_locked(call);
 499                 timer_queue_unlock(old_queue);
 500         }
 501         simple_unlock(&call->lock);
 502         return (old_queue);
 503 }
 504
 505
 506 /*
 507  * Timer call entry locking model
 508  * ==============================
 509  *
 510  * Timer call entries are linked on per-cpu timer queues which are protected
 511  * by the queue lock and the call entry lock. The locking protocol is:
 512  *
 513  *  0) The canonical locking order is timer call entry followed by queue.
 514  *
 515  *  1) With only the entry lock held, entry.queue is valid:
 516  *    1a) NULL: the entry is not queued, or
 517  *    1b) non-NULL: this queue must be locked before the entry is modified.
 518  *        After locking the queue, the call.async_dequeue flag must be checked:
 519  *    1c) TRUE: the entry was removed from the queue by another thread
 520  *              and we must NULL the entry.queue and reset this flag, or
 521  *    1d) FALSE: (ie. queued), the entry can be manipulated.
 522  *
 523  *  2) If a queue lock is obtained first, the queue is stable:
 524  *    2a) If a try-lock of a queued entry succeeds, the call can be operated on
 525  *        and dequeued.
 526  *    2b) If a try-lock fails, it indicates that another thread is attempting
 527  *        to change the entry and move it to a different position in this queue
 528  *        or to different queue. The entry can be dequeued but it should not be
 529  *        operated upon since it is being changed. Furthermore, we don't null
 530  *        the entry.queue pointer (protected by the entry lock we don't own).
 531  *        Instead, we set the async_dequeue flag -- see (1c).
 532  *    2c) Same as 2b but occurring when a longterm timer is matured.
 533  *  3) A callout's parameters (deadline, flags, parameters, soft deadline &c.)
 534  *     should be manipulated with the appropriate timer queue lock held,
 535  *     to prevent queue traversal observations from observing inconsistent
 536  *     updates to an in-flight callout.
 537  */
 538
 539 /*
 540  * Inlines timer_call_entry_dequeue() and timer_call_entry_enqueue_deadline()
 541  * cast between pointer types (mpqueue_head_t *) and (queue_t) so that
 542  * we can use the call_entry_dequeue() and call_entry_enqueue_deadline()
 543  * methods to operate on timer_call structs as if they are call_entry structs.
 544  * These structures are identical except for their queue head pointer fields.
 545  *
 546  * In the debug case, we assert that the timer call locking protocol
 547  * is being obeyed.
 548  */
 549
 550 static boolean_t
 551 timer_call_enter_internal(
 552         timer_call_t            call,
 553         timer_call_param_t      param1,
 554         uint64_t                deadline,
 555         uint64_t                leeway,
 556         uint32_t                flags,
 557         boolean_t               ratelimited)
 558 {
 559         mpqueue_head_t          *queue = NULL;
 560         mpqueue_head_t          *old_queue;
 561         spl_t                   s;
 562         uint64_t                slop;
 563         uint32_t                urgency;
 564         uint64_t                sdeadline, ttd;
 565
 566         s = splclock();
 567
 568         sdeadline = deadline;
 569         uint64_t ctime = mach_absolute_time();
 570
 571         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
 572                 DECR_TIMER_ENTER | DBG_FUNC_START,
 573                 call,
 574                 param1, deadline, flags, 0);
 575
 576         urgency = (flags & TIMER_CALL_URGENCY_MASK);
 577
 578         boolean_t slop_ratelimited = FALSE;
 579         slop = timer_call_slop(deadline, ctime, urgency, current_thread(), &slop_ratelimited);
 580
 581         if ((flags & TIMER_CALL_LEEWAY) != 0 && leeway > slop)
 582                 slop = leeway;
 583
 584         if (UINT64_MAX - deadline <= slop) {
 585                 deadline = UINT64_MAX;
 586         } else {
 587                 deadline += slop;
 588         }
 589
 590         if (__improbable(deadline < ctime)) {
 591                 uint64_t delta = (ctime - deadline);
 592
 593                 past_deadline_timers++;
 594                 past_deadline_deltas += delta;
 595                 if (delta > past_deadline_longest)
 596                         past_deadline_longest = deadline;
 597                 if (delta < past_deadline_shortest)
 598                         past_deadline_shortest = delta;
 599
 600                 deadline = ctime + past_deadline_timer_adjustment;
 601                 sdeadline = deadline;
 602         }
 603
 604         if (ratelimited || slop_ratelimited) {
 605                 flags |= TIMER_CALL_RATELIMITED;
 606         } else {
 607                 flags &= ~TIMER_CALL_RATELIMITED;
 608         }
 609
 610         ttd =  sdeadline - ctime;
 611 #if CONFIG_DTRACE
 612         DTRACE_TMR7(callout__create, timer_call_func_t, TCE(call)->func,
 613         timer_call_param_t, TCE(call)->param0, uint32_t, flags,
 614             (deadline - sdeadline),
 615             (ttd >> 32), (unsigned) (ttd & 0xFFFFFFFF), call);
 616 #endif
 617
 618         /* Program timer callout parameters under the appropriate per-CPU or
 619          * longterm queue lock. The callout may have been previously enqueued
 620          * and in-flight on this or another timer queue.
 621          */
 622         if (!ratelimited && !slop_ratelimited) {
 623                 queue = timer_longterm_enqueue_unlocked(call, ctime, deadline, &old_queue, sdeadline, ttd, param1, flags);
 624         }
 625
 626         if (queue == NULL) {
 627                 queue = timer_queue_assign(deadline);
 628                 old_queue = timer_call_enqueue_deadline_unlocked(call, queue, deadline, sdeadline, ttd, param1, flags);
 629         }
 630
 631 #if TIMER_TRACE
 632         TCE(call)->entry_time = ctime;
 633 #endif
 634
 635         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
 636                 DECR_TIMER_ENTER | DBG_FUNC_END,
 637                 call,
 638                 (old_queue != NULL), deadline, queue->count, 0);
 639
 640         splx(s);
 641
 642         return (old_queue != NULL);
 643 }
 644
 645 /*
 646  * timer_call_*()
 647  *      return boolean indicating whether the call was previously queued.
 648  */
 649 boolean_t
 650 timer_call_enter(
 651         timer_call_t            call,
 652         uint64_t                deadline,
 653         uint32_t                flags)
 654 {
 655         return timer_call_enter_internal(call, NULL, deadline, 0, flags, FALSE);
 656 }
 657
 658 boolean_t
 659 timer_call_enter1(
 660         timer_call_t            call,
 661         timer_call_param_t      param1,
 662         uint64_t                deadline,
 663         uint32_t                flags)
 664 {
 665         return timer_call_enter_internal(call, param1, deadline, 0, flags, FALSE);
 666 }
 667
 668 boolean_t
 669 timer_call_enter_with_leeway(
 670         timer_call_t            call,
 671         timer_call_param_t      param1,
 672         uint64_t                deadline,
 673         uint64_t                leeway,
 674         uint32_t                flags,
 675         boolean_t               ratelimited)
 676 {
 677         return timer_call_enter_internal(call, param1, deadline, leeway, flags, ratelimited);
 678 }
 679
 680 boolean_t
 681 timer_call_cancel(
 682         timer_call_t            call)
 683 {
 684         mpqueue_head_t          *old_queue;
 685         spl_t                   s;
 686
 687         s = splclock();
 688
 689         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
 690                 DECR_TIMER_CANCEL | DBG_FUNC_START,
 691                 call,
 692                 TCE(call)->deadline, call->soft_deadline, call->flags, 0);
 693
 694         old_queue = timer_call_dequeue_unlocked(call);
 695
 696         if (old_queue != NULL) {
 697                 timer_queue_lock_spin(old_queue);
 698                 if (!queue_empty(&old_queue->head)) {
 699                         timer_queue_cancel(old_queue, TCE(call)->deadline, CE(queue_first(&old_queue->head))->deadline);
 700                         timer_call_t thead = (timer_call_t)queue_first(&old_queue->head);
 701                         old_queue->earliest_soft_deadline = thead->flags & TIMER_CALL_RATELIMITED ? TCE(thead)->deadline : thead->soft_deadline;
 702                 }
 703                 else {
 704                         timer_queue_cancel(old_queue, TCE(call)->deadline, UINT64_MAX);
 705                         old_queue->earliest_soft_deadline = UINT64_MAX;
 706                 }
 707                 timer_queue_unlock(old_queue);
 708         }
 709         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
 710                 DECR_TIMER_CANCEL | DBG_FUNC_END,
 711                 call,
 712                 old_queue,
 713                 TCE(call)->deadline - mach_absolute_time(),
 714                 TCE(call)->deadline - TCE(call)->entry_time, 0);
 715         splx(s);
 716
 717 #if CONFIG_DTRACE
 718         DTRACE_TMR6(callout__cancel, timer_call_func_t, TCE(call)->func,
 719             timer_call_param_t, TCE(call)->param0, uint32_t, call->flags, 0,
 720             (call->ttd >> 32), (unsigned) (call->ttd & 0xFFFFFFFF));
 721 #endif
 722
 723         return (old_queue != NULL);
 724 }
 725
 726 static uint32_t timer_queue_shutdown_lock_skips;
 727 static uint32_t timer_queue_shutdown_discarded;
 728
 729 void
 730 timer_queue_shutdown(
 731         mpqueue_head_t          *queue)
 732 {
 733         timer_call_t            call;
 734         mpqueue_head_t          *new_queue;
 735         spl_t                   s;
 736
 737
 738         DBG("timer_queue_shutdown(%p)\n", queue);
 739
 740         s = splclock();
 741
 742         /* Note comma operator in while expression re-locking each iteration */
 743         while (timer_queue_lock_spin(queue), !queue_empty(&queue->head)) {
 744                 call = TIMER_CALL(queue_first(&queue->head));
 745
 746                 if (!simple_lock_try(&call->lock)) {
 747                         /*
 748                          * case (2b) lock order inversion, dequeue and skip
 749                          * Don't change the call_entry queue back-pointer
 750                          * but set the async_dequeue field.
 751                          */
 752                         timer_queue_shutdown_lock_skips++;
 753                         timer_call_entry_dequeue_async(call);
 754 #if TIMER_ASSERT
 755                         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
 756                                 DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
 757                                 call,
 758                                 call->async_dequeue,
 759                                 TCE(call)->queue,
 760                                 0x2b, 0);
 761 #endif
 762                         timer_queue_unlock(queue);
 763                         continue;
 764                 }
 765
 766                 boolean_t call_local = ((call->flags & TIMER_CALL_LOCAL) != 0);
 767
 768                 /* remove entry from old queue */
 769                 timer_call_entry_dequeue(call);
 770                 timer_queue_unlock(queue);
 771
 772                 if (call_local == FALSE) {
 773                         /* and queue it on new, discarding LOCAL timers */
 774                         new_queue = timer_queue_assign(TCE(call)->deadline);
 775                         timer_queue_lock_spin(new_queue);
 776                         timer_call_entry_enqueue_deadline(
 777                                 call, new_queue, TCE(call)->deadline);
 778                         timer_queue_unlock(new_queue);
 779                 } else {
 780                         timer_queue_shutdown_discarded++;
 781                 }
 782
 783                 /* The only lingering LOCAL timer should be this thread's
 784                  * quantum expiration timer.
 785                  */
 786                 assert((call_local == FALSE) ||
 787                     (TCE(call)->func == thread_quantum_expire));
 788
 789                 simple_unlock(&call->lock);
 790         }
 791
 792         timer_queue_unlock(queue);
 793         splx(s);
 794 }
 795
 796 static uint32_t timer_queue_expire_lock_skips;
 797 uint64_t
 798 timer_queue_expire_with_options(
 799         mpqueue_head_t          *queue,
 800         uint64_t                deadline,
 801         boolean_t               rescan)
 802 {
 803         timer_call_t    call = NULL;
 804         uint32_t tc_iterations = 0;
 805         DBG("timer_queue_expire(%p,)\n", queue);
 806
 807         uint64_t cur_deadline = deadline;
 808         timer_queue_lock_spin(queue);
 809
 810         while (!queue_empty(&queue->head)) {
 811                 /* Upon processing one or more timer calls, refresh the
 812                  * deadline to account for time elapsed in the callout
 813                  */
 814                 if (++tc_iterations > 1)
 815                         cur_deadline = mach_absolute_time();
 816
 817                 if (call == NULL)
 818                         call = TIMER_CALL(queue_first(&queue->head));
 819
 820                 if (call->soft_deadline <= cur_deadline) {
 821                         timer_call_func_t               func;
 822                         timer_call_param_t              param0, param1;
 823
 824                         TCOAL_DEBUG(0xDDDD0000, queue->earliest_soft_deadline, call->soft_deadline, 0, 0, 0);
 825                         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
 826                                 DECR_TIMER_EXPIRE | DBG_FUNC_NONE,
 827                                 call,
 828                                 call->soft_deadline,
 829                                 TCE(call)->deadline,
 830                                 TCE(call)->entry_time, 0);
 831
 832                         if ((call->flags & TIMER_CALL_RATELIMITED) &&
 833                             (TCE(call)->deadline > cur_deadline)) {
 834                                 if (rescan == FALSE)
 835                                         break;
 836                         }
 837
 838                         if (!simple_lock_try(&call->lock)) {
 839                                 /* case (2b) lock inversion, dequeue and skip */
 840                                 timer_queue_expire_lock_skips++;
 841                                 timer_call_entry_dequeue_async(call);
 842                                 call = NULL;
 843                                 continue;
 844                         }
 845
 846                         timer_call_entry_dequeue(call);
 847
 848                         func = TCE(call)->func;
 849                         param0 = TCE(call)->param0;
 850                         param1 = TCE(call)->param1;
 851
 852                         simple_unlock(&call->lock);
 853                         timer_queue_unlock(queue);
 854
 855                         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
 856                                 DECR_TIMER_CALLOUT | DBG_FUNC_START,
 857                                 call, VM_KERNEL_UNSLIDE(func), param0, param1, 0);
 858
 859 #if CONFIG_DTRACE
 860                         DTRACE_TMR7(callout__start, timer_call_func_t, func,
 861                             timer_call_param_t, param0, unsigned, call->flags,
 862                             0, (call->ttd >> 32),
 863                             (unsigned) (call->ttd & 0xFFFFFFFF), call);
 864 #endif
 865                         /* Maintain time-to-deadline in per-processor data
 866                          * structure for thread wakeup deadline statistics.
 867                          */
 868                         uint64_t *ttdp = &(PROCESSOR_DATA(current_processor(), timer_call_ttd));
 869                         *ttdp = call->ttd;
 870                         (*func)(param0, param1);
 871                         *ttdp = 0;
 872 #if CONFIG_DTRACE
 873                         DTRACE_TMR4(callout__end, timer_call_func_t, func,
 874                             param0, param1, call);
 875 #endif
 876
 877                         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
 878                                 DECR_TIMER_CALLOUT | DBG_FUNC_END,
 879                                 call, VM_KERNEL_UNSLIDE(func), param0, param1, 0);
 880                         call = NULL;
 881                         timer_queue_lock_spin(queue);
 882                 } else {
 883                         if (__probable(rescan == FALSE)) {
 884                                 break;
 885                         } else {
 886                                 int64_t skew = TCE(call)->deadline - call->soft_deadline;
 887                                 assert(TCE(call)->deadline >= call->soft_deadline);
 888
 889                                 /* DRK: On a latency quality-of-service level change,
 890                                  * re-sort potentially rate-limited timers. The platform
 891                                  * layer determines which timers require
 892                                  * this. In the absence of the per-callout
 893                                  * synchronization requirement, a global resort could
 894                                  * be more efficient. The re-sort effectively
 895                                  * annuls all timer adjustments, i.e. the "soft
 896                                  * deadline" is the sort key.
 897                                  */
 898
 899                                 if (timer_resort_threshold(skew)) {
 900                                         if (__probable(simple_lock_try(&call->lock))) {
 901                                                 timer_call_entry_dequeue(call);
 902                                                 timer_call_entry_enqueue_deadline(call, queue, call->soft_deadline);
 903                                                 simple_unlock(&call->lock);
 904                                                 call = NULL;
 905                                         }
 906                                 }
 907                                 if (call) {
 908                                         call = TIMER_CALL(queue_next(qe(call)));
 909                                         if (queue_end(&queue->head, qe(call)))
 910                                                 break;
 911                                 }
 912                         }
 913                 }
 914         }
 915
 916         if (!queue_empty(&queue->head)) {
 917                 call = TIMER_CALL(queue_first(&queue->head));
 918                 cur_deadline = TCE(call)->deadline;
 919                 queue->earliest_soft_deadline = (call->flags & TIMER_CALL_RATELIMITED) ? TCE(call)->deadline: call->soft_deadline;
 920         } else {
 921                 queue->earliest_soft_deadline = cur_deadline = UINT64_MAX;
 922         }
 923
 924         timer_queue_unlock(queue);
 925
 926         return (cur_deadline);
 927 }
 928
 929 uint64_t
 930 timer_queue_expire(
 931         mpqueue_head_t          *queue,
 932         uint64_t                deadline)
 933 {
 934         return timer_queue_expire_with_options(queue, deadline, FALSE);
 935 }
 936
 937 extern int serverperfmode;
 938 static uint32_t timer_queue_migrate_lock_skips;
 939 /*
 940  * timer_queue_migrate() is called by timer_queue_migrate_cpu()
 941  * to move timer requests from the local processor (queue_from)
 942  * to a target processor's (queue_to).
 943  */
 944 int
 945 timer_queue_migrate(mpqueue_head_t *queue_from, mpqueue_head_t *queue_to)
 946 {
 947         timer_call_t    call;
 948         timer_call_t    head_to;
 949         int             timers_migrated = 0;
 950
 951         DBG("timer_queue_migrate(%p,%p)\n", queue_from, queue_to);
 952
 953         assert(!ml_get_interrupts_enabled());
 954         assert(queue_from != queue_to);
 955
 956         if (serverperfmode) {
 957                 /*
 958                  * if we're running a high end server
 959                  * avoid migrations... they add latency
 960                  * and don't save us power under typical
 961                  * server workloads
 962                  */
 963                 return -4;
 964         }
 965
 966         /*
 967          * Take both local (from) and target (to) timer queue locks while
 968          * moving the timers from the local queue to the target processor.
 969          * We assume that the target is always the boot processor.
 970          * But only move if all of the following is true:
 971          *  - the target queue is non-empty
 972          *  - the local queue is non-empty
 973          *  - the local queue's first deadline is later than the target's
 974          *  - the local queue contains no non-migrateable "local" call
 975          * so that we need not have the target resync.
 976          */
 977
 978         timer_queue_lock_spin(queue_to);
 979
 980         head_to = TIMER_CALL(queue_first(&queue_to->head));
 981         if (queue_empty(&queue_to->head)) {
 982                 timers_migrated = -1;
 983                 goto abort1;
 984         }
 985
 986         timer_queue_lock_spin(queue_from);
 987
 988         if (queue_empty(&queue_from->head)) {
 989                 timers_migrated = -2;
 990                 goto abort2;
 991         }
 992
 993         call = TIMER_CALL(queue_first(&queue_from->head));
 994         if (TCE(call)->deadline < TCE(head_to)->deadline) {
 995                 timers_migrated = 0;
 996                 goto abort2;
 997         }
 998
 999         /* perform scan for non-migratable timers */
1000         do {
1001                 if (call->flags & TIMER_CALL_LOCAL) {
1002                         timers_migrated = -3;
1003                         goto abort2;
1004                 }
1005                 call = TIMER_CALL(queue_next(qe(call)));
1006         } while (!queue_end(&queue_from->head, qe(call)));
1007
1008         /* migration loop itself -- both queues are locked */
1009         while (!queue_empty(&queue_from->head)) {
1010                 call = TIMER_CALL(queue_first(&queue_from->head));
1011                 if (!simple_lock_try(&call->lock)) {
1012                         /* case (2b) lock order inversion, dequeue only */
1013 #ifdef TIMER_ASSERT
1014                         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1015                                 DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
1016                                 call,
1017                                 TCE(call)->queue,
1018                                 call->lock.interlock.lock_data,
1019                                 0x2b, 0);
1020 #endif
1021                         timer_queue_migrate_lock_skips++;
1022                         timer_call_entry_dequeue_async(call);
1023                         continue;
1024                 }
1025                 timer_call_entry_dequeue(call);
1026                 timer_call_entry_enqueue_deadline(
1027                         call, queue_to, TCE(call)->deadline);
1028                 timers_migrated++;
1029                 simple_unlock(&call->lock);
1030         }
1031         queue_from->earliest_soft_deadline = UINT64_MAX;
1032 abort2:
1033         timer_queue_unlock(queue_from);
1034 abort1:
1035         timer_queue_unlock(queue_to);
1036
1037         return timers_migrated;
1038 }
1039
1040 void
1041 timer_queue_trace_cpu(int ncpu)
1042 {
1043         timer_call_nosync_cpu(
1044                 ncpu,
1045                 (void(*)())timer_queue_trace,
1046                 (void*) timer_queue_cpu(ncpu));
1047 }
1048
1049 void
1050 timer_queue_trace(
1051         mpqueue_head_t                  *queue)
1052 {
1053         timer_call_t    call;
1054         spl_t           s;
1055
1056         if (!kdebug_enable)
1057                 return;
1058
1059         s = splclock();
1060         timer_queue_lock_spin(queue);
1061
1062         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1063                 DECR_TIMER_QUEUE | DBG_FUNC_START,
1064                 queue->count, mach_absolute_time(), 0, 0, 0);
1065
1066         if (!queue_empty(&queue->head)) {
1067                 call = TIMER_CALL(queue_first(&queue->head));
1068                 do {
1069                         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1070                                 DECR_TIMER_QUEUE | DBG_FUNC_NONE,
1071                                 call->soft_deadline,
1072                                 TCE(call)->deadline,
1073                                 TCE(call)->entry_time,
1074                                 TCE(call)->func,
1075                                 0);
1076                         call = TIMER_CALL(queue_next(qe(call)));
1077                 } while (!queue_end(&queue->head, qe(call)));
1078         }
1079
1080         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1081                 DECR_TIMER_QUEUE | DBG_FUNC_END,
1082                 queue->count, mach_absolute_time(), 0, 0, 0);
1083
1084         timer_queue_unlock(queue);
1085         splx(s);
1086 }
1087
1088 void
1089 timer_longterm_dequeued_locked(timer_call_t call)
1090 {
1091         timer_longterm_t        *tlp = &timer_longterm;
1092
1093         tlp->dequeues++;
1094         if (call == tlp->threshold.call)
1095                 tlp->threshold.call = NULL;
1096 }
1097
1098 /*
1099  * Place a timer call in the longterm list
1100  * and adjust the next timer callout deadline if the new timer is first.
1101  */
1102 mpqueue_head_t *
1103 timer_longterm_enqueue_unlocked(timer_call_t    call,
1104                                 uint64_t        now,
1105                                 uint64_t        deadline,
1106                                 mpqueue_head_t  **old_queue,
1107                                 uint64_t        soft_deadline,
1108                                 uint64_t        ttd,
1109                                 timer_call_param_t      param1,
1110                                 uint32_t        callout_flags)
1111 {
1112         timer_longterm_t        *tlp = &timer_longterm;
1113         boolean_t               update_required = FALSE;
1114         uint64_t                longterm_threshold;
1115
1116         longterm_threshold = now + tlp->threshold.interval;
1117
1118         /*
1119          * Return NULL without doing anything if:
1120          *  - this timer is local, or
1121          *  - the longterm mechanism is disabled, or
1122          *  - this deadline is too short.
1123          */
1124         if ((callout_flags & TIMER_CALL_LOCAL) != 0 ||
1125             (tlp->threshold.interval == TIMER_LONGTERM_NONE) ||
1126                 (deadline <= longterm_threshold))
1127                 return NULL;
1128
1129         /*
1130          * Remove timer from its current queue, if any.
1131          */
1132         *old_queue = timer_call_dequeue_unlocked(call);
1133
1134         /*
1135          * Lock the longterm queue, queue timer and determine
1136          * whether an update is necessary.
1137          */
1138         assert(!ml_get_interrupts_enabled());
1139         simple_lock(&call->lock);
1140         timer_queue_lock_spin(timer_longterm_queue);
1141         TCE(call)->deadline = deadline;
1142         TCE(call)->param1 = param1;
1143         call->ttd = ttd;
1144         call->soft_deadline = soft_deadline;
1145         call->flags = callout_flags;
1146         timer_call_entry_enqueue_tail(call, timer_longterm_queue);
1147
1148         tlp->enqueues++;
1149
1150         /*
1151          * We'll need to update the currently set threshold timer
1152          * if the new deadline is sooner and no sooner update is in flight.
1153          */
1154         if (deadline < tlp->threshold.deadline &&
1155             deadline < tlp->threshold.preempted) {
1156                 tlp->threshold.preempted = deadline;
1157                 tlp->threshold.call = call;
1158                 update_required = TRUE;
1159         }
1160         timer_queue_unlock(timer_longterm_queue);
1161         simple_unlock(&call->lock);
1162
1163         if (update_required) {
1164                 /*
1165                  * Note: this call expects that calling the master cpu
1166                  * alone does not involve locking the topo lock.
1167                  */
1168                 timer_call_nosync_cpu(
1169                         master_cpu,
1170                         (void (*)(void *)) timer_longterm_update,
1171                         (void *)tlp);
1172         }
1173
1174         return timer_longterm_queue;
1175 }
1176
1177 /*
1178  * Scan for timers below the longterm threshold.
1179  * Move these to the local timer queue (of the boot processor on which the
1180  * calling thread is running).
1181  * Both the local (boot) queue and the longterm queue are locked.
1182  * The scan is similar to the timer migrate sequence but is performed by
1183  * successively examining each timer on the longterm queue:
1184  *  - if within the short-term threshold
1185  *    - enter on the local queue (unless being deleted),
1186  *  - otherwise:
1187  *    - if sooner, deadline becomes the next threshold deadline.
1188  */
1189 void
1190 timer_longterm_scan(timer_longterm_t    *tlp,
1191                     uint64_t            now)
1192 {
1193         queue_entry_t   qe;
1194         timer_call_t    call;
1195         uint64_t        threshold;
1196         uint64_t        deadline;
1197         mpqueue_head_t  *timer_master_queue;
1198
1199         assert(!ml_get_interrupts_enabled());
1200         assert(cpu_number() == master_cpu);
1201
1202         if (tlp->threshold.interval != TIMER_LONGTERM_NONE)
1203                 threshold = now + tlp->threshold.interval;
1204         else
1205                 threshold = TIMER_LONGTERM_NONE;
1206
1207         tlp->threshold.deadline = TIMER_LONGTERM_NONE;
1208         tlp->threshold.call = NULL;
1209
1210         if (queue_empty(&timer_longterm_queue->head))
1211                 return;
1212
1213         timer_master_queue = timer_queue_cpu(master_cpu);
1214         timer_queue_lock_spin(timer_master_queue);
1215
1216         qe = queue_first(&timer_longterm_queue->head);
1217         while (!queue_end(&timer_longterm_queue->head, qe)) {
1218                 call = TIMER_CALL(qe);
1219                 deadline = call->soft_deadline;
1220                 qe = queue_next(qe);
1221                 if (!simple_lock_try(&call->lock)) {
1222                         /* case (2c) lock order inversion, dequeue only */
1223 #ifdef TIMER_ASSERT
1224                         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1225                                 DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
1226                                 call,
1227                                 TCE(call)->queue,
1228                                 call->lock.interlock.lock_data,
1229                                 0x2c, 0);
1230 #endif
1231                         timer_call_entry_dequeue_async(call);
1232                         continue;
1233                 }
1234                 if (deadline < threshold) {
1235                         /*
1236                          * This timer needs moving (escalating)
1237                          * to the local (boot) processor's queue.
1238                          */
1239 #ifdef TIMER_ASSERT
1240                         if (deadline < now)
1241                                 TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1242                                         DECR_TIMER_OVERDUE | DBG_FUNC_NONE,
1243                                         call,
1244                                         deadline,
1245                                         now,
1246                                         threshold,
1247                                         0);
1248 #endif
1249                         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1250                                 DECR_TIMER_ESCALATE | DBG_FUNC_NONE,
1251                                 call,
1252                                 TCE(call)->deadline,
1253                                 TCE(call)->entry_time,
1254                                 TCE(call)->func,
1255                                 0);
1256                         tlp->escalates++;
1257                         timer_call_entry_dequeue(call);
1258                         timer_call_entry_enqueue_deadline(
1259                                 call, timer_master_queue, TCE(call)->deadline);
1260                         /*
1261                          * A side-effect of the following call is to update
1262                          * the actual hardware deadline if required.
1263                          */
1264                         (void) timer_queue_assign(deadline);
1265                 } else {
1266                         if (deadline < tlp->threshold.deadline) {
1267                                 tlp->threshold.deadline = deadline;
1268                                 tlp->threshold.call = call;
1269                         }
1270                 }
1271                 simple_unlock(&call->lock);
1272         }
1273
1274         timer_queue_unlock(timer_master_queue);
1275 }
1276
1277 void
1278 timer_longterm_callout(timer_call_param_t p0, __unused timer_call_param_t p1)
1279 {
1280         timer_longterm_t        *tlp = (timer_longterm_t *) p0;
1281
1282         timer_longterm_update(tlp);
1283 }
1284
1285 void
1286 timer_longterm_update_locked(timer_longterm_t *tlp)
1287 {
1288         uint64_t        latency;
1289
1290         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1291                 DECR_TIMER_UPDATE | DBG_FUNC_START,
1292                 &tlp->queue,
1293                 tlp->threshold.deadline,
1294                 tlp->threshold.preempted,
1295                 tlp->queue.count, 0);
1296
1297         tlp->scan_time = mach_absolute_time();
1298         if (tlp->threshold.preempted != TIMER_LONGTERM_NONE) {
1299                 tlp->threshold.preempts++;
1300                 tlp->threshold.deadline = tlp->threshold.preempted;
1301                 tlp->threshold.preempted = TIMER_LONGTERM_NONE;
1302                 /*
1303                  * Note: in the unlikely event that a pre-empted timer has
1304                  * itself been cancelled, we'll simply re-scan later at the
1305                  * time of the preempted/cancelled timer.
1306                  */
1307         } else {
1308                 tlp->threshold.scans++;
1309
1310                 /*
1311                  * Maintain a moving average of our wakeup latency.
1312                  * Clamp latency to 0 and ignore above threshold interval.
1313                  */
1314                 if (tlp->scan_time > tlp->threshold.deadline_set)
1315                         latency = tlp->scan_time - tlp->threshold.deadline_set;
1316                 else
1317                         latency = 0;
1318                 if (latency < tlp->threshold.interval) {
1319                         tlp->threshold.latency_min =
1320                                 MIN(tlp->threshold.latency_min, latency);
1321                         tlp->threshold.latency_max =
1322                                 MAX(tlp->threshold.latency_max, latency);
1323                         tlp->threshold.latency =
1324                                 (tlp->threshold.latency*99 + latency) / 100;
1325                 }
1326
1327                 timer_longterm_scan(tlp, tlp->scan_time);
1328         }
1329
1330         tlp->threshold.deadline_set = tlp->threshold.deadline;
1331         /* The next deadline timer to be set is adjusted */
1332         if (tlp->threshold.deadline != TIMER_LONGTERM_NONE) {
1333                 tlp->threshold.deadline_set -= tlp->threshold.margin;
1334                 tlp->threshold.deadline_set -= tlp->threshold.latency;
1335         }
1336
1337         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1338                 DECR_TIMER_UPDATE | DBG_FUNC_END,
1339                 &tlp->queue,
1340                 tlp->threshold.deadline,
1341                 tlp->threshold.scans,
1342                 tlp->queue.count, 0);
1343 }
1344
1345 void
1346 timer_longterm_update(timer_longterm_t *tlp)
1347 {
1348         spl_t   s = splclock();
1349
1350         timer_queue_lock_spin(timer_longterm_queue);
1351
1352         if (cpu_number() != master_cpu)
1353                 panic("timer_longterm_update_master() on non-boot cpu");
1354
1355         timer_longterm_update_locked(tlp);
1356
1357         if (tlp->threshold.deadline != TIMER_LONGTERM_NONE)
1358                 timer_call_enter(
1359                         &tlp->threshold.timer,
1360                         tlp->threshold.deadline_set,
1361                         TIMER_CALL_LOCAL | TIMER_CALL_SYS_CRITICAL);
1362
1363         timer_queue_unlock(timer_longterm_queue);
1364         splx(s);
1365 }
1366
1367 void
1368 timer_longterm_init(void)
1369 {
1370         uint32_t                longterm;
1371         timer_longterm_t        *tlp = &timer_longterm;
1372
1373         DBG("timer_longterm_init() tlp: %p, queue: %p\n", tlp, &tlp->queue);
1374
1375         /*
1376          * Set the longterm timer threshold. Defaults to TIMER_LONGTERM_THRESHOLD
1377          * or TIMER_LONGTERM_NONE (disabled) for server;
1378          * overridden longterm boot-arg
1379          */
1380         tlp->threshold.interval = serverperfmode ? TIMER_LONGTERM_NONE
1381                                                  : TIMER_LONGTERM_THRESHOLD;
1382         if (PE_parse_boot_argn("longterm", &longterm, sizeof (longterm))) {
1383                 tlp->threshold.interval = (longterm == 0) ?
1384                                                 TIMER_LONGTERM_NONE :
1385                                                 longterm * NSEC_PER_MSEC;
1386         }
1387         if (tlp->threshold.interval != TIMER_LONGTERM_NONE) {
1388                 printf("Longterm timer threshold: %llu ms\n",
1389                         tlp->threshold.interval / NSEC_PER_MSEC);
1390                 kprintf("Longterm timer threshold: %llu ms\n",
1391                         tlp->threshold.interval / NSEC_PER_MSEC);
1392                 nanoseconds_to_absolutetime(tlp->threshold.interval,
1393                                             &tlp->threshold.interval);
1394                 tlp->threshold.margin = tlp->threshold.interval / 10;
1395                 tlp->threshold.latency_min = EndOfAllTime;
1396                 tlp->threshold.latency_max = 0;
1397         }
1398
1399         tlp->threshold.preempted = TIMER_LONGTERM_NONE;
1400         tlp->threshold.deadline = TIMER_LONGTERM_NONE;
1401
1402         lck_attr_setdefault(&timer_longterm_lck_attr);
1403         lck_grp_attr_setdefault(&timer_longterm_lck_grp_attr);
1404         lck_grp_init(&timer_longterm_lck_grp,
1405                      "timer_longterm", &timer_longterm_lck_grp_attr);
1406         mpqueue_init(&tlp->queue,
1407                      &timer_longterm_lck_grp, &timer_longterm_lck_attr);
1408
1409         timer_call_setup(&tlp->threshold.timer,
1410                          timer_longterm_callout, (timer_call_param_t) tlp);
1411
1412         timer_longterm_queue = &tlp->queue;
1413 }
1414
1415 enum {
1416         THRESHOLD, QCOUNT,
1417         ENQUEUES, DEQUEUES, ESCALATES, SCANS, PREEMPTS,
1418         LATENCY, LATENCY_MIN, LATENCY_MAX
1419 };
1420 uint64_t
1421 timer_sysctl_get(int oid)
1422 {
1423         timer_longterm_t        *tlp = &timer_longterm;
1424
1425         switch (oid) {
1426         case THRESHOLD:
1427                 return (tlp->threshold.interval == TIMER_LONGTERM_NONE) ?
1428                         0 : tlp->threshold.interval / NSEC_PER_MSEC;
1429         case QCOUNT:
1430                 return tlp->queue.count;
1431         case ENQUEUES:
1432                 return tlp->enqueues;
1433         case DEQUEUES:
1434                 return tlp->dequeues;
1435         case ESCALATES:
1436                 return tlp->escalates;
1437         case SCANS:
1438                 return tlp->threshold.scans;
1439         case PREEMPTS:
1440                 return tlp->threshold.preempts;
1441         case LATENCY:
1442                 return tlp->threshold.latency;
1443         case LATENCY_MIN:
1444                 return tlp->threshold.latency_min;
1445         case LATENCY_MAX:
1446                 return tlp->threshold.latency_max;
1447         default:
1448                 return 0;
1449         }
1450 }
1451
1452 /*
1453  * timer_master_scan() is the inverse of timer_longterm_scan()
1454  * since it un-escalates timers to the longterm queue.
1455  */
1456 static void
1457 timer_master_scan(timer_longterm_t      *tlp,
1458                   uint64_t              now)
1459 {
1460         queue_entry_t   qe;
1461         timer_call_t    call;
1462         uint64_t        threshold;
1463         uint64_t        deadline;
1464         mpqueue_head_t  *timer_master_queue;
1465
1466         if (tlp->threshold.interval != TIMER_LONGTERM_NONE)
1467                 threshold = now + tlp->threshold.interval;
1468         else
1469                 threshold = TIMER_LONGTERM_NONE;
1470
1471         timer_master_queue = timer_queue_cpu(master_cpu);
1472         timer_queue_lock_spin(timer_master_queue);
1473
1474         qe = queue_first(&timer_master_queue->head);
1475         while (!queue_end(&timer_master_queue->head, qe)) {
1476                 call = TIMER_CALL(qe);
1477                 deadline = TCE(call)->deadline;
1478                 qe = queue_next(qe);
1479                 if ((call->flags & TIMER_CALL_LOCAL) != 0)
1480                         continue;
1481                 if (!simple_lock_try(&call->lock)) {
1482                         /* case (2c) lock order inversion, dequeue only */
1483                         timer_call_entry_dequeue_async(call);
1484                         continue;
1485                 }
1486                 if (deadline > threshold) {
1487                         /* move from master to longterm */
1488                         timer_call_entry_dequeue(call);
1489                         timer_call_entry_enqueue_tail(call, timer_longterm_queue);
1490                         if (deadline < tlp->threshold.deadline) {
1491                                 tlp->threshold.deadline = deadline;
1492                                 tlp->threshold.call = call;
1493                         }
1494                 }
1495                 simple_unlock(&call->lock);
1496         }
1497         timer_queue_unlock(timer_master_queue);
1498 }
1499
1500 static void
1501 timer_sysctl_set_threshold(uint64_t value)
1502 {
1503         timer_longterm_t        *tlp = &timer_longterm;
1504         spl_t                   s = splclock();
1505         boolean_t               threshold_increase;
1506
1507         timer_queue_lock_spin(timer_longterm_queue);
1508
1509         timer_call_cancel(&tlp->threshold.timer);
1510
1511         /*
1512          * Set the new threshold and note whther it's increasing.
1513          */
1514         if (value == 0) {
1515                 tlp->threshold.interval = TIMER_LONGTERM_NONE;
1516                 threshold_increase = TRUE;
1517                 timer_call_cancel(&tlp->threshold.timer);
1518         } else {
1519                 uint64_t        old_interval = tlp->threshold.interval;
1520                 tlp->threshold.interval = value * NSEC_PER_MSEC;
1521                 nanoseconds_to_absolutetime(tlp->threshold.interval,
1522                                             &tlp->threshold.interval);
1523                 tlp->threshold.margin = tlp->threshold.interval / 10;
1524                 if  (old_interval == TIMER_LONGTERM_NONE)
1525                         threshold_increase = FALSE;
1526                 else
1527                         threshold_increase = (tlp->threshold.interval > old_interval);
1528         }
1529
1530         if (threshold_increase /* or removal */) {
1531                 /* Escalate timers from the longterm queue */
1532                 timer_longterm_scan(tlp, mach_absolute_time());
1533         } else /* decrease or addition  */ {
1534                 /*
1535                  * We scan the local/master queue for timers now longterm.
1536                  * To be strictly correct, we should scan all processor queues
1537                  * but timer migration results in most timers gravitating to the
1538                  * master processor in any case.
1539                  */
1540                 timer_master_scan(tlp, mach_absolute_time());
1541         }
1542
1543         /* Set new timer accordingly */
1544         tlp->threshold.deadline_set = tlp->threshold.deadline;
1545         if (tlp->threshold.deadline != TIMER_LONGTERM_NONE) {
1546                 tlp->threshold.deadline_set -= tlp->threshold.margin;
1547                 tlp->threshold.deadline_set -= tlp->threshold.latency;
1548                 timer_call_enter(
1549                         &tlp->threshold.timer,
1550                         tlp->threshold.deadline_set,
1551                         TIMER_CALL_LOCAL | TIMER_CALL_SYS_CRITICAL);
1552         }
1553
1554         /* Reset stats */
1555         tlp->enqueues = 0;
1556         tlp->dequeues = 0;
1557         tlp->escalates = 0;
1558         tlp->threshold.scans = 0;
1559         tlp->threshold.preempts = 0;
1560         tlp->threshold.latency = 0;
1561         tlp->threshold.latency_min = EndOfAllTime;
1562         tlp->threshold.latency_max = 0;
1563
1564         timer_queue_unlock(timer_longterm_queue);
1565         splx(s);
1566 }
1567
1568 int
1569 timer_sysctl_set(int oid, uint64_t value)
1570 {
1571         switch (oid) {
1572         case THRESHOLD:
1573                 timer_call_cpu(
1574                         master_cpu,
1575                         (void (*)(void *)) timer_sysctl_set_threshold,
1576                         (void *) value);
1577                 return KERN_SUCCESS;
1578         default:
1579                 return KERN_INVALID_ARGUMENT;
1580         }
1581 }
1582
1583
1584 /* Select timer coalescing window based on per-task quality-of-service hints */
1585 static boolean_t tcoal_qos_adjust(thread_t t, int32_t *tshift, uint64_t *tmax_abstime, boolean_t *pratelimited) {
1586         uint32_t latency_qos;
1587         boolean_t adjusted = FALSE;
1588         task_t ctask = t->task;
1589
1590         if (ctask) {
1591                 latency_qos = proc_get_effective_thread_policy(t, TASK_POLICY_LATENCY_QOS);
1592
1593                 assert(latency_qos <= NUM_LATENCY_QOS_TIERS);
1594
1595                 if (latency_qos) {
1596                         *tshift = tcoal_prio_params.latency_qos_scale[latency_qos - 1];
1597                         *tmax_abstime = tcoal_prio_params.latency_qos_abstime_max[latency_qos - 1];
1598                         *pratelimited = tcoal_prio_params.latency_tier_rate_limited[latency_qos - 1];
1599                         adjusted = TRUE;
1600                 }
1601         }
1602         return adjusted;
1603 }
1604
1605
1606 /* Adjust timer deadlines based on priority of the thread and the
1607  * urgency value provided at timeout establishment. With this mechanism,
1608  * timers are no longer necessarily sorted in order of soft deadline
1609  * on a given timer queue, i.e. they may be differentially skewed.
1610  * In the current scheme, this could lead to fewer pending timers
1611  * processed than is technically possible when the HW deadline arrives.
1612  */
1613 static void
1614 timer_compute_leeway(thread_t cthread, int32_t urgency, int32_t *tshift, uint64_t *tmax_abstime, boolean_t *pratelimited) {
1615         int16_t tpri = cthread->sched_pri;
1616         if ((urgency & TIMER_CALL_USER_MASK) != 0) {
1617                 if (tpri >= BASEPRI_RTQUEUES ||
1618                 urgency == TIMER_CALL_USER_CRITICAL) {
1619                         *tshift = tcoal_prio_params.timer_coalesce_rt_shift;
1620                         *tmax_abstime = tcoal_prio_params.timer_coalesce_rt_abstime_max;
1621                         TCOAL_PRIO_STAT(rt_tcl);
1622                 } else if (proc_get_effective_thread_policy(cthread, TASK_POLICY_DARWIN_BG) ||
1623                 (urgency == TIMER_CALL_USER_BACKGROUND)) {
1624                         /* Determine if timer should be subjected to a lower QoS */
1625                         if (tcoal_qos_adjust(cthread, tshift, tmax_abstime, pratelimited)) {
1626                                 if (*tmax_abstime > tcoal_prio_params.timer_coalesce_bg_abstime_max) {
1627                                         return;
1628                                 } else {
1629                                         *pratelimited = FALSE;
1630                                 }
1631                         }
1632                         *tshift = tcoal_prio_params.timer_coalesce_bg_shift;
1633                         *tmax_abstime = tcoal_prio_params.timer_coalesce_bg_abstime_max;
1634                         TCOAL_PRIO_STAT(bg_tcl);
1635                 } else if (tpri >= MINPRI_KERNEL) {
1636                         *tshift = tcoal_prio_params.timer_coalesce_kt_shift;
1637                         *tmax_abstime = tcoal_prio_params.timer_coalesce_kt_abstime_max;
1638                         TCOAL_PRIO_STAT(kt_tcl);
1639                 } else if (cthread->sched_mode == TH_MODE_FIXED) {
1640                         *tshift = tcoal_prio_params.timer_coalesce_fp_shift;
1641                         *tmax_abstime = tcoal_prio_params.timer_coalesce_fp_abstime_max;
1642                         TCOAL_PRIO_STAT(fp_tcl);
1643                 } else if (tcoal_qos_adjust(cthread, tshift, tmax_abstime, pratelimited)) {
1644                         TCOAL_PRIO_STAT(qos_tcl);
1645                 } else if (cthread->sched_mode == TH_MODE_TIMESHARE) {
1646                         *tshift = tcoal_prio_params.timer_coalesce_ts_shift;
1647                         *tmax_abstime = tcoal_prio_params.timer_coalesce_ts_abstime_max;
1648                         TCOAL_PRIO_STAT(ts_tcl);
1649                 } else {
1650                         TCOAL_PRIO_STAT(nc_tcl);
1651                 }
1652         } else if (urgency == TIMER_CALL_SYS_BACKGROUND) {
1653                 *tshift = tcoal_prio_params.timer_coalesce_bg_shift;
1654                 *tmax_abstime = tcoal_prio_params.timer_coalesce_bg_abstime_max;
1655                 TCOAL_PRIO_STAT(bg_tcl);
1656         } else {
1657                 *tshift = tcoal_prio_params.timer_coalesce_kt_shift;
1658                 *tmax_abstime = tcoal_prio_params.timer_coalesce_kt_abstime_max;
1659                 TCOAL_PRIO_STAT(kt_tcl);
1660         }
1661 }
1662
1663
1664 int timer_user_idle_level;
1665
1666 uint64_t
1667 timer_call_slop(uint64_t deadline, uint64_t now, uint32_t flags, thread_t cthread, boolean_t *pratelimited)
1668 {
1669         int32_t tcs_shift = 0;
1670         uint64_t tcs_max_abstime = 0;
1671         uint64_t adjval;
1672         uint32_t urgency = (flags & TIMER_CALL_URGENCY_MASK);
1673
1674         if (mach_timer_coalescing_enabled &&
1675             (deadline > now) && (urgency != TIMER_CALL_SYS_CRITICAL)) {
1676                 timer_compute_leeway(cthread, urgency, &tcs_shift, &tcs_max_abstime, pratelimited);
1677
1678                 if (tcs_shift >= 0)
1679                         adjval =  MIN((deadline - now) >> tcs_shift, tcs_max_abstime);
1680                 else
1681                         adjval =  MIN((deadline - now) << (-tcs_shift), tcs_max_abstime);
1682                 /* Apply adjustments derived from "user idle level" heuristic */
1683                 adjval += (adjval * timer_user_idle_level) >> 7;
1684                 return adjval;
1685         } else {
1686                 return 0;
1687         }
1688 }
1689
1690 int
1691 timer_get_user_idle_level(void) {
1692         return timer_user_idle_level;
1693 }
1694
1695 kern_return_t timer_set_user_idle_level(int ilevel) {
1696         boolean_t do_reeval = FALSE;
1697
1698         if ((ilevel < 0) || (ilevel > 128))
1699                 return KERN_INVALID_ARGUMENT;
1700
1701         if (ilevel < timer_user_idle_level) {
1702                 do_reeval = TRUE;
1703         }
1704
1705         timer_user_idle_level = ilevel;
1706
1707         if (do_reeval)
1708                 ml_timer_evaluate();
1709
1710         return KERN_SUCCESS;
1711 }