osfmk/kern/timer_call.c

   1 /*
   2  * Copyright (c) 1993-2008 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * Timer interrupt callout module.
  30  */
  31
  32 #include <mach/mach_types.h>
  33
  34 #include <kern/clock.h>
  35 #include <kern/smp.h>
  36 #include <kern/processor.h>
  37 #include <kern/timer_call.h>
  38 #include <kern/timer_queue.h>
  39 #include <kern/call_entry.h>
  40 #include <kern/thread.h>
  41
  42 #include <sys/kdebug.h>
  43
  44 #if CONFIG_DTRACE
  45 #include <mach/sdt.h>
  46 #endif
  47
  48
  49 #if DEBUG
  50 #define TIMER_ASSERT    1
  51 #endif
  52
  53 //#define TIMER_ASSERT  1
  54 //#define TIMER_DBG     1
  55
  56 #if TIMER_DBG
  57 #define DBG(x...) kprintf("DBG: " x);
  58 #else
  59 #define DBG(x...)
  60 #endif
  61
  62 #if TIMER_TRACE
  63 #define TIMER_KDEBUG_TRACE      KERNEL_DEBUG_CONSTANT_IST
  64 #else
  65 #define TIMER_KDEBUG_TRACE(x...)
  66 #endif
  67
  68
  69 lck_grp_t               timer_call_lck_grp;
  70 lck_attr_t              timer_call_lck_attr;
  71 lck_grp_attr_t          timer_call_lck_grp_attr;
  72
  73 lck_grp_t               timer_longterm_lck_grp;
  74 lck_attr_t              timer_longterm_lck_attr;
  75 lck_grp_attr_t          timer_longterm_lck_grp_attr;
  76
  77 /* Timer queue lock must be acquired with interrupts disabled (under splclock()) */
  78 #if __SMP__
  79 #define timer_queue_lock_spin(queue)                                    \
  80         lck_mtx_lock_spin_always(&queue->lock_data)
  81
  82 #define timer_queue_unlock(queue)               \
  83         lck_mtx_unlock_always(&queue->lock_data)
  84 #else
  85 #define timer_queue_lock_spin(queue)    (void)1
  86 #define timer_queue_unlock(queue)               (void)1
  87 #endif
  88
  89 #define QUEUE(x)        ((queue_t)(x))
  90 #define MPQUEUE(x)      ((mpqueue_head_t *)(x))
  91 #define TIMER_CALL(x)   ((timer_call_t)(x))
  92 #define TCE(x)          (&(x->call_entry))
  93 /*
  94  * The longterm timer object is a global structure holding all timers
  95  * beyond the short-term, local timer queue threshold. The boot processor
  96  * is responsible for moving each timer to its local timer queue
  97  * if and when that timer becomes due within the threshold.
  98  */
  99 #define TIMER_LONGTERM_NONE             EndOfAllTime
 100 #if defined(__x86_64__)
 101 #define TIMER_LONGTERM_THRESHOLD        (1ULL * NSEC_PER_SEC)
 102 #else
 103 #define TIMER_LONGTERM_THRESHOLD        TIMER_LONGTERM_NONE
 104 #endif
 105
 106 typedef struct {
 107         uint64_t        interval;       /* longterm timer interval */
 108         uint64_t        margin;         /* fudge factor (10% of interval */
 109         uint64_t        deadline;       /* first/soonest longterm deadline */
 110         uint64_t        preempted;      /* sooner timer has pre-empted */
 111         timer_call_t    call;           /* first/soonest longterm timer call */
 112         uint64_t        deadline_set;   /* next timer set */
 113         timer_call_data_t timer;        /* timer used by threshold management */
 114                                         /* Stats: */
 115         uint64_t        scans;          /*   num threshold timer scans */
 116         uint64_t        preempts;       /*   num threshold reductions */
 117         uint64_t        latency;        /*   average threshold latency */
 118         uint64_t        latency_min;    /*   minimum threshold latency */
 119         uint64_t        latency_max;    /*   maximum threshold latency */
 120 } threshold_t;
 121
 122 typedef struct {
 123         mpqueue_head_t  queue;          /* longterm timer list */
 124         uint64_t        enqueues;       /* num timers queued */
 125         uint64_t        dequeues;       /* num timers dequeued */
 126         uint64_t        escalates;      /* num timers becoming shortterm */
 127         uint64_t        scan_time;      /* last time the list was scanned */
 128         threshold_t     threshold;      /* longterm timer threshold */
 129 } timer_longterm_t;
 130
 131 timer_longterm_t                timer_longterm;
 132
 133 static mpqueue_head_t           *timer_longterm_queue = NULL;
 134
 135 static void                     timer_longterm_init(void);
 136 static void                     timer_longterm_callout(
 137                                         timer_call_param_t      p0,
 138                                         timer_call_param_t      p1);
 139 extern void                     timer_longterm_scan(
 140                                         timer_longterm_t        *tlp,
 141                                         uint64_t                now);
 142 static void                     timer_longterm_update(
 143                                         timer_longterm_t *tlp);
 144 static void                     timer_longterm_update_locked(
 145                                         timer_longterm_t *tlp);
 146 static mpqueue_head_t *         timer_longterm_enqueue_unlocked(
 147                                         timer_call_t            call,
 148                                         uint64_t                now,
 149                                         uint64_t                deadline,
 150                                         mpqueue_head_t **       old_queue,
 151                                         uint64_t                soft_deadline,
 152                                         uint64_t                ttd,
 153                                         timer_call_param_t      param1,
 154                                         uint32_t                callout_flags);
 155 static void                     timer_longterm_dequeued_locked(
 156                                         timer_call_t            call);
 157
 158 uint64_t past_deadline_timers;
 159 uint64_t past_deadline_deltas;
 160 uint64_t past_deadline_longest;
 161 uint64_t past_deadline_shortest = ~0ULL;
 162 enum {PAST_DEADLINE_TIMER_ADJUSTMENT_NS = 10 * 1000};
 163
 164 uint64_t past_deadline_timer_adjustment;
 165
 166 static boolean_t timer_call_enter_internal(timer_call_t call, timer_call_param_t param1, uint64_t deadline, uint64_t leeway, uint32_t flags, boolean_t ratelimited);
 167 boolean_t       mach_timer_coalescing_enabled = TRUE;
 168
 169 mpqueue_head_t  *timer_call_enqueue_deadline_unlocked(
 170                         timer_call_t            call,
 171                         mpqueue_head_t          *queue,
 172                         uint64_t                deadline,
 173                         uint64_t                soft_deadline,
 174                         uint64_t                ttd,
 175                         timer_call_param_t      param1,
 176                         uint32_t                flags);
 177
 178 mpqueue_head_t  *timer_call_dequeue_unlocked(
 179                         timer_call_t            call);
 180
 181 timer_coalescing_priority_params_t tcoal_prio_params;
 182
 183 #if TCOAL_PRIO_STATS
 184 int32_t nc_tcl, rt_tcl, bg_tcl, kt_tcl, fp_tcl, ts_tcl, qos_tcl;
 185 #define TCOAL_PRIO_STAT(x) (x++)
 186 #else
 187 #define TCOAL_PRIO_STAT(x)
 188 #endif
 189
 190 static void
 191 timer_call_init_abstime(void)
 192 {
 193         int i;
 194         uint64_t result;
 195         timer_coalescing_priority_params_ns_t * tcoal_prio_params_init = timer_call_get_priority_params();
 196         nanoseconds_to_absolutetime(PAST_DEADLINE_TIMER_ADJUSTMENT_NS, &past_deadline_timer_adjustment);
 197         nanoseconds_to_absolutetime(tcoal_prio_params_init->idle_entry_timer_processing_hdeadline_threshold_ns, &result);
 198         tcoal_prio_params.idle_entry_timer_processing_hdeadline_threshold_abstime = (uint32_t)result;
 199         nanoseconds_to_absolutetime(tcoal_prio_params_init->interrupt_timer_coalescing_ilat_threshold_ns, &result);
 200         tcoal_prio_params.interrupt_timer_coalescing_ilat_threshold_abstime = (uint32_t)result;
 201         nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_resort_threshold_ns, &result);
 202         tcoal_prio_params.timer_resort_threshold_abstime = (uint32_t)result;
 203         tcoal_prio_params.timer_coalesce_rt_shift = tcoal_prio_params_init->timer_coalesce_rt_shift;
 204         tcoal_prio_params.timer_coalesce_bg_shift = tcoal_prio_params_init->timer_coalesce_bg_shift;
 205         tcoal_prio_params.timer_coalesce_kt_shift = tcoal_prio_params_init->timer_coalesce_kt_shift;
 206         tcoal_prio_params.timer_coalesce_fp_shift = tcoal_prio_params_init->timer_coalesce_fp_shift;
 207         tcoal_prio_params.timer_coalesce_ts_shift = tcoal_prio_params_init->timer_coalesce_ts_shift;
 208
 209         nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_coalesce_rt_ns_max,
 210             &tcoal_prio_params.timer_coalesce_rt_abstime_max);
 211         nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_coalesce_bg_ns_max,
 212             &tcoal_prio_params.timer_coalesce_bg_abstime_max);
 213         nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_coalesce_kt_ns_max,
 214             &tcoal_prio_params.timer_coalesce_kt_abstime_max);
 215         nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_coalesce_fp_ns_max,
 216             &tcoal_prio_params.timer_coalesce_fp_abstime_max);
 217         nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_coalesce_ts_ns_max,
 218             &tcoal_prio_params.timer_coalesce_ts_abstime_max);
 219
 220         for (i = 0; i < NUM_LATENCY_QOS_TIERS; i++) {
 221                 tcoal_prio_params.latency_qos_scale[i] = tcoal_prio_params_init->latency_qos_scale[i];
 222                 nanoseconds_to_absolutetime(tcoal_prio_params_init->latency_qos_ns_max[i],
 223                     &tcoal_prio_params.latency_qos_abstime_max[i]);
 224                 tcoal_prio_params.latency_tier_rate_limited[i] = tcoal_prio_params_init->latency_tier_rate_limited[i];
 225         }
 226 }
 227
 228
 229 void
 230 timer_call_init(void)
 231 {
 232         lck_attr_setdefault(&timer_call_lck_attr);
 233         lck_grp_attr_setdefault(&timer_call_lck_grp_attr);
 234         lck_grp_init(&timer_call_lck_grp, "timer_call", &timer_call_lck_grp_attr);
 235
 236         timer_longterm_init();
 237         timer_call_init_abstime();
 238 }
 239
 240
 241 void
 242 timer_call_queue_init(mpqueue_head_t *queue)
 243 {
 244         DBG("timer_call_queue_init(%p)\n", queue);
 245         mpqueue_init(queue, &timer_call_lck_grp, &timer_call_lck_attr);
 246 }
 247
 248
 249 void
 250 timer_call_setup(
 251         timer_call_t                    call,
 252         timer_call_func_t               func,
 253         timer_call_param_t              param0)
 254 {
 255         DBG("timer_call_setup(%p,%p,%p)\n", call, func, param0);
 256         call_entry_setup(TCE(call), func, param0);
 257         simple_lock_init(&(call)->lock, 0);
 258         call->async_dequeue = FALSE;
 259 }
 260 #if TIMER_ASSERT
 261 static __inline__ mpqueue_head_t *
 262 timer_call_entry_dequeue(
 263         timer_call_t            entry)
 264 {
 265         mpqueue_head_t  *old_queue = MPQUEUE(TCE(entry)->queue);
 266
 267         if (!hw_lock_held((hw_lock_t)&entry->lock))
 268                 panic("_call_entry_dequeue() "
 269                         "entry %p is not locked\n", entry);
 270         /*
 271          * XXX The queue lock is actually a mutex in spin mode
 272          *     but there's no way to test for it being held
 273          *     so we pretend it's a spinlock!
 274          */
 275         if (!hw_lock_held((hw_lock_t)&old_queue->lock_data))
 276                 panic("_call_entry_dequeue() "
 277                         "queue %p is not locked\n", old_queue);
 278
 279         call_entry_dequeue(TCE(entry));
 280         old_queue->count--;
 281
 282         return (old_queue);
 283 }
 284
 285 static __inline__ mpqueue_head_t *
 286 timer_call_entry_enqueue_deadline(
 287         timer_call_t            entry,
 288         mpqueue_head_t          *queue,
 289         uint64_t                deadline)
 290 {
 291         mpqueue_head_t  *old_queue = MPQUEUE(TCE(entry)->queue);
 292
 293         if (!hw_lock_held((hw_lock_t)&entry->lock))
 294                 panic("_call_entry_enqueue_deadline() "
 295                         "entry %p is not locked\n", entry);
 296         /* XXX More lock pretense:  */
 297         if (!hw_lock_held((hw_lock_t)&queue->lock_data))
 298                 panic("_call_entry_enqueue_deadline() "
 299                         "queue %p is not locked\n", queue);
 300         if (old_queue != NULL && old_queue != queue)
 301                 panic("_call_entry_enqueue_deadline() "
 302                         "old_queue %p != queue", old_queue);
 303
 304         call_entry_enqueue_deadline(TCE(entry), QUEUE(queue), deadline);
 305
 306 /* For efficiency, track the earliest soft deadline on the queue, so that
 307  * fuzzy decisions can be made without lock acquisitions.
 308  */
 309         timer_call_t thead = (timer_call_t)queue_first(&queue->head);
 310
 311         queue->earliest_soft_deadline = thead->flags & TIMER_CALL_RATELIMITED ? TCE(thead)->deadline : thead->soft_deadline;
 312
 313         if (old_queue)
 314                 old_queue->count--;
 315         queue->count++;
 316
 317         return (old_queue);
 318 }
 319
 320 #else
 321
 322 static __inline__ mpqueue_head_t *
 323 timer_call_entry_dequeue(
 324         timer_call_t            entry)
 325 {
 326         mpqueue_head_t  *old_queue = MPQUEUE(TCE(entry)->queue);
 327
 328         call_entry_dequeue(TCE(entry));
 329         old_queue->count--;
 330
 331         return old_queue;
 332 }
 333
 334 static __inline__ mpqueue_head_t *
 335 timer_call_entry_enqueue_deadline(
 336         timer_call_t                    entry,
 337         mpqueue_head_t                  *queue,
 338         uint64_t                        deadline)
 339 {
 340         mpqueue_head_t  *old_queue = MPQUEUE(TCE(entry)->queue);
 341
 342         call_entry_enqueue_deadline(TCE(entry), QUEUE(queue), deadline);
 343
 344         /* For efficiency, track the earliest soft deadline on the queue,
 345          * so that fuzzy decisions can be made without lock acquisitions.
 346          */
 347
 348         timer_call_t thead = (timer_call_t)queue_first(&queue->head);
 349         queue->earliest_soft_deadline = thead->flags & TIMER_CALL_RATELIMITED ? TCE(thead)->deadline : thead->soft_deadline;
 350
 351         if (old_queue)
 352                 old_queue->count--;
 353         queue->count++;
 354
 355         return old_queue;
 356 }
 357
 358 #endif
 359
 360 static __inline__ void
 361 timer_call_entry_enqueue_tail(
 362         timer_call_t                    entry,
 363         mpqueue_head_t                  *queue)
 364 {
 365         call_entry_enqueue_tail(TCE(entry), QUEUE(queue));
 366         queue->count++;
 367         return;
 368 }
 369
 370 /*
 371  * Remove timer entry from its queue but don't change the queue pointer
 372  * and set the async_dequeue flag. This is locking case 2b.
 373  */
 374 static __inline__ void
 375 timer_call_entry_dequeue_async(
 376         timer_call_t            entry)
 377 {
 378         mpqueue_head_t  *old_queue = MPQUEUE(TCE(entry)->queue);
 379         if (old_queue) {
 380                 old_queue->count--;
 381                 (void) remque(qe(entry));
 382                 entry->async_dequeue = TRUE;
 383         }
 384         return;
 385 }
 386
 387 #if TIMER_ASSERT
 388 unsigned timer_call_enqueue_deadline_unlocked_async1;
 389 unsigned timer_call_enqueue_deadline_unlocked_async2;
 390 #endif
 391 /*
 392  * Assumes call_entry and queues unlocked, interrupts disabled.
 393  */
 394 __inline__ mpqueue_head_t *
 395 timer_call_enqueue_deadline_unlocked(
 396         timer_call_t                    call,
 397         mpqueue_head_t                  *queue,
 398         uint64_t                        deadline,
 399         uint64_t                        soft_deadline,
 400         uint64_t                        ttd,
 401         timer_call_param_t              param1,
 402         uint32_t                        callout_flags)
 403 {
 404         call_entry_t    entry = TCE(call);
 405         mpqueue_head_t  *old_queue;
 406
 407         DBG("timer_call_enqueue_deadline_unlocked(%p,%p,)\n", call, queue);
 408
 409         simple_lock(&call->lock);
 410
 411         old_queue = MPQUEUE(entry->queue);
 412
 413         if (old_queue != NULL) {
 414                 timer_queue_lock_spin(old_queue);
 415                 if (call->async_dequeue) {
 416                         /* collision (1c): timer already dequeued, clear flag */
 417 #if TIMER_ASSERT
 418                         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
 419                                 DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
 420                                 VM_KERNEL_UNSLIDE_OR_PERM(call),
 421                                 call->async_dequeue,
 422                                 VM_KERNEL_UNSLIDE_OR_PERM(TCE(call)->queue),
 423                                 0x1c, 0);
 424                         timer_call_enqueue_deadline_unlocked_async1++;
 425 #endif
 426                         call->async_dequeue = FALSE;
 427                         entry->queue = NULL;
 428                 } else if (old_queue != queue) {
 429                         timer_call_entry_dequeue(call);
 430 #if TIMER_ASSERT
 431                         timer_call_enqueue_deadline_unlocked_async2++;
 432 #endif
 433                 }
 434                 if (old_queue == timer_longterm_queue)
 435                         timer_longterm_dequeued_locked(call);
 436                 if (old_queue != queue) {
 437                         timer_queue_unlock(old_queue);
 438                         timer_queue_lock_spin(queue);
 439                 }
 440         } else {
 441                 timer_queue_lock_spin(queue);
 442         }
 443
 444         call->soft_deadline = soft_deadline;
 445         call->flags = callout_flags;
 446         TCE(call)->param1 = param1;
 447         call->ttd = ttd;
 448
 449         timer_call_entry_enqueue_deadline(call, queue, deadline);
 450         timer_queue_unlock(queue);
 451         simple_unlock(&call->lock);
 452
 453         return (old_queue);
 454 }
 455
 456 #if TIMER_ASSERT
 457 unsigned timer_call_dequeue_unlocked_async1;
 458 unsigned timer_call_dequeue_unlocked_async2;
 459 #endif
 460 mpqueue_head_t *
 461 timer_call_dequeue_unlocked(
 462         timer_call_t            call)
 463 {
 464         call_entry_t    entry = TCE(call);
 465         mpqueue_head_t  *old_queue;
 466
 467         DBG("timer_call_dequeue_unlocked(%p)\n", call);
 468
 469         simple_lock(&call->lock);
 470         old_queue = MPQUEUE(entry->queue);
 471 #if TIMER_ASSERT
 472         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
 473                 DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
 474                 VM_KERNEL_UNSLIDE_OR_PERM(call),
 475                 call->async_dequeue,
 476                 VM_KERNEL_UNSLIDE_OR_PERM(TCE(call)->queue),
 477                 0, 0);
 478 #endif
 479         if (old_queue != NULL) {
 480                 timer_queue_lock_spin(old_queue);
 481                 if (call->async_dequeue) {
 482                         /* collision (1c): timer already dequeued, clear flag */
 483 #if TIMER_ASSERT
 484                         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
 485                                 DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
 486                             VM_KERNEL_UNSLIDE_OR_PERM(call),
 487                                 call->async_dequeue,
 488                                 VM_KERNEL_UNSLIDE_OR_PERM(TCE(call)->queue),
 489                                 0x1c, 0);
 490                         timer_call_dequeue_unlocked_async1++;
 491 #endif
 492                         call->async_dequeue = FALSE;
 493                         entry->queue = NULL;
 494                 } else {
 495                         timer_call_entry_dequeue(call);
 496                 }
 497                 if (old_queue == timer_longterm_queue)
 498                         timer_longterm_dequeued_locked(call);
 499                 timer_queue_unlock(old_queue);
 500         }
 501         simple_unlock(&call->lock);
 502         return (old_queue);
 503 }
 504
 505
 506 /*
 507  * Timer call entry locking model
 508  * ==============================
 509  *
 510  * Timer call entries are linked on per-cpu timer queues which are protected
 511  * by the queue lock and the call entry lock. The locking protocol is:
 512  *
 513  *  0) The canonical locking order is timer call entry followed by queue.
 514  *
 515  *  1) With only the entry lock held, entry.queue is valid:
 516  *    1a) NULL: the entry is not queued, or
 517  *    1b) non-NULL: this queue must be locked before the entry is modified.
 518  *        After locking the queue, the call.async_dequeue flag must be checked:
 519  *    1c) TRUE: the entry was removed from the queue by another thread
 520  *              and we must NULL the entry.queue and reset this flag, or
 521  *    1d) FALSE: (ie. queued), the entry can be manipulated.
 522  *
 523  *  2) If a queue lock is obtained first, the queue is stable:
 524  *    2a) If a try-lock of a queued entry succeeds, the call can be operated on
 525  *        and dequeued.
 526  *    2b) If a try-lock fails, it indicates that another thread is attempting
 527  *        to change the entry and move it to a different position in this queue
 528  *        or to different queue. The entry can be dequeued but it should not be
 529  *        operated upon since it is being changed. Furthermore, we don't null
 530  *        the entry.queue pointer (protected by the entry lock we don't own).
 531  *        Instead, we set the async_dequeue flag -- see (1c).
 532  *    2c) Same as 2b but occurring when a longterm timer is matured.
 533  *  3) A callout's parameters (deadline, flags, parameters, soft deadline &c.)
 534  *     should be manipulated with the appropriate timer queue lock held,
 535  *     to prevent queue traversal observations from observing inconsistent
 536  *     updates to an in-flight callout.
 537  */
 538
 539 /*
 540  * Inlines timer_call_entry_dequeue() and timer_call_entry_enqueue_deadline()
 541  * cast between pointer types (mpqueue_head_t *) and (queue_t) so that
 542  * we can use the call_entry_dequeue() and call_entry_enqueue_deadline()
 543  * methods to operate on timer_call structs as if they are call_entry structs.
 544  * These structures are identical except for their queue head pointer fields.
 545  *
 546  * In the debug case, we assert that the timer call locking protocol
 547  * is being obeyed.
 548  */
 549
 550 static boolean_t
 551 timer_call_enter_internal(
 552         timer_call_t            call,
 553         timer_call_param_t      param1,
 554         uint64_t                deadline,
 555         uint64_t                leeway,
 556         uint32_t                flags,
 557         boolean_t               ratelimited)
 558 {
 559         mpqueue_head_t          *queue = NULL;
 560         mpqueue_head_t          *old_queue;
 561         spl_t                   s;
 562         uint64_t                slop;
 563         uint32_t                urgency;
 564         uint64_t                sdeadline, ttd;
 565
 566         s = splclock();
 567
 568         sdeadline = deadline;
 569         uint64_t ctime = mach_absolute_time();
 570
 571         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
 572                 DECR_TIMER_ENTER | DBG_FUNC_START,
 573             VM_KERNEL_UNSLIDE_OR_PERM(call),
 574             VM_KERNEL_UNSLIDE_OR_PERM(param1), deadline, flags, 0);
 575
 576         urgency = (flags & TIMER_CALL_URGENCY_MASK);
 577
 578         boolean_t slop_ratelimited = FALSE;
 579         slop = timer_call_slop(deadline, ctime, urgency, current_thread(), &slop_ratelimited);
 580
 581         if ((flags & TIMER_CALL_LEEWAY) != 0 && leeway > slop)
 582                 slop = leeway;
 583
 584         if (UINT64_MAX - deadline <= slop) {
 585                 deadline = UINT64_MAX;
 586         } else {
 587                 deadline += slop;
 588         }
 589
 590         if (__improbable(deadline < ctime)) {
 591                 uint64_t delta = (ctime - deadline);
 592
 593                 past_deadline_timers++;
 594                 past_deadline_deltas += delta;
 595                 if (delta > past_deadline_longest)
 596                         past_deadline_longest = deadline;
 597                 if (delta < past_deadline_shortest)
 598                         past_deadline_shortest = delta;
 599
 600                 deadline = ctime + past_deadline_timer_adjustment;
 601                 sdeadline = deadline;
 602         }
 603
 604         if (ratelimited || slop_ratelimited) {
 605                 flags |= TIMER_CALL_RATELIMITED;
 606         } else {
 607                 flags &= ~TIMER_CALL_RATELIMITED;
 608         }
 609
 610         ttd =  sdeadline - ctime;
 611 #if CONFIG_DTRACE
 612         DTRACE_TMR7(callout__create, timer_call_func_t, TCE(call)->func,
 613         timer_call_param_t, TCE(call)->param0, uint32_t, flags,
 614             (deadline - sdeadline),
 615             (ttd >> 32), (unsigned) (ttd & 0xFFFFFFFF), call);
 616 #endif
 617
 618         /* Program timer callout parameters under the appropriate per-CPU or
 619          * longterm queue lock. The callout may have been previously enqueued
 620          * and in-flight on this or another timer queue.
 621          */
 622         if (!ratelimited && !slop_ratelimited) {
 623                 queue = timer_longterm_enqueue_unlocked(call, ctime, deadline, &old_queue, sdeadline, ttd, param1, flags);
 624         }
 625
 626         if (queue == NULL) {
 627                 queue = timer_queue_assign(deadline);
 628                 old_queue = timer_call_enqueue_deadline_unlocked(call, queue, deadline, sdeadline, ttd, param1, flags);
 629         }
 630
 631 #if TIMER_TRACE
 632         TCE(call)->entry_time = ctime;
 633 #endif
 634
 635         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
 636                 DECR_TIMER_ENTER | DBG_FUNC_END,
 637                 VM_KERNEL_UNSLIDE_OR_PERM(call),
 638                 (old_queue != NULL), deadline, queue->count, 0);
 639
 640         splx(s);
 641
 642         return (old_queue != NULL);
 643 }
 644
 645 /*
 646  * timer_call_*()
 647  *      return boolean indicating whether the call was previously queued.
 648  */
 649 boolean_t
 650 timer_call_enter(
 651         timer_call_t            call,
 652         uint64_t                deadline,
 653         uint32_t                flags)
 654 {
 655         return timer_call_enter_internal(call, NULL, deadline, 0, flags, FALSE);
 656 }
 657
 658 boolean_t
 659 timer_call_enter1(
 660         timer_call_t            call,
 661         timer_call_param_t      param1,
 662         uint64_t                deadline,
 663         uint32_t                flags)
 664 {
 665         return timer_call_enter_internal(call, param1, deadline, 0, flags, FALSE);
 666 }
 667
 668 boolean_t
 669 timer_call_enter_with_leeway(
 670         timer_call_t            call,
 671         timer_call_param_t      param1,
 672         uint64_t                deadline,
 673         uint64_t                leeway,
 674         uint32_t                flags,
 675         boolean_t               ratelimited)
 676 {
 677         return timer_call_enter_internal(call, param1, deadline, leeway, flags, ratelimited);
 678 }
 679
 680 boolean_t
 681 timer_call_cancel(
 682         timer_call_t            call)
 683 {
 684         mpqueue_head_t          *old_queue;
 685         spl_t                   s;
 686
 687         s = splclock();
 688
 689         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
 690                 DECR_TIMER_CANCEL | DBG_FUNC_START,
 691                 VM_KERNEL_UNSLIDE_OR_PERM(call),
 692                 TCE(call)->deadline, call->soft_deadline, call->flags, 0);
 693
 694         old_queue = timer_call_dequeue_unlocked(call);
 695
 696         if (old_queue != NULL) {
 697                 timer_queue_lock_spin(old_queue);
 698                 if (!queue_empty(&old_queue->head)) {
 699                         timer_queue_cancel(old_queue, TCE(call)->deadline, CE(queue_first(&old_queue->head))->deadline);
 700                         timer_call_t thead = (timer_call_t)queue_first(&old_queue->head);
 701                         old_queue->earliest_soft_deadline = thead->flags & TIMER_CALL_RATELIMITED ? TCE(thead)->deadline : thead->soft_deadline;
 702                 }
 703                 else {
 704                         timer_queue_cancel(old_queue, TCE(call)->deadline, UINT64_MAX);
 705                         old_queue->earliest_soft_deadline = UINT64_MAX;
 706                 }
 707                 timer_queue_unlock(old_queue);
 708         }
 709         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
 710                 DECR_TIMER_CANCEL | DBG_FUNC_END,
 711                 VM_KERNEL_UNSLIDE_OR_PERM(call),
 712                 VM_KERNEL_UNSLIDE_OR_PERM(old_queue),
 713                 TCE(call)->deadline - mach_absolute_time(),
 714                 TCE(call)->deadline - TCE(call)->entry_time, 0);
 715         splx(s);
 716
 717 #if CONFIG_DTRACE
 718         DTRACE_TMR6(callout__cancel, timer_call_func_t, TCE(call)->func,
 719             timer_call_param_t, TCE(call)->param0, uint32_t, call->flags, 0,
 720             (call->ttd >> 32), (unsigned) (call->ttd & 0xFFFFFFFF));
 721 #endif
 722
 723         return (old_queue != NULL);
 724 }
 725
 726 static uint32_t timer_queue_shutdown_lock_skips;
 727 static uint32_t timer_queue_shutdown_discarded;
 728
 729 void
 730 timer_queue_shutdown(
 731         mpqueue_head_t          *queue)
 732 {
 733         timer_call_t            call;
 734         mpqueue_head_t          *new_queue;
 735         spl_t                   s;
 736
 737
 738         DBG("timer_queue_shutdown(%p)\n", queue);
 739
 740         s = splclock();
 741
 742         /* Note comma operator in while expression re-locking each iteration */
 743         while (timer_queue_lock_spin(queue), !queue_empty(&queue->head)) {
 744                 call = TIMER_CALL(queue_first(&queue->head));
 745
 746                 if (!simple_lock_try(&call->lock)) {
 747                         /*
 748                          * case (2b) lock order inversion, dequeue and skip
 749                          * Don't change the call_entry queue back-pointer
 750                          * but set the async_dequeue field.
 751                          */
 752                         timer_queue_shutdown_lock_skips++;
 753                         timer_call_entry_dequeue_async(call);
 754 #if TIMER_ASSERT
 755                         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
 756                                 DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
 757                                 VM_KERNEL_UNSLIDE_OR_PERM(call),
 758                                 call->async_dequeue,
 759                                 VM_KERNEL_UNSLIDE_OR_PERM(TCE(call)->queue),
 760                                 0x2b, 0);
 761 #endif
 762                         timer_queue_unlock(queue);
 763                         continue;
 764                 }
 765
 766                 boolean_t call_local = ((call->flags & TIMER_CALL_LOCAL) != 0);
 767
 768                 /* remove entry from old queue */
 769                 timer_call_entry_dequeue(call);
 770                 timer_queue_unlock(queue);
 771
 772                 if (call_local == FALSE) {
 773                         /* and queue it on new, discarding LOCAL timers */
 774                         new_queue = timer_queue_assign(TCE(call)->deadline);
 775                         timer_queue_lock_spin(new_queue);
 776                         timer_call_entry_enqueue_deadline(
 777                                 call, new_queue, TCE(call)->deadline);
 778                         timer_queue_unlock(new_queue);
 779                 } else {
 780                         timer_queue_shutdown_discarded++;
 781                 }
 782
 783                 /* The only lingering LOCAL timer should be this thread's
 784                  * quantum expiration timer.
 785                  */
 786                 assert((call_local == FALSE) ||
 787                     (TCE(call)->func == thread_quantum_expire));
 788
 789                 simple_unlock(&call->lock);
 790         }
 791
 792         timer_queue_unlock(queue);
 793         splx(s);
 794 }
 795
 796 static uint32_t timer_queue_expire_lock_skips;
 797 uint64_t
 798 timer_queue_expire_with_options(
 799         mpqueue_head_t          *queue,
 800         uint64_t                deadline,
 801         boolean_t               rescan)
 802 {
 803         timer_call_t    call = NULL;
 804         uint32_t tc_iterations = 0;
 805         DBG("timer_queue_expire(%p,)\n", queue);
 806
 807         uint64_t cur_deadline = deadline;
 808         timer_queue_lock_spin(queue);
 809
 810         while (!queue_empty(&queue->head)) {
 811                 /* Upon processing one or more timer calls, refresh the
 812                  * deadline to account for time elapsed in the callout
 813                  */
 814                 if (++tc_iterations > 1)
 815                         cur_deadline = mach_absolute_time();
 816
 817                 if (call == NULL)
 818                         call = TIMER_CALL(queue_first(&queue->head));
 819
 820                 if (call->soft_deadline <= cur_deadline) {
 821                         timer_call_func_t               func;
 822                         timer_call_param_t              param0, param1;
 823
 824                         TCOAL_DEBUG(0xDDDD0000, queue->earliest_soft_deadline, call->soft_deadline, 0, 0, 0);
 825                         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
 826                                 DECR_TIMER_EXPIRE | DBG_FUNC_NONE,
 827                                 VM_KERNEL_UNSLIDE_OR_PERM(call),
 828                                 call->soft_deadline,
 829                                 TCE(call)->deadline,
 830                                 TCE(call)->entry_time, 0);
 831
 832                         if ((call->flags & TIMER_CALL_RATELIMITED) &&
 833                             (TCE(call)->deadline > cur_deadline)) {
 834                                 if (rescan == FALSE)
 835                                         break;
 836                         }
 837
 838                         if (!simple_lock_try(&call->lock)) {
 839                                 /* case (2b) lock inversion, dequeue and skip */
 840                                 timer_queue_expire_lock_skips++;
 841                                 timer_call_entry_dequeue_async(call);
 842                                 call = NULL;
 843                                 continue;
 844                         }
 845
 846                         timer_call_entry_dequeue(call);
 847
 848                         func = TCE(call)->func;
 849                         param0 = TCE(call)->param0;
 850                         param1 = TCE(call)->param1;
 851
 852                         simple_unlock(&call->lock);
 853                         timer_queue_unlock(queue);
 854
 855                         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
 856                                 DECR_TIMER_CALLOUT | DBG_FUNC_START,
 857                                 VM_KERNEL_UNSLIDE_OR_PERM(call), VM_KERNEL_UNSLIDE(func),
 858                                 VM_KERNEL_UNSLIDE_OR_PERM(param0),
 859                                 VM_KERNEL_UNSLIDE_OR_PERM(param1),
 860                                 0);
 861
 862 #if CONFIG_DTRACE
 863                         DTRACE_TMR7(callout__start, timer_call_func_t, func,
 864                             timer_call_param_t, param0, unsigned, call->flags,
 865                             0, (call->ttd >> 32),
 866                             (unsigned) (call->ttd & 0xFFFFFFFF), call);
 867 #endif
 868                         /* Maintain time-to-deadline in per-processor data
 869                          * structure for thread wakeup deadline statistics.
 870                          */
 871                         uint64_t *ttdp = &(PROCESSOR_DATA(current_processor(), timer_call_ttd));
 872                         *ttdp = call->ttd;
 873                         (*func)(param0, param1);
 874                         *ttdp = 0;
 875 #if CONFIG_DTRACE
 876                         DTRACE_TMR4(callout__end, timer_call_func_t, func,
 877                             param0, param1, call);
 878 #endif
 879
 880                         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
 881                                 DECR_TIMER_CALLOUT | DBG_FUNC_END,
 882                                 VM_KERNEL_UNSLIDE_OR_PERM(call), VM_KERNEL_UNSLIDE(func),
 883                                 VM_KERNEL_UNSLIDE_OR_PERM(param0),
 884                                 VM_KERNEL_UNSLIDE_OR_PERM(param1),
 885                                 0);
 886                         call = NULL;
 887                         timer_queue_lock_spin(queue);
 888                 } else {
 889                         if (__probable(rescan == FALSE)) {
 890                                 break;
 891                         } else {
 892                                 int64_t skew = TCE(call)->deadline - call->soft_deadline;
 893                                 assert(TCE(call)->deadline >= call->soft_deadline);
 894
 895                                 /* DRK: On a latency quality-of-service level change,
 896                                  * re-sort potentially rate-limited timers. The platform
 897                                  * layer determines which timers require
 898                                  * this. In the absence of the per-callout
 899                                  * synchronization requirement, a global resort could
 900                                  * be more efficient. The re-sort effectively
 901                                  * annuls all timer adjustments, i.e. the "soft
 902                                  * deadline" is the sort key.
 903                                  */
 904
 905                                 if (timer_resort_threshold(skew)) {
 906                                         if (__probable(simple_lock_try(&call->lock))) {
 907                                                 timer_call_entry_dequeue(call);
 908                                                 timer_call_entry_enqueue_deadline(call, queue, call->soft_deadline);
 909                                                 simple_unlock(&call->lock);
 910                                                 call = NULL;
 911                                         }
 912                                 }
 913                                 if (call) {
 914                                         call = TIMER_CALL(queue_next(qe(call)));
 915                                         if (queue_end(&queue->head, qe(call)))
 916                                                 break;
 917                                 }
 918                         }
 919                 }
 920         }
 921
 922         if (!queue_empty(&queue->head)) {
 923                 call = TIMER_CALL(queue_first(&queue->head));
 924                 cur_deadline = TCE(call)->deadline;
 925                 queue->earliest_soft_deadline = (call->flags & TIMER_CALL_RATELIMITED) ? TCE(call)->deadline: call->soft_deadline;
 926         } else {
 927                 queue->earliest_soft_deadline = cur_deadline = UINT64_MAX;
 928         }
 929
 930         timer_queue_unlock(queue);
 931
 932         return (cur_deadline);
 933 }
 934
 935 uint64_t
 936 timer_queue_expire(
 937         mpqueue_head_t          *queue,
 938         uint64_t                deadline)
 939 {
 940         return timer_queue_expire_with_options(queue, deadline, FALSE);
 941 }
 942
 943 extern int serverperfmode;
 944 static uint32_t timer_queue_migrate_lock_skips;
 945 /*
 946  * timer_queue_migrate() is called by timer_queue_migrate_cpu()
 947  * to move timer requests from the local processor (queue_from)
 948  * to a target processor's (queue_to).
 949  */
 950 int
 951 timer_queue_migrate(mpqueue_head_t *queue_from, mpqueue_head_t *queue_to)
 952 {
 953         timer_call_t    call;
 954         timer_call_t    head_to;
 955         int             timers_migrated = 0;
 956
 957         DBG("timer_queue_migrate(%p,%p)\n", queue_from, queue_to);
 958
 959         assert(!ml_get_interrupts_enabled());
 960         assert(queue_from != queue_to);
 961
 962         if (serverperfmode) {
 963                 /*
 964                  * if we're running a high end server
 965                  * avoid migrations... they add latency
 966                  * and don't save us power under typical
 967                  * server workloads
 968                  */
 969                 return -4;
 970         }
 971
 972         /*
 973          * Take both local (from) and target (to) timer queue locks while
 974          * moving the timers from the local queue to the target processor.
 975          * We assume that the target is always the boot processor.
 976          * But only move if all of the following is true:
 977          *  - the target queue is non-empty
 978          *  - the local queue is non-empty
 979          *  - the local queue's first deadline is later than the target's
 980          *  - the local queue contains no non-migrateable "local" call
 981          * so that we need not have the target resync.
 982          */
 983
 984         timer_queue_lock_spin(queue_to);
 985
 986         head_to = TIMER_CALL(queue_first(&queue_to->head));
 987         if (queue_empty(&queue_to->head)) {
 988                 timers_migrated = -1;
 989                 goto abort1;
 990         }
 991
 992         timer_queue_lock_spin(queue_from);
 993
 994         if (queue_empty(&queue_from->head)) {
 995                 timers_migrated = -2;
 996                 goto abort2;
 997         }
 998
 999         call = TIMER_CALL(queue_first(&queue_from->head));
1000         if (TCE(call)->deadline < TCE(head_to)->deadline) {
1001                 timers_migrated = 0;
1002                 goto abort2;
1003         }
1004
1005         /* perform scan for non-migratable timers */
1006         do {
1007                 if (call->flags & TIMER_CALL_LOCAL) {
1008                         timers_migrated = -3;
1009                         goto abort2;
1010                 }
1011                 call = TIMER_CALL(queue_next(qe(call)));
1012         } while (!queue_end(&queue_from->head, qe(call)));
1013
1014         /* migration loop itself -- both queues are locked */
1015         while (!queue_empty(&queue_from->head)) {
1016                 call = TIMER_CALL(queue_first(&queue_from->head));
1017                 if (!simple_lock_try(&call->lock)) {
1018                         /* case (2b) lock order inversion, dequeue only */
1019 #ifdef TIMER_ASSERT
1020                         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1021                                 DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
1022                                 VM_KERNEL_UNSLIDE_OR_PERM(call),
1023                                 VM_KERNEL_UNSLIDE_OR_PERM(TCE(call)->queue),
1024                                 VM_KERNEL_UNSLIDE_OR_PERM(call->lock.interlock.lock_data),
1025                                 0x2b, 0);
1026 #endif
1027                         timer_queue_migrate_lock_skips++;
1028                         timer_call_entry_dequeue_async(call);
1029                         continue;
1030                 }
1031                 timer_call_entry_dequeue(call);
1032                 timer_call_entry_enqueue_deadline(
1033                         call, queue_to, TCE(call)->deadline);
1034                 timers_migrated++;
1035                 simple_unlock(&call->lock);
1036         }
1037         queue_from->earliest_soft_deadline = UINT64_MAX;
1038 abort2:
1039         timer_queue_unlock(queue_from);
1040 abort1:
1041         timer_queue_unlock(queue_to);
1042
1043         return timers_migrated;
1044 }
1045
1046 void
1047 timer_queue_trace_cpu(int ncpu)
1048 {
1049         timer_call_nosync_cpu(
1050                 ncpu,
1051                 (void(*)())timer_queue_trace,
1052                 (void*) timer_queue_cpu(ncpu));
1053 }
1054
1055 void
1056 timer_queue_trace(
1057         mpqueue_head_t                  *queue)
1058 {
1059         timer_call_t    call;
1060         spl_t           s;
1061
1062         if (!kdebug_enable)
1063                 return;
1064
1065         s = splclock();
1066         timer_queue_lock_spin(queue);
1067
1068         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1069                 DECR_TIMER_QUEUE | DBG_FUNC_START,
1070                 queue->count, mach_absolute_time(), 0, 0, 0);
1071
1072         if (!queue_empty(&queue->head)) {
1073                 call = TIMER_CALL(queue_first(&queue->head));
1074                 do {
1075                         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1076                                 DECR_TIMER_QUEUE | DBG_FUNC_NONE,
1077                                 call->soft_deadline,
1078                                 TCE(call)->deadline,
1079                                 TCE(call)->entry_time,
1080                                 VM_KERNEL_UNSLIDE(TCE(call)->func),
1081                                 0);
1082                         call = TIMER_CALL(queue_next(qe(call)));
1083                 } while (!queue_end(&queue->head, qe(call)));
1084         }
1085
1086         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1087                 DECR_TIMER_QUEUE | DBG_FUNC_END,
1088                 queue->count, mach_absolute_time(), 0, 0, 0);
1089
1090         timer_queue_unlock(queue);
1091         splx(s);
1092 }
1093
1094 void
1095 timer_longterm_dequeued_locked(timer_call_t call)
1096 {
1097         timer_longterm_t        *tlp = &timer_longterm;
1098
1099         tlp->dequeues++;
1100         if (call == tlp->threshold.call)
1101                 tlp->threshold.call = NULL;
1102 }
1103
1104 /*
1105  * Place a timer call in the longterm list
1106  * and adjust the next timer callout deadline if the new timer is first.
1107  */
1108 mpqueue_head_t *
1109 timer_longterm_enqueue_unlocked(timer_call_t    call,
1110                                 uint64_t        now,
1111                                 uint64_t        deadline,
1112                                 mpqueue_head_t  **old_queue,
1113                                 uint64_t        soft_deadline,
1114                                 uint64_t        ttd,
1115                                 timer_call_param_t      param1,
1116                                 uint32_t        callout_flags)
1117 {
1118         timer_longterm_t        *tlp = &timer_longterm;
1119         boolean_t               update_required = FALSE;
1120         uint64_t                longterm_threshold;
1121
1122         longterm_threshold = now + tlp->threshold.interval;
1123
1124         /*
1125          * Return NULL without doing anything if:
1126          *  - this timer is local, or
1127          *  - the longterm mechanism is disabled, or
1128          *  - this deadline is too short.
1129          */
1130         if ((callout_flags & TIMER_CALL_LOCAL) != 0 ||
1131             (tlp->threshold.interval == TIMER_LONGTERM_NONE) ||
1132                 (deadline <= longterm_threshold))
1133                 return NULL;
1134
1135         /*
1136          * Remove timer from its current queue, if any.
1137          */
1138         *old_queue = timer_call_dequeue_unlocked(call);
1139
1140         /*
1141          * Lock the longterm queue, queue timer and determine
1142          * whether an update is necessary.
1143          */
1144         assert(!ml_get_interrupts_enabled());
1145         simple_lock(&call->lock);
1146         timer_queue_lock_spin(timer_longterm_queue);
1147         TCE(call)->deadline = deadline;
1148         TCE(call)->param1 = param1;
1149         call->ttd = ttd;
1150         call->soft_deadline = soft_deadline;
1151         call->flags = callout_flags;
1152         timer_call_entry_enqueue_tail(call, timer_longterm_queue);
1153
1154         tlp->enqueues++;
1155
1156         /*
1157          * We'll need to update the currently set threshold timer
1158          * if the new deadline is sooner and no sooner update is in flight.
1159          */
1160         if (deadline < tlp->threshold.deadline &&
1161             deadline < tlp->threshold.preempted) {
1162                 tlp->threshold.preempted = deadline;
1163                 tlp->threshold.call = call;
1164                 update_required = TRUE;
1165         }
1166         timer_queue_unlock(timer_longterm_queue);
1167         simple_unlock(&call->lock);
1168
1169         if (update_required) {
1170                 /*
1171                  * Note: this call expects that calling the master cpu
1172                  * alone does not involve locking the topo lock.
1173                  */
1174                 timer_call_nosync_cpu(
1175                         master_cpu,
1176                         (void (*)(void *)) timer_longterm_update,
1177                         (void *)tlp);
1178         }
1179
1180         return timer_longterm_queue;
1181 }
1182
1183 /*
1184  * Scan for timers below the longterm threshold.
1185  * Move these to the local timer queue (of the boot processor on which the
1186  * calling thread is running).
1187  * Both the local (boot) queue and the longterm queue are locked.
1188  * The scan is similar to the timer migrate sequence but is performed by
1189  * successively examining each timer on the longterm queue:
1190  *  - if within the short-term threshold
1191  *    - enter on the local queue (unless being deleted),
1192  *  - otherwise:
1193  *    - if sooner, deadline becomes the next threshold deadline.
1194  */
1195 void
1196 timer_longterm_scan(timer_longterm_t    *tlp,
1197                     uint64_t            now)
1198 {
1199         queue_entry_t   qe;
1200         timer_call_t    call;
1201         uint64_t        threshold;
1202         uint64_t        deadline;
1203         mpqueue_head_t  *timer_master_queue;
1204
1205         assert(!ml_get_interrupts_enabled());
1206         assert(cpu_number() == master_cpu);
1207
1208         if (tlp->threshold.interval != TIMER_LONGTERM_NONE)
1209                 threshold = now + tlp->threshold.interval;
1210         else
1211                 threshold = TIMER_LONGTERM_NONE;
1212
1213         tlp->threshold.deadline = TIMER_LONGTERM_NONE;
1214         tlp->threshold.call = NULL;
1215
1216         if (queue_empty(&timer_longterm_queue->head))
1217                 return;
1218
1219         timer_master_queue = timer_queue_cpu(master_cpu);
1220         timer_queue_lock_spin(timer_master_queue);
1221
1222         qe = queue_first(&timer_longterm_queue->head);
1223         while (!queue_end(&timer_longterm_queue->head, qe)) {
1224                 call = TIMER_CALL(qe);
1225                 deadline = call->soft_deadline;
1226                 qe = queue_next(qe);
1227                 if (!simple_lock_try(&call->lock)) {
1228                         /* case (2c) lock order inversion, dequeue only */
1229 #ifdef TIMER_ASSERT
1230                         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1231                                 DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
1232                                 VM_KERNEL_UNSLIDE_OR_PERM(call),
1233                                 VM_KERNEL_UNSLIDE_OR_PERM(TCE(call)->queue),
1234                                 VM_KERNEL_UNSLIDE_OR_PERM(call->lock.interlock.lock_data),
1235                                 0x2c, 0);
1236 #endif
1237                         timer_call_entry_dequeue_async(call);
1238                         continue;
1239                 }
1240                 if (deadline < threshold) {
1241                         /*
1242                          * This timer needs moving (escalating)
1243                          * to the local (boot) processor's queue.
1244                          */
1245 #ifdef TIMER_ASSERT
1246                         if (deadline < now)
1247                                 TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1248                                         DECR_TIMER_OVERDUE | DBG_FUNC_NONE,
1249                                         VM_KERNEL_UNSLIDE_OR_PERM(call),
1250                                         deadline,
1251                                         now,
1252                                         threshold,
1253                                         0);
1254 #endif
1255                         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1256                                 DECR_TIMER_ESCALATE | DBG_FUNC_NONE,
1257                                 VM_KERNEL_UNSLIDE_OR_PERM(call),
1258                                 TCE(call)->deadline,
1259                                 TCE(call)->entry_time,
1260                                 VM_KERNEL_UNSLIDE(TCE(call)->func),
1261                                 0);
1262                         tlp->escalates++;
1263                         timer_call_entry_dequeue(call);
1264                         timer_call_entry_enqueue_deadline(
1265                                 call, timer_master_queue, TCE(call)->deadline);
1266                         /*
1267                          * A side-effect of the following call is to update
1268                          * the actual hardware deadline if required.
1269                          */
1270                         (void) timer_queue_assign(deadline);
1271                 } else {
1272                         if (deadline < tlp->threshold.deadline) {
1273                                 tlp->threshold.deadline = deadline;
1274                                 tlp->threshold.call = call;
1275                         }
1276                 }
1277                 simple_unlock(&call->lock);
1278         }
1279
1280         timer_queue_unlock(timer_master_queue);
1281 }
1282
1283 void
1284 timer_longterm_callout(timer_call_param_t p0, __unused timer_call_param_t p1)
1285 {
1286         timer_longterm_t        *tlp = (timer_longterm_t *) p0;
1287
1288         timer_longterm_update(tlp);
1289 }
1290
1291 void
1292 timer_longterm_update_locked(timer_longterm_t *tlp)
1293 {
1294         uint64_t        latency;
1295
1296         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1297                 DECR_TIMER_UPDATE | DBG_FUNC_START,
1298                 VM_KERNEL_UNSLIDE_OR_PERM(&tlp->queue),
1299                 tlp->threshold.deadline,
1300                 tlp->threshold.preempted,
1301                 tlp->queue.count, 0);
1302
1303         tlp->scan_time = mach_absolute_time();
1304         if (tlp->threshold.preempted != TIMER_LONGTERM_NONE) {
1305                 tlp->threshold.preempts++;
1306                 tlp->threshold.deadline = tlp->threshold.preempted;
1307                 tlp->threshold.preempted = TIMER_LONGTERM_NONE;
1308                 /*
1309                  * Note: in the unlikely event that a pre-empted timer has
1310                  * itself been cancelled, we'll simply re-scan later at the
1311                  * time of the preempted/cancelled timer.
1312                  */
1313         } else {
1314                 tlp->threshold.scans++;
1315
1316                 /*
1317                  * Maintain a moving average of our wakeup latency.
1318                  * Clamp latency to 0 and ignore above threshold interval.
1319                  */
1320                 if (tlp->scan_time > tlp->threshold.deadline_set)
1321                         latency = tlp->scan_time - tlp->threshold.deadline_set;
1322                 else
1323                         latency = 0;
1324                 if (latency < tlp->threshold.interval) {
1325                         tlp->threshold.latency_min =
1326                                 MIN(tlp->threshold.latency_min, latency);
1327                         tlp->threshold.latency_max =
1328                                 MAX(tlp->threshold.latency_max, latency);
1329                         tlp->threshold.latency =
1330                                 (tlp->threshold.latency*99 + latency) / 100;
1331                 }
1332
1333                 timer_longterm_scan(tlp, tlp->scan_time);
1334         }
1335
1336         tlp->threshold.deadline_set = tlp->threshold.deadline;
1337         /* The next deadline timer to be set is adjusted */
1338         if (tlp->threshold.deadline != TIMER_LONGTERM_NONE) {
1339                 tlp->threshold.deadline_set -= tlp->threshold.margin;
1340                 tlp->threshold.deadline_set -= tlp->threshold.latency;
1341         }
1342
1343         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1344                 DECR_TIMER_UPDATE | DBG_FUNC_END,
1345                 VM_KERNEL_UNSLIDE_OR_PERM(&tlp->queue),
1346                 tlp->threshold.deadline,
1347                 tlp->threshold.scans,
1348                 tlp->queue.count, 0);
1349 }
1350
1351 void
1352 timer_longterm_update(timer_longterm_t *tlp)
1353 {
1354         spl_t   s = splclock();
1355
1356         timer_queue_lock_spin(timer_longterm_queue);
1357
1358         if (cpu_number() != master_cpu)
1359                 panic("timer_longterm_update_master() on non-boot cpu");
1360
1361         timer_longterm_update_locked(tlp);
1362
1363         if (tlp->threshold.deadline != TIMER_LONGTERM_NONE)
1364                 timer_call_enter(
1365                         &tlp->threshold.timer,
1366                         tlp->threshold.deadline_set,
1367                         TIMER_CALL_LOCAL | TIMER_CALL_SYS_CRITICAL);
1368
1369         timer_queue_unlock(timer_longterm_queue);
1370         splx(s);
1371 }
1372
1373 void
1374 timer_longterm_init(void)
1375 {
1376         uint32_t                longterm;
1377         timer_longterm_t        *tlp = &timer_longterm;
1378
1379         DBG("timer_longterm_init() tlp: %p, queue: %p\n", tlp, &tlp->queue);
1380
1381         /*
1382          * Set the longterm timer threshold. Defaults to TIMER_LONGTERM_THRESHOLD
1383          * or TIMER_LONGTERM_NONE (disabled) for server;
1384          * overridden longterm boot-arg
1385          */
1386         tlp->threshold.interval = serverperfmode ? TIMER_LONGTERM_NONE
1387                                                  : TIMER_LONGTERM_THRESHOLD;
1388         if (PE_parse_boot_argn("longterm", &longterm, sizeof (longterm))) {
1389                 tlp->threshold.interval = (longterm == 0) ?
1390                                                 TIMER_LONGTERM_NONE :
1391                                                 longterm * NSEC_PER_MSEC;
1392         }
1393         if (tlp->threshold.interval != TIMER_LONGTERM_NONE) {
1394                 printf("Longterm timer threshold: %llu ms\n",
1395                         tlp->threshold.interval / NSEC_PER_MSEC);
1396                 kprintf("Longterm timer threshold: %llu ms\n",
1397                         tlp->threshold.interval / NSEC_PER_MSEC);
1398                 nanoseconds_to_absolutetime(tlp->threshold.interval,
1399                                             &tlp->threshold.interval);
1400                 tlp->threshold.margin = tlp->threshold.interval / 10;
1401                 tlp->threshold.latency_min = EndOfAllTime;
1402                 tlp->threshold.latency_max = 0;
1403         }
1404
1405         tlp->threshold.preempted = TIMER_LONGTERM_NONE;
1406         tlp->threshold.deadline = TIMER_LONGTERM_NONE;
1407
1408         lck_attr_setdefault(&timer_longterm_lck_attr);
1409         lck_grp_attr_setdefault(&timer_longterm_lck_grp_attr);
1410         lck_grp_init(&timer_longterm_lck_grp,
1411                      "timer_longterm", &timer_longterm_lck_grp_attr);
1412         mpqueue_init(&tlp->queue,
1413                      &timer_longterm_lck_grp, &timer_longterm_lck_attr);
1414
1415         timer_call_setup(&tlp->threshold.timer,
1416                          timer_longterm_callout, (timer_call_param_t) tlp);
1417
1418         timer_longterm_queue = &tlp->queue;
1419 }
1420
1421 enum {
1422         THRESHOLD, QCOUNT,
1423         ENQUEUES, DEQUEUES, ESCALATES, SCANS, PREEMPTS,
1424         LATENCY, LATENCY_MIN, LATENCY_MAX
1425 };
1426 uint64_t
1427 timer_sysctl_get(int oid)
1428 {
1429         timer_longterm_t        *tlp = &timer_longterm;
1430
1431         switch (oid) {
1432         case THRESHOLD:
1433                 return (tlp->threshold.interval == TIMER_LONGTERM_NONE) ?
1434                         0 : tlp->threshold.interval / NSEC_PER_MSEC;
1435         case QCOUNT:
1436                 return tlp->queue.count;
1437         case ENQUEUES:
1438                 return tlp->enqueues;
1439         case DEQUEUES:
1440                 return tlp->dequeues;
1441         case ESCALATES:
1442                 return tlp->escalates;
1443         case SCANS:
1444                 return tlp->threshold.scans;
1445         case PREEMPTS:
1446                 return tlp->threshold.preempts;
1447         case LATENCY:
1448                 return tlp->threshold.latency;
1449         case LATENCY_MIN:
1450                 return tlp->threshold.latency_min;
1451         case LATENCY_MAX:
1452                 return tlp->threshold.latency_max;
1453         default:
1454                 return 0;
1455         }
1456 }
1457
1458 /*
1459  * timer_master_scan() is the inverse of timer_longterm_scan()
1460  * since it un-escalates timers to the longterm queue.
1461  */
1462 static void
1463 timer_master_scan(timer_longterm_t      *tlp,
1464                   uint64_t              now)
1465 {
1466         queue_entry_t   qe;
1467         timer_call_t    call;
1468         uint64_t        threshold;
1469         uint64_t        deadline;
1470         mpqueue_head_t  *timer_master_queue;
1471
1472         if (tlp->threshold.interval != TIMER_LONGTERM_NONE)
1473                 threshold = now + tlp->threshold.interval;
1474         else
1475                 threshold = TIMER_LONGTERM_NONE;
1476
1477         timer_master_queue = timer_queue_cpu(master_cpu);
1478         timer_queue_lock_spin(timer_master_queue);
1479
1480         qe = queue_first(&timer_master_queue->head);
1481         while (!queue_end(&timer_master_queue->head, qe)) {
1482                 call = TIMER_CALL(qe);
1483                 deadline = TCE(call)->deadline;
1484                 qe = queue_next(qe);
1485                 if ((call->flags & TIMER_CALL_LOCAL) != 0)
1486                         continue;
1487                 if (!simple_lock_try(&call->lock)) {
1488                         /* case (2c) lock order inversion, dequeue only */
1489                         timer_call_entry_dequeue_async(call);
1490                         continue;
1491                 }
1492                 if (deadline > threshold) {
1493                         /* move from master to longterm */
1494                         timer_call_entry_dequeue(call);
1495                         timer_call_entry_enqueue_tail(call, timer_longterm_queue);
1496                         if (deadline < tlp->threshold.deadline) {
1497                                 tlp->threshold.deadline = deadline;
1498                                 tlp->threshold.call = call;
1499                         }
1500                 }
1501                 simple_unlock(&call->lock);
1502         }
1503         timer_queue_unlock(timer_master_queue);
1504 }
1505
1506 static void
1507 timer_sysctl_set_threshold(uint64_t value)
1508 {
1509         timer_longterm_t        *tlp = &timer_longterm;
1510         spl_t                   s = splclock();
1511         boolean_t               threshold_increase;
1512
1513         timer_queue_lock_spin(timer_longterm_queue);
1514
1515         timer_call_cancel(&tlp->threshold.timer);
1516
1517         /*
1518          * Set the new threshold and note whther it's increasing.
1519          */
1520         if (value == 0) {
1521                 tlp->threshold.interval = TIMER_LONGTERM_NONE;
1522                 threshold_increase = TRUE;
1523                 timer_call_cancel(&tlp->threshold.timer);
1524         } else {
1525                 uint64_t        old_interval = tlp->threshold.interval;
1526                 tlp->threshold.interval = value * NSEC_PER_MSEC;
1527                 nanoseconds_to_absolutetime(tlp->threshold.interval,
1528                                             &tlp->threshold.interval);
1529                 tlp->threshold.margin = tlp->threshold.interval / 10;
1530                 if  (old_interval == TIMER_LONGTERM_NONE)
1531                         threshold_increase = FALSE;
1532                 else
1533                         threshold_increase = (tlp->threshold.interval > old_interval);
1534         }
1535
1536         if (threshold_increase /* or removal */) {
1537                 /* Escalate timers from the longterm queue */
1538                 timer_longterm_scan(tlp, mach_absolute_time());
1539         } else /* decrease or addition  */ {
1540                 /*
1541                  * We scan the local/master queue for timers now longterm.
1542                  * To be strictly correct, we should scan all processor queues
1543                  * but timer migration results in most timers gravitating to the
1544                  * master processor in any case.
1545                  */
1546                 timer_master_scan(tlp, mach_absolute_time());
1547         }
1548
1549         /* Set new timer accordingly */
1550         tlp->threshold.deadline_set = tlp->threshold.deadline;
1551         if (tlp->threshold.deadline != TIMER_LONGTERM_NONE) {
1552                 tlp->threshold.deadline_set -= tlp->threshold.margin;
1553                 tlp->threshold.deadline_set -= tlp->threshold.latency;
1554                 timer_call_enter(
1555                         &tlp->threshold.timer,
1556                         tlp->threshold.deadline_set,
1557                         TIMER_CALL_LOCAL | TIMER_CALL_SYS_CRITICAL);
1558         }
1559
1560         /* Reset stats */
1561         tlp->enqueues = 0;
1562         tlp->dequeues = 0;
1563         tlp->escalates = 0;
1564         tlp->threshold.scans = 0;
1565         tlp->threshold.preempts = 0;
1566         tlp->threshold.latency = 0;
1567         tlp->threshold.latency_min = EndOfAllTime;
1568         tlp->threshold.latency_max = 0;
1569
1570         timer_queue_unlock(timer_longterm_queue);
1571         splx(s);
1572 }
1573
1574 int
1575 timer_sysctl_set(int oid, uint64_t value)
1576 {
1577         switch (oid) {
1578         case THRESHOLD:
1579                 timer_call_cpu(
1580                         master_cpu,
1581                         (void (*)(void *)) timer_sysctl_set_threshold,
1582                         (void *) value);
1583                 return KERN_SUCCESS;
1584         default:
1585                 return KERN_INVALID_ARGUMENT;
1586         }
1587 }
1588
1589
1590 /* Select timer coalescing window based on per-task quality-of-service hints */
1591 static boolean_t tcoal_qos_adjust(thread_t t, int32_t *tshift, uint64_t *tmax_abstime, boolean_t *pratelimited) {
1592         uint32_t latency_qos;
1593         boolean_t adjusted = FALSE;
1594         task_t ctask = t->task;
1595
1596         if (ctask) {
1597                 latency_qos = proc_get_effective_thread_policy(t, TASK_POLICY_LATENCY_QOS);
1598
1599                 assert(latency_qos <= NUM_LATENCY_QOS_TIERS);
1600
1601                 if (latency_qos) {
1602                         *tshift = tcoal_prio_params.latency_qos_scale[latency_qos - 1];
1603                         *tmax_abstime = tcoal_prio_params.latency_qos_abstime_max[latency_qos - 1];
1604                         *pratelimited = tcoal_prio_params.latency_tier_rate_limited[latency_qos - 1];
1605                         adjusted = TRUE;
1606                 }
1607         }
1608         return adjusted;
1609 }
1610
1611
1612 /* Adjust timer deadlines based on priority of the thread and the
1613  * urgency value provided at timeout establishment. With this mechanism,
1614  * timers are no longer necessarily sorted in order of soft deadline
1615  * on a given timer queue, i.e. they may be differentially skewed.
1616  * In the current scheme, this could lead to fewer pending timers
1617  * processed than is technically possible when the HW deadline arrives.
1618  */
1619 static void
1620 timer_compute_leeway(thread_t cthread, int32_t urgency, int32_t *tshift, uint64_t *tmax_abstime, boolean_t *pratelimited) {
1621         int16_t tpri = cthread->sched_pri;
1622         if ((urgency & TIMER_CALL_USER_MASK) != 0) {
1623                 if (tpri >= BASEPRI_RTQUEUES ||
1624                 urgency == TIMER_CALL_USER_CRITICAL) {
1625                         *tshift = tcoal_prio_params.timer_coalesce_rt_shift;
1626                         *tmax_abstime = tcoal_prio_params.timer_coalesce_rt_abstime_max;
1627                         TCOAL_PRIO_STAT(rt_tcl);
1628                 } else if (proc_get_effective_thread_policy(cthread, TASK_POLICY_DARWIN_BG) ||
1629                 (urgency == TIMER_CALL_USER_BACKGROUND)) {
1630                         /* Determine if timer should be subjected to a lower QoS */
1631                         if (tcoal_qos_adjust(cthread, tshift, tmax_abstime, pratelimited)) {
1632                                 if (*tmax_abstime > tcoal_prio_params.timer_coalesce_bg_abstime_max) {
1633                                         return;
1634                                 } else {
1635                                         *pratelimited = FALSE;
1636                                 }
1637                         }
1638                         *tshift = tcoal_prio_params.timer_coalesce_bg_shift;
1639                         *tmax_abstime = tcoal_prio_params.timer_coalesce_bg_abstime_max;
1640                         TCOAL_PRIO_STAT(bg_tcl);
1641                 } else if (tpri >= MINPRI_KERNEL) {
1642                         *tshift = tcoal_prio_params.timer_coalesce_kt_shift;
1643                         *tmax_abstime = tcoal_prio_params.timer_coalesce_kt_abstime_max;
1644                         TCOAL_PRIO_STAT(kt_tcl);
1645                 } else if (cthread->sched_mode == TH_MODE_FIXED) {
1646                         *tshift = tcoal_prio_params.timer_coalesce_fp_shift;
1647                         *tmax_abstime = tcoal_prio_params.timer_coalesce_fp_abstime_max;
1648                         TCOAL_PRIO_STAT(fp_tcl);
1649                 } else if (tcoal_qos_adjust(cthread, tshift, tmax_abstime, pratelimited)) {
1650                         TCOAL_PRIO_STAT(qos_tcl);
1651                 } else if (cthread->sched_mode == TH_MODE_TIMESHARE) {
1652                         *tshift = tcoal_prio_params.timer_coalesce_ts_shift;
1653                         *tmax_abstime = tcoal_prio_params.timer_coalesce_ts_abstime_max;
1654                         TCOAL_PRIO_STAT(ts_tcl);
1655                 } else {
1656                         TCOAL_PRIO_STAT(nc_tcl);
1657                 }
1658         } else if (urgency == TIMER_CALL_SYS_BACKGROUND) {
1659                 *tshift = tcoal_prio_params.timer_coalesce_bg_shift;
1660                 *tmax_abstime = tcoal_prio_params.timer_coalesce_bg_abstime_max;
1661                 TCOAL_PRIO_STAT(bg_tcl);
1662         } else {
1663                 *tshift = tcoal_prio_params.timer_coalesce_kt_shift;
1664                 *tmax_abstime = tcoal_prio_params.timer_coalesce_kt_abstime_max;
1665                 TCOAL_PRIO_STAT(kt_tcl);
1666         }
1667 }
1668
1669
1670 int timer_user_idle_level;
1671
1672 uint64_t
1673 timer_call_slop(uint64_t deadline, uint64_t now, uint32_t flags, thread_t cthread, boolean_t *pratelimited)
1674 {
1675         int32_t tcs_shift = 0;
1676         uint64_t tcs_max_abstime = 0;
1677         uint64_t adjval;
1678         uint32_t urgency = (flags & TIMER_CALL_URGENCY_MASK);
1679
1680         if (mach_timer_coalescing_enabled &&
1681             (deadline > now) && (urgency != TIMER_CALL_SYS_CRITICAL)) {
1682                 timer_compute_leeway(cthread, urgency, &tcs_shift, &tcs_max_abstime, pratelimited);
1683
1684                 if (tcs_shift >= 0)
1685                         adjval =  MIN((deadline - now) >> tcs_shift, tcs_max_abstime);
1686                 else
1687                         adjval =  MIN((deadline - now) << (-tcs_shift), tcs_max_abstime);
1688                 /* Apply adjustments derived from "user idle level" heuristic */
1689                 adjval += (adjval * timer_user_idle_level) >> 7;
1690                 return adjval;
1691         } else {
1692                 return 0;
1693         }
1694 }
1695
1696 int
1697 timer_get_user_idle_level(void) {
1698         return timer_user_idle_level;
1699 }
1700
1701 kern_return_t timer_set_user_idle_level(int ilevel) {
1702         boolean_t do_reeval = FALSE;
1703
1704         if ((ilevel < 0) || (ilevel > 128))
1705                 return KERN_INVALID_ARGUMENT;
1706
1707         if (ilevel < timer_user_idle_level) {
1708                 do_reeval = TRUE;
1709         }
1710
1711         timer_user_idle_level = ilevel;
1712
1713         if (do_reeval)
1714                 ml_timer_evaluate();
1715
1716         return KERN_SUCCESS;
1717 }