osfmk/kern/timer_call.c

   1 /*
   2  * Copyright (c) 1993-2008 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * Timer interrupt callout module.
  30  */
  31
  32 #include <mach/mach_types.h>
  33
  34 #include <kern/clock.h>
  35 #include <kern/smp.h>
  36 #include <kern/processor.h>
  37 #include <kern/timer_call.h>
  38 #include <kern/timer_queue.h>
  39 #include <kern/call_entry.h>
  40 #include <kern/thread.h>
  41 #include <kern/policy_internal.h>
  42
  43 #include <sys/kdebug.h>
  44
  45 #if CONFIG_DTRACE
  46 #include <mach/sdt.h>
  47 #endif
  48
  49
  50 #if DEBUG
  51 #define TIMER_ASSERT    1
  52 #endif
  53
  54 //#define TIMER_ASSERT  1
  55 //#define TIMER_DBG     1
  56
  57 #if TIMER_DBG
  58 #define DBG(x...) kprintf("DBG: " x);
  59 #else
  60 #define DBG(x...)
  61 #endif
  62
  63 #if TIMER_TRACE
  64 #define TIMER_KDEBUG_TRACE      KERNEL_DEBUG_CONSTANT_IST
  65 #else
  66 #define TIMER_KDEBUG_TRACE(x...)
  67 #endif
  68
  69
  70 lck_grp_t               timer_call_lck_grp;
  71 lck_attr_t              timer_call_lck_attr;
  72 lck_grp_attr_t          timer_call_lck_grp_attr;
  73
  74 lck_grp_t               timer_longterm_lck_grp;
  75 lck_attr_t              timer_longterm_lck_attr;
  76 lck_grp_attr_t          timer_longterm_lck_grp_attr;
  77
  78 /* Timer queue lock must be acquired with interrupts disabled (under splclock()) */
  79 #if __SMP__
  80 #define timer_queue_lock_spin(queue)                                    \
  81         lck_mtx_lock_spin_always(&queue->lock_data)
  82
  83 #define timer_queue_unlock(queue)               \
  84         lck_mtx_unlock_always(&queue->lock_data)
  85 #else
  86 #define timer_queue_lock_spin(queue)    (void)1
  87 #define timer_queue_unlock(queue)               (void)1
  88 #endif
  89
  90 #define QUEUE(x)        ((queue_t)(x))
  91 #define MPQUEUE(x)      ((mpqueue_head_t *)(x))
  92 #define TIMER_CALL(x)   ((timer_call_t)(x))
  93 #define TCE(x)          (&(x->call_entry))
  94 /*
  95  * The longterm timer object is a global structure holding all timers
  96  * beyond the short-term, local timer queue threshold. The boot processor
  97  * is responsible for moving each timer to its local timer queue
  98  * if and when that timer becomes due within the threshold.
  99  */
 100 #define TIMER_LONGTERM_NONE             EndOfAllTime
 101 #if defined(__x86_64__)
 102 #define TIMER_LONGTERM_THRESHOLD        (1ULL * NSEC_PER_SEC)
 103 #else
 104 #define TIMER_LONGTERM_THRESHOLD        TIMER_LONGTERM_NONE
 105 #endif
 106
 107 typedef struct {
 108         uint64_t        interval;       /* longterm timer interval */
 109         uint64_t        margin;         /* fudge factor (10% of interval */
 110         uint64_t        deadline;       /* first/soonest longterm deadline */
 111         uint64_t        preempted;      /* sooner timer has pre-empted */
 112         timer_call_t    call;           /* first/soonest longterm timer call */
 113         uint64_t        deadline_set;   /* next timer set */
 114         timer_call_data_t timer;        /* timer used by threshold management */
 115                                         /* Stats: */
 116         uint64_t        scans;          /*   num threshold timer scans */
 117         uint64_t        preempts;       /*   num threshold reductions */
 118         uint64_t        latency;        /*   average threshold latency */
 119         uint64_t        latency_min;    /*   minimum threshold latency */
 120         uint64_t        latency_max;    /*   maximum threshold latency */
 121 } threshold_t;
 122
 123 typedef struct {
 124         mpqueue_head_t  queue;          /* longterm timer list */
 125         uint64_t        enqueues;       /* num timers queued */
 126         uint64_t        dequeues;       /* num timers dequeued */
 127         uint64_t        escalates;      /* num timers becoming shortterm */
 128         uint64_t        scan_time;      /* last time the list was scanned */
 129         threshold_t     threshold;      /* longterm timer threshold */
 130 } timer_longterm_t;
 131
 132 timer_longterm_t                timer_longterm;
 133
 134 static mpqueue_head_t           *timer_longterm_queue = NULL;
 135
 136 static void                     timer_longterm_init(void);
 137 static void                     timer_longterm_callout(
 138                                         timer_call_param_t      p0,
 139                                         timer_call_param_t      p1);
 140 extern void                     timer_longterm_scan(
 141                                         timer_longterm_t        *tlp,
 142                                         uint64_t                now);
 143 static void                     timer_longterm_update(
 144                                         timer_longterm_t *tlp);
 145 static void                     timer_longterm_update_locked(
 146                                         timer_longterm_t *tlp);
 147 static mpqueue_head_t *         timer_longterm_enqueue_unlocked(
 148                                         timer_call_t            call,
 149                                         uint64_t                now,
 150                                         uint64_t                deadline,
 151                                         mpqueue_head_t **       old_queue,
 152                                         uint64_t                soft_deadline,
 153                                         uint64_t                ttd,
 154                                         timer_call_param_t      param1,
 155                                         uint32_t                callout_flags);
 156 static void                     timer_longterm_dequeued_locked(
 157                                         timer_call_t            call);
 158
 159 uint64_t past_deadline_timers;
 160 uint64_t past_deadline_deltas;
 161 uint64_t past_deadline_longest;
 162 uint64_t past_deadline_shortest = ~0ULL;
 163 enum {PAST_DEADLINE_TIMER_ADJUSTMENT_NS = 10 * 1000};
 164
 165 uint64_t past_deadline_timer_adjustment;
 166
 167 static boolean_t timer_call_enter_internal(timer_call_t call, timer_call_param_t param1, uint64_t deadline, uint64_t leeway, uint32_t flags, boolean_t ratelimited);
 168 boolean_t       mach_timer_coalescing_enabled = TRUE;
 169
 170 mpqueue_head_t  *timer_call_enqueue_deadline_unlocked(
 171                         timer_call_t            call,
 172                         mpqueue_head_t          *queue,
 173                         uint64_t                deadline,
 174                         uint64_t                soft_deadline,
 175                         uint64_t                ttd,
 176                         timer_call_param_t      param1,
 177                         uint32_t                flags);
 178
 179 mpqueue_head_t  *timer_call_dequeue_unlocked(
 180                         timer_call_t            call);
 181
 182 timer_coalescing_priority_params_t tcoal_prio_params;
 183
 184 #if TCOAL_PRIO_STATS
 185 int32_t nc_tcl, rt_tcl, bg_tcl, kt_tcl, fp_tcl, ts_tcl, qos_tcl;
 186 #define TCOAL_PRIO_STAT(x) (x++)
 187 #else
 188 #define TCOAL_PRIO_STAT(x)
 189 #endif
 190
 191 static void
 192 timer_call_init_abstime(void)
 193 {
 194         int i;
 195         uint64_t result;
 196         timer_coalescing_priority_params_ns_t * tcoal_prio_params_init = timer_call_get_priority_params();
 197         nanoseconds_to_absolutetime(PAST_DEADLINE_TIMER_ADJUSTMENT_NS, &past_deadline_timer_adjustment);
 198         nanoseconds_to_absolutetime(tcoal_prio_params_init->idle_entry_timer_processing_hdeadline_threshold_ns, &result);
 199         tcoal_prio_params.idle_entry_timer_processing_hdeadline_threshold_abstime = (uint32_t)result;
 200         nanoseconds_to_absolutetime(tcoal_prio_params_init->interrupt_timer_coalescing_ilat_threshold_ns, &result);
 201         tcoal_prio_params.interrupt_timer_coalescing_ilat_threshold_abstime = (uint32_t)result;
 202         nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_resort_threshold_ns, &result);
 203         tcoal_prio_params.timer_resort_threshold_abstime = (uint32_t)result;
 204         tcoal_prio_params.timer_coalesce_rt_shift = tcoal_prio_params_init->timer_coalesce_rt_shift;
 205         tcoal_prio_params.timer_coalesce_bg_shift = tcoal_prio_params_init->timer_coalesce_bg_shift;
 206         tcoal_prio_params.timer_coalesce_kt_shift = tcoal_prio_params_init->timer_coalesce_kt_shift;
 207         tcoal_prio_params.timer_coalesce_fp_shift = tcoal_prio_params_init->timer_coalesce_fp_shift;
 208         tcoal_prio_params.timer_coalesce_ts_shift = tcoal_prio_params_init->timer_coalesce_ts_shift;
 209
 210         nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_coalesce_rt_ns_max,
 211             &tcoal_prio_params.timer_coalesce_rt_abstime_max);
 212         nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_coalesce_bg_ns_max,
 213             &tcoal_prio_params.timer_coalesce_bg_abstime_max);
 214         nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_coalesce_kt_ns_max,
 215             &tcoal_prio_params.timer_coalesce_kt_abstime_max);
 216         nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_coalesce_fp_ns_max,
 217             &tcoal_prio_params.timer_coalesce_fp_abstime_max);
 218         nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_coalesce_ts_ns_max,
 219             &tcoal_prio_params.timer_coalesce_ts_abstime_max);
 220
 221         for (i = 0; i < NUM_LATENCY_QOS_TIERS; i++) {
 222                 tcoal_prio_params.latency_qos_scale[i] = tcoal_prio_params_init->latency_qos_scale[i];
 223                 nanoseconds_to_absolutetime(tcoal_prio_params_init->latency_qos_ns_max[i],
 224                     &tcoal_prio_params.latency_qos_abstime_max[i]);
 225                 tcoal_prio_params.latency_tier_rate_limited[i] = tcoal_prio_params_init->latency_tier_rate_limited[i];
 226         }
 227 }
 228
 229
 230 void
 231 timer_call_init(void)
 232 {
 233         lck_attr_setdefault(&timer_call_lck_attr);
 234         lck_grp_attr_setdefault(&timer_call_lck_grp_attr);
 235         lck_grp_init(&timer_call_lck_grp, "timer_call", &timer_call_lck_grp_attr);
 236
 237         timer_longterm_init();
 238         timer_call_init_abstime();
 239 }
 240
 241
 242 void
 243 timer_call_queue_init(mpqueue_head_t *queue)
 244 {
 245         DBG("timer_call_queue_init(%p)\n", queue);
 246         mpqueue_init(queue, &timer_call_lck_grp, &timer_call_lck_attr);
 247 }
 248
 249
 250 void
 251 timer_call_setup(
 252         timer_call_t                    call,
 253         timer_call_func_t               func,
 254         timer_call_param_t              param0)
 255 {
 256         DBG("timer_call_setup(%p,%p,%p)\n", call, func, param0);
 257         call_entry_setup(TCE(call), func, param0);
 258         simple_lock_init(&(call)->lock, 0);
 259         call->async_dequeue = FALSE;
 260 }
 261 #if TIMER_ASSERT
 262 static __inline__ mpqueue_head_t *
 263 timer_call_entry_dequeue(
 264         timer_call_t            entry)
 265 {
 266         mpqueue_head_t  *old_queue = MPQUEUE(TCE(entry)->queue);
 267
 268         if (!hw_lock_held((hw_lock_t)&entry->lock))
 269                 panic("_call_entry_dequeue() "
 270                         "entry %p is not locked\n", entry);
 271         /*
 272          * XXX The queue lock is actually a mutex in spin mode
 273          *     but there's no way to test for it being held
 274          *     so we pretend it's a spinlock!
 275          */
 276         if (!hw_lock_held((hw_lock_t)&old_queue->lock_data))
 277                 panic("_call_entry_dequeue() "
 278                         "queue %p is not locked\n", old_queue);
 279
 280         call_entry_dequeue(TCE(entry));
 281         old_queue->count--;
 282
 283         return (old_queue);
 284 }
 285
 286 static __inline__ mpqueue_head_t *
 287 timer_call_entry_enqueue_deadline(
 288         timer_call_t            entry,
 289         mpqueue_head_t          *queue,
 290         uint64_t                deadline)
 291 {
 292         mpqueue_head_t  *old_queue = MPQUEUE(TCE(entry)->queue);
 293
 294         if (!hw_lock_held((hw_lock_t)&entry->lock))
 295                 panic("_call_entry_enqueue_deadline() "
 296                         "entry %p is not locked\n", entry);
 297         /* XXX More lock pretense:  */
 298         if (!hw_lock_held((hw_lock_t)&queue->lock_data))
 299                 panic("_call_entry_enqueue_deadline() "
 300                         "queue %p is not locked\n", queue);
 301         if (old_queue != NULL && old_queue != queue)
 302                 panic("_call_entry_enqueue_deadline() "
 303                         "old_queue %p != queue", old_queue);
 304
 305         call_entry_enqueue_deadline(TCE(entry), QUEUE(queue), deadline);
 306
 307 /* For efficiency, track the earliest soft deadline on the queue, so that
 308  * fuzzy decisions can be made without lock acquisitions.
 309  */
 310         timer_call_t thead = (timer_call_t)queue_first(&queue->head);
 311
 312         queue->earliest_soft_deadline = thead->flags & TIMER_CALL_RATELIMITED ? TCE(thead)->deadline : thead->soft_deadline;
 313
 314         if (old_queue)
 315                 old_queue->count--;
 316         queue->count++;
 317
 318         return (old_queue);
 319 }
 320
 321 #else
 322
 323 static __inline__ mpqueue_head_t *
 324 timer_call_entry_dequeue(
 325         timer_call_t            entry)
 326 {
 327         mpqueue_head_t  *old_queue = MPQUEUE(TCE(entry)->queue);
 328
 329         call_entry_dequeue(TCE(entry));
 330         old_queue->count--;
 331
 332         return old_queue;
 333 }
 334
 335 static __inline__ mpqueue_head_t *
 336 timer_call_entry_enqueue_deadline(
 337         timer_call_t                    entry,
 338         mpqueue_head_t                  *queue,
 339         uint64_t                        deadline)
 340 {
 341         mpqueue_head_t  *old_queue = MPQUEUE(TCE(entry)->queue);
 342
 343         call_entry_enqueue_deadline(TCE(entry), QUEUE(queue), deadline);
 344
 345         /* For efficiency, track the earliest soft deadline on the queue,
 346          * so that fuzzy decisions can be made without lock acquisitions.
 347          */
 348
 349         timer_call_t thead = (timer_call_t)queue_first(&queue->head);
 350         queue->earliest_soft_deadline = thead->flags & TIMER_CALL_RATELIMITED ? TCE(thead)->deadline : thead->soft_deadline;
 351
 352         if (old_queue)
 353                 old_queue->count--;
 354         queue->count++;
 355
 356         return old_queue;
 357 }
 358
 359 #endif
 360
 361 static __inline__ void
 362 timer_call_entry_enqueue_tail(
 363         timer_call_t                    entry,
 364         mpqueue_head_t                  *queue)
 365 {
 366         call_entry_enqueue_tail(TCE(entry), QUEUE(queue));
 367         queue->count++;
 368         return;
 369 }
 370
 371 /*
 372  * Remove timer entry from its queue but don't change the queue pointer
 373  * and set the async_dequeue flag. This is locking case 2b.
 374  */
 375 static __inline__ void
 376 timer_call_entry_dequeue_async(
 377         timer_call_t            entry)
 378 {
 379         mpqueue_head_t  *old_queue = MPQUEUE(TCE(entry)->queue);
 380         if (old_queue) {
 381                 old_queue->count--;
 382                 (void) remque(qe(entry));
 383                 entry->async_dequeue = TRUE;
 384         }
 385         return;
 386 }
 387
 388 #if TIMER_ASSERT
 389 unsigned timer_call_enqueue_deadline_unlocked_async1;
 390 unsigned timer_call_enqueue_deadline_unlocked_async2;
 391 #endif
 392 /*
 393  * Assumes call_entry and queues unlocked, interrupts disabled.
 394  */
 395 __inline__ mpqueue_head_t *
 396 timer_call_enqueue_deadline_unlocked(
 397         timer_call_t                    call,
 398         mpqueue_head_t                  *queue,
 399         uint64_t                        deadline,
 400         uint64_t                        soft_deadline,
 401         uint64_t                        ttd,
 402         timer_call_param_t              param1,
 403         uint32_t                        callout_flags)
 404 {
 405         call_entry_t    entry = TCE(call);
 406         mpqueue_head_t  *old_queue;
 407
 408         DBG("timer_call_enqueue_deadline_unlocked(%p,%p,)\n", call, queue);
 409
 410         simple_lock(&call->lock);
 411
 412         old_queue = MPQUEUE(entry->queue);
 413
 414         if (old_queue != NULL) {
 415                 timer_queue_lock_spin(old_queue);
 416                 if (call->async_dequeue) {
 417                         /* collision (1c): timer already dequeued, clear flag */
 418 #if TIMER_ASSERT
 419                         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
 420                                 DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
 421                                 VM_KERNEL_UNSLIDE_OR_PERM(call),
 422                                 call->async_dequeue,
 423                                 VM_KERNEL_UNSLIDE_OR_PERM(TCE(call)->queue),
 424                                 0x1c, 0);
 425                         timer_call_enqueue_deadline_unlocked_async1++;
 426 #endif
 427                         call->async_dequeue = FALSE;
 428                         entry->queue = NULL;
 429                 } else if (old_queue != queue) {
 430                         timer_call_entry_dequeue(call);
 431 #if TIMER_ASSERT
 432                         timer_call_enqueue_deadline_unlocked_async2++;
 433 #endif
 434                 }
 435                 if (old_queue == timer_longterm_queue)
 436                         timer_longterm_dequeued_locked(call);
 437                 if (old_queue != queue) {
 438                         timer_queue_unlock(old_queue);
 439                         timer_queue_lock_spin(queue);
 440                 }
 441         } else {
 442                 timer_queue_lock_spin(queue);
 443         }
 444
 445         call->soft_deadline = soft_deadline;
 446         call->flags = callout_flags;
 447         TCE(call)->param1 = param1;
 448         call->ttd = ttd;
 449
 450         timer_call_entry_enqueue_deadline(call, queue, deadline);
 451         timer_queue_unlock(queue);
 452         simple_unlock(&call->lock);
 453
 454         return (old_queue);
 455 }
 456
 457 #if TIMER_ASSERT
 458 unsigned timer_call_dequeue_unlocked_async1;
 459 unsigned timer_call_dequeue_unlocked_async2;
 460 #endif
 461 mpqueue_head_t *
 462 timer_call_dequeue_unlocked(
 463         timer_call_t            call)
 464 {
 465         call_entry_t    entry = TCE(call);
 466         mpqueue_head_t  *old_queue;
 467
 468         DBG("timer_call_dequeue_unlocked(%p)\n", call);
 469
 470         simple_lock(&call->lock);
 471         old_queue = MPQUEUE(entry->queue);
 472 #if TIMER_ASSERT
 473         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
 474                 DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
 475                 VM_KERNEL_UNSLIDE_OR_PERM(call),
 476                 call->async_dequeue,
 477                 VM_KERNEL_UNSLIDE_OR_PERM(TCE(call)->queue),
 478                 0, 0);
 479 #endif
 480         if (old_queue != NULL) {
 481                 timer_queue_lock_spin(old_queue);
 482                 if (call->async_dequeue) {
 483                         /* collision (1c): timer already dequeued, clear flag */
 484 #if TIMER_ASSERT
 485                         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
 486                                 DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
 487                             VM_KERNEL_UNSLIDE_OR_PERM(call),
 488                                 call->async_dequeue,
 489                                 VM_KERNEL_UNSLIDE_OR_PERM(TCE(call)->queue),
 490                                 0x1c, 0);
 491                         timer_call_dequeue_unlocked_async1++;
 492 #endif
 493                         call->async_dequeue = FALSE;
 494                         entry->queue = NULL;
 495                 } else {
 496                         timer_call_entry_dequeue(call);
 497                 }
 498                 if (old_queue == timer_longterm_queue)
 499                         timer_longterm_dequeued_locked(call);
 500                 timer_queue_unlock(old_queue);
 501         }
 502         simple_unlock(&call->lock);
 503         return (old_queue);
 504 }
 505
 506
 507 /*
 508  * Timer call entry locking model
 509  * ==============================
 510  *
 511  * Timer call entries are linked on per-cpu timer queues which are protected
 512  * by the queue lock and the call entry lock. The locking protocol is:
 513  *
 514  *  0) The canonical locking order is timer call entry followed by queue.
 515  *
 516  *  1) With only the entry lock held, entry.queue is valid:
 517  *    1a) NULL: the entry is not queued, or
 518  *    1b) non-NULL: this queue must be locked before the entry is modified.
 519  *        After locking the queue, the call.async_dequeue flag must be checked:
 520  *    1c) TRUE: the entry was removed from the queue by another thread
 521  *              and we must NULL the entry.queue and reset this flag, or
 522  *    1d) FALSE: (ie. queued), the entry can be manipulated.
 523  *
 524  *  2) If a queue lock is obtained first, the queue is stable:
 525  *    2a) If a try-lock of a queued entry succeeds, the call can be operated on
 526  *        and dequeued.
 527  *    2b) If a try-lock fails, it indicates that another thread is attempting
 528  *        to change the entry and move it to a different position in this queue
 529  *        or to different queue. The entry can be dequeued but it should not be
 530  *        operated upon since it is being changed. Furthermore, we don't null
 531  *        the entry.queue pointer (protected by the entry lock we don't own).
 532  *        Instead, we set the async_dequeue flag -- see (1c).
 533  *    2c) Same as 2b but occurring when a longterm timer is matured.
 534  *  3) A callout's parameters (deadline, flags, parameters, soft deadline &c.)
 535  *     should be manipulated with the appropriate timer queue lock held,
 536  *     to prevent queue traversal observations from observing inconsistent
 537  *     updates to an in-flight callout.
 538  */
 539
 540 /*
 541  * Inlines timer_call_entry_dequeue() and timer_call_entry_enqueue_deadline()
 542  * cast between pointer types (mpqueue_head_t *) and (queue_t) so that
 543  * we can use the call_entry_dequeue() and call_entry_enqueue_deadline()
 544  * methods to operate on timer_call structs as if they are call_entry structs.
 545  * These structures are identical except for their queue head pointer fields.
 546  *
 547  * In the debug case, we assert that the timer call locking protocol
 548  * is being obeyed.
 549  */
 550
 551 static boolean_t
 552 timer_call_enter_internal(
 553         timer_call_t            call,
 554         timer_call_param_t      param1,
 555         uint64_t                deadline,
 556         uint64_t                leeway,
 557         uint32_t                flags,
 558         boolean_t               ratelimited)
 559 {
 560         mpqueue_head_t          *queue = NULL;
 561         mpqueue_head_t          *old_queue;
 562         spl_t                   s;
 563         uint64_t                slop;
 564         uint32_t                urgency;
 565         uint64_t                sdeadline, ttd;
 566
 567         assert(call->call_entry.func != NULL);
 568         s = splclock();
 569
 570         sdeadline = deadline;
 571         uint64_t ctime = mach_absolute_time();
 572
 573         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
 574                 DECR_TIMER_ENTER | DBG_FUNC_START,
 575             VM_KERNEL_UNSLIDE_OR_PERM(call),
 576             VM_KERNEL_UNSLIDE_OR_PERM(param1), deadline, flags, 0);
 577
 578         urgency = (flags & TIMER_CALL_URGENCY_MASK);
 579
 580         boolean_t slop_ratelimited = FALSE;
 581         slop = timer_call_slop(deadline, ctime, urgency, current_thread(), &slop_ratelimited);
 582
 583         if ((flags & TIMER_CALL_LEEWAY) != 0 && leeway > slop)
 584                 slop = leeway;
 585
 586         if (UINT64_MAX - deadline <= slop) {
 587                 deadline = UINT64_MAX;
 588         } else {
 589                 deadline += slop;
 590         }
 591
 592         if (__improbable(deadline < ctime)) {
 593                 uint64_t delta = (ctime - deadline);
 594
 595                 past_deadline_timers++;
 596                 past_deadline_deltas += delta;
 597                 if (delta > past_deadline_longest)
 598                         past_deadline_longest = deadline;
 599                 if (delta < past_deadline_shortest)
 600                         past_deadline_shortest = delta;
 601
 602                 deadline = ctime + past_deadline_timer_adjustment;
 603                 sdeadline = deadline;
 604         }
 605
 606         if (ratelimited || slop_ratelimited) {
 607                 flags |= TIMER_CALL_RATELIMITED;
 608         } else {
 609                 flags &= ~TIMER_CALL_RATELIMITED;
 610         }
 611
 612         ttd =  sdeadline - ctime;
 613 #if CONFIG_DTRACE
 614         DTRACE_TMR7(callout__create, timer_call_func_t, TCE(call)->func,
 615         timer_call_param_t, TCE(call)->param0, uint32_t, flags,
 616             (deadline - sdeadline),
 617             (ttd >> 32), (unsigned) (ttd & 0xFFFFFFFF), call);
 618 #endif
 619
 620         /* Program timer callout parameters under the appropriate per-CPU or
 621          * longterm queue lock. The callout may have been previously enqueued
 622          * and in-flight on this or another timer queue.
 623          */
 624         if (!ratelimited && !slop_ratelimited) {
 625                 queue = timer_longterm_enqueue_unlocked(call, ctime, deadline, &old_queue, sdeadline, ttd, param1, flags);
 626         }
 627
 628         if (queue == NULL) {
 629                 queue = timer_queue_assign(deadline);
 630                 old_queue = timer_call_enqueue_deadline_unlocked(call, queue, deadline, sdeadline, ttd, param1, flags);
 631         }
 632
 633 #if TIMER_TRACE
 634         TCE(call)->entry_time = ctime;
 635 #endif
 636
 637         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
 638                 DECR_TIMER_ENTER | DBG_FUNC_END,
 639                 VM_KERNEL_UNSLIDE_OR_PERM(call),
 640                 (old_queue != NULL), deadline, queue->count, 0);
 641
 642         splx(s);
 643
 644         return (old_queue != NULL);
 645 }
 646
 647 /*
 648  * timer_call_*()
 649  *      return boolean indicating whether the call was previously queued.
 650  */
 651 boolean_t
 652 timer_call_enter(
 653         timer_call_t            call,
 654         uint64_t                deadline,
 655         uint32_t                flags)
 656 {
 657         return timer_call_enter_internal(call, NULL, deadline, 0, flags, FALSE);
 658 }
 659
 660 boolean_t
 661 timer_call_enter1(
 662         timer_call_t            call,
 663         timer_call_param_t      param1,
 664         uint64_t                deadline,
 665         uint32_t                flags)
 666 {
 667         return timer_call_enter_internal(call, param1, deadline, 0, flags, FALSE);
 668 }
 669
 670 boolean_t
 671 timer_call_enter_with_leeway(
 672         timer_call_t            call,
 673         timer_call_param_t      param1,
 674         uint64_t                deadline,
 675         uint64_t                leeway,
 676         uint32_t                flags,
 677         boolean_t               ratelimited)
 678 {
 679         return timer_call_enter_internal(call, param1, deadline, leeway, flags, ratelimited);
 680 }
 681
 682 boolean_t
 683 timer_call_cancel(
 684         timer_call_t            call)
 685 {
 686         mpqueue_head_t          *old_queue;
 687         spl_t                   s;
 688
 689         s = splclock();
 690
 691         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
 692                 DECR_TIMER_CANCEL | DBG_FUNC_START,
 693                 VM_KERNEL_UNSLIDE_OR_PERM(call),
 694                 TCE(call)->deadline, call->soft_deadline, call->flags, 0);
 695
 696         old_queue = timer_call_dequeue_unlocked(call);
 697
 698         if (old_queue != NULL) {
 699                 timer_queue_lock_spin(old_queue);
 700                 if (!queue_empty(&old_queue->head)) {
 701                         timer_queue_cancel(old_queue, TCE(call)->deadline, CE(queue_first(&old_queue->head))->deadline);
 702                         timer_call_t thead = (timer_call_t)queue_first(&old_queue->head);
 703                         old_queue->earliest_soft_deadline = thead->flags & TIMER_CALL_RATELIMITED ? TCE(thead)->deadline : thead->soft_deadline;
 704                 }
 705                 else {
 706                         timer_queue_cancel(old_queue, TCE(call)->deadline, UINT64_MAX);
 707                         old_queue->earliest_soft_deadline = UINT64_MAX;
 708                 }
 709                 timer_queue_unlock(old_queue);
 710         }
 711         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
 712                 DECR_TIMER_CANCEL | DBG_FUNC_END,
 713                 VM_KERNEL_UNSLIDE_OR_PERM(call),
 714                 VM_KERNEL_UNSLIDE_OR_PERM(old_queue),
 715                 TCE(call)->deadline - mach_absolute_time(),
 716                 TCE(call)->deadline - TCE(call)->entry_time, 0);
 717         splx(s);
 718
 719 #if CONFIG_DTRACE
 720         DTRACE_TMR6(callout__cancel, timer_call_func_t, TCE(call)->func,
 721             timer_call_param_t, TCE(call)->param0, uint32_t, call->flags, 0,
 722             (call->ttd >> 32), (unsigned) (call->ttd & 0xFFFFFFFF));
 723 #endif
 724
 725         return (old_queue != NULL);
 726 }
 727
 728 static uint32_t timer_queue_shutdown_lock_skips;
 729 static uint32_t timer_queue_shutdown_discarded;
 730
 731 void
 732 timer_queue_shutdown(
 733         mpqueue_head_t          *queue)
 734 {
 735         timer_call_t            call;
 736         mpqueue_head_t          *new_queue;
 737         spl_t                   s;
 738
 739
 740         DBG("timer_queue_shutdown(%p)\n", queue);
 741
 742         s = splclock();
 743
 744         /* Note comma operator in while expression re-locking each iteration */
 745         while ((void)timer_queue_lock_spin(queue), !queue_empty(&queue->head)) {
 746                 call = TIMER_CALL(queue_first(&queue->head));
 747
 748                 if (!simple_lock_try(&call->lock)) {
 749                         /*
 750                          * case (2b) lock order inversion, dequeue and skip
 751                          * Don't change the call_entry queue back-pointer
 752                          * but set the async_dequeue field.
 753                          */
 754                         timer_queue_shutdown_lock_skips++;
 755                         timer_call_entry_dequeue_async(call);
 756 #if TIMER_ASSERT
 757                         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
 758                                 DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
 759                                 VM_KERNEL_UNSLIDE_OR_PERM(call),
 760                                 call->async_dequeue,
 761                                 VM_KERNEL_UNSLIDE_OR_PERM(TCE(call)->queue),
 762                                 0x2b, 0);
 763 #endif
 764                         timer_queue_unlock(queue);
 765                         continue;
 766                 }
 767
 768                 boolean_t call_local = ((call->flags & TIMER_CALL_LOCAL) != 0);
 769
 770                 /* remove entry from old queue */
 771                 timer_call_entry_dequeue(call);
 772                 timer_queue_unlock(queue);
 773
 774                 if (call_local == FALSE) {
 775                         /* and queue it on new, discarding LOCAL timers */
 776                         new_queue = timer_queue_assign(TCE(call)->deadline);
 777                         timer_queue_lock_spin(new_queue);
 778                         timer_call_entry_enqueue_deadline(
 779                                 call, new_queue, TCE(call)->deadline);
 780                         timer_queue_unlock(new_queue);
 781                 } else {
 782                         timer_queue_shutdown_discarded++;
 783                 }
 784
 785                 /* The only lingering LOCAL timer should be this thread's
 786                  * quantum expiration timer.
 787                  */
 788                 assert((call_local == FALSE) ||
 789                     (TCE(call)->func == thread_quantum_expire));
 790
 791                 simple_unlock(&call->lock);
 792         }
 793
 794         timer_queue_unlock(queue);
 795         splx(s);
 796 }
 797
 798 static uint32_t timer_queue_expire_lock_skips;
 799 uint64_t
 800 timer_queue_expire_with_options(
 801         mpqueue_head_t          *queue,
 802         uint64_t                deadline,
 803         boolean_t               rescan)
 804 {
 805         timer_call_t    call = NULL;
 806         uint32_t tc_iterations = 0;
 807         DBG("timer_queue_expire(%p,)\n", queue);
 808
 809         uint64_t cur_deadline = deadline;
 810         timer_queue_lock_spin(queue);
 811
 812         while (!queue_empty(&queue->head)) {
 813                 /* Upon processing one or more timer calls, refresh the
 814                  * deadline to account for time elapsed in the callout
 815                  */
 816                 if (++tc_iterations > 1)
 817                         cur_deadline = mach_absolute_time();
 818
 819                 if (call == NULL)
 820                         call = TIMER_CALL(queue_first(&queue->head));
 821
 822                 if (call->soft_deadline <= cur_deadline) {
 823                         timer_call_func_t               func;
 824                         timer_call_param_t              param0, param1;
 825
 826                         TCOAL_DEBUG(0xDDDD0000, queue->earliest_soft_deadline, call->soft_deadline, 0, 0, 0);
 827                         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
 828                                 DECR_TIMER_EXPIRE | DBG_FUNC_NONE,
 829                                 VM_KERNEL_UNSLIDE_OR_PERM(call),
 830                                 call->soft_deadline,
 831                                 TCE(call)->deadline,
 832                                 TCE(call)->entry_time, 0);
 833
 834                         if ((call->flags & TIMER_CALL_RATELIMITED) &&
 835                             (TCE(call)->deadline > cur_deadline)) {
 836                                 if (rescan == FALSE)
 837                                         break;
 838                         }
 839
 840                         if (!simple_lock_try(&call->lock)) {
 841                                 /* case (2b) lock inversion, dequeue and skip */
 842                                 timer_queue_expire_lock_skips++;
 843                                 timer_call_entry_dequeue_async(call);
 844                                 call = NULL;
 845                                 continue;
 846                         }
 847
 848                         timer_call_entry_dequeue(call);
 849
 850                         func = TCE(call)->func;
 851                         param0 = TCE(call)->param0;
 852                         param1 = TCE(call)->param1;
 853
 854                         simple_unlock(&call->lock);
 855                         timer_queue_unlock(queue);
 856
 857                         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
 858                                 DECR_TIMER_CALLOUT | DBG_FUNC_START,
 859                                 VM_KERNEL_UNSLIDE_OR_PERM(call), VM_KERNEL_UNSLIDE(func),
 860                                 VM_KERNEL_UNSLIDE_OR_PERM(param0),
 861                                 VM_KERNEL_UNSLIDE_OR_PERM(param1),
 862                                 0);
 863
 864 #if CONFIG_DTRACE
 865                         DTRACE_TMR7(callout__start, timer_call_func_t, func,
 866                             timer_call_param_t, param0, unsigned, call->flags,
 867                             0, (call->ttd >> 32),
 868                             (unsigned) (call->ttd & 0xFFFFFFFF), call);
 869 #endif
 870                         /* Maintain time-to-deadline in per-processor data
 871                          * structure for thread wakeup deadline statistics.
 872                          */
 873                         uint64_t *ttdp = &(PROCESSOR_DATA(current_processor(), timer_call_ttd));
 874                         *ttdp = call->ttd;
 875                         (*func)(param0, param1);
 876                         *ttdp = 0;
 877 #if CONFIG_DTRACE
 878                         DTRACE_TMR4(callout__end, timer_call_func_t, func,
 879                             param0, param1, call);
 880 #endif
 881
 882                         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
 883                                 DECR_TIMER_CALLOUT | DBG_FUNC_END,
 884                                 VM_KERNEL_UNSLIDE_OR_PERM(call), VM_KERNEL_UNSLIDE(func),
 885                                 VM_KERNEL_UNSLIDE_OR_PERM(param0),
 886                                 VM_KERNEL_UNSLIDE_OR_PERM(param1),
 887                                 0);
 888                         call = NULL;
 889                         timer_queue_lock_spin(queue);
 890                 } else {
 891                         if (__probable(rescan == FALSE)) {
 892                                 break;
 893                         } else {
 894                                 int64_t skew = TCE(call)->deadline - call->soft_deadline;
 895                                 assert(TCE(call)->deadline >= call->soft_deadline);
 896
 897                                 /* DRK: On a latency quality-of-service level change,
 898                                  * re-sort potentially rate-limited timers. The platform
 899                                  * layer determines which timers require
 900                                  * this. In the absence of the per-callout
 901                                  * synchronization requirement, a global resort could
 902                                  * be more efficient. The re-sort effectively
 903                                  * annuls all timer adjustments, i.e. the "soft
 904                                  * deadline" is the sort key.
 905                                  */
 906
 907                                 if (timer_resort_threshold(skew)) {
 908                                         if (__probable(simple_lock_try(&call->lock))) {
 909                                                 timer_call_entry_dequeue(call);
 910                                                 timer_call_entry_enqueue_deadline(call, queue, call->soft_deadline);
 911                                                 simple_unlock(&call->lock);
 912                                                 call = NULL;
 913                                         }
 914                                 }
 915                                 if (call) {
 916                                         call = TIMER_CALL(queue_next(qe(call)));
 917                                         if (queue_end(&queue->head, qe(call)))
 918                                                 break;
 919                                 }
 920                         }
 921                 }
 922         }
 923
 924         if (!queue_empty(&queue->head)) {
 925                 call = TIMER_CALL(queue_first(&queue->head));
 926                 cur_deadline = TCE(call)->deadline;
 927                 queue->earliest_soft_deadline = (call->flags & TIMER_CALL_RATELIMITED) ? TCE(call)->deadline: call->soft_deadline;
 928         } else {
 929                 queue->earliest_soft_deadline = cur_deadline = UINT64_MAX;
 930         }
 931
 932         timer_queue_unlock(queue);
 933
 934         return (cur_deadline);
 935 }
 936
 937 uint64_t
 938 timer_queue_expire(
 939         mpqueue_head_t          *queue,
 940         uint64_t                deadline)
 941 {
 942         return timer_queue_expire_with_options(queue, deadline, FALSE);
 943 }
 944
 945 extern int serverperfmode;
 946 static uint32_t timer_queue_migrate_lock_skips;
 947 /*
 948  * timer_queue_migrate() is called by timer_queue_migrate_cpu()
 949  * to move timer requests from the local processor (queue_from)
 950  * to a target processor's (queue_to).
 951  */
 952 int
 953 timer_queue_migrate(mpqueue_head_t *queue_from, mpqueue_head_t *queue_to)
 954 {
 955         timer_call_t    call;
 956         timer_call_t    head_to;
 957         int             timers_migrated = 0;
 958
 959         DBG("timer_queue_migrate(%p,%p)\n", queue_from, queue_to);
 960
 961         assert(!ml_get_interrupts_enabled());
 962         assert(queue_from != queue_to);
 963
 964         if (serverperfmode) {
 965                 /*
 966                  * if we're running a high end server
 967                  * avoid migrations... they add latency
 968                  * and don't save us power under typical
 969                  * server workloads
 970                  */
 971                 return -4;
 972         }
 973
 974         /*
 975          * Take both local (from) and target (to) timer queue locks while
 976          * moving the timers from the local queue to the target processor.
 977          * We assume that the target is always the boot processor.
 978          * But only move if all of the following is true:
 979          *  - the target queue is non-empty
 980          *  - the local queue is non-empty
 981          *  - the local queue's first deadline is later than the target's
 982          *  - the local queue contains no non-migrateable "local" call
 983          * so that we need not have the target resync.
 984          */
 985
 986         timer_queue_lock_spin(queue_to);
 987
 988         head_to = TIMER_CALL(queue_first(&queue_to->head));
 989         if (queue_empty(&queue_to->head)) {
 990                 timers_migrated = -1;
 991                 goto abort1;
 992         }
 993
 994         timer_queue_lock_spin(queue_from);
 995
 996         if (queue_empty(&queue_from->head)) {
 997                 timers_migrated = -2;
 998                 goto abort2;
 999         }
1000
1001         call = TIMER_CALL(queue_first(&queue_from->head));
1002         if (TCE(call)->deadline < TCE(head_to)->deadline) {
1003                 timers_migrated = 0;
1004                 goto abort2;
1005         }
1006
1007         /* perform scan for non-migratable timers */
1008         do {
1009                 if (call->flags & TIMER_CALL_LOCAL) {
1010                         timers_migrated = -3;
1011                         goto abort2;
1012                 }
1013                 call = TIMER_CALL(queue_next(qe(call)));
1014         } while (!queue_end(&queue_from->head, qe(call)));
1015
1016         /* migration loop itself -- both queues are locked */
1017         while (!queue_empty(&queue_from->head)) {
1018                 call = TIMER_CALL(queue_first(&queue_from->head));
1019                 if (!simple_lock_try(&call->lock)) {
1020                         /* case (2b) lock order inversion, dequeue only */
1021 #ifdef TIMER_ASSERT
1022                         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1023                                 DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
1024                                 VM_KERNEL_UNSLIDE_OR_PERM(call),
1025                                 VM_KERNEL_UNSLIDE_OR_PERM(TCE(call)->queue),
1026                                 VM_KERNEL_UNSLIDE_OR_PERM(call->lock.interlock.lock_data),
1027                                 0x2b, 0);
1028 #endif
1029                         timer_queue_migrate_lock_skips++;
1030                         timer_call_entry_dequeue_async(call);
1031                         continue;
1032                 }
1033                 timer_call_entry_dequeue(call);
1034                 timer_call_entry_enqueue_deadline(
1035                         call, queue_to, TCE(call)->deadline);
1036                 timers_migrated++;
1037                 simple_unlock(&call->lock);
1038         }
1039         queue_from->earliest_soft_deadline = UINT64_MAX;
1040 abort2:
1041         timer_queue_unlock(queue_from);
1042 abort1:
1043         timer_queue_unlock(queue_to);
1044
1045         return timers_migrated;
1046 }
1047
1048 void
1049 timer_queue_trace_cpu(int ncpu)
1050 {
1051         timer_call_nosync_cpu(
1052                 ncpu,
1053                 (void(*)())timer_queue_trace,
1054                 (void*) timer_queue_cpu(ncpu));
1055 }
1056
1057 void
1058 timer_queue_trace(
1059         mpqueue_head_t                  *queue)
1060 {
1061         timer_call_t    call;
1062         spl_t           s;
1063
1064         if (!kdebug_enable)
1065                 return;
1066
1067         s = splclock();
1068         timer_queue_lock_spin(queue);
1069
1070         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1071                 DECR_TIMER_QUEUE | DBG_FUNC_START,
1072                 queue->count, mach_absolute_time(), 0, 0, 0);
1073
1074         if (!queue_empty(&queue->head)) {
1075                 call = TIMER_CALL(queue_first(&queue->head));
1076                 do {
1077                         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1078                                 DECR_TIMER_QUEUE | DBG_FUNC_NONE,
1079                                 call->soft_deadline,
1080                                 TCE(call)->deadline,
1081                                 TCE(call)->entry_time,
1082                                 VM_KERNEL_UNSLIDE(TCE(call)->func),
1083                                 0);
1084                         call = TIMER_CALL(queue_next(qe(call)));
1085                 } while (!queue_end(&queue->head, qe(call)));
1086         }
1087
1088         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1089                 DECR_TIMER_QUEUE | DBG_FUNC_END,
1090                 queue->count, mach_absolute_time(), 0, 0, 0);
1091
1092         timer_queue_unlock(queue);
1093         splx(s);
1094 }
1095
1096 void
1097 timer_longterm_dequeued_locked(timer_call_t call)
1098 {
1099         timer_longterm_t        *tlp = &timer_longterm;
1100
1101         tlp->dequeues++;
1102         if (call == tlp->threshold.call)
1103                 tlp->threshold.call = NULL;
1104 }
1105
1106 /*
1107  * Place a timer call in the longterm list
1108  * and adjust the next timer callout deadline if the new timer is first.
1109  */
1110 mpqueue_head_t *
1111 timer_longterm_enqueue_unlocked(timer_call_t    call,
1112                                 uint64_t        now,
1113                                 uint64_t        deadline,
1114                                 mpqueue_head_t  **old_queue,
1115                                 uint64_t        soft_deadline,
1116                                 uint64_t        ttd,
1117                                 timer_call_param_t      param1,
1118                                 uint32_t        callout_flags)
1119 {
1120         timer_longterm_t        *tlp = &timer_longterm;
1121         boolean_t               update_required = FALSE;
1122         uint64_t                longterm_threshold;
1123
1124         longterm_threshold = now + tlp->threshold.interval;
1125
1126         /*
1127          * Return NULL without doing anything if:
1128          *  - this timer is local, or
1129          *  - the longterm mechanism is disabled, or
1130          *  - this deadline is too short.
1131          */
1132         if ((callout_flags & TIMER_CALL_LOCAL) != 0 ||
1133             (tlp->threshold.interval == TIMER_LONGTERM_NONE) ||
1134                 (deadline <= longterm_threshold))
1135                 return NULL;
1136
1137         /*
1138          * Remove timer from its current queue, if any.
1139          */
1140         *old_queue = timer_call_dequeue_unlocked(call);
1141
1142         /*
1143          * Lock the longterm queue, queue timer and determine
1144          * whether an update is necessary.
1145          */
1146         assert(!ml_get_interrupts_enabled());
1147         simple_lock(&call->lock);
1148         timer_queue_lock_spin(timer_longterm_queue);
1149         TCE(call)->deadline = deadline;
1150         TCE(call)->param1 = param1;
1151         call->ttd = ttd;
1152         call->soft_deadline = soft_deadline;
1153         call->flags = callout_flags;
1154         timer_call_entry_enqueue_tail(call, timer_longterm_queue);
1155
1156         tlp->enqueues++;
1157
1158         /*
1159          * We'll need to update the currently set threshold timer
1160          * if the new deadline is sooner and no sooner update is in flight.
1161          */
1162         if (deadline < tlp->threshold.deadline &&
1163             deadline < tlp->threshold.preempted) {
1164                 tlp->threshold.preempted = deadline;
1165                 tlp->threshold.call = call;
1166                 update_required = TRUE;
1167         }
1168         timer_queue_unlock(timer_longterm_queue);
1169         simple_unlock(&call->lock);
1170
1171         if (update_required) {
1172                 /*
1173                  * Note: this call expects that calling the master cpu
1174                  * alone does not involve locking the topo lock.
1175                  */
1176                 timer_call_nosync_cpu(
1177                         master_cpu,
1178                         (void (*)(void *)) timer_longterm_update,
1179                         (void *)tlp);
1180         }
1181
1182         return timer_longterm_queue;
1183 }
1184
1185 /*
1186  * Scan for timers below the longterm threshold.
1187  * Move these to the local timer queue (of the boot processor on which the
1188  * calling thread is running).
1189  * Both the local (boot) queue and the longterm queue are locked.
1190  * The scan is similar to the timer migrate sequence but is performed by
1191  * successively examining each timer on the longterm queue:
1192  *  - if within the short-term threshold
1193  *    - enter on the local queue (unless being deleted),
1194  *  - otherwise:
1195  *    - if sooner, deadline becomes the next threshold deadline.
1196  */
1197 void
1198 timer_longterm_scan(timer_longterm_t    *tlp,
1199                     uint64_t            now)
1200 {
1201         queue_entry_t   qe;
1202         timer_call_t    call;
1203         uint64_t        threshold;
1204         uint64_t        deadline;
1205         mpqueue_head_t  *timer_master_queue;
1206
1207         assert(!ml_get_interrupts_enabled());
1208         assert(cpu_number() == master_cpu);
1209
1210         if (tlp->threshold.interval != TIMER_LONGTERM_NONE)
1211                 threshold = now + tlp->threshold.interval;
1212         else
1213                 threshold = TIMER_LONGTERM_NONE;
1214
1215         tlp->threshold.deadline = TIMER_LONGTERM_NONE;
1216         tlp->threshold.call = NULL;
1217
1218         if (queue_empty(&timer_longterm_queue->head))
1219                 return;
1220
1221         timer_master_queue = timer_queue_cpu(master_cpu);
1222         timer_queue_lock_spin(timer_master_queue);
1223
1224         qe = queue_first(&timer_longterm_queue->head);
1225         while (!queue_end(&timer_longterm_queue->head, qe)) {
1226                 call = TIMER_CALL(qe);
1227                 deadline = call->soft_deadline;
1228                 qe = queue_next(qe);
1229                 if (!simple_lock_try(&call->lock)) {
1230                         /* case (2c) lock order inversion, dequeue only */
1231 #ifdef TIMER_ASSERT
1232                         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1233                                 DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
1234                                 VM_KERNEL_UNSLIDE_OR_PERM(call),
1235                                 VM_KERNEL_UNSLIDE_OR_PERM(TCE(call)->queue),
1236                                 VM_KERNEL_UNSLIDE_OR_PERM(call->lock.interlock.lock_data),
1237                                 0x2c, 0);
1238 #endif
1239                         timer_call_entry_dequeue_async(call);
1240                         continue;
1241                 }
1242                 if (deadline < threshold) {
1243                         /*
1244                          * This timer needs moving (escalating)
1245                          * to the local (boot) processor's queue.
1246                          */
1247 #ifdef TIMER_ASSERT
1248                         if (deadline < now)
1249                                 TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1250                                         DECR_TIMER_OVERDUE | DBG_FUNC_NONE,
1251                                         VM_KERNEL_UNSLIDE_OR_PERM(call),
1252                                         deadline,
1253                                         now,
1254                                         threshold,
1255                                         0);
1256 #endif
1257                         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1258                                 DECR_TIMER_ESCALATE | DBG_FUNC_NONE,
1259                                 VM_KERNEL_UNSLIDE_OR_PERM(call),
1260                                 TCE(call)->deadline,
1261                                 TCE(call)->entry_time,
1262                                 VM_KERNEL_UNSLIDE(TCE(call)->func),
1263                                 0);
1264                         tlp->escalates++;
1265                         timer_call_entry_dequeue(call);
1266                         timer_call_entry_enqueue_deadline(
1267                                 call, timer_master_queue, TCE(call)->deadline);
1268                         /*
1269                          * A side-effect of the following call is to update
1270                          * the actual hardware deadline if required.
1271                          */
1272                         (void) timer_queue_assign(deadline);
1273                 } else {
1274                         if (deadline < tlp->threshold.deadline) {
1275                                 tlp->threshold.deadline = deadline;
1276                                 tlp->threshold.call = call;
1277                         }
1278                 }
1279                 simple_unlock(&call->lock);
1280         }
1281
1282         timer_queue_unlock(timer_master_queue);
1283 }
1284
1285 void
1286 timer_longterm_callout(timer_call_param_t p0, __unused timer_call_param_t p1)
1287 {
1288         timer_longterm_t        *tlp = (timer_longterm_t *) p0;
1289
1290         timer_longterm_update(tlp);
1291 }
1292
1293 void
1294 timer_longterm_update_locked(timer_longterm_t *tlp)
1295 {
1296         uint64_t        latency;
1297
1298         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1299                 DECR_TIMER_UPDATE | DBG_FUNC_START,
1300                 VM_KERNEL_UNSLIDE_OR_PERM(&tlp->queue),
1301                 tlp->threshold.deadline,
1302                 tlp->threshold.preempted,
1303                 tlp->queue.count, 0);
1304
1305         tlp->scan_time = mach_absolute_time();
1306         if (tlp->threshold.preempted != TIMER_LONGTERM_NONE) {
1307                 tlp->threshold.preempts++;
1308                 tlp->threshold.deadline = tlp->threshold.preempted;
1309                 tlp->threshold.preempted = TIMER_LONGTERM_NONE;
1310                 /*
1311                  * Note: in the unlikely event that a pre-empted timer has
1312                  * itself been cancelled, we'll simply re-scan later at the
1313                  * time of the preempted/cancelled timer.
1314                  */
1315         } else {
1316                 tlp->threshold.scans++;
1317
1318                 /*
1319                  * Maintain a moving average of our wakeup latency.
1320                  * Clamp latency to 0 and ignore above threshold interval.
1321                  */
1322                 if (tlp->scan_time > tlp->threshold.deadline_set)
1323                         latency = tlp->scan_time - tlp->threshold.deadline_set;
1324                 else
1325                         latency = 0;
1326                 if (latency < tlp->threshold.interval) {
1327                         tlp->threshold.latency_min =
1328                                 MIN(tlp->threshold.latency_min, latency);
1329                         tlp->threshold.latency_max =
1330                                 MAX(tlp->threshold.latency_max, latency);
1331                         tlp->threshold.latency =
1332                                 (tlp->threshold.latency*99 + latency) / 100;
1333                 }
1334
1335                 timer_longterm_scan(tlp, tlp->scan_time);
1336         }
1337
1338         tlp->threshold.deadline_set = tlp->threshold.deadline;
1339         /* The next deadline timer to be set is adjusted */
1340         if (tlp->threshold.deadline != TIMER_LONGTERM_NONE) {
1341                 tlp->threshold.deadline_set -= tlp->threshold.margin;
1342                 tlp->threshold.deadline_set -= tlp->threshold.latency;
1343         }
1344
1345         TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1346                 DECR_TIMER_UPDATE | DBG_FUNC_END,
1347                 VM_KERNEL_UNSLIDE_OR_PERM(&tlp->queue),
1348                 tlp->threshold.deadline,
1349                 tlp->threshold.scans,
1350                 tlp->queue.count, 0);
1351 }
1352
1353 void
1354 timer_longterm_update(timer_longterm_t *tlp)
1355 {
1356         spl_t   s = splclock();
1357
1358         timer_queue_lock_spin(timer_longterm_queue);
1359
1360         if (cpu_number() != master_cpu)
1361                 panic("timer_longterm_update_master() on non-boot cpu");
1362
1363         timer_longterm_update_locked(tlp);
1364
1365         if (tlp->threshold.deadline != TIMER_LONGTERM_NONE)
1366                 timer_call_enter(
1367                         &tlp->threshold.timer,
1368                         tlp->threshold.deadline_set,
1369                         TIMER_CALL_LOCAL | TIMER_CALL_SYS_CRITICAL);
1370
1371         timer_queue_unlock(timer_longterm_queue);
1372         splx(s);
1373 }
1374
1375 void
1376 timer_longterm_init(void)
1377 {
1378         uint32_t                longterm;
1379         timer_longterm_t        *tlp = &timer_longterm;
1380
1381         DBG("timer_longterm_init() tlp: %p, queue: %p\n", tlp, &tlp->queue);
1382
1383         /*
1384          * Set the longterm timer threshold. Defaults to TIMER_LONGTERM_THRESHOLD
1385          * or TIMER_LONGTERM_NONE (disabled) for server;
1386          * overridden longterm boot-arg
1387          */
1388         tlp->threshold.interval = serverperfmode ? TIMER_LONGTERM_NONE
1389                                                  : TIMER_LONGTERM_THRESHOLD;
1390         if (PE_parse_boot_argn("longterm", &longterm, sizeof (longterm))) {
1391                 tlp->threshold.interval = (longterm == 0) ?
1392                                                 TIMER_LONGTERM_NONE :
1393                                                 longterm * NSEC_PER_MSEC;
1394         }
1395         if (tlp->threshold.interval != TIMER_LONGTERM_NONE) {
1396                 printf("Longterm timer threshold: %llu ms\n",
1397                         tlp->threshold.interval / NSEC_PER_MSEC);
1398                 kprintf("Longterm timer threshold: %llu ms\n",
1399                         tlp->threshold.interval / NSEC_PER_MSEC);
1400                 nanoseconds_to_absolutetime(tlp->threshold.interval,
1401                                             &tlp->threshold.interval);
1402                 tlp->threshold.margin = tlp->threshold.interval / 10;
1403                 tlp->threshold.latency_min = EndOfAllTime;
1404                 tlp->threshold.latency_max = 0;
1405         }
1406
1407         tlp->threshold.preempted = TIMER_LONGTERM_NONE;
1408         tlp->threshold.deadline = TIMER_LONGTERM_NONE;
1409
1410         lck_attr_setdefault(&timer_longterm_lck_attr);
1411         lck_grp_attr_setdefault(&timer_longterm_lck_grp_attr);
1412         lck_grp_init(&timer_longterm_lck_grp,
1413                      "timer_longterm", &timer_longterm_lck_grp_attr);
1414         mpqueue_init(&tlp->queue,
1415                      &timer_longterm_lck_grp, &timer_longterm_lck_attr);
1416
1417         timer_call_setup(&tlp->threshold.timer,
1418                          timer_longterm_callout, (timer_call_param_t) tlp);
1419
1420         timer_longterm_queue = &tlp->queue;
1421 }
1422
1423 enum {
1424         THRESHOLD, QCOUNT,
1425         ENQUEUES, DEQUEUES, ESCALATES, SCANS, PREEMPTS,
1426         LATENCY, LATENCY_MIN, LATENCY_MAX
1427 };
1428 uint64_t
1429 timer_sysctl_get(int oid)
1430 {
1431         timer_longterm_t        *tlp = &timer_longterm;
1432
1433         switch (oid) {
1434         case THRESHOLD:
1435                 return (tlp->threshold.interval == TIMER_LONGTERM_NONE) ?
1436                         0 : tlp->threshold.interval / NSEC_PER_MSEC;
1437         case QCOUNT:
1438                 return tlp->queue.count;
1439         case ENQUEUES:
1440                 return tlp->enqueues;
1441         case DEQUEUES:
1442                 return tlp->dequeues;
1443         case ESCALATES:
1444                 return tlp->escalates;
1445         case SCANS:
1446                 return tlp->threshold.scans;
1447         case PREEMPTS:
1448                 return tlp->threshold.preempts;
1449         case LATENCY:
1450                 return tlp->threshold.latency;
1451         case LATENCY_MIN:
1452                 return tlp->threshold.latency_min;
1453         case LATENCY_MAX:
1454                 return tlp->threshold.latency_max;
1455         default:
1456                 return 0;
1457         }
1458 }
1459
1460 /*
1461  * timer_master_scan() is the inverse of timer_longterm_scan()
1462  * since it un-escalates timers to the longterm queue.
1463  */
1464 static void
1465 timer_master_scan(timer_longterm_t      *tlp,
1466                   uint64_t              now)
1467 {
1468         queue_entry_t   qe;
1469         timer_call_t    call;
1470         uint64_t        threshold;
1471         uint64_t        deadline;
1472         mpqueue_head_t  *timer_master_queue;
1473
1474         if (tlp->threshold.interval != TIMER_LONGTERM_NONE)
1475                 threshold = now + tlp->threshold.interval;
1476         else
1477                 threshold = TIMER_LONGTERM_NONE;
1478
1479         timer_master_queue = timer_queue_cpu(master_cpu);
1480         timer_queue_lock_spin(timer_master_queue);
1481
1482         qe = queue_first(&timer_master_queue->head);
1483         while (!queue_end(&timer_master_queue->head, qe)) {
1484                 call = TIMER_CALL(qe);
1485                 deadline = TCE(call)->deadline;
1486                 qe = queue_next(qe);
1487                 if ((call->flags & TIMER_CALL_LOCAL) != 0)
1488                         continue;
1489                 if (!simple_lock_try(&call->lock)) {
1490                         /* case (2c) lock order inversion, dequeue only */
1491                         timer_call_entry_dequeue_async(call);
1492                         continue;
1493                 }
1494                 if (deadline > threshold) {
1495                         /* move from master to longterm */
1496                         timer_call_entry_dequeue(call);
1497                         timer_call_entry_enqueue_tail(call, timer_longterm_queue);
1498                         if (deadline < tlp->threshold.deadline) {
1499                                 tlp->threshold.deadline = deadline;
1500                                 tlp->threshold.call = call;
1501                         }
1502                 }
1503                 simple_unlock(&call->lock);
1504         }
1505         timer_queue_unlock(timer_master_queue);
1506 }
1507
1508 static void
1509 timer_sysctl_set_threshold(uint64_t value)
1510 {
1511         timer_longterm_t        *tlp = &timer_longterm;
1512         spl_t                   s = splclock();
1513         boolean_t               threshold_increase;
1514
1515         timer_queue_lock_spin(timer_longterm_queue);
1516
1517         timer_call_cancel(&tlp->threshold.timer);
1518
1519         /*
1520          * Set the new threshold and note whther it's increasing.
1521          */
1522         if (value == 0) {
1523                 tlp->threshold.interval = TIMER_LONGTERM_NONE;
1524                 threshold_increase = TRUE;
1525                 timer_call_cancel(&tlp->threshold.timer);
1526         } else {
1527                 uint64_t        old_interval = tlp->threshold.interval;
1528                 tlp->threshold.interval = value * NSEC_PER_MSEC;
1529                 nanoseconds_to_absolutetime(tlp->threshold.interval,
1530                                             &tlp->threshold.interval);
1531                 tlp->threshold.margin = tlp->threshold.interval / 10;
1532                 if  (old_interval == TIMER_LONGTERM_NONE)
1533                         threshold_increase = FALSE;
1534                 else
1535                         threshold_increase = (tlp->threshold.interval > old_interval);
1536         }
1537
1538         if (threshold_increase /* or removal */) {
1539                 /* Escalate timers from the longterm queue */
1540                 timer_longterm_scan(tlp, mach_absolute_time());
1541         } else /* decrease or addition  */ {
1542                 /*
1543                  * We scan the local/master queue for timers now longterm.
1544                  * To be strictly correct, we should scan all processor queues
1545                  * but timer migration results in most timers gravitating to the
1546                  * master processor in any case.
1547                  */
1548                 timer_master_scan(tlp, mach_absolute_time());
1549         }
1550
1551         /* Set new timer accordingly */
1552         tlp->threshold.deadline_set = tlp->threshold.deadline;
1553         if (tlp->threshold.deadline != TIMER_LONGTERM_NONE) {
1554                 tlp->threshold.deadline_set -= tlp->threshold.margin;
1555                 tlp->threshold.deadline_set -= tlp->threshold.latency;
1556                 timer_call_enter(
1557                         &tlp->threshold.timer,
1558                         tlp->threshold.deadline_set,
1559                         TIMER_CALL_LOCAL | TIMER_CALL_SYS_CRITICAL);
1560         }
1561
1562         /* Reset stats */
1563         tlp->enqueues = 0;
1564         tlp->dequeues = 0;
1565         tlp->escalates = 0;
1566         tlp->threshold.scans = 0;
1567         tlp->threshold.preempts = 0;
1568         tlp->threshold.latency = 0;
1569         tlp->threshold.latency_min = EndOfAllTime;
1570         tlp->threshold.latency_max = 0;
1571
1572         timer_queue_unlock(timer_longterm_queue);
1573         splx(s);
1574 }
1575
1576 int
1577 timer_sysctl_set(int oid, uint64_t value)
1578 {
1579         switch (oid) {
1580         case THRESHOLD:
1581                 timer_call_cpu(
1582                         master_cpu,
1583                         (void (*)(void *)) timer_sysctl_set_threshold,
1584                         (void *) value);
1585                 return KERN_SUCCESS;
1586         default:
1587                 return KERN_INVALID_ARGUMENT;
1588         }
1589 }
1590
1591
1592 /* Select timer coalescing window based on per-task quality-of-service hints */
1593 static boolean_t tcoal_qos_adjust(thread_t t, int32_t *tshift, uint64_t *tmax_abstime, boolean_t *pratelimited) {
1594         uint32_t latency_qos;
1595         boolean_t adjusted = FALSE;
1596         task_t ctask = t->task;
1597
1598         if (ctask) {
1599                 latency_qos = proc_get_effective_thread_policy(t, TASK_POLICY_LATENCY_QOS);
1600
1601                 assert(latency_qos <= NUM_LATENCY_QOS_TIERS);
1602
1603                 if (latency_qos) {
1604                         *tshift = tcoal_prio_params.latency_qos_scale[latency_qos - 1];
1605                         *tmax_abstime = tcoal_prio_params.latency_qos_abstime_max[latency_qos - 1];
1606                         *pratelimited = tcoal_prio_params.latency_tier_rate_limited[latency_qos - 1];
1607                         adjusted = TRUE;
1608                 }
1609         }
1610         return adjusted;
1611 }
1612
1613
1614 /* Adjust timer deadlines based on priority of the thread and the
1615  * urgency value provided at timeout establishment. With this mechanism,
1616  * timers are no longer necessarily sorted in order of soft deadline
1617  * on a given timer queue, i.e. they may be differentially skewed.
1618  * In the current scheme, this could lead to fewer pending timers
1619  * processed than is technically possible when the HW deadline arrives.
1620  */
1621 static void
1622 timer_compute_leeway(thread_t cthread, int32_t urgency, int32_t *tshift, uint64_t *tmax_abstime, boolean_t *pratelimited) {
1623         int16_t tpri = cthread->sched_pri;
1624         if ((urgency & TIMER_CALL_USER_MASK) != 0) {
1625                 if (tpri >= BASEPRI_RTQUEUES ||
1626                 urgency == TIMER_CALL_USER_CRITICAL) {
1627                         *tshift = tcoal_prio_params.timer_coalesce_rt_shift;
1628                         *tmax_abstime = tcoal_prio_params.timer_coalesce_rt_abstime_max;
1629                         TCOAL_PRIO_STAT(rt_tcl);
1630                 } else if (proc_get_effective_thread_policy(cthread, TASK_POLICY_DARWIN_BG) ||
1631                 (urgency == TIMER_CALL_USER_BACKGROUND)) {
1632                         /* Determine if timer should be subjected to a lower QoS */
1633                         if (tcoal_qos_adjust(cthread, tshift, tmax_abstime, pratelimited)) {
1634                                 if (*tmax_abstime > tcoal_prio_params.timer_coalesce_bg_abstime_max) {
1635                                         return;
1636                                 } else {
1637                                         *pratelimited = FALSE;
1638                                 }
1639                         }
1640                         *tshift = tcoal_prio_params.timer_coalesce_bg_shift;
1641                         *tmax_abstime = tcoal_prio_params.timer_coalesce_bg_abstime_max;
1642                         TCOAL_PRIO_STAT(bg_tcl);
1643                 } else if (tpri >= MINPRI_KERNEL) {
1644                         *tshift = tcoal_prio_params.timer_coalesce_kt_shift;
1645                         *tmax_abstime = tcoal_prio_params.timer_coalesce_kt_abstime_max;
1646                         TCOAL_PRIO_STAT(kt_tcl);
1647                 } else if (cthread->sched_mode == TH_MODE_FIXED) {
1648                         *tshift = tcoal_prio_params.timer_coalesce_fp_shift;
1649                         *tmax_abstime = tcoal_prio_params.timer_coalesce_fp_abstime_max;
1650                         TCOAL_PRIO_STAT(fp_tcl);
1651                 } else if (tcoal_qos_adjust(cthread, tshift, tmax_abstime, pratelimited)) {
1652                         TCOAL_PRIO_STAT(qos_tcl);
1653                 } else if (cthread->sched_mode == TH_MODE_TIMESHARE) {
1654                         *tshift = tcoal_prio_params.timer_coalesce_ts_shift;
1655                         *tmax_abstime = tcoal_prio_params.timer_coalesce_ts_abstime_max;
1656                         TCOAL_PRIO_STAT(ts_tcl);
1657                 } else {
1658                         TCOAL_PRIO_STAT(nc_tcl);
1659                 }
1660         } else if (urgency == TIMER_CALL_SYS_BACKGROUND) {
1661                 *tshift = tcoal_prio_params.timer_coalesce_bg_shift;
1662                 *tmax_abstime = tcoal_prio_params.timer_coalesce_bg_abstime_max;
1663                 TCOAL_PRIO_STAT(bg_tcl);
1664         } else {
1665                 *tshift = tcoal_prio_params.timer_coalesce_kt_shift;
1666                 *tmax_abstime = tcoal_prio_params.timer_coalesce_kt_abstime_max;
1667                 TCOAL_PRIO_STAT(kt_tcl);
1668         }
1669 }
1670
1671
1672 int timer_user_idle_level;
1673
1674 uint64_t
1675 timer_call_slop(uint64_t deadline, uint64_t now, uint32_t flags, thread_t cthread, boolean_t *pratelimited)
1676 {
1677         int32_t tcs_shift = 0;
1678         uint64_t tcs_max_abstime = 0;
1679         uint64_t adjval;
1680         uint32_t urgency = (flags & TIMER_CALL_URGENCY_MASK);
1681
1682         if (mach_timer_coalescing_enabled &&
1683             (deadline > now) && (urgency != TIMER_CALL_SYS_CRITICAL)) {
1684                 timer_compute_leeway(cthread, urgency, &tcs_shift, &tcs_max_abstime, pratelimited);
1685
1686                 if (tcs_shift >= 0)
1687                         adjval =  MIN((deadline - now) >> tcs_shift, tcs_max_abstime);
1688                 else
1689                         adjval =  MIN((deadline - now) << (-tcs_shift), tcs_max_abstime);
1690                 /* Apply adjustments derived from "user idle level" heuristic */
1691                 adjval += (adjval * timer_user_idle_level) >> 7;
1692                 return adjval;
1693         } else {
1694                 return 0;
1695         }
1696 }
1697
1698 int
1699 timer_get_user_idle_level(void) {
1700         return timer_user_idle_level;
1701 }
1702
1703 kern_return_t timer_set_user_idle_level(int ilevel) {
1704         boolean_t do_reeval = FALSE;
1705
1706         if ((ilevel < 0) || (ilevel > 128))
1707                 return KERN_INVALID_ARGUMENT;
1708
1709         if (ilevel < timer_user_idle_level) {
1710                 do_reeval = TRUE;
1711         }
1712
1713         timer_user_idle_level = ilevel;
1714
1715         if (do_reeval)
1716                 ml_timer_evaluate();
1717
1718         return KERN_SUCCESS;
1719 }