osfmk/kern/thread_call.c

   1 /*
   2  * Copyright (c) 1993-1995, 1999-2008 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28
  29 #include <mach/mach_types.h>
  30 #include <mach/thread_act.h>
  31
  32 #include <kern/kern_types.h>
  33 #include <kern/zalloc.h>
  34 #include <kern/sched_prim.h>
  35 #include <kern/clock.h>
  36 #include <kern/task.h>
  37 #include <kern/thread.h>
  38 #include <kern/waitq.h>
  39 #include <kern/ledger.h>
  40 #include <kern/policy_internal.h>
  41
  42 #include <vm/vm_pageout.h>
  43
  44 #include <kern/thread_call.h>
  45 #include <kern/call_entry.h>
  46 #include <kern/timer_call.h>
  47
  48 #include <libkern/OSAtomic.h>
  49 #include <kern/timer_queue.h>
  50
  51 #include <sys/kdebug.h>
  52 #if CONFIG_DTRACE
  53 #include <mach/sdt.h>
  54 #endif
  55 #include <machine/machine_routines.h>
  56
  57 static zone_t                   thread_call_zone;
  58 static struct waitq             daemon_waitq;
  59
  60 typedef enum {
  61         TCF_ABSOLUTE    = 0,
  62         TCF_CONTINUOUS  = 1,
  63         TCF_COUNT       = 2,
  64 } thread_call_flavor_t;
  65
  66 typedef enum {
  67         TCG_NONE                = 0x0,
  68         TCG_PARALLEL            = 0x1,
  69         TCG_DEALLOC_ACTIVE      = 0x2,
  70 } thread_call_group_flags_t;
  71
  72 static struct thread_call_group {
  73         const char *            tcg_name;
  74
  75         queue_head_t            pending_queue;
  76         uint32_t                pending_count;
  77
  78         queue_head_t            delayed_queues[TCF_COUNT];
  79         timer_call_data_t       delayed_timers[TCF_COUNT];
  80
  81         timer_call_data_t       dealloc_timer;
  82
  83         struct waitq            idle_waitq;
  84         uint32_t                idle_count, active_count, blocked_count;
  85
  86         uint32_t                tcg_thread_pri;
  87         uint32_t                target_thread_count;
  88         uint64_t                idle_timestamp;
  89
  90         thread_call_group_flags_t flags;
  91 } thread_call_groups[THREAD_CALL_INDEX_MAX] = {
  92         [THREAD_CALL_INDEX_HIGH] = {
  93                 .tcg_name               = "high",
  94                 .tcg_thread_pri         = BASEPRI_PREEMPT_HIGH,
  95                 .target_thread_count    = 4,
  96                 .flags                  = TCG_NONE,
  97         },
  98         [THREAD_CALL_INDEX_KERNEL] = {
  99                 .tcg_name               = "kernel",
 100                 .tcg_thread_pri         = BASEPRI_KERNEL,
 101                 .target_thread_count    = 1,
 102                 .flags                  = TCG_PARALLEL,
 103         },
 104         [THREAD_CALL_INDEX_USER] = {
 105                 .tcg_name               = "user",
 106                 .tcg_thread_pri         = BASEPRI_DEFAULT,
 107                 .target_thread_count    = 1,
 108                 .flags                  = TCG_PARALLEL,
 109         },
 110         [THREAD_CALL_INDEX_LOW] = {
 111                 .tcg_name               = "low",
 112                 .tcg_thread_pri         = MAXPRI_THROTTLE,
 113                 .target_thread_count    = 1,
 114                 .flags                  = TCG_PARALLEL,
 115         },
 116         [THREAD_CALL_INDEX_KERNEL_HIGH] = {
 117                 .tcg_name               = "kernel-high",
 118                 .tcg_thread_pri         = BASEPRI_PREEMPT,
 119                 .target_thread_count    = 2,
 120                 .flags                  = TCG_NONE,
 121         },
 122         [THREAD_CALL_INDEX_QOS_UI] = {
 123                 .tcg_name               = "qos-ui",
 124                 .tcg_thread_pri         = BASEPRI_FOREGROUND,
 125                 .target_thread_count    = 1,
 126                 .flags                  = TCG_NONE,
 127         },
 128         [THREAD_CALL_INDEX_QOS_IN] = {
 129                 .tcg_name               = "qos-in",
 130                 .tcg_thread_pri         = BASEPRI_USER_INITIATED,
 131                 .target_thread_count    = 1,
 132                 .flags                  = TCG_NONE,
 133         },
 134         [THREAD_CALL_INDEX_QOS_UT] = {
 135                 .tcg_name               = "qos-ut",
 136                 .tcg_thread_pri         = BASEPRI_UTILITY,
 137                 .target_thread_count    = 1,
 138                 .flags                  = TCG_NONE,
 139         },
 140 };
 141
 142 typedef struct thread_call_group        *thread_call_group_t;
 143
 144 #define INTERNAL_CALL_COUNT             768
 145 #define THREAD_CALL_DEALLOC_INTERVAL_NS (5 * NSEC_PER_MSEC) /* 5 ms */
 146 #define THREAD_CALL_ADD_RATIO           4
 147 #define THREAD_CALL_MACH_FACTOR_CAP     3
 148 #define THREAD_CALL_GROUP_MAX_THREADS   500
 149
 150 static boolean_t                thread_call_daemon_awake;
 151 static thread_call_data_t       internal_call_storage[INTERNAL_CALL_COUNT];
 152 static queue_head_t             thread_call_internal_queue;
 153 int                                             thread_call_internal_queue_count = 0;
 154 static uint64_t                 thread_call_dealloc_interval_abs;
 155
 156 static __inline__ thread_call_t _internal_call_allocate(thread_call_func_t func, thread_call_param_t param0);
 157 static __inline__ void          _internal_call_release(thread_call_t call);
 158 static __inline__ boolean_t     _pending_call_enqueue(thread_call_t call, thread_call_group_t group);
 159 static boolean_t                _delayed_call_enqueue(thread_call_t call, thread_call_group_t group,
 160     uint64_t deadline, thread_call_flavor_t flavor);
 161 static __inline__ boolean_t     _call_dequeue(thread_call_t call, thread_call_group_t group);
 162 static __inline__ void          thread_call_wake(thread_call_group_t group);
 163 static void                     thread_call_daemon(void *arg);
 164 static void                     thread_call_thread(thread_call_group_t group, wait_result_t wres);
 165 static void                     thread_call_dealloc_timer(timer_call_param_t p0, timer_call_param_t p1);
 166 static void                     thread_call_group_setup(thread_call_group_t group);
 167 static void                     sched_call_thread(int type, thread_t thread);
 168 static void                     thread_call_start_deallocate_timer(thread_call_group_t group);
 169 static void                     thread_call_wait_locked(thread_call_t call, spl_t s);
 170 static boolean_t                thread_call_wait_once_locked(thread_call_t call, spl_t s);
 171
 172 static boolean_t                thread_call_enter_delayed_internal(thread_call_t call,
 173     thread_call_func_t alt_func, thread_call_param_t alt_param0,
 174     thread_call_param_t param1, uint64_t deadline,
 175     uint64_t leeway, unsigned int flags);
 176
 177 /* non-static so dtrace can find it rdar://problem/31156135&31379348 */
 178 extern void thread_call_delayed_timer(timer_call_param_t p0, timer_call_param_t p1);
 179
 180 lck_grp_t               thread_call_lck_grp;
 181 lck_mtx_t               thread_call_lock_data;
 182
 183 #define thread_call_lock_spin()                 \
 184         lck_mtx_lock_spin_always(&thread_call_lock_data)
 185
 186 #define thread_call_unlock()                    \
 187         lck_mtx_unlock_always(&thread_call_lock_data)
 188
 189 #define tc_deadline tc_call.deadline
 190
 191 extern boolean_t        mach_timer_coalescing_enabled;
 192
 193 static inline spl_t
 194 disable_ints_and_lock(void)
 195 {
 196         spl_t s = splsched();
 197         thread_call_lock_spin();
 198
 199         return s;
 200 }
 201
 202 static inline void
 203 enable_ints_and_unlock(spl_t s)
 204 {
 205         thread_call_unlock();
 206         splx(s);
 207 }
 208
 209 static inline boolean_t
 210 group_isparallel(thread_call_group_t group)
 211 {
 212         return (group->flags & TCG_PARALLEL) != 0;
 213 }
 214
 215 static boolean_t
 216 thread_call_group_should_add_thread(thread_call_group_t group)
 217 {
 218         if ((group->active_count + group->blocked_count + group->idle_count) >= THREAD_CALL_GROUP_MAX_THREADS) {
 219                 panic("thread_call group '%s' reached max thread cap (%d): active: %d, blocked: %d, idle: %d",
 220                     group->tcg_name, THREAD_CALL_GROUP_MAX_THREADS,
 221                     group->active_count, group->blocked_count, group->idle_count);
 222         }
 223
 224         if (group_isparallel(group) == FALSE) {
 225                 if (group->pending_count > 0 && group->active_count == 0) {
 226                         return TRUE;
 227                 }
 228
 229                 return FALSE;
 230         }
 231
 232         if (group->pending_count > 0) {
 233                 if (group->idle_count > 0) {
 234                         return FALSE;
 235                 }
 236
 237                 uint32_t thread_count = group->active_count;
 238
 239                 /*
 240                  * Add a thread if either there are no threads,
 241                  * the group has fewer than its target number of
 242                  * threads, or the amount of work is large relative
 243                  * to the number of threads.  In the last case, pay attention
 244                  * to the total load on the system, and back off if
 245                  * it's high.
 246                  */
 247                 if ((thread_count == 0) ||
 248                     (thread_count < group->target_thread_count) ||
 249                     ((group->pending_count > THREAD_CALL_ADD_RATIO * thread_count) &&
 250                     (sched_mach_factor < THREAD_CALL_MACH_FACTOR_CAP))) {
 251                         return TRUE;
 252                 }
 253         }
 254
 255         return FALSE;
 256 }
 257
 258 /* Lock held */
 259 static inline thread_call_group_t
 260 thread_call_get_group(thread_call_t call)
 261 {
 262         thread_call_index_t index = call->tc_index;
 263
 264         assert(index >= 0 && index < THREAD_CALL_INDEX_MAX);
 265
 266         return &thread_call_groups[index];
 267 }
 268
 269 /* Lock held */
 270 static inline thread_call_flavor_t
 271 thread_call_get_flavor(thread_call_t call)
 272 {
 273         return (call->tc_flags & THREAD_CALL_CONTINUOUS) ? TCF_CONTINUOUS : TCF_ABSOLUTE;
 274 }
 275
 276 static void
 277 thread_call_group_setup(thread_call_group_t group)
 278 {
 279         queue_init(&group->pending_queue);
 280         queue_init(&group->delayed_queues[TCF_ABSOLUTE]);
 281         queue_init(&group->delayed_queues[TCF_CONTINUOUS]);
 282
 283         /* TODO: Consolidate to one hard timer for each group */
 284         timer_call_setup(&group->delayed_timers[TCF_ABSOLUTE], thread_call_delayed_timer, group);
 285         timer_call_setup(&group->delayed_timers[TCF_CONTINUOUS], thread_call_delayed_timer, group);
 286         timer_call_setup(&group->dealloc_timer, thread_call_dealloc_timer, group);
 287
 288         /* Reverse the wait order so we re-use the most recently parked thread from the pool */
 289         waitq_init(&group->idle_waitq, SYNC_POLICY_REVERSED | SYNC_POLICY_DISABLE_IRQ);
 290 }
 291
 292 /*
 293  * Simple wrapper for creating threads bound to
 294  * thread call groups.
 295  */
 296 static kern_return_t
 297 thread_call_thread_create(
 298         thread_call_group_t             group)
 299 {
 300         thread_t thread;
 301         kern_return_t result;
 302
 303         int thread_pri = group->tcg_thread_pri;
 304
 305         result = kernel_thread_start_priority((thread_continue_t)thread_call_thread,
 306             group, thread_pri, &thread);
 307         if (result != KERN_SUCCESS) {
 308                 return result;
 309         }
 310
 311         if (thread_pri <= BASEPRI_KERNEL) {
 312                 /*
 313                  * THREAD_CALL_PRIORITY_KERNEL and lower don't get to run to completion
 314                  * in kernel if there are higher priority threads available.
 315                  */
 316                 thread_set_eager_preempt(thread);
 317         }
 318
 319         char name[MAXTHREADNAMESIZE] = "";
 320
 321         int group_thread_count = group->idle_count + group->active_count + group->blocked_count;
 322
 323         snprintf(name, sizeof(name), "thread call %s #%d", group->tcg_name, group_thread_count);
 324         thread_set_thread_name(thread, name);
 325
 326         thread_deallocate(thread);
 327         return KERN_SUCCESS;
 328 }
 329
 330 /*
 331  *      thread_call_initialize:
 332  *
 333  *      Initialize this module, called
 334  *      early during system initialization.
 335  */
 336 void
 337 thread_call_initialize(void)
 338 {
 339         int tc_size = sizeof(thread_call_data_t);
 340         thread_call_zone = zinit(tc_size, 4096 * tc_size, 16 * tc_size, "thread_call");
 341         zone_change(thread_call_zone, Z_CALLERACCT, FALSE);
 342         zone_change(thread_call_zone, Z_NOENCRYPT, TRUE);
 343
 344         lck_grp_init(&thread_call_lck_grp, "thread_call", LCK_GRP_ATTR_NULL);
 345         lck_mtx_init(&thread_call_lock_data, &thread_call_lck_grp, LCK_ATTR_NULL);
 346
 347         nanotime_to_absolutetime(0, THREAD_CALL_DEALLOC_INTERVAL_NS, &thread_call_dealloc_interval_abs);
 348         waitq_init(&daemon_waitq, SYNC_POLICY_DISABLE_IRQ | SYNC_POLICY_FIFO);
 349
 350         for (uint32_t i = 0; i < THREAD_CALL_INDEX_MAX; i++) {
 351                 thread_call_group_setup(&thread_call_groups[i]);
 352         }
 353
 354         spl_t s = disable_ints_and_lock();
 355
 356         queue_init(&thread_call_internal_queue);
 357         for (
 358                 thread_call_t call = internal_call_storage;
 359                 call < &internal_call_storage[INTERNAL_CALL_COUNT];
 360                 call++) {
 361                 enqueue_tail(&thread_call_internal_queue, &call->tc_call.q_link);
 362                 thread_call_internal_queue_count++;
 363         }
 364
 365         thread_call_daemon_awake = TRUE;
 366
 367         enable_ints_and_unlock(s);
 368
 369         thread_t thread;
 370         kern_return_t result;
 371
 372         result = kernel_thread_start_priority((thread_continue_t)thread_call_daemon,
 373             NULL, BASEPRI_PREEMPT_HIGH + 1, &thread);
 374         if (result != KERN_SUCCESS) {
 375                 panic("thread_call_initialize");
 376         }
 377
 378         thread_deallocate(thread);
 379 }
 380
 381 void
 382 thread_call_setup(
 383         thread_call_t                   call,
 384         thread_call_func_t              func,
 385         thread_call_param_t             param0)
 386 {
 387         bzero(call, sizeof(*call));
 388         call_entry_setup((call_entry_t)call, func, param0);
 389
 390         /* Thread calls default to the HIGH group unless otherwise specified */
 391         call->tc_index = THREAD_CALL_INDEX_HIGH;
 392
 393         /* THREAD_CALL_ALLOC not set, memory owned by caller */
 394 }
 395
 396 /*
 397  *      _internal_call_allocate:
 398  *
 399  *      Allocate an internal callout entry.
 400  *
 401  *      Called with thread_call_lock held.
 402  */
 403 static __inline__ thread_call_t
 404 _internal_call_allocate(thread_call_func_t func, thread_call_param_t param0)
 405 {
 406         thread_call_t               call;
 407
 408         if (queue_empty(&thread_call_internal_queue)) {
 409                 panic("_internal_call_allocate");
 410         }
 411
 412         call = qe_dequeue_head(&thread_call_internal_queue, struct thread_call, tc_call.q_link);
 413
 414         thread_call_internal_queue_count--;
 415
 416         thread_call_setup(call, func, param0);
 417         call->tc_refs = 0;
 418         call->tc_flags = 0; /* THREAD_CALL_ALLOC not set, do not free back to zone */
 419
 420         return call;
 421 }
 422
 423 /*
 424  *      _internal_call_release:
 425  *
 426  *      Release an internal callout entry which
 427  *      is no longer pending (or delayed). This is
 428  *      safe to call on a non-internal entry, in which
 429  *      case nothing happens.
 430  *
 431  *      Called with thread_call_lock held.
 432  */
 433 static __inline__ void
 434 _internal_call_release(thread_call_t call)
 435 {
 436         if (call >= internal_call_storage &&
 437             call < &internal_call_storage[INTERNAL_CALL_COUNT]) {
 438                 assert((call->tc_flags & THREAD_CALL_ALLOC) == 0);
 439                 enqueue_head(&thread_call_internal_queue, &call->tc_call.q_link);
 440                 thread_call_internal_queue_count++;
 441         }
 442 }
 443
 444 /*
 445  *      _pending_call_enqueue:
 446  *
 447  *      Place an entry at the end of the
 448  *      pending queue, to be executed soon.
 449  *
 450  *      Returns TRUE if the entry was already
 451  *      on a queue.
 452  *
 453  *      Called with thread_call_lock held.
 454  */
 455 static __inline__ boolean_t
 456 _pending_call_enqueue(thread_call_t             call,
 457     thread_call_group_t       group)
 458 {
 459         if ((THREAD_CALL_ONCE | THREAD_CALL_RUNNING)
 460             == (call->tc_flags & (THREAD_CALL_ONCE | THREAD_CALL_RUNNING))) {
 461                 call->tc_deadline = 0;
 462
 463                 uint32_t flags = call->tc_flags;
 464                 call->tc_flags |= THREAD_CALL_RESCHEDULE;
 465
 466                 if ((flags & THREAD_CALL_RESCHEDULE) != 0) {
 467                         return TRUE;
 468                 } else {
 469                         return FALSE;
 470                 }
 471         }
 472
 473         queue_head_t *old_queue = call_entry_enqueue_tail(CE(call), &group->pending_queue);
 474
 475         if (old_queue == NULL) {
 476                 call->tc_submit_count++;
 477         } else if (old_queue != &group->pending_queue &&
 478             old_queue != &group->delayed_queues[TCF_ABSOLUTE] &&
 479             old_queue != &group->delayed_queues[TCF_CONTINUOUS]) {
 480                 panic("tried to move a thread call (%p) between groups (old_queue: %p)", call, old_queue);
 481         }
 482
 483         group->pending_count++;
 484
 485         thread_call_wake(group);
 486
 487         return old_queue != NULL;
 488 }
 489
 490 /*
 491  *      _delayed_call_enqueue:
 492  *
 493  *      Place an entry on the delayed queue,
 494  *      after existing entries with an earlier
 495  *      (or identical) deadline.
 496  *
 497  *      Returns TRUE if the entry was already
 498  *      on a queue.
 499  *
 500  *      Called with thread_call_lock held.
 501  */
 502 static boolean_t
 503 _delayed_call_enqueue(
 504         thread_call_t           call,
 505         thread_call_group_t     group,
 506         uint64_t                deadline,
 507         thread_call_flavor_t    flavor)
 508 {
 509         if ((THREAD_CALL_ONCE | THREAD_CALL_RUNNING)
 510             == (call->tc_flags & (THREAD_CALL_ONCE | THREAD_CALL_RUNNING))) {
 511                 call->tc_deadline = deadline;
 512
 513                 uint32_t flags = call->tc_flags;
 514                 call->tc_flags |= THREAD_CALL_RESCHEDULE;
 515
 516                 if ((flags & THREAD_CALL_RESCHEDULE) != 0) {
 517                         return TRUE;
 518                 } else {
 519                         return FALSE;
 520                 }
 521         }
 522
 523         queue_head_t *old_queue = call_entry_enqueue_deadline(CE(call),
 524             &group->delayed_queues[flavor],
 525             deadline);
 526
 527         if (old_queue == &group->pending_queue) {
 528                 group->pending_count--;
 529         } else if (old_queue == NULL) {
 530                 call->tc_submit_count++;
 531         } else if (old_queue == &group->delayed_queues[TCF_ABSOLUTE] ||
 532             old_queue == &group->delayed_queues[TCF_CONTINUOUS]) {
 533                 /* TODO: if it's in the other delayed queue, that might not be OK */
 534                 // we did nothing, and that's fine
 535         } else {
 536                 panic("tried to move a thread call (%p) between groups (old_queue: %p)", call, old_queue);
 537         }
 538
 539         return old_queue != NULL;
 540 }
 541
 542 /*
 543  *      _call_dequeue:
 544  *
 545  *      Remove an entry from a queue.
 546  *
 547  *      Returns TRUE if the entry was on a queue.
 548  *
 549  *      Called with thread_call_lock held.
 550  */
 551 static __inline__ boolean_t
 552 _call_dequeue(
 553         thread_call_t           call,
 554         thread_call_group_t     group)
 555 {
 556         queue_head_t            *old_queue;
 557
 558         old_queue = call_entry_dequeue(CE(call));
 559
 560         if (old_queue != NULL) {
 561                 assert(old_queue == &group->pending_queue ||
 562                     old_queue == &group->delayed_queues[TCF_ABSOLUTE] ||
 563                     old_queue == &group->delayed_queues[TCF_CONTINUOUS]);
 564
 565                 call->tc_finish_count++;
 566                 if (old_queue == &group->pending_queue) {
 567                         group->pending_count--;
 568                 }
 569         }
 570
 571         return old_queue != NULL;
 572 }
 573
 574 /*
 575  * _arm_delayed_call_timer:
 576  *
 577  * Check if the timer needs to be armed for this flavor,
 578  * and if so, arm it.
 579  *
 580  * If call is non-NULL, only re-arm the timer if the specified call
 581  * is the first in the queue.
 582  *
 583  * Returns true if the timer was armed/re-armed, false if it was left unset
 584  * Caller should cancel the timer if need be.
 585  *
 586  * Called with thread_call_lock held.
 587  */
 588 static bool
 589 _arm_delayed_call_timer(thread_call_t           new_call,
 590     thread_call_group_t     group,
 591     thread_call_flavor_t    flavor)
 592 {
 593         /* No calls implies no timer needed */
 594         if (queue_empty(&group->delayed_queues[flavor])) {
 595                 return false;
 596         }
 597
 598         thread_call_t call = qe_queue_first(&group->delayed_queues[flavor], struct thread_call, tc_call.q_link);
 599
 600         /* We only need to change the hard timer if this new call is the first in the list */
 601         if (new_call != NULL && new_call != call) {
 602                 return false;
 603         }
 604
 605         assert((call->tc_soft_deadline != 0) && ((call->tc_soft_deadline <= call->tc_call.deadline)));
 606
 607         uint64_t fire_at = call->tc_soft_deadline;
 608
 609         if (flavor == TCF_CONTINUOUS) {
 610                 assert((call->tc_flags & THREAD_CALL_CONTINUOUS) == THREAD_CALL_CONTINUOUS);
 611                 fire_at = continuoustime_to_absolutetime(fire_at);
 612         } else {
 613                 assert((call->tc_flags & THREAD_CALL_CONTINUOUS) == 0);
 614         }
 615
 616         /*
 617          * Note: This picks the soonest-deadline call's leeway as the hard timer's leeway,
 618          * which does not take into account later-deadline timers with a larger leeway.
 619          * This is a valid coalescing behavior, but masks a possible window to
 620          * fire a timer instead of going idle.
 621          */
 622         uint64_t leeway = call->tc_call.deadline - call->tc_soft_deadline;
 623
 624         timer_call_enter_with_leeway(&group->delayed_timers[flavor], (timer_call_param_t)flavor,
 625             fire_at, leeway,
 626             TIMER_CALL_SYS_CRITICAL | TIMER_CALL_LEEWAY,
 627             ((call->tc_flags & THREAD_CALL_RATELIMITED) == THREAD_CALL_RATELIMITED));
 628
 629         return true;
 630 }
 631
 632 /*
 633  *      _cancel_func_from_queue:
 634  *
 635  *      Remove the first (or all) matching
 636  *      entries from the specified queue.
 637  *
 638  *      Returns TRUE if any matching entries
 639  *      were found.
 640  *
 641  *      Called with thread_call_lock held.
 642  */
 643 static boolean_t
 644 _cancel_func_from_queue(thread_call_func_t      func,
 645     thread_call_param_t     param0,
 646     thread_call_group_t     group,
 647     boolean_t               remove_all,
 648     queue_head_t            *queue)
 649 {
 650         boolean_t call_removed = FALSE;
 651         thread_call_t call;
 652
 653         qe_foreach_element_safe(call, queue, tc_call.q_link) {
 654                 if (call->tc_call.func != func ||
 655                     call->tc_call.param0 != param0) {
 656                         continue;
 657                 }
 658
 659                 _call_dequeue(call, group);
 660
 661                 _internal_call_release(call);
 662
 663                 call_removed = TRUE;
 664                 if (!remove_all) {
 665                         break;
 666                 }
 667         }
 668
 669         return call_removed;
 670 }
 671
 672 /*
 673  *      thread_call_func_delayed:
 674  *
 675  *      Enqueue a function callout to
 676  *      occur at the stated time.
 677  */
 678 void
 679 thread_call_func_delayed(
 680         thread_call_func_t              func,
 681         thread_call_param_t             param,
 682         uint64_t                        deadline)
 683 {
 684         (void)thread_call_enter_delayed_internal(NULL, func, param, 0, deadline, 0, 0);
 685 }
 686
 687 /*
 688  * thread_call_func_delayed_with_leeway:
 689  *
 690  * Same as thread_call_func_delayed(), but with
 691  * leeway/flags threaded through.
 692  */
 693
 694 void
 695 thread_call_func_delayed_with_leeway(
 696         thread_call_func_t              func,
 697         thread_call_param_t             param,
 698         uint64_t                deadline,
 699         uint64_t                leeway,
 700         uint32_t                flags)
 701 {
 702         (void)thread_call_enter_delayed_internal(NULL, func, param, 0, deadline, leeway, flags);
 703 }
 704
 705 /*
 706  *      thread_call_func_cancel:
 707  *
 708  *      Dequeue a function callout.
 709  *
 710  *      Removes one (or all) { function, argument }
 711  *      instance(s) from either (or both)
 712  *      the pending and the delayed queue,
 713  *      in that order.
 714  *
 715  *      Returns TRUE if any calls were cancelled.
 716  *
 717  *      This iterates all of the pending or delayed thread calls in the group,
 718  *      which is really inefficient.  Switch to an allocated thread call instead.
 719  */
 720 boolean_t
 721 thread_call_func_cancel(
 722         thread_call_func_t              func,
 723         thread_call_param_t             param,
 724         boolean_t                       cancel_all)
 725 {
 726         boolean_t       result;
 727
 728         assert(func != NULL);
 729
 730         spl_t s = disable_ints_and_lock();
 731
 732         /* Function-only thread calls are only kept in the default HIGH group */
 733         thread_call_group_t group = &thread_call_groups[THREAD_CALL_INDEX_HIGH];
 734
 735         if (cancel_all) {
 736                 /* exhaustively search every queue, and return true if any search found something */
 737                 result = _cancel_func_from_queue(func, param, group, cancel_all, &group->pending_queue) |
 738                     _cancel_func_from_queue(func, param, group, cancel_all, &group->delayed_queues[TCF_ABSOLUTE])  |
 739                     _cancel_func_from_queue(func, param, group, cancel_all, &group->delayed_queues[TCF_CONTINUOUS]);
 740         } else {
 741                 /* early-exit as soon as we find something, don't search other queues */
 742                 result = _cancel_func_from_queue(func, param, group, cancel_all, &group->pending_queue) ||
 743                     _cancel_func_from_queue(func, param, group, cancel_all, &group->delayed_queues[TCF_ABSOLUTE]) ||
 744                     _cancel_func_from_queue(func, param, group, cancel_all, &group->delayed_queues[TCF_CONTINUOUS]);
 745         }
 746
 747         enable_ints_and_unlock(s);
 748
 749         return result;
 750 }
 751
 752 /*
 753  * Allocate a thread call with a given priority.  Importances other than
 754  * THREAD_CALL_PRIORITY_HIGH or THREAD_CALL_PRIORITY_KERNEL_HIGH will be run in threads
 755  * with eager preemption enabled (i.e. may be aggressively preempted by higher-priority
 756  * threads which are not in the normal "urgent" bands).
 757  */
 758 thread_call_t
 759 thread_call_allocate_with_priority(
 760         thread_call_func_t              func,
 761         thread_call_param_t             param0,
 762         thread_call_priority_t          pri)
 763 {
 764         return thread_call_allocate_with_options(func, param0, pri, 0);
 765 }
 766
 767 thread_call_t
 768 thread_call_allocate_with_options(
 769         thread_call_func_t              func,
 770         thread_call_param_t             param0,
 771         thread_call_priority_t          pri,
 772         thread_call_options_t           options)
 773 {
 774         thread_call_t call = thread_call_allocate(func, param0);
 775
 776         switch (pri) {
 777         case THREAD_CALL_PRIORITY_HIGH:
 778                 call->tc_index = THREAD_CALL_INDEX_HIGH;
 779                 break;
 780         case THREAD_CALL_PRIORITY_KERNEL:
 781                 call->tc_index = THREAD_CALL_INDEX_KERNEL;
 782                 break;
 783         case THREAD_CALL_PRIORITY_USER:
 784                 call->tc_index = THREAD_CALL_INDEX_USER;
 785                 break;
 786         case THREAD_CALL_PRIORITY_LOW:
 787                 call->tc_index = THREAD_CALL_INDEX_LOW;
 788                 break;
 789         case THREAD_CALL_PRIORITY_KERNEL_HIGH:
 790                 call->tc_index = THREAD_CALL_INDEX_KERNEL_HIGH;
 791                 break;
 792         default:
 793                 panic("Invalid thread call pri value: %d", pri);
 794                 break;
 795         }
 796
 797         if (options & THREAD_CALL_OPTIONS_ONCE) {
 798                 call->tc_flags |= THREAD_CALL_ONCE;
 799         }
 800         if (options & THREAD_CALL_OPTIONS_SIGNAL) {
 801                 call->tc_flags |= THREAD_CALL_SIGNAL | THREAD_CALL_ONCE;
 802         }
 803
 804         return call;
 805 }
 806
 807 thread_call_t
 808 thread_call_allocate_with_qos(thread_call_func_t        func,
 809     thread_call_param_t       param0,
 810     int                       qos_tier,
 811     thread_call_options_t     options)
 812 {
 813         thread_call_t call = thread_call_allocate(func, param0);
 814
 815         switch (qos_tier) {
 816         case THREAD_QOS_UNSPECIFIED:
 817                 call->tc_index = THREAD_CALL_INDEX_HIGH;
 818                 break;
 819         case THREAD_QOS_LEGACY:
 820                 call->tc_index = THREAD_CALL_INDEX_USER;
 821                 break;
 822         case THREAD_QOS_MAINTENANCE:
 823         case THREAD_QOS_BACKGROUND:
 824                 call->tc_index = THREAD_CALL_INDEX_LOW;
 825                 break;
 826         case THREAD_QOS_UTILITY:
 827                 call->tc_index = THREAD_CALL_INDEX_QOS_UT;
 828                 break;
 829         case THREAD_QOS_USER_INITIATED:
 830                 call->tc_index = THREAD_CALL_INDEX_QOS_IN;
 831                 break;
 832         case THREAD_QOS_USER_INTERACTIVE:
 833                 call->tc_index = THREAD_CALL_INDEX_QOS_UI;
 834                 break;
 835         default:
 836                 panic("Invalid thread call qos value: %d", qos_tier);
 837                 break;
 838         }
 839
 840         if (options & THREAD_CALL_OPTIONS_ONCE) {
 841                 call->tc_flags |= THREAD_CALL_ONCE;
 842         }
 843
 844         /* does not support THREAD_CALL_OPTIONS_SIGNAL */
 845
 846         return call;
 847 }
 848
 849
 850 /*
 851  *      thread_call_allocate:
 852  *
 853  *      Allocate a callout entry.
 854  */
 855 thread_call_t
 856 thread_call_allocate(
 857         thread_call_func_t              func,
 858         thread_call_param_t             param0)
 859 {
 860         thread_call_t   call = zalloc(thread_call_zone);
 861
 862         thread_call_setup(call, func, param0);
 863         call->tc_refs = 1;
 864         call->tc_flags = THREAD_CALL_ALLOC;
 865
 866         return call;
 867 }
 868
 869 /*
 870  *      thread_call_free:
 871  *
 872  *      Release a callout.  If the callout is currently
 873  *      executing, it will be freed when all invocations
 874  *      finish.
 875  *
 876  *      If the callout is currently armed to fire again, then
 877  *      freeing is not allowed and returns FALSE.  The
 878  *      client must have canceled the pending invocation before freeing.
 879  */
 880 boolean_t
 881 thread_call_free(
 882         thread_call_t           call)
 883 {
 884         spl_t s = disable_ints_and_lock();
 885
 886         if (call->tc_call.queue != NULL ||
 887             ((call->tc_flags & THREAD_CALL_RESCHEDULE) != 0)) {
 888                 thread_call_unlock();
 889                 splx(s);
 890
 891                 return FALSE;
 892         }
 893
 894         int32_t refs = --call->tc_refs;
 895         if (refs < 0) {
 896                 panic("Refcount negative: %d\n", refs);
 897         }
 898
 899         if ((THREAD_CALL_SIGNAL | THREAD_CALL_RUNNING)
 900             == ((THREAD_CALL_SIGNAL | THREAD_CALL_RUNNING) & call->tc_flags)) {
 901                 thread_call_wait_once_locked(call, s);
 902                 /* thread call lock has been unlocked */
 903         } else {
 904                 enable_ints_and_unlock(s);
 905         }
 906
 907         if (refs == 0) {
 908                 assert(call->tc_finish_count == call->tc_submit_count);
 909                 zfree(thread_call_zone, call);
 910         }
 911
 912         return TRUE;
 913 }
 914
 915 /*
 916  *      thread_call_enter:
 917  *
 918  *      Enqueue a callout entry to occur "soon".
 919  *
 920  *      Returns TRUE if the call was
 921  *      already on a queue.
 922  */
 923 boolean_t
 924 thread_call_enter(
 925         thread_call_t           call)
 926 {
 927         return thread_call_enter1(call, 0);
 928 }
 929
 930 boolean_t
 931 thread_call_enter1(
 932         thread_call_t                   call,
 933         thread_call_param_t             param1)
 934 {
 935         boolean_t               result = TRUE;
 936         thread_call_group_t     group;
 937
 938         assert(call->tc_call.func != NULL);
 939
 940         assert((call->tc_flags & THREAD_CALL_SIGNAL) == 0);
 941
 942         group = thread_call_get_group(call);
 943
 944         spl_t s = disable_ints_and_lock();
 945
 946         if (call->tc_call.queue != &group->pending_queue) {
 947                 result = _pending_call_enqueue(call, group);
 948         }
 949
 950         call->tc_call.param1 = param1;
 951
 952         enable_ints_and_unlock(s);
 953
 954         return result;
 955 }
 956
 957 /*
 958  *      thread_call_enter_delayed:
 959  *
 960  *      Enqueue a callout entry to occur
 961  *      at the stated time.
 962  *
 963  *      Returns TRUE if the call was
 964  *      already on a queue.
 965  */
 966 boolean_t
 967 thread_call_enter_delayed(
 968         thread_call_t           call,
 969         uint64_t                deadline)
 970 {
 971         assert(call != NULL);
 972         return thread_call_enter_delayed_internal(call, NULL, 0, 0, deadline, 0, 0);
 973 }
 974
 975 boolean_t
 976 thread_call_enter1_delayed(
 977         thread_call_t                   call,
 978         thread_call_param_t             param1,
 979         uint64_t                        deadline)
 980 {
 981         assert(call != NULL);
 982         return thread_call_enter_delayed_internal(call, NULL, 0, param1, deadline, 0, 0);
 983 }
 984
 985 boolean_t
 986 thread_call_enter_delayed_with_leeway(
 987         thread_call_t           call,
 988         thread_call_param_t     param1,
 989         uint64_t                deadline,
 990         uint64_t                leeway,
 991         unsigned int            flags)
 992 {
 993         assert(call != NULL);
 994         return thread_call_enter_delayed_internal(call, NULL, 0, param1, deadline, leeway, flags);
 995 }
 996
 997
 998 /*
 999  * thread_call_enter_delayed_internal:
1000  * enqueue a callout entry to occur at the stated time
1001  *
1002  * Returns True if the call was already on a queue
1003  * params:
1004  * call     - structure encapsulating state of the callout
1005  * alt_func/alt_param0 - if call is NULL, allocate temporary storage using these parameters
1006  * deadline - time deadline in nanoseconds
1007  * leeway   - timer slack represented as delta of deadline.
1008  * flags    - THREAD_CALL_DELAY_XXX : classification of caller's desires wrt timer coalescing.
1009  *            THREAD_CALL_DELAY_LEEWAY : value in leeway is used for timer coalescing.
1010  *            THREAD_CALL_CONTINUOUS: thread call will be called according to mach_continuous_time rather
1011  *                                                                        than mach_absolute_time
1012  */
1013 boolean_t
1014 thread_call_enter_delayed_internal(
1015         thread_call_t           call,
1016         thread_call_func_t      alt_func,
1017         thread_call_param_t     alt_param0,
1018         thread_call_param_t     param1,
1019         uint64_t                deadline,
1020         uint64_t                leeway,
1021         unsigned int            flags)
1022 {
1023         boolean_t               result = TRUE;
1024         thread_call_group_t     group;
1025         uint64_t                now, sdeadline, slop;
1026         uint32_t                urgency;
1027
1028         thread_call_flavor_t flavor = (flags & THREAD_CALL_CONTINUOUS) ? TCF_CONTINUOUS : TCF_ABSOLUTE;
1029
1030         /* direct mapping between thread_call, timer_call, and timeout_urgency values */
1031         urgency = (flags & TIMEOUT_URGENCY_MASK);
1032
1033         spl_t s = disable_ints_and_lock();
1034
1035         if (call == NULL) {
1036                 /* allocate a structure out of internal storage, as a convenience for BSD callers */
1037                 call = _internal_call_allocate(alt_func, alt_param0);
1038         }
1039
1040         assert(call->tc_call.func != NULL);
1041         group = thread_call_get_group(call);
1042
1043         /* TODO: assert that call is not enqueued before flipping the flag */
1044         if (flavor == TCF_CONTINUOUS) {
1045                 now = mach_continuous_time();
1046                 call->tc_flags |= THREAD_CALL_CONTINUOUS;
1047         } else {
1048                 now = mach_absolute_time();
1049                 call->tc_flags &= ~THREAD_CALL_CONTINUOUS;
1050         }
1051
1052         call->tc_flags |= THREAD_CALL_DELAYED;
1053
1054         call->tc_soft_deadline = sdeadline = deadline;
1055
1056         boolean_t ratelimited = FALSE;
1057         slop = timer_call_slop(deadline, now, urgency, current_thread(), &ratelimited);
1058
1059         if ((flags & THREAD_CALL_DELAY_LEEWAY) != 0 && leeway > slop) {
1060                 slop = leeway;
1061         }
1062
1063         if (UINT64_MAX - deadline <= slop) {
1064                 deadline = UINT64_MAX;
1065         } else {
1066                 deadline += slop;
1067         }
1068
1069         if (ratelimited) {
1070                 call->tc_flags |= TIMER_CALL_RATELIMITED;
1071         } else {
1072                 call->tc_flags &= ~TIMER_CALL_RATELIMITED;
1073         }
1074
1075         call->tc_call.param1 = param1;
1076
1077         call->tc_ttd = (sdeadline > now) ? (sdeadline - now) : 0;
1078
1079         result = _delayed_call_enqueue(call, group, deadline, flavor);
1080
1081         _arm_delayed_call_timer(call, group, flavor);
1082
1083 #if CONFIG_DTRACE
1084         DTRACE_TMR5(thread_callout__create, thread_call_func_t, call->tc_call.func,
1085             uint64_t, (deadline - sdeadline), uint64_t, (call->tc_ttd >> 32),
1086             (unsigned) (call->tc_ttd & 0xFFFFFFFF), call);
1087 #endif
1088
1089         enable_ints_and_unlock(s);
1090
1091         return result;
1092 }
1093
1094 /*
1095  * Remove a callout entry from the queue
1096  * Called with thread_call_lock held
1097  */
1098 static boolean_t
1099 thread_call_cancel_locked(thread_call_t call)
1100 {
1101         boolean_t canceled = (0 != (THREAD_CALL_RESCHEDULE & call->tc_flags));
1102         call->tc_flags &= ~THREAD_CALL_RESCHEDULE;
1103
1104         if (canceled) {
1105                 /* if reschedule was set, it must not have been queued */
1106                 assert(call->tc_call.queue == NULL);
1107         } else {
1108                 boolean_t do_cancel_callout = FALSE;
1109
1110                 thread_call_flavor_t flavor = thread_call_get_flavor(call);
1111                 thread_call_group_t  group  = thread_call_get_group(call);
1112
1113                 if ((call->tc_call.deadline != 0) &&
1114                     (call == qe_queue_first(&group->delayed_queues[flavor], struct thread_call, tc_call.q_link))) {
1115                         assert(call->tc_call.queue == &group->delayed_queues[flavor]);
1116                         do_cancel_callout = TRUE;
1117                 }
1118
1119                 canceled = _call_dequeue(call, group);
1120
1121                 if (do_cancel_callout) {
1122                         if (_arm_delayed_call_timer(NULL, group, flavor) == false) {
1123                                 timer_call_cancel(&group->delayed_timers[flavor]);
1124                         }
1125                 }
1126         }
1127
1128 #if CONFIG_DTRACE
1129         DTRACE_TMR4(thread_callout__cancel, thread_call_func_t, call->tc_call.func,
1130             0, (call->tc_ttd >> 32), (unsigned) (call->tc_ttd & 0xFFFFFFFF));
1131 #endif
1132
1133         return canceled;
1134 }
1135
1136 /*
1137  *      thread_call_cancel:
1138  *
1139  *      Dequeue a callout entry.
1140  *
1141  *      Returns TRUE if the call was
1142  *      on a queue.
1143  */
1144 boolean_t
1145 thread_call_cancel(thread_call_t call)
1146 {
1147         spl_t s = disable_ints_and_lock();
1148
1149         boolean_t result = thread_call_cancel_locked(call);
1150
1151         enable_ints_and_unlock(s);
1152
1153         return result;
1154 }
1155
1156 /*
1157  * Cancel a thread call.  If it cannot be cancelled (i.e.
1158  * is already in flight), waits for the most recent invocation
1159  * to finish.  Note that if clients re-submit this thread call,
1160  * it may still be pending or in flight when thread_call_cancel_wait
1161  * returns, but all requests to execute this work item prior
1162  * to the call to thread_call_cancel_wait will have finished.
1163  */
1164 boolean_t
1165 thread_call_cancel_wait(thread_call_t call)
1166 {
1167         if ((call->tc_flags & THREAD_CALL_ALLOC) == 0) {
1168                 panic("thread_call_cancel_wait: can't wait on thread call whose storage I don't own");
1169         }
1170
1171         if (!ml_get_interrupts_enabled()) {
1172                 panic("unsafe thread_call_cancel_wait");
1173         }
1174
1175         if (current_thread()->thc_state.thc_call == call) {
1176                 panic("thread_call_cancel_wait: deadlock waiting on self from inside call: %p to function %p",
1177                     call, call->tc_call.func);
1178         }
1179
1180         spl_t s = disable_ints_and_lock();
1181
1182         boolean_t canceled = thread_call_cancel_locked(call);
1183
1184         if ((call->tc_flags & THREAD_CALL_ONCE) == THREAD_CALL_ONCE) {
1185                 /*
1186                  * A cancel-wait on a 'once' call will both cancel
1187                  * the pending call and wait for the in-flight call
1188                  */
1189
1190                 thread_call_wait_once_locked(call, s);
1191                 /* thread call lock unlocked */
1192         } else {
1193                 /*
1194                  * A cancel-wait on a normal call will only wait for the in-flight calls
1195                  * if it did not cancel the pending call.
1196                  *
1197                  * TODO: This seems less than useful - shouldn't it do the wait as well?
1198                  */
1199
1200                 if (canceled == FALSE) {
1201                         thread_call_wait_locked(call, s);
1202                         /* thread call lock unlocked */
1203                 } else {
1204                         enable_ints_and_unlock(s);
1205                 }
1206         }
1207
1208         return canceled;
1209 }
1210
1211
1212 /*
1213  *      thread_call_wake:
1214  *
1215  *      Wake a call thread to service
1216  *      pending call entries.  May wake
1217  *      the daemon thread in order to
1218  *      create additional call threads.
1219  *
1220  *      Called with thread_call_lock held.
1221  *
1222  *      For high-priority group, only does wakeup/creation if there are no threads
1223  *      running.
1224  */
1225 static __inline__ void
1226 thread_call_wake(
1227         thread_call_group_t             group)
1228 {
1229         /*
1230          * New behavior: use threads if you've got 'em.
1231          * Traditional behavior: wake only if no threads running.
1232          */
1233         if (group_isparallel(group) || group->active_count == 0) {
1234                 if (waitq_wakeup64_one(&group->idle_waitq, NO_EVENT64,
1235                     THREAD_AWAKENED, WAITQ_ALL_PRIORITIES) == KERN_SUCCESS) {
1236                         group->idle_count--; group->active_count++;
1237
1238                         if (group->idle_count == 0 && (group->flags & TCG_DEALLOC_ACTIVE) == TCG_DEALLOC_ACTIVE) {
1239                                 if (timer_call_cancel(&group->dealloc_timer) == TRUE) {
1240                                         group->flags &= ~TCG_DEALLOC_ACTIVE;
1241                                 }
1242                         }
1243                 } else {
1244                         if (!thread_call_daemon_awake && thread_call_group_should_add_thread(group)) {
1245                                 thread_call_daemon_awake = TRUE;
1246                                 waitq_wakeup64_one(&daemon_waitq, NO_EVENT64,
1247                                     THREAD_AWAKENED, WAITQ_ALL_PRIORITIES);
1248                         }
1249                 }
1250         }
1251 }
1252
1253 /*
1254  *      sched_call_thread:
1255  *
1256  *      Call out invoked by the scheduler.
1257  */
1258 static void
1259 sched_call_thread(
1260         int                             type,
1261         thread_t                thread)
1262 {
1263         thread_call_group_t             group;
1264
1265         group = thread->thc_state.thc_group;
1266         assert((group - &thread_call_groups[0]) < THREAD_CALL_INDEX_MAX);
1267
1268         thread_call_lock_spin();
1269
1270         switch (type) {
1271         case SCHED_CALL_BLOCK:
1272                 assert(group->active_count);
1273                 --group->active_count;
1274                 group->blocked_count++;
1275                 if (group->pending_count > 0) {
1276                         thread_call_wake(group);
1277                 }
1278                 break;
1279
1280         case SCHED_CALL_UNBLOCK:
1281                 assert(group->blocked_count);
1282                 --group->blocked_count;
1283                 group->active_count++;
1284                 break;
1285         }
1286
1287         thread_call_unlock();
1288 }
1289
1290 /*
1291  * Interrupts disabled, lock held; returns the same way.
1292  * Only called on thread calls whose storage we own.  Wakes up
1293  * anyone who might be waiting on this work item and frees it
1294  * if the client has so requested.
1295  */
1296 static boolean_t
1297 thread_call_finish(thread_call_t call, thread_call_group_t group, spl_t *s)
1298 {
1299         uint64_t  time;
1300         uint32_t  flags;
1301         boolean_t signal;
1302         boolean_t repend = FALSE;
1303
1304         call->tc_finish_count++;
1305         flags = call->tc_flags;
1306         signal = ((THREAD_CALL_SIGNAL & flags) != 0);
1307
1308         if (!signal) {
1309                 /* The thread call thread owns a ref until the call is finished */
1310                 if (call->tc_refs <= 0) {
1311                         panic("thread_call_finish: detected over-released thread call: %p", call);
1312                 }
1313                 call->tc_refs--;
1314         }
1315
1316         call->tc_flags &= ~(THREAD_CALL_RESCHEDULE | THREAD_CALL_RUNNING | THREAD_CALL_WAIT);
1317
1318         if ((call->tc_refs != 0) && ((flags & THREAD_CALL_RESCHEDULE) != 0)) {
1319                 assert(flags & THREAD_CALL_ONCE);
1320                 thread_call_flavor_t flavor = thread_call_get_flavor(call);
1321
1322                 if (THREAD_CALL_DELAYED & flags) {
1323                         time =  mach_absolute_time();
1324                         if (flavor == TCF_CONTINUOUS) {
1325                                 time =  absolutetime_to_continuoustime(time);
1326                         }
1327                         if (call->tc_soft_deadline <= time) {
1328                                 call->tc_flags &= ~(THREAD_CALL_DELAYED | TIMER_CALL_RATELIMITED);
1329                                 call->tc_deadline = 0;
1330                         }
1331                 }
1332                 if (call->tc_deadline) {
1333                         _delayed_call_enqueue(call, group, call->tc_deadline, flavor);
1334                         if (!signal) {
1335                                 _arm_delayed_call_timer(call, group, flavor);
1336                         }
1337                 } else if (signal) {
1338                         call->tc_submit_count++;
1339                         repend = TRUE;
1340                 } else {
1341                         _pending_call_enqueue(call, group);
1342                 }
1343         }
1344
1345         if (!signal && (call->tc_refs == 0)) {
1346                 if ((flags & THREAD_CALL_WAIT) != 0) {
1347                         panic("Someone waiting on a thread call that is scheduled for free: %p\n", call->tc_call.func);
1348                 }
1349
1350                 assert(call->tc_finish_count == call->tc_submit_count);
1351
1352                 enable_ints_and_unlock(*s);
1353
1354                 zfree(thread_call_zone, call);
1355
1356                 *s = disable_ints_and_lock();
1357         }
1358
1359         if ((flags & THREAD_CALL_WAIT) != 0) {
1360                 /*
1361                  * Dropping lock here because the sched call for the
1362                  * high-pri group can take the big lock from under
1363                  * a thread lock.
1364                  */
1365                 thread_call_unlock();
1366                 thread_wakeup((event_t)call);
1367                 thread_call_lock_spin();
1368                 /* THREAD_CALL_SIGNAL call may have been freed */
1369         }
1370
1371         return repend;
1372 }
1373
1374 /*
1375  * thread_call_invoke
1376  *
1377  * Invoke the function provided for this thread call
1378  *
1379  * Note that the thread call object can be deallocated by the function if we do not control its storage.
1380  */
1381 static void __attribute__((noinline))
1382 thread_call_invoke(thread_call_func_t func, thread_call_param_t param0, thread_call_param_t param1, thread_call_t call)
1383 {
1384         current_thread()->thc_state.thc_call = call;
1385
1386 #if DEVELOPMENT || DEBUG
1387         KERNEL_DEBUG_CONSTANT(
1388                 MACHDBG_CODE(DBG_MACH_SCHED, MACH_CALLOUT) | DBG_FUNC_START,
1389                 VM_KERNEL_UNSLIDE(func), VM_KERNEL_ADDRHIDE(param0), VM_KERNEL_ADDRHIDE(param1), 0, 0);
1390 #endif /* DEVELOPMENT || DEBUG */
1391
1392 #if CONFIG_DTRACE
1393         uint64_t tc_ttd = call->tc_ttd;
1394         boolean_t is_delayed = call->tc_flags & THREAD_CALL_DELAYED;
1395         DTRACE_TMR6(thread_callout__start, thread_call_func_t, func, int, 0, int, (tc_ttd >> 32),
1396             (unsigned) (tc_ttd & 0xFFFFFFFF), is_delayed, call);
1397 #endif
1398
1399         (*func)(param0, param1);
1400
1401 #if CONFIG_DTRACE
1402         DTRACE_TMR6(thread_callout__end, thread_call_func_t, func, int, 0, int, (tc_ttd >> 32),
1403             (unsigned) (tc_ttd & 0xFFFFFFFF), is_delayed, call);
1404 #endif
1405
1406 #if DEVELOPMENT || DEBUG
1407         KERNEL_DEBUG_CONSTANT(
1408                 MACHDBG_CODE(DBG_MACH_SCHED, MACH_CALLOUT) | DBG_FUNC_END,
1409                 VM_KERNEL_UNSLIDE(func), 0, 0, 0, 0);
1410 #endif /* DEVELOPMENT || DEBUG */
1411
1412         current_thread()->thc_state.thc_call = NULL;
1413 }
1414
1415 /*
1416  *      thread_call_thread:
1417  */
1418 static void
1419 thread_call_thread(
1420         thread_call_group_t             group,
1421         wait_result_t                   wres)
1422 {
1423         thread_t        self = current_thread();
1424         boolean_t       canwait;
1425
1426         if ((thread_get_tag_internal(self) & THREAD_TAG_CALLOUT) == 0) {
1427                 (void)thread_set_tag_internal(self, THREAD_TAG_CALLOUT);
1428         }
1429
1430         /*
1431          * A wakeup with THREAD_INTERRUPTED indicates that
1432          * we should terminate.
1433          */
1434         if (wres == THREAD_INTERRUPTED) {
1435                 thread_terminate(self);
1436
1437                 /* NOTREACHED */
1438                 panic("thread_terminate() returned?");
1439         }
1440
1441         spl_t s = disable_ints_and_lock();
1442
1443         self->thc_state.thc_group = group;
1444         thread_sched_call(self, sched_call_thread);
1445
1446         while (group->pending_count > 0) {
1447                 thread_call_t                   call;
1448                 thread_call_func_t              func;
1449                 thread_call_param_t             param0, param1;
1450
1451                 call = qe_dequeue_head(&group->pending_queue, struct thread_call, tc_call.q_link);
1452                 assert(call != NULL);
1453                 group->pending_count--;
1454
1455                 func = call->tc_call.func;
1456                 param0 = call->tc_call.param0;
1457                 param1 = call->tc_call.param1;
1458
1459                 call->tc_call.queue = NULL;
1460
1461                 _internal_call_release(call);
1462
1463                 /*
1464                  * Can only do wakeups for thread calls whose storage
1465                  * we control.
1466                  */
1467                 if ((call->tc_flags & THREAD_CALL_ALLOC) != 0) {
1468                         canwait = TRUE;
1469                         call->tc_flags |= THREAD_CALL_RUNNING;
1470                         call->tc_refs++;        /* Delay free until we're done */
1471                 } else {
1472                         canwait = FALSE;
1473                 }
1474
1475                 enable_ints_and_unlock(s);
1476
1477                 thread_call_invoke(func, param0, param1, call);
1478
1479                 if (get_preemption_level() != 0) {
1480                         int pl = get_preemption_level();
1481                         panic("thread_call_thread: preemption_level %d, last callout %p(%p, %p)",
1482                             pl, (void *)VM_KERNEL_UNSLIDE(func), param0, param1);
1483                 }
1484
1485                 s = disable_ints_and_lock();
1486
1487                 if (canwait) {
1488                         /* Frees if so desired */
1489                         thread_call_finish(call, group, &s);
1490                 }
1491         }
1492
1493         thread_sched_call(self, NULL);
1494         group->active_count--;
1495
1496         if (self->callout_woken_from_icontext && !self->callout_woke_thread) {
1497                 ledger_credit(self->t_ledger, task_ledgers.interrupt_wakeups, 1);
1498                 if (self->callout_woken_from_platform_idle) {
1499                         ledger_credit(self->t_ledger, task_ledgers.platform_idle_wakeups, 1);
1500                 }
1501         }
1502
1503         self->callout_woken_from_icontext = FALSE;
1504         self->callout_woken_from_platform_idle = FALSE;
1505         self->callout_woke_thread = FALSE;
1506
1507         if (group_isparallel(group)) {
1508                 /*
1509                  * For new style of thread group, thread always blocks.
1510                  * If we have more than the target number of threads,
1511                  * and this is the first to block, and it isn't active
1512                  * already, set a timer for deallocating a thread if we
1513                  * continue to have a surplus.
1514                  */
1515                 group->idle_count++;
1516
1517                 if (group->idle_count == 1) {
1518                         group->idle_timestamp = mach_absolute_time();
1519                 }
1520
1521                 if (((group->flags & TCG_DEALLOC_ACTIVE) == 0) &&
1522                     ((group->active_count + group->idle_count) > group->target_thread_count)) {
1523                         thread_call_start_deallocate_timer(group);
1524                 }
1525
1526                 /* Wait for more work (or termination) */
1527                 wres = waitq_assert_wait64(&group->idle_waitq, NO_EVENT64, THREAD_INTERRUPTIBLE, 0);
1528                 if (wres != THREAD_WAITING) {
1529                         panic("kcall worker unable to assert wait?");
1530                 }
1531
1532                 enable_ints_and_unlock(s);
1533
1534                 thread_block_parameter((thread_continue_t)thread_call_thread, group);
1535         } else {
1536                 if (group->idle_count < group->target_thread_count) {
1537                         group->idle_count++;
1538
1539                         waitq_assert_wait64(&group->idle_waitq, NO_EVENT64, THREAD_UNINT, 0); /* Interrupted means to exit */
1540
1541                         enable_ints_and_unlock(s);
1542
1543                         thread_block_parameter((thread_continue_t)thread_call_thread, group);
1544                         /* NOTREACHED */
1545                 }
1546         }
1547
1548         enable_ints_and_unlock(s);
1549
1550         thread_terminate(self);
1551         /* NOTREACHED */
1552 }
1553
1554 /*
1555  *      thread_call_daemon: walk list of groups, allocating
1556  *      threads if appropriate (as determined by
1557  *      thread_call_group_should_add_thread()).
1558  */
1559 static void
1560 thread_call_daemon_continue(__unused void *arg)
1561 {
1562         spl_t s = disable_ints_and_lock();
1563
1564         /* Starting at zero happens to be high-priority first. */
1565         for (int i = 0; i < THREAD_CALL_INDEX_MAX; i++) {
1566                 thread_call_group_t group = &thread_call_groups[i];
1567                 while (thread_call_group_should_add_thread(group)) {
1568                         group->active_count++;
1569
1570                         enable_ints_and_unlock(s);
1571
1572                         kern_return_t kr = thread_call_thread_create(group);
1573                         if (kr != KERN_SUCCESS) {
1574                                 /*
1575                                  * On failure, just pause for a moment and give up.
1576                                  * We can try again later.
1577                                  */
1578                                 delay(10000); /* 10 ms */
1579                                 s = disable_ints_and_lock();
1580                                 goto out;
1581                         }
1582
1583                         s = disable_ints_and_lock();
1584                 }
1585         }
1586
1587 out:
1588         thread_call_daemon_awake = FALSE;
1589         waitq_assert_wait64(&daemon_waitq, NO_EVENT64, THREAD_UNINT, 0);
1590
1591         enable_ints_and_unlock(s);
1592
1593         thread_block_parameter((thread_continue_t)thread_call_daemon_continue, NULL);
1594         /* NOTREACHED */
1595 }
1596
1597 static void
1598 thread_call_daemon(
1599         __unused void    *arg)
1600 {
1601         thread_t        self = current_thread();
1602
1603         self->options |= TH_OPT_VMPRIV;
1604         vm_page_free_reserve(2);        /* XXX */
1605
1606         thread_set_thread_name(self, "thread_call_daemon");
1607
1608         thread_call_daemon_continue(NULL);
1609         /* NOTREACHED */
1610 }
1611
1612 /*
1613  * Schedule timer to deallocate a worker thread if we have a surplus
1614  * of threads (in excess of the group's target) and at least one thread
1615  * is idle the whole time.
1616  */
1617 static void
1618 thread_call_start_deallocate_timer(thread_call_group_t group)
1619 {
1620         __assert_only boolean_t already_enqueued;
1621
1622         assert(group->idle_count > 0);
1623         assert((group->flags & TCG_DEALLOC_ACTIVE) == 0);
1624
1625         group->flags |= TCG_DEALLOC_ACTIVE;
1626
1627         uint64_t deadline = group->idle_timestamp + thread_call_dealloc_interval_abs;
1628
1629         already_enqueued = timer_call_enter(&group->dealloc_timer, deadline, 0);
1630
1631         assert(already_enqueued == FALSE);
1632 }
1633
1634 /* non-static so dtrace can find it rdar://problem/31156135&31379348 */
1635 void
1636 thread_call_delayed_timer(timer_call_param_t p0, timer_call_param_t p1)
1637 {
1638         thread_call_group_t  group  = (thread_call_group_t)  p0;
1639         thread_call_flavor_t flavor = (thread_call_flavor_t) p1;
1640
1641         thread_call_t   call;
1642         uint64_t        now;
1643         boolean_t       restart;
1644         boolean_t       repend;
1645
1646         thread_call_lock_spin();
1647
1648         if (flavor == TCF_CONTINUOUS) {
1649                 now = mach_continuous_time();
1650         } else if (flavor == TCF_ABSOLUTE) {
1651                 now = mach_absolute_time();
1652         } else {
1653                 panic("invalid timer flavor: %d", flavor);
1654         }
1655
1656         do {
1657                 restart = FALSE;
1658                 qe_foreach_element_safe(call, &group->delayed_queues[flavor], tc_call.q_link) {
1659                         if (flavor == TCF_CONTINUOUS) {
1660                                 assert((call->tc_flags & THREAD_CALL_CONTINUOUS) == THREAD_CALL_CONTINUOUS);
1661                         } else {
1662                                 assert((call->tc_flags & THREAD_CALL_CONTINUOUS) == 0);
1663                         }
1664
1665                         /*
1666                          * if we hit a call that isn't yet ready to expire,
1667                          * then we're done for now
1668                          * TODO: The next timer in the list could have a larger leeway
1669                          *       and therefore be ready to expire.
1670                          *       Sort by deadline then by soft deadline to avoid this
1671                          */
1672                         if (call->tc_soft_deadline > now) {
1673                                 break;
1674                         }
1675
1676                         /*
1677                          * If we hit a rate-limited timer, don't eagerly wake it up.
1678                          * Wait until it reaches the end of the leeway window.
1679                          *
1680                          * TODO: What if the next timer is not rate-limited?
1681                          *       Have a separate rate-limited queue to avoid this
1682                          */
1683                         if ((call->tc_flags & THREAD_CALL_RATELIMITED) &&
1684                             (call->tc_call.deadline > now) &&
1685                             (ml_timer_forced_evaluation() == FALSE)) {
1686                                 break;
1687                         }
1688
1689                         if (THREAD_CALL_SIGNAL & call->tc_flags) {
1690                                 __assert_only queue_head_t *old_queue;
1691                                 old_queue = call_entry_dequeue(&call->tc_call);
1692                                 assert(old_queue == &group->delayed_queues[flavor]);
1693
1694                                 do {
1695                                         thread_call_func_t  func   = call->tc_call.func;
1696                                         thread_call_param_t param0 = call->tc_call.param0;
1697                                         thread_call_param_t param1 = call->tc_call.param1;
1698
1699                                         call->tc_flags |= THREAD_CALL_RUNNING;
1700                                         thread_call_unlock();
1701                                         thread_call_invoke(func, param0, param1, call);
1702                                         thread_call_lock_spin();
1703
1704                                         repend = thread_call_finish(call, group, NULL);
1705                                 } while (repend);
1706
1707                                 /* call may have been freed */
1708                                 restart = TRUE;
1709                                 break;
1710                         } else {
1711                                 _pending_call_enqueue(call, group);
1712                         }
1713                 }
1714         } while (restart);
1715
1716         _arm_delayed_call_timer(call, group, flavor);
1717
1718         thread_call_unlock();
1719 }
1720
1721 static void
1722 thread_call_delayed_timer_rescan(thread_call_group_t group,
1723     thread_call_flavor_t flavor)
1724 {
1725         thread_call_t call;
1726         uint64_t now;
1727
1728         spl_t s = disable_ints_and_lock();
1729
1730         assert(ml_timer_forced_evaluation() == TRUE);
1731
1732         if (flavor == TCF_CONTINUOUS) {
1733                 now = mach_continuous_time();
1734         } else {
1735                 now = mach_absolute_time();
1736         }
1737
1738         qe_foreach_element_safe(call, &group->delayed_queues[flavor], tc_call.q_link) {
1739                 if (call->tc_soft_deadline <= now) {
1740                         _pending_call_enqueue(call, group);
1741                 } else {
1742                         uint64_t skew = call->tc_call.deadline - call->tc_soft_deadline;
1743                         assert(call->tc_call.deadline >= call->tc_soft_deadline);
1744                         /*
1745                          * On a latency quality-of-service level change,
1746                          * re-sort potentially rate-limited callout. The platform
1747                          * layer determines which timers require this.
1748                          */
1749                         if (timer_resort_threshold(skew)) {
1750                                 _call_dequeue(call, group);
1751                                 _delayed_call_enqueue(call, group, call->tc_soft_deadline, flavor);
1752                         }
1753                 }
1754         }
1755
1756         _arm_delayed_call_timer(NULL, group, flavor);
1757
1758         enable_ints_and_unlock(s);
1759 }
1760
1761 void
1762 thread_call_delayed_timer_rescan_all(void)
1763 {
1764         for (int i = 0; i < THREAD_CALL_INDEX_MAX; i++) {
1765                 thread_call_delayed_timer_rescan(&thread_call_groups[i], TCF_ABSOLUTE);
1766                 thread_call_delayed_timer_rescan(&thread_call_groups[i], TCF_CONTINUOUS);
1767         }
1768 }
1769
1770 /*
1771  * Timer callback to tell a thread to terminate if
1772  * we have an excess of threads and at least one has been
1773  * idle for a long time.
1774  */
1775 static void
1776 thread_call_dealloc_timer(
1777         timer_call_param_t              p0,
1778         __unused timer_call_param_t     p1)
1779 {
1780         thread_call_group_t group = (thread_call_group_t)p0;
1781         uint64_t now;
1782         kern_return_t res;
1783         boolean_t terminated = FALSE;
1784
1785         thread_call_lock_spin();
1786
1787         assert((group->flags & TCG_DEALLOC_ACTIVE) == TCG_DEALLOC_ACTIVE);
1788
1789         now = mach_absolute_time();
1790
1791         if (group->idle_count > 0) {
1792                 if (now > group->idle_timestamp + thread_call_dealloc_interval_abs) {
1793                         terminated = TRUE;
1794                         group->idle_count--;
1795                         res = waitq_wakeup64_one(&group->idle_waitq, NO_EVENT64,
1796                             THREAD_INTERRUPTED, WAITQ_ALL_PRIORITIES);
1797                         if (res != KERN_SUCCESS) {
1798                                 panic("Unable to wake up idle thread for termination?");
1799                         }
1800                 }
1801         }
1802
1803         group->flags &= ~TCG_DEALLOC_ACTIVE;
1804
1805         /*
1806          * If we still have an excess of threads, schedule another
1807          * invocation of this function.
1808          */
1809         if (group->idle_count > 0 && (group->idle_count + group->active_count > group->target_thread_count)) {
1810                 /*
1811                  * If we killed someone just now, push out the
1812                  * next deadline.
1813                  */
1814                 if (terminated) {
1815                         group->idle_timestamp = now;
1816                 }
1817
1818                 thread_call_start_deallocate_timer(group);
1819         }
1820
1821         thread_call_unlock();
1822 }
1823
1824 /*
1825  * Wait for the invocation of the thread call to complete
1826  * We know there's only one in flight because of the 'once' flag.
1827  *
1828  * If a subsequent invocation comes in before we wake up, that's OK
1829  *
1830  * TODO: Here is where we will add priority inheritance to the thread executing
1831  * the thread call in case it's lower priority than the current thread
1832  *      <rdar://problem/30321792> Priority inheritance for thread_call_wait_once
1833  *
1834  * Takes the thread call lock locked, returns unlocked
1835  *      This lets us avoid a spurious take/drop after waking up from thread_block
1836  */
1837 static boolean_t
1838 thread_call_wait_once_locked(thread_call_t call, spl_t s)
1839 {
1840         assert(call->tc_flags & THREAD_CALL_ALLOC);
1841         assert(call->tc_flags & THREAD_CALL_ONCE);
1842
1843         if ((call->tc_flags & THREAD_CALL_RUNNING) == 0) {
1844                 enable_ints_and_unlock(s);
1845                 return FALSE;
1846         }
1847
1848         /* call is running, so we have to wait for it */
1849         call->tc_flags |= THREAD_CALL_WAIT;
1850
1851         wait_result_t res = assert_wait(call, THREAD_UNINT);
1852         if (res != THREAD_WAITING) {
1853                 panic("Unable to assert wait: %d", res);
1854         }
1855
1856         enable_ints_and_unlock(s);
1857
1858         res = thread_block(THREAD_CONTINUE_NULL);
1859         if (res != THREAD_AWAKENED) {
1860                 panic("Awoken with %d?", res);
1861         }
1862
1863         /* returns unlocked */
1864         return TRUE;
1865 }
1866
1867 /*
1868  * Wait for an in-flight invocation to complete
1869  * Does NOT try to cancel, so the client doesn't need to hold their
1870  * lock while calling this function.
1871  *
1872  * Returns whether or not it had to wait.
1873  *
1874  * Only works for THREAD_CALL_ONCE calls.
1875  */
1876 boolean_t
1877 thread_call_wait_once(thread_call_t call)
1878 {
1879         if ((call->tc_flags & THREAD_CALL_ALLOC) == 0) {
1880                 panic("thread_call_wait_once: can't wait on thread call whose storage I don't own");
1881         }
1882
1883         if ((call->tc_flags & THREAD_CALL_ONCE) == 0) {
1884                 panic("thread_call_wait_once: can't wait_once on a non-once call");
1885         }
1886
1887         if (!ml_get_interrupts_enabled()) {
1888                 panic("unsafe thread_call_wait_once");
1889         }
1890
1891         if (current_thread()->thc_state.thc_call == call) {
1892                 panic("thread_call_wait_once: deadlock waiting on self from inside call: %p to function %p",
1893                     call, call->tc_call.func);
1894         }
1895
1896         spl_t s = disable_ints_and_lock();
1897
1898         boolean_t waited = thread_call_wait_once_locked(call, s);
1899         /* thread call lock unlocked */
1900
1901         return waited;
1902 }
1903
1904
1905 /*
1906  * Wait for all requested invocations of a thread call prior to now
1907  * to finish.  Can only be invoked on thread calls whose storage we manage.
1908  * Just waits for the finish count to catch up to the submit count we find
1909  * at the beginning of our wait.
1910  *
1911  * Called with thread_call_lock held.  Returns with lock released.
1912  */
1913 static void
1914 thread_call_wait_locked(thread_call_t call, spl_t s)
1915 {
1916         uint64_t submit_count;
1917         wait_result_t res;
1918
1919         assert(call->tc_flags & THREAD_CALL_ALLOC);
1920
1921         submit_count = call->tc_submit_count;
1922
1923         while (call->tc_finish_count < submit_count) {
1924                 call->tc_flags |= THREAD_CALL_WAIT;
1925
1926                 res = assert_wait(call, THREAD_UNINT);
1927                 if (res != THREAD_WAITING) {
1928                         panic("Unable to assert wait: %d", res);
1929                 }
1930
1931                 enable_ints_and_unlock(s);
1932
1933                 res = thread_block(THREAD_CONTINUE_NULL);
1934                 if (res != THREAD_AWAKENED) {
1935                         panic("Awoken with %d?", res);
1936                 }
1937
1938                 s = disable_ints_and_lock();
1939         }
1940
1941         enable_ints_and_unlock(s);
1942 }
1943
1944 /*
1945  * Determine whether a thread call is either on a queue or
1946  * currently being executed.
1947  */
1948 boolean_t
1949 thread_call_isactive(thread_call_t call)
1950 {
1951         boolean_t active;
1952
1953         spl_t s = disable_ints_and_lock();
1954         active = (call->tc_submit_count > call->tc_finish_count);
1955         enable_ints_and_unlock(s);
1956
1957         return active;
1958 }
1959
1960 /*
1961  * adjust_cont_time_thread_calls
1962  * on wake, reenqueue delayed call timer for continuous time thread call groups
1963  */
1964 void
1965 adjust_cont_time_thread_calls(void)
1966 {
1967         spl_t s = disable_ints_and_lock();
1968
1969         for (int i = 0; i < THREAD_CALL_INDEX_MAX; i++) {
1970                 thread_call_group_t group = &thread_call_groups[i];
1971
1972                 /* only the continuous timers need to be re-armed */
1973
1974                 _arm_delayed_call_timer(NULL, group, TCF_CONTINUOUS);
1975         }
1976
1977         enable_ints_and_unlock(s);
1978 }