xnu-7195.101.1.tar.gz

[apple/xnu.git] / osfmk / kern / thread_call.c
diff --git a/osfmk/kern/thread_call.c b/osfmk/kern/thread_call.c

index a50c6d7d312df8627fa0c83c0c3af7f217ff71d8..6bcca37203eed56ffa6d2fa344194163881a0ca9 100644 (file)
--- a/osfmk/kern/thread_call.c
+++ b/osfmk/kern/thread_call.c
@@ -1,8 +1,8 @@
  /*
- * Copyright (c) 1993-1995, 1999-2008 Apple Inc. All rights reserved.
+ * Copyright (c) 1993-1995, 1999-2020 Apple Inc. All rights reserved.
   *
   * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- * 
+ *
   * This file contains Original Code and/or Modifications of Original Code
   * as defined in and that are subject to the Apple Public Source License
   * Version 2.0 (the 'License'). You may not use this file except in
@@ -11,10 +11,10 @@
   * unlawful or unlicensed copies of an Apple operating system, or to
   * circumvent, violate, or enable the circumvention or violation of, any
   * terms of an Apple operating system software license agreement.
- * 
+ *
   * Please obtain a copy of the License at
   * http://www.opensource.apple.com/apsl/ and read it before using this file.
- * 
+ *
   * The Original Code and all software distributed under the License are
   * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
   * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
@@ -22,10 +22,10 @@
   * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
   * Please see the License for the specific language governing rights and
   * limitations under the License.
- * 
+ *
   * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
   */
- 
+
  #include <mach/mach_types.h>
  #include <mach/thread_act.h>
  
@@ -35,90 +35,471 @@
  #include <kern/clock.h>
  #include <kern/task.h>
  #include <kern/thread.h>
-#include <kern/wait_queue.h>
+#include <kern/waitq.h>
+#include <kern/ledger.h>
+#include <kern/policy_internal.h>
  
  #include <vm/vm_pageout.h>
  
  #include <kern/thread_call.h>
-#include <kern/call_entry.h>
-
  #include <kern/timer_call.h>
  
+#include <libkern/OSAtomic.h>
+#include <kern/timer_queue.h>
+
  #include <sys/kdebug.h>
+#if CONFIG_DTRACE
+#include <mach/sdt.h>
+#endif
+#include <machine/machine_routines.h>
+
+static ZONE_DECLARE(thread_call_zone, "thread_call",
+    sizeof(thread_call_data_t), ZC_NOENCRYPT);
+
+typedef enum {
+       TCF_ABSOLUTE    = 0,
+       TCF_CONTINUOUS  = 1,
+       TCF_COUNT       = 2,
+} thread_call_flavor_t;
+
+__options_decl(thread_call_group_flags_t, uint32_t, {
+       TCG_NONE                = 0x0,
+       TCG_PARALLEL            = 0x1,
+       TCG_DEALLOC_ACTIVE      = 0x2,
+});
+
+static struct thread_call_group {
+       __attribute__((aligned(128))) lck_ticket_t tcg_lock;
+
+       const char *            tcg_name;
+
+       queue_head_t            pending_queue;
+       uint32_t                pending_count;
+
+       queue_head_t            delayed_queues[TCF_COUNT];
+       struct priority_queue_deadline_min delayed_pqueues[TCF_COUNT];
+       timer_call_data_t       delayed_timers[TCF_COUNT];
+
+       timer_call_data_t       dealloc_timer;
+
+       struct waitq            idle_waitq;
+       uint64_t                idle_timestamp;
+       uint32_t                idle_count, active_count, blocked_count;
+
+       uint32_t                tcg_thread_pri;
+       uint32_t                target_thread_count;
+
+       thread_call_group_flags_t tcg_flags;
+
+       struct waitq            waiters_waitq;
+} thread_call_groups[THREAD_CALL_INDEX_MAX] = {
+       [THREAD_CALL_INDEX_HIGH] = {
+               .tcg_name               = "high",
+               .tcg_thread_pri         = BASEPRI_PREEMPT_HIGH,
+               .target_thread_count    = 4,
+               .tcg_flags              = TCG_NONE,
+       },
+       [THREAD_CALL_INDEX_KERNEL] = {
+               .tcg_name               = "kernel",
+               .tcg_thread_pri         = BASEPRI_KERNEL,
+               .target_thread_count    = 1,
+               .tcg_flags              = TCG_PARALLEL,
+       },
+       [THREAD_CALL_INDEX_USER] = {
+               .tcg_name               = "user",
+               .tcg_thread_pri         = BASEPRI_DEFAULT,
+               .target_thread_count    = 1,
+               .tcg_flags              = TCG_PARALLEL,
+       },
+       [THREAD_CALL_INDEX_LOW] = {
+               .tcg_name               = "low",
+               .tcg_thread_pri         = MAXPRI_THROTTLE,
+               .target_thread_count    = 1,
+               .tcg_flags              = TCG_PARALLEL,
+       },
+       [THREAD_CALL_INDEX_KERNEL_HIGH] = {
+               .tcg_name               = "kernel-high",
+               .tcg_thread_pri         = BASEPRI_PREEMPT,
+               .target_thread_count    = 2,
+               .tcg_flags              = TCG_NONE,
+       },
+       [THREAD_CALL_INDEX_QOS_UI] = {
+               .tcg_name               = "qos-ui",
+               .tcg_thread_pri         = BASEPRI_FOREGROUND,
+               .target_thread_count    = 1,
+               .tcg_flags              = TCG_NONE,
+       },
+       [THREAD_CALL_INDEX_QOS_IN] = {
+               .tcg_name               = "qos-in",
+               .tcg_thread_pri         = BASEPRI_USER_INITIATED,
+               .target_thread_count    = 1,
+               .tcg_flags              = TCG_NONE,
+       },
+       [THREAD_CALL_INDEX_QOS_UT] = {
+               .tcg_name               = "qos-ut",
+               .tcg_thread_pri         = BASEPRI_UTILITY,
+               .target_thread_count    = 1,
+               .tcg_flags              = TCG_NONE,
+       },
+};
+
+typedef struct thread_call_group        *thread_call_group_t;
+
+#define INTERNAL_CALL_COUNT             768
+#define THREAD_CALL_DEALLOC_INTERVAL_NS (5 * NSEC_PER_MSEC) /* 5 ms */
+#define THREAD_CALL_ADD_RATIO           4
+#define THREAD_CALL_MACH_FACTOR_CAP     3
+#define THREAD_CALL_GROUP_MAX_THREADS   500
+
+struct thread_call_thread_state {
+       struct thread_call_group * thc_group;
+       struct thread_call *       thc_call;    /* debug only, may be deallocated */
+       uint64_t thc_call_start;
+       uint64_t thc_call_soft_deadline;
+       uint64_t thc_call_hard_deadline;
+       uint64_t thc_call_pending_timestamp;
+       uint64_t thc_IOTES_invocation_timestamp;
+       thread_call_func_t  thc_func;
+       thread_call_param_t thc_param0;
+       thread_call_param_t thc_param1;
+};
+
+static bool                     thread_call_daemon_awake = true;
+/*
+ * This special waitq exists because the daemon thread
+ * might need to be woken while already holding a global waitq locked.
+ */
+static struct waitq             daemon_waitq;
+
+static thread_call_data_t       internal_call_storage[INTERNAL_CALL_COUNT];
+static queue_head_t             thread_call_internal_queue;
+int                                             thread_call_internal_queue_count = 0;
+static uint64_t                 thread_call_dealloc_interval_abs;
+
+static void                     _internal_call_init(void);
+
+static thread_call_t            _internal_call_allocate(thread_call_func_t func, thread_call_param_t param0);
+static bool                     _is_internal_call(thread_call_t call);
+static void                     _internal_call_release(thread_call_t call);
+static bool                     _pending_call_enqueue(thread_call_t call, thread_call_group_t group, uint64_t now);
+static bool                     _delayed_call_enqueue(thread_call_t call, thread_call_group_t group,
+    uint64_t deadline, thread_call_flavor_t flavor);
+static bool                     _call_dequeue(thread_call_t call, thread_call_group_t group);
+static void                     thread_call_wake(thread_call_group_t group);
+static void                     thread_call_daemon(void *arg);
+static void                     thread_call_thread(thread_call_group_t group, wait_result_t wres);
+static void                     thread_call_dealloc_timer(timer_call_param_t p0, timer_call_param_t p1);
+static void                     thread_call_group_setup(thread_call_group_t group);
+static void                     sched_call_thread(int type, thread_t thread);
+static void                     thread_call_start_deallocate_timer(thread_call_group_t group);
+static void                     thread_call_wait_locked(thread_call_t call, spl_t s);
+static bool                     thread_call_wait_once_locked(thread_call_t call, spl_t s);
+
+static boolean_t                thread_call_enter_delayed_internal(thread_call_t call,
+    thread_call_func_t alt_func, thread_call_param_t alt_param0,
+    thread_call_param_t param1, uint64_t deadline,
+    uint64_t leeway, unsigned int flags);
+
+/* non-static so dtrace can find it rdar://problem/31156135&31379348 */
+extern void thread_call_delayed_timer(timer_call_param_t p0, timer_call_param_t p1);
+
+LCK_GRP_DECLARE(thread_call_lck_grp, "thread_call");
+
+
+static void
+thread_call_lock_spin(thread_call_group_t group)
+{
+       lck_ticket_lock(&group->tcg_lock, &thread_call_lck_grp);
+}
+
+static void
+thread_call_unlock(thread_call_group_t group)
+{
+       lck_ticket_unlock(&group->tcg_lock);
+}
  
-decl_simple_lock_data(static,thread_call_lock)
+static void __assert_only
+thread_call_assert_locked(thread_call_group_t group)
+{
+       lck_ticket_assert_owned(&group->tcg_lock);
+}
  
-static zone_t          thread_call_zone;
  
-struct thread_call_group {
-       queue_head_t            pending_queue;
-       uint32_t                        pending_count;
+static spl_t
+disable_ints_and_lock(thread_call_group_t group)
+{
+       spl_t s = splsched();
+       thread_call_lock_spin(group);
  
-       queue_head_t            delayed_queue;
+       return s;
+}
  
-       timer_call_data_t       delayed_timer;
+static void
+enable_ints_and_unlock(thread_call_group_t group, spl_t s)
+{
+       thread_call_unlock(group);
+       splx(s);
+}
  
-       struct wait_queue       idle_wqueue;
-       struct wait_queue       daemon_wqueue;
-       uint32_t                        idle_count, active_count;
-};
+/* Lock held */
+static thread_call_group_t
+thread_call_get_group(thread_call_t call)
+{
+       thread_call_index_t index = call->tc_index;
+
+       assert(index >= 0 && index < THREAD_CALL_INDEX_MAX);
+
+       return &thread_call_groups[index];
+}
+
+/* Lock held */
+static thread_call_flavor_t
+thread_call_get_flavor(thread_call_t call)
+{
+       return (call->tc_flags & THREAD_CALL_FLAG_CONTINUOUS) ? TCF_CONTINUOUS : TCF_ABSOLUTE;
+}
+
+/* Lock held */
+static thread_call_flavor_t
+thread_call_set_flavor(thread_call_t call, thread_call_flavor_t flavor)
+{
+       assert(flavor == TCF_CONTINUOUS || flavor == TCF_ABSOLUTE);
+       thread_call_flavor_t old_flavor = thread_call_get_flavor(call);
+
+       if (old_flavor != flavor) {
+               if (flavor == TCF_CONTINUOUS) {
+                       call->tc_flags |= THREAD_CALL_FLAG_CONTINUOUS;
+               } else {
+                       call->tc_flags &= ~THREAD_CALL_FLAG_CONTINUOUS;
+               }
+       }
+
+       return old_flavor;
+}
+
+/* returns true if it was on a queue */
+static bool
+thread_call_enqueue_tail(
+       thread_call_t           call,
+       queue_t                 new_queue)
+{
+       queue_t                 old_queue = call->tc_queue;
+
+       thread_call_group_t     group = thread_call_get_group(call);
+       thread_call_flavor_t    flavor = thread_call_get_flavor(call);
+
+       if (old_queue != NULL &&
+           old_queue != &group->delayed_queues[flavor]) {
+               panic("thread call (%p) on bad queue (old_queue: %p)", call, old_queue);
+       }
+
+       if (old_queue == &group->delayed_queues[flavor]) {
+               priority_queue_remove(&group->delayed_pqueues[flavor], &call->tc_pqlink);
+       }
+
+       if (old_queue == NULL) {
+               enqueue_tail(new_queue, &call->tc_qlink);
+       } else {
+               re_queue_tail(new_queue, &call->tc_qlink);
+       }
+
+       call->tc_queue = new_queue;
+
+       return old_queue != NULL;
+}
+
+static queue_head_t *
+thread_call_dequeue(
+       thread_call_t            call)
+{
+       queue_t                 old_queue = call->tc_queue;
+
+       thread_call_group_t     group = thread_call_get_group(call);
+       thread_call_flavor_t    flavor = thread_call_get_flavor(call);
+
+       if (old_queue != NULL &&
+           old_queue != &group->pending_queue &&
+           old_queue != &group->delayed_queues[flavor]) {
+               panic("thread call (%p) on bad queue (old_queue: %p)", call, old_queue);
+       }
+
+       if (old_queue == &group->delayed_queues[flavor]) {
+               priority_queue_remove(&group->delayed_pqueues[flavor], &call->tc_pqlink);
+       }
+
+       if (old_queue != NULL) {
+               remqueue(&call->tc_qlink);
  
-typedef struct thread_call_group       *thread_call_group_t;
+               call->tc_queue = NULL;
+       }
+       return old_queue;
+}
+
+static queue_head_t *
+thread_call_enqueue_deadline(
+       thread_call_t           call,
+       thread_call_group_t     group,
+       thread_call_flavor_t    flavor,
+       uint64_t                deadline)
+{
+       queue_t old_queue = call->tc_queue;
+       queue_t new_queue = &group->delayed_queues[flavor];
+
+       thread_call_flavor_t old_flavor = thread_call_set_flavor(call, flavor);
+
+       if (old_queue != NULL &&
+           old_queue != &group->pending_queue &&
+           old_queue != &group->delayed_queues[old_flavor]) {
+               panic("thread call (%p) on bad queue (old_queue: %p)", call, old_queue);
+       }
+
+       if (old_queue == new_queue) {
+               /* optimize the same-queue case to avoid a full re-insert */
+               uint64_t old_deadline = call->tc_pqlink.deadline;
+               call->tc_pqlink.deadline = deadline;
+
+               if (old_deadline < deadline) {
+                       priority_queue_entry_increased(&group->delayed_pqueues[flavor],
+                           &call->tc_pqlink);
+               } else {
+                       priority_queue_entry_decreased(&group->delayed_pqueues[flavor],
+                           &call->tc_pqlink);
+               }
+       } else {
+               if (old_queue == &group->delayed_queues[old_flavor]) {
+                       priority_queue_remove(&group->delayed_pqueues[old_flavor],
+                           &call->tc_pqlink);
+               }
+
+               call->tc_pqlink.deadline = deadline;
+
+               priority_queue_insert(&group->delayed_pqueues[flavor], &call->tc_pqlink);
+       }
+
+       if (old_queue == NULL) {
+               enqueue_tail(new_queue, &call->tc_qlink);
+       } else if (old_queue != new_queue) {
+               re_queue_tail(new_queue, &call->tc_qlink);
+       }
+
+       call->tc_queue = new_queue;
+
+       return old_queue;
+}
+
+uint64_t
+thread_call_get_armed_deadline(thread_call_t call)
+{
+       return call->tc_pqlink.deadline;
+}
+
+
+static bool
+group_isparallel(thread_call_group_t group)
+{
+       return (group->tcg_flags & TCG_PARALLEL) != 0;
+}
+
+static bool
+thread_call_group_should_add_thread(thread_call_group_t group)
+{
+       if ((group->active_count + group->blocked_count + group->idle_count) >= THREAD_CALL_GROUP_MAX_THREADS) {
+               panic("thread_call group '%s' reached max thread cap (%d): active: %d, blocked: %d, idle: %d",
+                   group->tcg_name, THREAD_CALL_GROUP_MAX_THREADS,
+                   group->active_count, group->blocked_count, group->idle_count);
+       }
+
+       if (group_isparallel(group) == false) {
+               if (group->pending_count > 0 && group->active_count == 0) {
+                       return true;
+               }
+
+               return false;
+       }
  
-static struct thread_call_group                thread_call_group0;
+       if (group->pending_count > 0) {
+               if (group->idle_count > 0) {
+                       return false;
+               }
+
+               uint32_t thread_count = group->active_count;
+
+               /*
+                * Add a thread if either there are no threads,
+                * the group has fewer than its target number of
+                * threads, or the amount of work is large relative
+                * to the number of threads.  In the last case, pay attention
+                * to the total load on the system, and back off if
+                * it's high.
+                */
+               if ((thread_count == 0) ||
+                   (thread_count < group->target_thread_count) ||
+                   ((group->pending_count > THREAD_CALL_ADD_RATIO * thread_count) &&
+                   (sched_mach_factor < THREAD_CALL_MACH_FACTOR_CAP))) {
+                       return true;
+               }
+       }
+
+       return false;
+}
+
+static void
+thread_call_group_setup(thread_call_group_t group)
+{
+       lck_ticket_init(&group->tcg_lock, &thread_call_lck_grp);
+
+       queue_init(&group->pending_queue);
  
-static boolean_t                       thread_call_daemon_awake;
+       for (thread_call_flavor_t flavor = 0; flavor < TCF_COUNT; flavor++) {
+               queue_init(&group->delayed_queues[flavor]);
+               priority_queue_init(&group->delayed_pqueues[flavor]);
+               timer_call_setup(&group->delayed_timers[flavor], thread_call_delayed_timer, group);
+       }
  
-#define thread_call_thread_min 4
+       timer_call_setup(&group->dealloc_timer, thread_call_dealloc_timer, group);
  
-#define internal_call_count    768
+       waitq_init(&group->waiters_waitq, SYNC_POLICY_DISABLE_IRQ);
  
-static thread_call_data_t      internal_call_storage[internal_call_count];
-static queue_head_t                    thread_call_internal_queue;
+       /* Reverse the wait order so we re-use the most recently parked thread from the pool */
+       waitq_init(&group->idle_waitq, SYNC_POLICY_REVERSED | SYNC_POLICY_DISABLE_IRQ);
+}
  
-static __inline__ thread_call_t                _internal_call_allocate(void);
+/*
+ * Simple wrapper for creating threads bound to
+ * thread call groups.
+ */
+static void
+thread_call_thread_create(
+       thread_call_group_t             group)
+{
+       thread_t thread;
+       kern_return_t result;
  
-static __inline__ void _internal_call_release(
-                                                       thread_call_t           call);
+       int thread_pri = group->tcg_thread_pri;
  
-static __inline__ boolean_t    _pending_call_enqueue(
-                                                               thread_call_t           call,
-                                                               thread_call_group_t     group),
-                                                       _delayed_call_enqueue(
-                                                               thread_call_t           call,
-                                                               thread_call_group_t     group,
-                                                               uint64_t                        deadline),
-                                                       _call_dequeue(
-                                                               thread_call_t           call,
-                                                               thread_call_group_t     group);
+       result = kernel_thread_start_priority((thread_continue_t)thread_call_thread,
+           group, thread_pri, &thread);
+       if (result != KERN_SUCCESS) {
+               panic("cannot create new thread call thread %d", result);
+       }
  
-static __inline__ void thread_call_wake(
-                                                       thread_call_group_t     group);
+       if (thread_pri <= BASEPRI_KERNEL) {
+               /*
+                * THREAD_CALL_PRIORITY_KERNEL and lower don't get to run to completion
+                * in kernel if there are higher priority threads available.
+                */
+               thread_set_eager_preempt(thread);
+       }
  
-static __inline__ void _set_delayed_call_timer(
-                                                       thread_call_t           call,
-                                                       thread_call_group_t     group);
-                                       
-static boolean_t       _remove_from_pending_queue(
-                                               thread_call_func_t              func,
-                                               thread_call_param_t             param0,
-                                               boolean_t                               remove_all),
-                                       _remove_from_delayed_queue(
-                                               thread_call_func_t              func,
-                                               thread_call_param_t             param0,
-                                               boolean_t                               remove_all);
+       char name[MAXTHREADNAMESIZE] = "";
  
-static void            thread_call_daemon(
-                                       thread_call_group_t             group),
-                               thread_call_thread(
-                                       thread_call_group_t             group);
+       int group_thread_count = group->idle_count + group->active_count + group->blocked_count;
  
-static void            thread_call_delayed_timer(
-                                       timer_call_param_t              p0,
-                                       timer_call_param_t              p1);
+       snprintf(name, sizeof(name), "thread call %s #%d", group->tcg_name, group_thread_count);
+       thread_set_thread_name(thread, name);
  
-#define qe(x)          ((queue_entry_t)(x))
-#define TC(x)          ((thread_call_t)(x))
+       thread_deallocate(thread);
+}
  
  /*
   *     thread_call_initialize:
@@ -129,58 +510,97 @@ static void               thread_call_delayed_timer(
  void
  thread_call_initialize(void)
  {
-    thread_call_t                      call;
-       thread_call_group_t             group = &thread_call_group0;
-       kern_return_t                   result;
-       thread_t                                thread;
-       int                                             i;
-       spl_t                                   s;
+       nanotime_to_absolutetime(0, THREAD_CALL_DEALLOC_INTERVAL_NS, &thread_call_dealloc_interval_abs);
+       waitq_init(&daemon_waitq, SYNC_POLICY_DISABLE_IRQ | SYNC_POLICY_FIFO);
  
-       i = sizeof (thread_call_data_t);
-       thread_call_zone = zinit(i, 4096 * i, 16 * i, "thread_call");
-       zone_change(thread_call_zone, Z_NOENCRYPT, TRUE);
-
-    simple_lock_init(&thread_call_lock, 0);
-
-       s = splsched();
-       simple_lock(&thread_call_lock);
-
-    queue_init(&group->pending_queue);
-    queue_init(&group->delayed_queue);
+       for (uint32_t i = 0; i < THREAD_CALL_INDEX_MAX; i++) {
+               thread_call_group_setup(&thread_call_groups[i]);
+       }
  
-       timer_call_setup(&group->delayed_timer, thread_call_delayed_timer, group);
+       _internal_call_init();
  
-       wait_queue_init(&group->idle_wqueue, SYNC_POLICY_FIFO);
-       wait_queue_init(&group->daemon_wqueue, SYNC_POLICY_FIFO);
+       thread_t thread;
+       kern_return_t result;
  
-    queue_init(&thread_call_internal_queue);
-    for (
-               call = internal_call_storage;
-                       call < &internal_call_storage[internal_call_count];
-                       call++) {
+       result = kernel_thread_start_priority((thread_continue_t)thread_call_daemon,
+           NULL, BASEPRI_PREEMPT_HIGH + 1, &thread);
+       if (result != KERN_SUCCESS) {
+               panic("thread_call_initialize");
+       }
  
-               enqueue_tail(&thread_call_internal_queue, qe(call));
-    }
+       thread_deallocate(thread);
+}
  
-       thread_call_daemon_awake = TRUE;
+void
+thread_call_setup_with_options(
+       thread_call_t                   call,
+       thread_call_func_t              func,
+       thread_call_param_t             param0,
+       thread_call_priority_t          pri,
+       thread_call_options_t           options)
+{
+       bzero(call, sizeof(*call));
  
-       simple_unlock(&thread_call_lock);
-       splx(s);
+       *call = (struct thread_call) {
+               .tc_func = func,
+               .tc_param0 = param0,
+       };
  
-       result = kernel_thread_start_priority((thread_continue_t)thread_call_daemon, group, BASEPRI_PREEMPT + 1, &thread);
-       if (result != KERN_SUCCESS)
-               panic("thread_call_initialize");
+       switch (pri) {
+       case THREAD_CALL_PRIORITY_HIGH:
+               call->tc_index = THREAD_CALL_INDEX_HIGH;
+               break;
+       case THREAD_CALL_PRIORITY_KERNEL:
+               call->tc_index = THREAD_CALL_INDEX_KERNEL;
+               break;
+       case THREAD_CALL_PRIORITY_USER:
+               call->tc_index = THREAD_CALL_INDEX_USER;
+               break;
+       case THREAD_CALL_PRIORITY_LOW:
+               call->tc_index = THREAD_CALL_INDEX_LOW;
+               break;
+       case THREAD_CALL_PRIORITY_KERNEL_HIGH:
+               call->tc_index = THREAD_CALL_INDEX_KERNEL_HIGH;
+               break;
+       default:
+               panic("Invalid thread call pri value: %d", pri);
+               break;
+       }
  
-       thread_deallocate(thread);
+       if (options & THREAD_CALL_OPTIONS_ONCE) {
+               call->tc_flags |= THREAD_CALL_ONCE;
+       }
+       if (options & THREAD_CALL_OPTIONS_SIGNAL) {
+               call->tc_flags |= THREAD_CALL_SIGNAL | THREAD_CALL_ONCE;
+       }
  }
  
  void
  thread_call_setup(
-       thread_call_t                   call,
-       thread_call_func_t              func,
-       thread_call_param_t             param0)
+       thread_call_t                   call,
+       thread_call_func_t              func,
+       thread_call_param_t             param0)
+{
+       thread_call_setup_with_options(call, func, param0,
+           THREAD_CALL_PRIORITY_HIGH, 0);
+}
+
+static void
+_internal_call_init(void)
  {
-       call_entry_setup(call, func, param0);
+       /* Function-only thread calls are only kept in the default HIGH group */
+       thread_call_group_t group = &thread_call_groups[THREAD_CALL_INDEX_HIGH];
+
+       spl_t s = disable_ints_and_lock(group);
+
+       queue_init(&thread_call_internal_queue);
+
+       for (unsigned i = 0; i < INTERNAL_CALL_COUNT; i++) {
+               enqueue_tail(&thread_call_internal_queue, &internal_call_storage[i].tc_qlink);
+               thread_call_internal_queue_count++;
+       }
+
+       enable_ints_and_unlock(group, s);
  }
  
  /*
@@ -190,17 +610,41 @@ thread_call_setup(
   *
   *     Called with thread_call_lock held.
   */
-static __inline__ thread_call_t
-_internal_call_allocate(void)
+static thread_call_t
+_internal_call_allocate(thread_call_func_t func, thread_call_param_t param0)
+{
+       /* Function-only thread calls are only kept in the default HIGH group */
+       thread_call_group_t group = &thread_call_groups[THREAD_CALL_INDEX_HIGH];
+
+       spl_t s = disable_ints_and_lock(group);
+
+       thread_call_t call = qe_dequeue_head(&thread_call_internal_queue,
+           struct thread_call, tc_qlink);
+
+       if (call == NULL) {
+               panic("_internal_call_allocate: thread_call_internal_queue empty");
+       }
+
+       thread_call_internal_queue_count--;
+
+       thread_call_setup(call, func, param0);
+       /* THREAD_CALL_ALLOC not set, do not free back to zone */
+       assert((call->tc_flags & THREAD_CALL_ALLOC) == 0);
+       enable_ints_and_unlock(group, s);
+
+       return call;
+}
+
+/* Check if a call is internal and needs to be returned to the internal pool. */
+static bool
+_is_internal_call(thread_call_t call)
  {
-    thread_call_t              call;
-    
-    if (queue_empty(&thread_call_internal_queue))
-       panic("_internal_call_allocate");
-       
-    call = TC(dequeue_head(&thread_call_internal_queue));
-    
-    return (call);
+       if (call >= internal_call_storage &&
+           call < &internal_call_storage[INTERNAL_CALL_COUNT]) {
+               assert((call->tc_flags & THREAD_CALL_ALLOC) == 0);
+               return true;
+       }
+       return false;
  }
  
  /*
@@ -209,15 +653,20 @@ _internal_call_allocate(void)
   *     Release an internal callout entry which
   *     is no longer pending (or delayed).
   *
- *     Called with thread_call_lock held.
+ *      Called with thread_call_lock held.
   */
-static __inline__ void
-_internal_call_release(
-    thread_call_t              call)
+static void
+_internal_call_release(thread_call_t call)
  {
-    if (    call >= internal_call_storage                                              &&
-                   call < &internal_call_storage[internal_call_count]          )
-               enqueue_head(&thread_call_internal_queue, qe(call));
+       assert(_is_internal_call(call));
+
+       thread_call_group_t group = thread_call_get_group(call);
+
+       assert(group == &thread_call_groups[THREAD_CALL_INDEX_HIGH]);
+       thread_call_assert_locked(group);
+
+       enqueue_head(&thread_call_internal_queue, &call->tc_qlink);
+       thread_call_internal_queue_count++;
  }
  
  /*
@@ -231,18 +680,36 @@ _internal_call_release(
   *
   *     Called with thread_call_lock held.
   */
-static __inline__ boolean_t
-_pending_call_enqueue(
-    thread_call_t              call,
-       thread_call_group_t     group)
+static bool
+_pending_call_enqueue(thread_call_t call,
+    thread_call_group_t group,
+    uint64_t now)
  {
-       queue_t         old_queue;
+       if ((THREAD_CALL_ONCE | THREAD_CALL_RUNNING)
+           == (call->tc_flags & (THREAD_CALL_ONCE | THREAD_CALL_RUNNING))) {
+               call->tc_pqlink.deadline = 0;
+
+               thread_call_flags_t flags = call->tc_flags;
+               call->tc_flags |= THREAD_CALL_RESCHEDULE;
+
+               assert(call->tc_queue == NULL);
+
+               return flags & THREAD_CALL_RESCHEDULE;
+       }
+
+       call->tc_pending_timestamp = now;
+
+       bool was_on_queue = thread_call_enqueue_tail(call, &group->pending_queue);
  
-       old_queue = call_entry_enqueue_tail(call, &group->pending_queue);
+       if (!was_on_queue) {
+               call->tc_submit_count++;
+       }
  
         group->pending_count++;
  
-       return (old_queue != NULL);
+       thread_call_wake(group);
+
+       return was_on_queue;
  }
  
  /*
@@ -250,27 +717,42 @@ _pending_call_enqueue(
   *
   *     Place an entry on the delayed queue,
   *     after existing entries with an earlier
- *     (or identical) deadline.
+ *      (or identical) deadline.
   *
   *     Returns TRUE if the entry was already
   *     on a queue.
   *
   *     Called with thread_call_lock held.
   */
-static __inline__ boolean_t
+static bool
  _delayed_call_enqueue(
-    thread_call_t              call,
-       thread_call_group_t     group,
-       uint64_t                        deadline)
+       thread_call_t           call,
+       thread_call_group_t     group,
+       uint64_t                deadline,
+       thread_call_flavor_t    flavor)
  {
-       queue_t                 old_queue;
+       if ((THREAD_CALL_ONCE | THREAD_CALL_RUNNING)
+           == (call->tc_flags & (THREAD_CALL_ONCE | THREAD_CALL_RUNNING))) {
+               call->tc_pqlink.deadline = deadline;
+
+               thread_call_flags_t flags = call->tc_flags;
+               call->tc_flags |= THREAD_CALL_RESCHEDULE;
+
+               assert(call->tc_queue == NULL);
+               thread_call_set_flavor(call, flavor);
  
-       old_queue = call_entry_enqueue_deadline(call, &group->delayed_queue, deadline);
+               return flags & THREAD_CALL_RESCHEDULE;
+       }
+
+       queue_head_t *old_queue = thread_call_enqueue_deadline(call, group, flavor, deadline);
  
-       if (old_queue == &group->pending_queue)
+       if (old_queue == &group->pending_queue) {
                 group->pending_count--;
+       } else if (old_queue == NULL) {
+               call->tc_submit_count++;
+       }
  
-       return (old_queue != NULL);
+       return old_queue != NULL;
  }
  
  /*
@@ -282,179 +764,126 @@ _delayed_call_enqueue(
   *
   *     Called with thread_call_lock held.
   */
-static __inline__ boolean_t
+static bool
  _call_dequeue(
-       thread_call_t           call,
-       thread_call_group_t     group)
+       thread_call_t           call,
+       thread_call_group_t     group)
  {
-       queue_t                 old_queue;
+       queue_head_t *old_queue = thread_call_dequeue(call);
+
+       if (old_queue == NULL) {
+               return false;
+       }
  
-       old_queue = call_entry_dequeue(call);
+       call->tc_finish_count++;
  
-       if (old_queue == &group->pending_queue)
+       if (old_queue == &group->pending_queue) {
                 group->pending_count--;
+       }
  
-       return (old_queue != NULL);
+       return true;
  }
  
  /*
- *     _set_delayed_call_timer:
+ * _arm_delayed_call_timer:
   *
- *     Reset the timer so that it
- *     next expires when the entry is due.
+ * Check if the timer needs to be armed for this flavor,
+ * and if so, arm it.
   *
- *     Called with thread_call_lock held.
+ * If call is non-NULL, only re-arm the timer if the specified call
+ * is the first in the queue.
+ *
+ * Returns true if the timer was armed/re-armed, false if it was left unset
+ * Caller should cancel the timer if need be.
+ *
+ * Called with thread_call_lock held.
   */
-static __inline__ void
-_set_delayed_call_timer(
-    thread_call_t              call,
-       thread_call_group_t     group)
+static bool
+_arm_delayed_call_timer(thread_call_t           new_call,
+    thread_call_group_t     group,
+    thread_call_flavor_t    flavor)
  {
-    timer_call_enter(&group->delayed_timer, call->deadline);
+       /* No calls implies no timer needed */
+       if (queue_empty(&group->delayed_queues[flavor])) {
+               return false;
+       }
+
+       thread_call_t call = priority_queue_min(&group->delayed_pqueues[flavor], struct thread_call, tc_pqlink);
+
+       /* We only need to change the hard timer if this new call is the first in the list */
+       if (new_call != NULL && new_call != call) {
+               return false;
+       }
+
+       assert((call->tc_soft_deadline != 0) && ((call->tc_soft_deadline <= call->tc_pqlink.deadline)));
+
+       uint64_t fire_at = call->tc_soft_deadline;
+
+       if (flavor == TCF_CONTINUOUS) {
+               assert(call->tc_flags & THREAD_CALL_FLAG_CONTINUOUS);
+               fire_at = continuoustime_to_absolutetime(fire_at);
+       } else {
+               assert((call->tc_flags & THREAD_CALL_FLAG_CONTINUOUS) == 0);
+       }
+
+       /*
+        * Note: This picks the soonest-deadline call's leeway as the hard timer's leeway,
+        * which does not take into account later-deadline timers with a larger leeway.
+        * This is a valid coalescing behavior, but masks a possible window to
+        * fire a timer instead of going idle.
+        */
+       uint64_t leeway = call->tc_pqlink.deadline - call->tc_soft_deadline;
+
+       timer_call_enter_with_leeway(&group->delayed_timers[flavor], (timer_call_param_t)flavor,
+           fire_at, leeway,
+           TIMER_CALL_SYS_CRITICAL | TIMER_CALL_LEEWAY,
+           ((call->tc_flags & THREAD_CALL_RATELIMITED) == THREAD_CALL_RATELIMITED));
+
+       return true;
  }
  
  /*
- *     _remove_from_pending_queue:
+ *     _cancel_func_from_queue:
   *
   *     Remove the first (or all) matching
- *     entries from the pending queue.
+ *     entries from the specified queue.
   *
- *     Returns TRUE if any matching entries
+ *     Returns TRUE if any matching entries
   *     were found.
   *
   *     Called with thread_call_lock held.
   */
  static boolean_t
-_remove_from_pending_queue(
-    thread_call_func_t         func,
-    thread_call_param_t                param0,
-    boolean_t                          remove_all)
-{
-       boolean_t                               call_removed = FALSE;
-       thread_call_t                   call;
-       thread_call_group_t             group = &thread_call_group0;
-    
-    call = TC(queue_first(&group->pending_queue));
-    
-    while (!queue_end(&group->pending_queue, qe(call))) {
-       if (    call->func == func                      &&
-                               call->param0 == param0                  ) {
-                       thread_call_t   next = TC(queue_next(qe(call)));
-               
-                       _call_dequeue(call, group);
+_cancel_func_from_queue(thread_call_func_t      func,
+    thread_call_param_t     param0,
+    thread_call_group_t     group,
+    boolean_t               remove_all,
+    queue_head_t            *queue)
+{
+       boolean_t call_removed = FALSE;
+       thread_call_t call;
  
-                       _internal_call_release(call);
-           
-                       call_removed = TRUE;
-                       if (!remove_all)
-                               break;
-               
-                       call = next;
+       qe_foreach_element_safe(call, queue, tc_qlink) {
+               if (call->tc_func != func ||
+                   call->tc_param0 != param0) {
+                       continue;
                 }
-               else    
-                       call = TC(queue_next(qe(call)));
-    }
-    
-    return (call_removed);
-}
  
-/*
- *     _remove_from_delayed_queue:
- *
- *     Remove the first (or all) matching
- *     entries from the delayed queue.
- *
- *     Returns TRUE if any matching entries
- *     were found.
- *
- *     Called with thread_call_lock held.
- */
-static boolean_t
-_remove_from_delayed_queue(
-    thread_call_func_t         func,
-    thread_call_param_t                param0,
-    boolean_t                          remove_all)
-{
-    boolean_t                          call_removed = FALSE;
-    thread_call_t                      call;
-       thread_call_group_t             group = &thread_call_group0;
-    
-    call = TC(queue_first(&group->delayed_queue));
-    
-    while (!queue_end(&group->delayed_queue, qe(call))) {
-       if (    call->func == func                      &&
-                               call->param0 == param0                  ) {
-                       thread_call_t   next = TC(queue_next(qe(call)));
-               
-                       _call_dequeue(call, group);
-           
+               _call_dequeue(call, group);
+
+               if (_is_internal_call(call)) {
                         _internal_call_release(call);
-           
-                       call_removed = TRUE;
-                       if (!remove_all)
-                               break;
-               
-                       call = next;
                 }
-               else    
-                       call = TC(queue_next(qe(call)));
-    }
-    
-    return (call_removed);
-}
  
-#ifndef        __LP64__
-
-/*
- *     thread_call_func:
- *
- *     Enqueue a function callout.
- *
- *     Guarantees { function, argument }
- *     uniqueness if unique_call is TRUE.
- */
-void
-thread_call_func(
-    thread_call_func_t         func,
-    thread_call_param_t                param,
-    boolean_t                          unique_call)
-{
-    thread_call_t                      call;
-       thread_call_group_t             group = &thread_call_group0;
-    spl_t                                      s;
-    
-    s = splsched();
-    simple_lock(&thread_call_lock);
-    
-    call = TC(queue_first(&group->pending_queue));
-    
-       while (unique_call && !queue_end(&group->pending_queue, qe(call))) {
-       if (    call->func == func                      &&
-                               call->param0 == param                   ) {
+               call_removed = TRUE;
+               if (!remove_all) {
                         break;
                 }
-       
-               call = TC(queue_next(qe(call)));
-    }
-    
-    if (!unique_call || queue_end(&group->pending_queue, qe(call))) {
-               call = _internal_call_allocate();
-               call->func                      = func;
-               call->param0            = param;
-               call->param1            = NULL;
-       
-               _pending_call_enqueue(call, group);
-               
-               if (group->active_count == 0)
-                       thread_call_wake(group);
-    }
+       }
  
-       simple_unlock(&thread_call_lock);
-    splx(s);
+       return call_removed;
  }
  
-#endif /* __LP64__ */
-
  /*
   *     thread_call_func_delayed:
   *
@@ -463,29 +892,29 @@ thread_call_func(
   */
  void
  thread_call_func_delayed(
-    thread_call_func_t         func,
-    thread_call_param_t                param,
-    uint64_t                           deadline)
-{
-    thread_call_t                      call;
-       thread_call_group_t             group = &thread_call_group0;
-    spl_t                                      s;
-    
-    s = splsched();
-    simple_lock(&thread_call_lock);
-    
-    call = _internal_call_allocate();
-    call->func                 = func;
-    call->param0               = param;
-    call->param1               = 0;
-    
-    _delayed_call_enqueue(call, group, deadline);
-    
-    if (queue_first(&group->delayed_queue) == qe(call))
-       _set_delayed_call_timer(call, group);
-    
-    simple_unlock(&thread_call_lock);
-    splx(s);
+       thread_call_func_t              func,
+       thread_call_param_t             param,
+       uint64_t                        deadline)
+{
+       (void)thread_call_enter_delayed_internal(NULL, func, param, 0, deadline, 0, 0);
+}
+
+/*
+ * thread_call_func_delayed_with_leeway:
+ *
+ * Same as thread_call_func_delayed(), but with
+ * leeway/flags threaded through.
+ */
+
+void
+thread_call_func_delayed_with_leeway(
+       thread_call_func_t              func,
+       thread_call_param_t             param,
+       uint64_t                deadline,
+       uint64_t                leeway,
+       uint32_t                flags)
+{
+       (void)thread_call_enter_delayed_internal(NULL, func, param, 0, deadline, leeway, flags);
  }
  
  /*
@@ -499,137 +928,218 @@ thread_call_func_delayed(
   *     in that order.
   *
   *     Returns TRUE if any calls were cancelled.
+ *
+ *     This iterates all of the pending or delayed thread calls in the group,
+ *     which is really inefficient.  Switch to an allocated thread call instead.
+ *
+ *     TODO: Give 'func' thread calls their own group, so this silliness doesn't
+ *     affect the main 'high' group.
   */
  boolean_t
  thread_call_func_cancel(
-    thread_call_func_t         func,
-    thread_call_param_t                param,
-    boolean_t                          cancel_all)
+       thread_call_func_t              func,
+       thread_call_param_t             param,
+       boolean_t                       cancel_all)
  {
-       boolean_t                       result;
-    spl_t                              s;
-    
-    s = splsched();
-    simple_lock(&thread_call_lock);
+       boolean_t       result;
+
+       assert(func != NULL);
+
+       /* Function-only thread calls are only kept in the default HIGH group */
+       thread_call_group_t group = &thread_call_groups[THREAD_CALL_INDEX_HIGH];
  
-    if (cancel_all)
-               result = _remove_from_pending_queue(func, param, cancel_all) |
-                                               _remove_from_delayed_queue(func, param, cancel_all);
-       else
-               result = _remove_from_pending_queue(func, param, cancel_all) ||
-                                               _remove_from_delayed_queue(func, param, cancel_all);
-    
-    simple_unlock(&thread_call_lock);
-    splx(s);
+       spl_t s = disable_ints_and_lock(group);
  
-       return (result);
+       if (cancel_all) {
+               /* exhaustively search every queue, and return true if any search found something */
+               result = _cancel_func_from_queue(func, param, group, cancel_all, &group->pending_queue) |
+                   _cancel_func_from_queue(func, param, group, cancel_all, &group->delayed_queues[TCF_ABSOLUTE])  |
+                   _cancel_func_from_queue(func, param, group, cancel_all, &group->delayed_queues[TCF_CONTINUOUS]);
+       } else {
+               /* early-exit as soon as we find something, don't search other queues */
+               result = _cancel_func_from_queue(func, param, group, cancel_all, &group->pending_queue) ||
+                   _cancel_func_from_queue(func, param, group, cancel_all, &group->delayed_queues[TCF_ABSOLUTE]) ||
+                   _cancel_func_from_queue(func, param, group, cancel_all, &group->delayed_queues[TCF_CONTINUOUS]);
+       }
+
+       enable_ints_and_unlock(group, s);
+
+       return result;
  }
  
  /*
- *     thread_call_allocate:
- *
- *     Allocate a callout entry.
+ * Allocate a thread call with a given priority.  Importances other than
+ * THREAD_CALL_PRIORITY_HIGH or THREAD_CALL_PRIORITY_KERNEL_HIGH will be run in threads
+ * with eager preemption enabled (i.e. may be aggressively preempted by higher-priority
+ * threads which are not in the normal "urgent" bands).
   */
  thread_call_t
-thread_call_allocate(
-    thread_call_func_t         func,
-    thread_call_param_t                param0)
+thread_call_allocate_with_priority(
+       thread_call_func_t              func,
+       thread_call_param_t             param0,
+       thread_call_priority_t          pri)
  {
-    thread_call_t              call = zalloc(thread_call_zone);
-
-       call_entry_setup(call, func, param0);
-
-    return (call);
+       return thread_call_allocate_with_options(func, param0, pri, 0);
  }
  
-/*
- *     thread_call_free:
- *
- *     Free a callout entry.
- */
-boolean_t
-thread_call_free(
-    thread_call_t              call)
-{
-    spl_t              s;
-    
-    s = splsched();
-    simple_lock(&thread_call_lock);
-    
-    if (call->queue != NULL) {
-       simple_unlock(&thread_call_lock);
-               splx(s);
+thread_call_t
+thread_call_allocate_with_options(
+       thread_call_func_t              func,
+       thread_call_param_t             param0,
+       thread_call_priority_t          pri,
+       thread_call_options_t           options)
+{
+       thread_call_t call = zalloc(thread_call_zone);
  
-               return (FALSE);
-    }
-    
-    simple_unlock(&thread_call_lock);
-    splx(s);
-    
-       zfree(thread_call_zone, call);
+       thread_call_setup_with_options(call, func, param0, pri, options);
+       call->tc_refs = 1;
+       call->tc_flags |= THREAD_CALL_ALLOC;
  
-       return (TRUE);
+       return call;
  }
  
-/*
- *     thread_call_enter:
- *
- *     Enqueue a callout entry to occur "soon".
- *
- *     Returns TRUE if the call was
- *     already on a queue.
- */
-boolean_t
-thread_call_enter(
-    thread_call_t              call)
-{
-       boolean_t                               result = TRUE;
-       thread_call_group_t             group = &thread_call_group0;
-    spl_t                                      s;
-    
-    s = splsched();
-    simple_lock(&thread_call_lock);
-    
-    if (call->queue != &group->pending_queue) {
-       result = _pending_call_enqueue(call, group);
-               
-               if (group->active_count == 0)
-                       thread_call_wake(group);
+thread_call_t
+thread_call_allocate_with_qos(thread_call_func_t        func,
+    thread_call_param_t       param0,
+    int                       qos_tier,
+    thread_call_options_t     options)
+{
+       thread_call_t call = thread_call_allocate(func, param0);
+
+       switch (qos_tier) {
+       case THREAD_QOS_UNSPECIFIED:
+               call->tc_index = THREAD_CALL_INDEX_HIGH;
+               break;
+       case THREAD_QOS_LEGACY:
+               call->tc_index = THREAD_CALL_INDEX_USER;
+               break;
+       case THREAD_QOS_MAINTENANCE:
+       case THREAD_QOS_BACKGROUND:
+               call->tc_index = THREAD_CALL_INDEX_LOW;
+               break;
+       case THREAD_QOS_UTILITY:
+               call->tc_index = THREAD_CALL_INDEX_QOS_UT;
+               break;
+       case THREAD_QOS_USER_INITIATED:
+               call->tc_index = THREAD_CALL_INDEX_QOS_IN;
+               break;
+       case THREAD_QOS_USER_INTERACTIVE:
+               call->tc_index = THREAD_CALL_INDEX_QOS_UI;
+               break;
+       default:
+               panic("Invalid thread call qos value: %d", qos_tier);
+               break;
         }
  
-       call->param1 = 0;
+       if (options & THREAD_CALL_OPTIONS_ONCE) {
+               call->tc_flags |= THREAD_CALL_ONCE;
+       }
  
-       simple_unlock(&thread_call_lock);
-    splx(s);
+       /* does not support THREAD_CALL_OPTIONS_SIGNAL */
  
-       return (result);
+       return call;
+}
+
+
+/*
+ *     thread_call_allocate:
+ *
+ *     Allocate a callout entry.
+ */
+thread_call_t
+thread_call_allocate(
+       thread_call_func_t              func,
+       thread_call_param_t             param0)
+{
+       return thread_call_allocate_with_options(func, param0,
+                  THREAD_CALL_PRIORITY_HIGH, 0);
+}
+
+/*
+ *     thread_call_free:
+ *
+ *     Release a callout.  If the callout is currently
+ *     executing, it will be freed when all invocations
+ *     finish.
+ *
+ *     If the callout is currently armed to fire again, then
+ *     freeing is not allowed and returns FALSE.  The
+ *     client must have canceled the pending invocation before freeing.
+ */
+boolean_t
+thread_call_free(
+       thread_call_t           call)
+{
+       thread_call_group_t group = thread_call_get_group(call);
+
+       spl_t s = disable_ints_and_lock(group);
+
+       if (call->tc_queue != NULL ||
+           ((call->tc_flags & THREAD_CALL_RESCHEDULE) != 0)) {
+               thread_call_unlock(group);
+               splx(s);
+
+               return FALSE;
+       }
+
+       int32_t refs = --call->tc_refs;
+       if (refs < 0) {
+               panic("Refcount negative: %d\n", refs);
+       }
+
+       if ((THREAD_CALL_SIGNAL | THREAD_CALL_RUNNING)
+           == ((THREAD_CALL_SIGNAL | THREAD_CALL_RUNNING) & call->tc_flags)) {
+               thread_call_wait_once_locked(call, s);
+               /* thread call lock has been unlocked */
+       } else {
+               enable_ints_and_unlock(group, s);
+       }
+
+       if (refs == 0) {
+               assert(call->tc_finish_count == call->tc_submit_count);
+               zfree(thread_call_zone, call);
+       }
+
+       return TRUE;
+}
+
+/*
+ *     thread_call_enter:
+ *
+ *     Enqueue a callout entry to occur "soon".
+ *
+ *     Returns TRUE if the call was
+ *     already on a queue.
+ */
+boolean_t
+thread_call_enter(
+       thread_call_t           call)
+{
+       return thread_call_enter1(call, 0);
  }
  
  boolean_t
  thread_call_enter1(
-    thread_call_t                      call,
-    thread_call_param_t                param1)
-{
-       boolean_t                               result = TRUE;
-       thread_call_group_t             group = &thread_call_group0;
-    spl_t                                      s;
-    
-    s = splsched();
-    simple_lock(&thread_call_lock);
-    
-    if (call->queue != &group->pending_queue) {
-       result = _pending_call_enqueue(call, group);
-               
-               if (group->active_count == 0)
-                       thread_call_wake(group);
+       thread_call_t                   call,
+       thread_call_param_t             param1)
+{
+       assert(call->tc_func != NULL);
+       assert((call->tc_flags & THREAD_CALL_SIGNAL) == 0);
+
+       thread_call_group_t group = thread_call_get_group(call);
+       bool result = true;
+
+       spl_t s = disable_ints_and_lock(group);
+
+       if (call->tc_queue != &group->pending_queue) {
+               result = _pending_call_enqueue(call, group, mach_absolute_time());
         }
  
-       call->param1 = param1;
+       call->tc_param1 = param1;
  
-       simple_unlock(&thread_call_lock);
-    splx(s);
+       enable_ints_and_unlock(group, s);
  
-       return (result);
+       return result;
  }
  
  /*
@@ -643,53 +1153,172 @@ thread_call_enter1(
   */
  boolean_t
  thread_call_enter_delayed(
-    thread_call_t              call,
-    uint64_t                   deadline)
+       thread_call_t           call,
+       uint64_t                deadline)
+{
+       assert(call != NULL);
+       return thread_call_enter_delayed_internal(call, NULL, 0, 0, deadline, 0, 0);
+}
+
+boolean_t
+thread_call_enter1_delayed(
+       thread_call_t                   call,
+       thread_call_param_t             param1,
+       uint64_t                        deadline)
+{
+       assert(call != NULL);
+       return thread_call_enter_delayed_internal(call, NULL, 0, param1, deadline, 0, 0);
+}
+
+boolean_t
+thread_call_enter_delayed_with_leeway(
+       thread_call_t           call,
+       thread_call_param_t     param1,
+       uint64_t                deadline,
+       uint64_t                leeway,
+       unsigned int            flags)
+{
+       assert(call != NULL);
+       return thread_call_enter_delayed_internal(call, NULL, 0, param1, deadline, leeway, flags);
+}
+
+
+/*
+ * thread_call_enter_delayed_internal:
+ * enqueue a callout entry to occur at the stated time
+ *
+ * Returns True if the call was already on a queue
+ * params:
+ * call     - structure encapsulating state of the callout
+ * alt_func/alt_param0 - if call is NULL, allocate temporary storage using these parameters
+ * deadline - time deadline in nanoseconds
+ * leeway   - timer slack represented as delta of deadline.
+ * flags    - THREAD_CALL_DELAY_XXX : classification of caller's desires wrt timer coalescing.
+ *            THREAD_CALL_DELAY_LEEWAY : value in leeway is used for timer coalescing.
+ *            THREAD_CALL_CONTINUOUS: thread call will be called according to mach_continuous_time rather
+ *                                                                        than mach_absolute_time
+ */
+boolean_t
+thread_call_enter_delayed_internal(
+       thread_call_t           call,
+       thread_call_func_t      alt_func,
+       thread_call_param_t     alt_param0,
+       thread_call_param_t     param1,
+       uint64_t                deadline,
+       uint64_t                leeway,
+       unsigned int            flags)
  {
-       boolean_t                               result = TRUE;
-       thread_call_group_t             group = &thread_call_group0;
-    spl_t                                      s;
+       uint64_t                now, sdeadline;
+
+       thread_call_flavor_t flavor = (flags & THREAD_CALL_CONTINUOUS) ? TCF_CONTINUOUS : TCF_ABSOLUTE;
+
+       /* direct mapping between thread_call, timer_call, and timeout_urgency values */
+       uint32_t urgency = (flags & TIMEOUT_URGENCY_MASK);
+
+       if (call == NULL) {
+               /* allocate a structure out of internal storage, as a convenience for BSD callers */
+               call = _internal_call_allocate(alt_func, alt_param0);
+       }
+
+       assert(call->tc_func != NULL);
+       thread_call_group_t group = thread_call_get_group(call);
+
+       spl_t s = disable_ints_and_lock(group);
+
+       /*
+        * kevent and IOTES let you change flavor for an existing timer, so we have to
+        * support flipping flavors for enqueued thread calls.
+        */
+       if (flavor == TCF_CONTINUOUS) {
+               now = mach_continuous_time();
+       } else {
+               now = mach_absolute_time();
+       }
+
+       call->tc_flags |= THREAD_CALL_DELAYED;
  
-    s = splsched();
-    simple_lock(&thread_call_lock);
+       call->tc_soft_deadline = sdeadline = deadline;
  
-       result = _delayed_call_enqueue(call, group, deadline);
+       boolean_t ratelimited = FALSE;
+       uint64_t slop = timer_call_slop(deadline, now, urgency, current_thread(), &ratelimited);
  
-       if (queue_first(&group->delayed_queue) == qe(call))
-               _set_delayed_call_timer(call, group);
+       if ((flags & THREAD_CALL_DELAY_LEEWAY) != 0 && leeway > slop) {
+               slop = leeway;
+       }
+
+       if (UINT64_MAX - deadline <= slop) {
+               deadline = UINT64_MAX;
+       } else {
+               deadline += slop;
+       }
+
+       if (ratelimited) {
+               call->tc_flags |= THREAD_CALL_RATELIMITED;
+       } else {
+               call->tc_flags &= ~THREAD_CALL_RATELIMITED;
+       }
+
+       call->tc_param1 = param1;
  
-       call->param1 = 0;
+       call->tc_ttd = (sdeadline > now) ? (sdeadline - now) : 0;
  
-    simple_unlock(&thread_call_lock);
-    splx(s);
+       bool result = _delayed_call_enqueue(call, group, deadline, flavor);
  
-       return (result);
+       _arm_delayed_call_timer(call, group, flavor);
+
+#if CONFIG_DTRACE
+       DTRACE_TMR5(thread_callout__create, thread_call_func_t, call->tc_func,
+           uint64_t, (deadline - sdeadline), uint64_t, (call->tc_ttd >> 32),
+           (unsigned) (call->tc_ttd & 0xFFFFFFFF), call);
+#endif
+
+       enable_ints_and_unlock(group, s);
+
+       return result;
  }
  
-boolean_t
-thread_call_enter1_delayed(
-    thread_call_t                      call,
-    thread_call_param_t                param1,
-    uint64_t                           deadline)
+/*
+ * Remove a callout entry from the queue
+ * Called with thread_call_lock held
+ */
+static bool
+thread_call_cancel_locked(thread_call_t call)
  {
-       boolean_t                               result = TRUE;
-       thread_call_group_t             group = &thread_call_group0;
-    spl_t                                      s;
+       bool canceled;
+
+       if (call->tc_flags & THREAD_CALL_RESCHEDULE) {
+               call->tc_flags &= ~THREAD_CALL_RESCHEDULE;
+               canceled = true;
+
+               /* if reschedule was set, it must not have been queued */
+               assert(call->tc_queue == NULL);
+       } else {
+               bool queue_head_changed = false;
  
-    s = splsched();
-    simple_lock(&thread_call_lock);
+               thread_call_flavor_t flavor = thread_call_get_flavor(call);
+               thread_call_group_t  group  = thread_call_get_group(call);
  
-       result = _delayed_call_enqueue(call, group, deadline);
+               if (call->tc_pqlink.deadline != 0 &&
+                   call == priority_queue_min(&group->delayed_pqueues[flavor], struct thread_call, tc_pqlink)) {
+                       assert(call->tc_queue == &group->delayed_queues[flavor]);
+                       queue_head_changed = true;
+               }
  
-       if (queue_first(&group->delayed_queue) == qe(call))
-               _set_delayed_call_timer(call, group);
+               canceled = _call_dequeue(call, group);
  
-       call->param1 = param1;
+               if (queue_head_changed) {
+                       if (_arm_delayed_call_timer(NULL, group, flavor) == false) {
+                               timer_call_cancel(&group->delayed_timers[flavor]);
+                       }
+               }
+       }
  
-    simple_unlock(&thread_call_lock);
-    splx(s);
+#if CONFIG_DTRACE
+       DTRACE_TMR4(thread_callout__cancel, thread_call_func_t, call->tc_func,
+           0, (call->tc_ttd >> 32), (unsigned) (call->tc_ttd & 0xFFFFFFFF));
+#endif
  
-       return (result);
+       return canceled;
  }
  
  /*
@@ -701,59 +1330,79 @@ thread_call_enter1_delayed(
   *     on a queue.
   */
  boolean_t
-thread_call_cancel(
-    thread_call_t              call)
+thread_call_cancel(thread_call_t call)
  {
-       boolean_t                               result;
-       thread_call_group_t             group = &thread_call_group0;
-    spl_t                                      s;
-    
-    s = splsched();
-    simple_lock(&thread_call_lock);
+       thread_call_group_t group = thread_call_get_group(call);
  
-       result = _call_dequeue(call, group);
-       
-    simple_unlock(&thread_call_lock);
-    splx(s);
+       spl_t s = disable_ints_and_lock(group);
  
-       return (result);
-}
+       boolean_t result = thread_call_cancel_locked(call);
+
+       enable_ints_and_unlock(group, s);
  
-#ifndef        __LP64__
+       return result;
+}
  
  /*
- *     thread_call_is_delayed:
- *
- *     Returns TRUE if the call is
- *     currently on a delayed queue.
- *
- *     Optionally returns the expiration time.
+ * Cancel a thread call.  If it cannot be cancelled (i.e.
+ * is already in flight), waits for the most recent invocation
+ * to finish.  Note that if clients re-submit this thread call,
+ * it may still be pending or in flight when thread_call_cancel_wait
+ * returns, but all requests to execute this work item prior
+ * to the call to thread_call_cancel_wait will have finished.
   */
  boolean_t
-thread_call_is_delayed(
-       thread_call_t           call,
-       uint64_t                        *deadline)
+thread_call_cancel_wait(thread_call_t call)
  {
-       boolean_t                               result = FALSE;
-       thread_call_group_t             group = &thread_call_group0;
-       spl_t                                   s;
+       thread_call_group_t group = thread_call_get_group(call);
  
-       s = splsched();
-       simple_lock(&thread_call_lock);
+       if ((call->tc_flags & THREAD_CALL_ALLOC) == 0) {
+               panic("thread_call_cancel_wait: can't wait on thread call whose storage I don't own");
+       }
  
-       if (call->queue == &group->delayed_queue) {
-               if (deadline != NULL)
-                       *deadline = call->deadline;
-               result = TRUE;
+       if (!ml_get_interrupts_enabled()) {
+               panic("unsafe thread_call_cancel_wait");
         }
  
-       simple_unlock(&thread_call_lock);
-       splx(s);
+       thread_t self = current_thread();
+
+       if ((thread_get_tag_internal(self) & THREAD_TAG_CALLOUT) &&
+           self->thc_state && self->thc_state->thc_call == call) {
+               panic("thread_call_cancel_wait: deadlock waiting on self from inside call: %p to function %p",
+                   call, call->tc_func);
+       }
  
-       return (result);
+       spl_t s = disable_ints_and_lock(group);
+
+       boolean_t canceled = thread_call_cancel_locked(call);
+
+       if ((call->tc_flags & THREAD_CALL_ONCE) == THREAD_CALL_ONCE) {
+               /*
+                * A cancel-wait on a 'once' call will both cancel
+                * the pending call and wait for the in-flight call
+                */
+
+               thread_call_wait_once_locked(call, s);
+               /* thread call lock unlocked */
+       } else {
+               /*
+                * A cancel-wait on a normal call will only wait for the in-flight calls
+                * if it did not cancel the pending call.
+                *
+                * TODO: This seems less than useful - shouldn't it do the wait as well?
+                */
+
+               if (canceled == FALSE) {
+                       thread_call_wait_locked(call, s);
+                       /* thread call lock unlocked */
+               } else {
+                       enable_ints_and_unlock(group, s);
+               }
+       }
+
+       return canceled;
  }
  
-#endif /* __LP64__ */
  
  /*
   *     thread_call_wake:
@@ -764,18 +1413,42 @@ thread_call_is_delayed(
   *     create additional call threads.
   *
   *     Called with thread_call_lock held.
+ *
+ *     For high-priority group, only does wakeup/creation if there are no threads
+ *     running.
   */
-static __inline__ void
+static void
  thread_call_wake(
-       thread_call_group_t             group)
+       thread_call_group_t             group)
  {
-       if (group->idle_count > 0 && wait_queue_wakeup_one(&group->idle_wqueue, NULL, THREAD_AWAKENED) == KERN_SUCCESS) {
-               group->idle_count--; group->active_count++;
-       }
-       else
-       if (!thread_call_daemon_awake) {
-               thread_call_daemon_awake = TRUE;
-               wait_queue_wakeup_one(&group->daemon_wqueue, NULL, THREAD_AWAKENED);
+       /*
+        * New behavior: use threads if you've got 'em.
+        * Traditional behavior: wake only if no threads running.
+        */
+       if (group_isparallel(group) || group->active_count == 0) {
+               if (group->idle_count) {
+                       __assert_only kern_return_t kr;
+
+                       kr = waitq_wakeup64_one(&group->idle_waitq, CAST_EVENT64_T(group),
+                           THREAD_AWAKENED, WAITQ_ALL_PRIORITIES);
+                       assert(kr == KERN_SUCCESS);
+
+                       group->idle_count--;
+                       group->active_count++;
+
+                       if (group->idle_count == 0 && (group->tcg_flags & TCG_DEALLOC_ACTIVE) == TCG_DEALLOC_ACTIVE) {
+                               if (timer_call_cancel(&group->dealloc_timer) == TRUE) {
+                                       group->tcg_flags &= ~TCG_DEALLOC_ACTIVE;
+                               }
+                       }
+               } else {
+                       if (thread_call_group_should_add_thread(group) &&
+                           os_atomic_cmpxchg(&thread_call_daemon_awake,
+                           false, true, relaxed)) {
+                               waitq_wakeup64_all(&daemon_waitq, CAST_EVENT64_T(&thread_call_daemon_awake),
+                                   THREAD_AWAKENED, WAITQ_ALL_PRIORITIES);
+                       }
+               }
         }
  }
  
@@ -786,26 +1459,171 @@ thread_call_wake(
   */
  static void
  sched_call_thread(
-       int                             type,
-__unused       thread_t                thread)
+       int                             type,
+       thread_t                thread)
  {
-       thread_call_group_t             group = &thread_call_group0;
+       thread_call_group_t             group;
  
-       simple_lock(&thread_call_lock);
+       assert(thread_get_tag_internal(thread) & THREAD_TAG_CALLOUT);
+       assert(thread->thc_state != NULL);
  
-       switch (type) {
+       group = thread->thc_state->thc_group;
+       assert((group - &thread_call_groups[0]) < THREAD_CALL_INDEX_MAX);
+
+       thread_call_lock_spin(group);
  
+       switch (type) {
         case SCHED_CALL_BLOCK:
-               if (--group->active_count == 0 && group->pending_count > 0)
+               assert(group->active_count);
+               --group->active_count;
+               group->blocked_count++;
+               if (group->pending_count > 0) {
                         thread_call_wake(group);
+               }
                 break;
  
         case SCHED_CALL_UNBLOCK:
+               assert(group->blocked_count);
+               --group->blocked_count;
                 group->active_count++;
                 break;
         }
  
-       simple_unlock(&thread_call_lock);
+       thread_call_unlock(group);
+}
+
+/*
+ * Interrupts disabled, lock held; returns the same way.
+ * Only called on thread calls whose storage we own.  Wakes up
+ * anyone who might be waiting on this work item and frees it
+ * if the client has so requested.
+ */
+static bool
+thread_call_finish(thread_call_t call, thread_call_group_t group, spl_t *s)
+{
+       assert(thread_call_get_group(call) == group);
+
+       bool repend = false;
+       bool signal = call->tc_flags & THREAD_CALL_SIGNAL;
+       bool alloc = call->tc_flags & THREAD_CALL_ALLOC;
+
+       call->tc_finish_count++;
+
+       if (!signal && alloc) {
+               /* The thread call thread owns a ref until the call is finished */
+               if (call->tc_refs <= 0) {
+                       panic("thread_call_finish: detected over-released thread call: %p", call);
+               }
+               call->tc_refs--;
+       }
+
+       thread_call_flags_t old_flags = call->tc_flags;
+       call->tc_flags &= ~(THREAD_CALL_RESCHEDULE | THREAD_CALL_RUNNING | THREAD_CALL_WAIT);
+
+       if ((!alloc || call->tc_refs != 0) &&
+           (old_flags & THREAD_CALL_RESCHEDULE) != 0) {
+               assert(old_flags & THREAD_CALL_ONCE);
+               thread_call_flavor_t flavor = thread_call_get_flavor(call);
+
+               if (old_flags & THREAD_CALL_DELAYED) {
+                       uint64_t now = mach_absolute_time();
+                       if (flavor == TCF_CONTINUOUS) {
+                               now = absolutetime_to_continuoustime(now);
+                       }
+                       if (call->tc_soft_deadline <= now) {
+                               /* The deadline has already expired, go straight to pending */
+                               call->tc_flags &= ~(THREAD_CALL_DELAYED | THREAD_CALL_RATELIMITED);
+                               call->tc_pqlink.deadline = 0;
+                       }
+               }
+
+               if (call->tc_pqlink.deadline) {
+                       _delayed_call_enqueue(call, group, call->tc_pqlink.deadline, flavor);
+                       if (!signal) {
+                               _arm_delayed_call_timer(call, group, flavor);
+                       }
+               } else if (signal) {
+                       call->tc_submit_count++;
+                       repend = true;
+               } else {
+                       _pending_call_enqueue(call, group, mach_absolute_time());
+               }
+       }
+
+       if (!signal && alloc && call->tc_refs == 0) {
+               if ((old_flags & THREAD_CALL_WAIT) != 0) {
+                       panic("Someone waiting on a thread call that is scheduled for free: %p\n", call->tc_func);
+               }
+
+               assert(call->tc_finish_count == call->tc_submit_count);
+
+               enable_ints_and_unlock(group, *s);
+
+               zfree(thread_call_zone, call);
+
+               *s = disable_ints_and_lock(group);
+       }
+
+       if ((old_flags & THREAD_CALL_WAIT) != 0) {
+               /*
+                * This may wake up a thread with a registered sched_call.
+                * That call might need the group lock, so we drop the lock
+                * to avoid deadlocking.
+                *
+                * We also must use a separate waitq from the idle waitq, as
+                * this path goes waitq lock->thread lock->group lock, but
+                * the idle wait goes group lock->waitq_lock->thread_lock.
+                */
+               thread_call_unlock(group);
+
+               waitq_wakeup64_all(&group->waiters_waitq, CAST_EVENT64_T(call),
+                   THREAD_AWAKENED, WAITQ_ALL_PRIORITIES);
+
+               thread_call_lock_spin(group);
+               /* THREAD_CALL_SIGNAL call may have been freed */
+       }
+
+       return repend;
+}
+
+/*
+ * thread_call_invoke
+ *
+ * Invoke the function provided for this thread call
+ *
+ * Note that the thread call object can be deallocated by the function if we do not control its storage.
+ */
+static void __attribute__((noinline))
+thread_call_invoke(thread_call_func_t func,
+    thread_call_param_t param0,
+    thread_call_param_t param1,
+    __unused thread_call_t call)
+{
+#if DEVELOPMENT || DEBUG
+       KERNEL_DEBUG_CONSTANT(
+               MACHDBG_CODE(DBG_MACH_SCHED, MACH_CALLOUT) | DBG_FUNC_START,
+               VM_KERNEL_UNSLIDE(func), VM_KERNEL_ADDRHIDE(param0), VM_KERNEL_ADDRHIDE(param1), 0, 0);
+#endif /* DEVELOPMENT || DEBUG */
+
+#if CONFIG_DTRACE
+       uint64_t tc_ttd = call->tc_ttd;
+       boolean_t is_delayed = call->tc_flags & THREAD_CALL_DELAYED;
+       DTRACE_TMR6(thread_callout__start, thread_call_func_t, func, int, 0, int, (tc_ttd >> 32),
+           (unsigned) (tc_ttd & 0xFFFFFFFF), is_delayed, call);
+#endif
+
+       (*func)(param0, param1);
+
+#if CONFIG_DTRACE
+       DTRACE_TMR6(thread_callout__end, thread_call_func_t, func, int, 0, int, (tc_ttd >> 32),
+           (unsigned) (tc_ttd & 0xFFFFFFFF), is_delayed, call);
+#endif
+
+#if DEVELOPMENT || DEBUG
+       KERNEL_DEBUG_CONSTANT(
+               MACHDBG_CODE(DBG_MACH_SCHED, MACH_CALLOUT) | DBG_FUNC_END,
+               VM_KERNEL_UNSLIDE(func), 0, 0, 0, 0);
+#endif /* DEVELOPMENT || DEBUG */
  }
  
  /*
@@ -813,153 +1631,619 @@ __unused       thread_t                thread)
   */
  static void
  thread_call_thread(
-       thread_call_group_t             group)
+       thread_call_group_t             group,
+       wait_result_t                   wres)
  {
-       thread_t                self = current_thread();
+       thread_t self = current_thread();
  
-    (void) splsched();
-    simple_lock(&thread_call_lock);
+       if ((thread_get_tag_internal(self) & THREAD_TAG_CALLOUT) == 0) {
+               (void)thread_set_tag_internal(self, THREAD_TAG_CALLOUT);
+       }
+
+       /*
+        * A wakeup with THREAD_INTERRUPTED indicates that
+        * we should terminate.
+        */
+       if (wres == THREAD_INTERRUPTED) {
+               thread_terminate(self);
+
+               /* NOTREACHED */
+               panic("thread_terminate() returned?");
+       }
+
+       spl_t s = disable_ints_and_lock(group);
+
+       struct thread_call_thread_state thc_state = { .thc_group = group };
+       self->thc_state = &thc_state;
  
         thread_sched_call(self, sched_call_thread);
  
-    while (group->pending_count > 0) {
-               thread_call_t                   call;
-               thread_call_func_t              func;
-               thread_call_param_t             param0, param1;
+       while (group->pending_count > 0) {
+               thread_call_t call = qe_dequeue_head(&group->pending_queue,
+                   struct thread_call, tc_qlink);
+               assert(call != NULL);
  
-               call = TC(dequeue_head(&group->pending_queue));
                 group->pending_count--;
+               if (group->pending_count == 0) {
+                       assert(queue_empty(&group->pending_queue));
+               }
  
-               func = call->func;
-               param0 = call->param0;
-               param1 = call->param1;
-       
-               call->queue = NULL;
+               thread_call_func_t  func   = call->tc_func;
+               thread_call_param_t param0 = call->tc_param0;
+               thread_call_param_t param1 = call->tc_param1;
  
-               _internal_call_release(call);
+               call->tc_queue = NULL;
  
-               simple_unlock(&thread_call_lock);
-               (void) spllo();
+               if (_is_internal_call(call)) {
+                       _internal_call_release(call);
+               }
+
+               /*
+                * Can only do wakeups for thread calls whose storage
+                * we control.
+                */
+               bool needs_finish = false;
+               if (call->tc_flags & THREAD_CALL_ALLOC) {
+                       call->tc_refs++;        /* Delay free until we're done */
+               }
+               if (call->tc_flags & (THREAD_CALL_ALLOC | THREAD_CALL_ONCE)) {
+                       /*
+                        * If THREAD_CALL_ONCE is used, and the timer wasn't
+                        * THREAD_CALL_ALLOC, then clients swear they will use
+                        * thread_call_cancel_wait() before destroying
+                        * the thread call.
+                        *
+                        * Else, the storage for the thread call might have
+                        * disappeared when thread_call_invoke() ran.
+                        */
+                       needs_finish = true;
+                       call->tc_flags |= THREAD_CALL_RUNNING;
+               }
+
+               thc_state.thc_call = call;
+               thc_state.thc_call_pending_timestamp = call->tc_pending_timestamp;
+               thc_state.thc_call_soft_deadline = call->tc_soft_deadline;
+               thc_state.thc_call_hard_deadline = call->tc_pqlink.deadline;
+               thc_state.thc_func = func;
+               thc_state.thc_param0 = param0;
+               thc_state.thc_param1 = param1;
+               thc_state.thc_IOTES_invocation_timestamp = 0;
  
-               KERNEL_DEBUG_CONSTANT(
-                       MACHDBG_CODE(DBG_MACH_SCHED,MACH_CALLOUT) | DBG_FUNC_NONE,
-                               func, param0, param1, 0, 0);
+               enable_ints_and_unlock(group, s);
  
-               (*func)(param0, param1);
+               thc_state.thc_call_start = mach_absolute_time();
  
-               (void)thread_funnel_set(self->funnel_lock, FALSE);              /* XXX */
+               thread_call_invoke(func, param0, param1, call);
  
-               (void) splsched();
-               simple_lock(&thread_call_lock);
-    }
+               thc_state.thc_call = NULL;
+
+               if (get_preemption_level() != 0) {
+                       int pl = get_preemption_level();
+                       panic("thread_call_thread: preemption_level %d, last callout %p(%p, %p)",
+                           pl, (void *)VM_KERNEL_UNSLIDE(func), param0, param1);
+               }
+
+               s = disable_ints_and_lock(group);
+
+               if (needs_finish) {
+                       /* Release refcount, may free, may temporarily drop lock */
+                       thread_call_finish(call, group, &s);
+               }
+       }
  
         thread_sched_call(self, NULL);
         group->active_count--;
  
-    if (group->idle_count < thread_call_thread_min) {
+       if (self->callout_woken_from_icontext && !self->callout_woke_thread) {
+               ledger_credit(self->t_ledger, task_ledgers.interrupt_wakeups, 1);
+               if (self->callout_woken_from_platform_idle) {
+                       ledger_credit(self->t_ledger, task_ledgers.platform_idle_wakeups, 1);
+               }
+       }
+
+       self->callout_woken_from_icontext = FALSE;
+       self->callout_woken_from_platform_idle = FALSE;
+       self->callout_woke_thread = FALSE;
+
+       self->thc_state = NULL;
+
+       if (group_isparallel(group)) {
+               /*
+                * For new style of thread group, thread always blocks.
+                * If we have more than the target number of threads,
+                * and this is the first to block, and it isn't active
+                * already, set a timer for deallocating a thread if we
+                * continue to have a surplus.
+                */
                 group->idle_count++;
  
-               wait_queue_assert_wait(&group->idle_wqueue, NULL, THREAD_UNINT, 0);
-       
-               simple_unlock(&thread_call_lock);
-               (void) spllo();
+               if (group->idle_count == 1) {
+                       group->idle_timestamp = mach_absolute_time();
+               }
+
+               if (((group->tcg_flags & TCG_DEALLOC_ACTIVE) == 0) &&
+                   ((group->active_count + group->idle_count) > group->target_thread_count)) {
+                       thread_call_start_deallocate_timer(group);
+               }
+
+               /* Wait for more work (or termination) */
+               wres = waitq_assert_wait64(&group->idle_waitq, CAST_EVENT64_T(group), THREAD_INTERRUPTIBLE, 0);
+               if (wres != THREAD_WAITING) {
+                       panic("kcall worker unable to assert wait?");
+               }
+
+               enable_ints_and_unlock(group, s);
  
                 thread_block_parameter((thread_continue_t)thread_call_thread, group);
-               /* NOTREACHED */
-    }
+       } else {
+               if (group->idle_count < group->target_thread_count) {
+                       group->idle_count++;
+
+                       waitq_assert_wait64(&group->idle_waitq, CAST_EVENT64_T(group), THREAD_UNINT, 0); /* Interrupted means to exit */
+
+                       enable_ints_and_unlock(group, s);
+
+                       thread_block_parameter((thread_continue_t)thread_call_thread, group);
+                       /* NOTREACHED */
+               }
+       }
  
-    simple_unlock(&thread_call_lock);
-    (void) spllo();
-    
-    thread_terminate(self);
+       enable_ints_and_unlock(group, s);
+
+       thread_terminate(self);
         /* NOTREACHED */
  }
  
+void
+thread_call_start_iotes_invocation(__assert_only thread_call_t call)
+{
+       thread_t self = current_thread();
+
+       if ((thread_get_tag_internal(self) & THREAD_TAG_CALLOUT) == 0) {
+               /* not a thread call thread, might be a workloop IOTES */
+               return;
+       }
+
+       assert(self->thc_state);
+       assert(self->thc_state->thc_call == call);
+
+       self->thc_state->thc_IOTES_invocation_timestamp = mach_absolute_time();
+}
+
+
  /*
- *     thread_call_daemon:
+ *     thread_call_daemon: walk list of groups, allocating
+ *     threads if appropriate (as determined by
+ *     thread_call_group_should_add_thread()).
   */
  static void
-thread_call_daemon_continue(
-       thread_call_group_t             group)
+thread_call_daemon_continue(__unused void *arg)
  {
-       kern_return_t   result;
-       thread_t                thread;
+       do {
+               os_atomic_store(&thread_call_daemon_awake, false, relaxed);
  
-    (void) splsched();
-    simple_lock(&thread_call_lock);
-        
-       while (group->active_count == 0 && group->pending_count > 0) {
-               group->active_count++;
+               /* Starting at zero happens to be high-priority first. */
+               for (int i = 0; i < THREAD_CALL_INDEX_MAX; i++) {
+                       thread_call_group_t group = &thread_call_groups[i];
+
+                       spl_t s = disable_ints_and_lock(group);
+
+                       while (thread_call_group_should_add_thread(group)) {
+                               group->active_count++;
+
+                               enable_ints_and_unlock(group, s);
+
+                               thread_call_thread_create(group);
+
+                               s = disable_ints_and_lock(group);
+                       }
  
-               simple_unlock(&thread_call_lock);
-               (void) spllo();
-       
-               result = kernel_thread_start_priority((thread_continue_t)thread_call_thread, group, BASEPRI_PREEMPT, &thread);
-               if (result != KERN_SUCCESS)
-                       panic("thread_call_daemon");
-
-               thread_deallocate(thread);
-
-               (void) splsched();
-               simple_lock(&thread_call_lock);
-    }
-
-    thread_call_daemon_awake = FALSE;
-    wait_queue_assert_wait(&group->daemon_wqueue, NULL, THREAD_UNINT, 0);
-    
-    simple_unlock(&thread_call_lock);
-       (void) spllo();
-    
-       thread_block_parameter((thread_continue_t)thread_call_daemon_continue, group);
+                       enable_ints_and_unlock(group, s);
+               }
+       } while (os_atomic_load(&thread_call_daemon_awake, relaxed));
+
+       waitq_assert_wait64(&daemon_waitq, CAST_EVENT64_T(&thread_call_daemon_awake), THREAD_UNINT, 0);
+
+       if (os_atomic_load(&thread_call_daemon_awake, relaxed)) {
+               clear_wait(current_thread(), THREAD_AWAKENED);
+       }
+
+       thread_block_parameter((thread_continue_t)thread_call_daemon_continue, NULL);
         /* NOTREACHED */
  }
  
  static void
  thread_call_daemon(
-       thread_call_group_t             group)
+       __unused void    *arg)
  {
-       thread_t        self = current_thread();
+       thread_t        self = current_thread();
  
         self->options |= TH_OPT_VMPRIV;
-       vm_page_free_reserve(2);        /* XXX */
-    
-    thread_call_daemon_continue(group);
-    /* NOTREACHED */
+       vm_page_free_reserve(2);        /* XXX */
+
+       thread_set_thread_name(self, "thread_call_daemon");
+
+       thread_call_daemon_continue(NULL);
+       /* NOTREACHED */
  }
  
+/*
+ * Schedule timer to deallocate a worker thread if we have a surplus
+ * of threads (in excess of the group's target) and at least one thread
+ * is idle the whole time.
+ */
  static void
-thread_call_delayed_timer(
-       timer_call_param_t                              p0,
-       __unused timer_call_param_t             p1
-)
-{
-    thread_call_t                      call;
-       thread_call_group_t             group = p0;
-       boolean_t                               new_pending = FALSE;
-       uint64_t                                timestamp;
-
-    simple_lock(&thread_call_lock);
-
-       timestamp = mach_absolute_time();
-    
-    call = TC(queue_first(&group->delayed_queue));
-    
-    while (!queue_end(&group->delayed_queue, qe(call))) {
-       if (call->deadline <= timestamp) {
-                       _pending_call_enqueue(call, group);
-                       new_pending = TRUE;
-               }
-               else
+thread_call_start_deallocate_timer(thread_call_group_t group)
+{
+       __assert_only bool already_enqueued;
+
+       assert(group->idle_count > 0);
+       assert((group->tcg_flags & TCG_DEALLOC_ACTIVE) == 0);
+
+       group->tcg_flags |= TCG_DEALLOC_ACTIVE;
+
+       uint64_t deadline = group->idle_timestamp + thread_call_dealloc_interval_abs;
+
+       already_enqueued = timer_call_enter(&group->dealloc_timer, deadline, 0);
+
+       assert(already_enqueued == false);
+}
+
+/* non-static so dtrace can find it rdar://problem/31156135&31379348 */
+void
+thread_call_delayed_timer(timer_call_param_t p0, timer_call_param_t p1)
+{
+       thread_call_group_t  group  = (thread_call_group_t)  p0;
+       thread_call_flavor_t flavor = (thread_call_flavor_t) p1;
+
+       thread_call_t   call;
+       uint64_t        now;
+
+       thread_call_lock_spin(group);
+
+       if (flavor == TCF_CONTINUOUS) {
+               now = mach_continuous_time();
+       } else if (flavor == TCF_ABSOLUTE) {
+               now = mach_absolute_time();
+       } else {
+               panic("invalid timer flavor: %d", flavor);
+       }
+
+       while ((call = priority_queue_min(&group->delayed_pqueues[flavor],
+           struct thread_call, tc_pqlink)) != NULL) {
+               assert(thread_call_get_group(call) == group);
+               assert(thread_call_get_flavor(call) == flavor);
+
+               /*
+                * if we hit a call that isn't yet ready to expire,
+                * then we're done for now
+                * TODO: The next timer in the list could have a larger leeway
+                *       and therefore be ready to expire.
+                */
+               if (call->tc_soft_deadline > now) {
+                       break;
+               }
+
+               /*
+                * If we hit a rate-limited timer, don't eagerly wake it up.
+                * Wait until it reaches the end of the leeway window.
+                *
+                * TODO: What if the next timer is not rate-limited?
+                *       Have a separate rate-limited queue to avoid this
+                */
+               if ((call->tc_flags & THREAD_CALL_RATELIMITED) &&
+                   (call->tc_pqlink.deadline > now) &&
+                   (ml_timer_forced_evaluation() == FALSE)) {
                         break;
-           
-               call = TC(queue_first(&group->delayed_queue));
-    }
+               }
+
+               if (THREAD_CALL_SIGNAL & call->tc_flags) {
+                       __assert_only queue_head_t *old_queue;
+                       old_queue = thread_call_dequeue(call);
+                       assert(old_queue == &group->delayed_queues[flavor]);
+
+                       do {
+                               thread_call_func_t  func   = call->tc_func;
+                               thread_call_param_t param0 = call->tc_param0;
+                               thread_call_param_t param1 = call->tc_param1;
+
+                               call->tc_flags |= THREAD_CALL_RUNNING;
+
+                               thread_call_unlock(group);
+                               thread_call_invoke(func, param0, param1, call);
+                               thread_call_lock_spin(group);
  
-       if (!queue_end(&group->delayed_queue, qe(call)))
-               _set_delayed_call_timer(call, group);
+                               /* finish may detect that the call has been re-pended */
+                       } while (thread_call_finish(call, group, NULL));
+                       /* call may have been freed by the finish */
+               } else {
+                       _pending_call_enqueue(call, group, now);
+               }
+       }
+
+       _arm_delayed_call_timer(call, group, flavor);
+
+       thread_call_unlock(group);
+}
+
+static void
+thread_call_delayed_timer_rescan(thread_call_group_t group,
+    thread_call_flavor_t flavor)
+{
+       thread_call_t call;
+       uint64_t now;
+
+       spl_t s = disable_ints_and_lock(group);
+
+       assert(ml_timer_forced_evaluation() == TRUE);
+
+       if (flavor == TCF_CONTINUOUS) {
+               now = mach_continuous_time();
+       } else {
+               now = mach_absolute_time();
+       }
+
+       qe_foreach_element_safe(call, &group->delayed_queues[flavor], tc_qlink) {
+               if (call->tc_soft_deadline <= now) {
+                       _pending_call_enqueue(call, group, now);
+               } else {
+                       uint64_t skew = call->tc_pqlink.deadline - call->tc_soft_deadline;
+                       assert(call->tc_pqlink.deadline >= call->tc_soft_deadline);
+                       /*
+                        * On a latency quality-of-service level change,
+                        * re-sort potentially rate-limited callout. The platform
+                        * layer determines which timers require this.
+                        *
+                        * This trick works by updating the deadline value to
+                        * equal soft-deadline, effectively crushing away
+                        * timer coalescing slop values for any armed
+                        * timer in the queue.
+                        *
+                        * TODO: keep a hint on the timer to tell whether its inputs changed, so we
+                        * only have to crush coalescing for timers that need it.
+                        *
+                        * TODO: Keep a separate queue of timers above the re-sort
+                        * threshold, so we only have to look at those.
+                        */
+                       if (timer_resort_threshold(skew)) {
+                               _call_dequeue(call, group);
+                               _delayed_call_enqueue(call, group, call->tc_soft_deadline, flavor);
+                       }
+               }
+       }
+
+       _arm_delayed_call_timer(NULL, group, flavor);
  
-    if (new_pending && group->active_count == 0)
-               thread_call_wake(group);
+       enable_ints_and_unlock(group, s);
+}
  
-    simple_unlock(&thread_call_lock);
+void
+thread_call_delayed_timer_rescan_all(void)
+{
+       for (int i = 0; i < THREAD_CALL_INDEX_MAX; i++) {
+               for (thread_call_flavor_t flavor = 0; flavor < TCF_COUNT; flavor++) {
+                       thread_call_delayed_timer_rescan(&thread_call_groups[i], flavor);
+               }
+       }
+}
+
+/*
+ * Timer callback to tell a thread to terminate if
+ * we have an excess of threads and at least one has been
+ * idle for a long time.
+ */
+static void
+thread_call_dealloc_timer(
+       timer_call_param_t              p0,
+       __unused timer_call_param_t     p1)
+{
+       thread_call_group_t group = (thread_call_group_t)p0;
+       uint64_t now;
+       kern_return_t res;
+       bool terminated = false;
+
+       thread_call_lock_spin(group);
+
+       assert(group->tcg_flags & TCG_DEALLOC_ACTIVE);
+
+       now = mach_absolute_time();
+
+       if (group->idle_count > 0) {
+               if (now > group->idle_timestamp + thread_call_dealloc_interval_abs) {
+                       terminated = true;
+                       group->idle_count--;
+                       res = waitq_wakeup64_one(&group->idle_waitq, CAST_EVENT64_T(group),
+                           THREAD_INTERRUPTED, WAITQ_ALL_PRIORITIES);
+                       if (res != KERN_SUCCESS) {
+                               panic("Unable to wake up idle thread for termination?");
+                       }
+               }
+       }
+
+       group->tcg_flags &= ~TCG_DEALLOC_ACTIVE;
+
+       /*
+        * If we still have an excess of threads, schedule another
+        * invocation of this function.
+        */
+       if (group->idle_count > 0 && (group->idle_count + group->active_count > group->target_thread_count)) {
+               /*
+                * If we killed someone just now, push out the
+                * next deadline.
+                */
+               if (terminated) {
+                       group->idle_timestamp = now;
+               }
+
+               thread_call_start_deallocate_timer(group);
+       }
+
+       thread_call_unlock(group);
+}
+
+/*
+ * Wait for the invocation of the thread call to complete
+ * We know there's only one in flight because of the 'once' flag.
+ *
+ * If a subsequent invocation comes in before we wake up, that's OK
+ *
+ * TODO: Here is where we will add priority inheritance to the thread executing
+ * the thread call in case it's lower priority than the current thread
+ *      <rdar://problem/30321792> Priority inheritance for thread_call_wait_once
+ *
+ * Takes the thread call lock locked, returns unlocked
+ *      This lets us avoid a spurious take/drop after waking up from thread_block
+ *
+ * This thread could be a thread call thread itself, blocking and therefore making a
+ * sched_call upcall into the thread call subsystem, needing the group lock.
+ * However, we're saved from deadlock because the 'block' upcall is made in
+ * thread_block, not in assert_wait.
+ */
+static bool
+thread_call_wait_once_locked(thread_call_t call, spl_t s)
+{
+       assert(call->tc_flags & THREAD_CALL_ALLOC);
+       assert(call->tc_flags & THREAD_CALL_ONCE);
+
+       thread_call_group_t group = thread_call_get_group(call);
+
+       if ((call->tc_flags & THREAD_CALL_RUNNING) == 0) {
+               enable_ints_and_unlock(group, s);
+               return false;
+       }
+
+       /* call is running, so we have to wait for it */
+       call->tc_flags |= THREAD_CALL_WAIT;
+
+       wait_result_t res = waitq_assert_wait64(&group->waiters_waitq, CAST_EVENT64_T(call), THREAD_UNINT, 0);
+       if (res != THREAD_WAITING) {
+               panic("Unable to assert wait: %d", res);
+       }
+
+       enable_ints_and_unlock(group, s);
+
+       res = thread_block(THREAD_CONTINUE_NULL);
+       if (res != THREAD_AWAKENED) {
+               panic("Awoken with %d?", res);
+       }
+
+       /* returns unlocked */
+       return true;
+}
+
+/*
+ * Wait for an in-flight invocation to complete
+ * Does NOT try to cancel, so the client doesn't need to hold their
+ * lock while calling this function.
+ *
+ * Returns whether or not it had to wait.
+ *
+ * Only works for THREAD_CALL_ONCE calls.
+ */
+boolean_t
+thread_call_wait_once(thread_call_t call)
+{
+       if ((call->tc_flags & THREAD_CALL_ALLOC) == 0) {
+               panic("thread_call_wait_once: can't wait on thread call whose storage I don't own");
+       }
+
+       if ((call->tc_flags & THREAD_CALL_ONCE) == 0) {
+               panic("thread_call_wait_once: can't wait_once on a non-once call");
+       }
+
+       if (!ml_get_interrupts_enabled()) {
+               panic("unsafe thread_call_wait_once");
+       }
+
+       thread_t self = current_thread();
+
+       if ((thread_get_tag_internal(self) & THREAD_TAG_CALLOUT) &&
+           self->thc_state && self->thc_state->thc_call == call) {
+               panic("thread_call_wait_once: deadlock waiting on self from inside call: %p to function %p",
+                   call, call->tc_func);
+       }
+
+       thread_call_group_t group = thread_call_get_group(call);
+
+       spl_t s = disable_ints_and_lock(group);
+
+       bool waited = thread_call_wait_once_locked(call, s);
+       /* thread call lock unlocked */
+
+       return waited;
+}
+
+
+/*
+ * Wait for all requested invocations of a thread call prior to now
+ * to finish.  Can only be invoked on thread calls whose storage we manage.
+ * Just waits for the finish count to catch up to the submit count we find
+ * at the beginning of our wait.
+ *
+ * Called with thread_call_lock held.  Returns with lock released.
+ */
+static void
+thread_call_wait_locked(thread_call_t call, spl_t s)
+{
+       thread_call_group_t group = thread_call_get_group(call);
+
+       assert(call->tc_flags & THREAD_CALL_ALLOC);
+
+       uint64_t submit_count = call->tc_submit_count;
+
+       while (call->tc_finish_count < submit_count) {
+               call->tc_flags |= THREAD_CALL_WAIT;
+
+               wait_result_t res = waitq_assert_wait64(&group->waiters_waitq,
+                   CAST_EVENT64_T(call), THREAD_UNINT, 0);
+
+               if (res != THREAD_WAITING) {
+                       panic("Unable to assert wait: %d", res);
+               }
+
+               enable_ints_and_unlock(group, s);
+
+               res = thread_block(THREAD_CONTINUE_NULL);
+               if (res != THREAD_AWAKENED) {
+                       panic("Awoken with %d?", res);
+               }
+
+               s = disable_ints_and_lock(group);
+       }
+
+       enable_ints_and_unlock(group, s);
+}
+
+/*
+ * Determine whether a thread call is either on a queue or
+ * currently being executed.
+ */
+boolean_t
+thread_call_isactive(thread_call_t call)
+{
+       thread_call_group_t group = thread_call_get_group(call);
+
+       spl_t s = disable_ints_and_lock(group);
+       boolean_t active = (call->tc_submit_count > call->tc_finish_count);
+       enable_ints_and_unlock(group, s);
+
+       return active;
+}
+
+/*
+ * adjust_cont_time_thread_calls
+ * on wake, reenqueue delayed call timer for continuous time thread call groups
+ */
+void
+adjust_cont_time_thread_calls(void)
+{
+       for (int i = 0; i < THREAD_CALL_INDEX_MAX; i++) {
+               thread_call_group_t group = &thread_call_groups[i];
+               spl_t s = disable_ints_and_lock(group);
+
+               /* only the continuous timers need to be re-armed */
+
+               _arm_delayed_call_timer(NULL, group, TCF_CONTINUOUS);
+               enable_ints_and_unlock(group, s);
+       }
  }