xnu-7195.101.1.tar.gz

[apple/xnu.git] / osfmk / kern / waitq.c
diff --git a/osfmk/kern/waitq.c b/osfmk/kern/waitq.c

index 1a38d76feb9a34676279de9f07859cbc6ddba3e5..1674f379aa0d8909d6e337ed3cd5539686683ffe 100644 (file)
--- a/osfmk/kern/waitq.c
+++ b/osfmk/kern/waitq.c
@@ -1,5 +1,5 @@
  /*
  /*
- * Copyright (c) 2015-2016 Apple Inc. All rights reserved.
+ * Copyright (c) 2015-2020 Apple Inc. All rights reserved.
   *
   * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   *
   *
   * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   *
@@ -66,6 +66,7 @@
  #include <kern/kern_types.h>
  #include <kern/ltable.h>
  #include <kern/mach_param.h>
  #include <kern/kern_types.h>
  #include <kern/ltable.h>
  #include <kern/mach_param.h>
+#include <kern/percpu.h>
  #include <kern/queue.h>
  #include <kern/sched_prim.h>
  #include <kern/simple_lock.h>
  #include <kern/queue.h>
  #include <kern/sched_prim.h>
  #include <kern/simple_lock.h>
@@ -122,9 +123,16 @@ static kern_return_t waitq_select_thread_locked(struct waitq *waitq,
      event64_t event,
      thread_t thread, spl_t *spl);
  
      event64_t event,
      thread_t thread, spl_t *spl);
  
-#define WAITQ_SET_MAX (task_max * 3)
-static zone_t waitq_set_zone;
+ZONE_DECLARE(waitq_set_zone, "waitq sets",
+    sizeof(struct waitq_set), ZC_NOENCRYPT);
  
  
+/* waitq prepost cache */
+#define WQP_CACHE_MAX   50
+struct wqp_cache {
+       uint64_t        head;
+       unsigned int    avail;
+};
+static struct wqp_cache PERCPU_DATA(wqp_cache);
  
  #define P2ROUNDUP(x, align) (-(-((uint32_t)(x)) & -(align)))
  #define ROUNDDOWN(x, y)  (((x)/(y))*(y))
  
  #define P2ROUNDUP(x, align) (-(-((uint32_t)(x)) & -(align)))
  #define ROUNDDOWN(x, y)  (((x)/(y))*(y))
@@ -134,7 +142,7 @@ static zone_t waitq_set_zone;
  static __inline__ void waitq_grab_backtrace(uintptr_t bt[NWAITQ_BTFRAMES], int skip);
  #endif
  
  static __inline__ void waitq_grab_backtrace(uintptr_t bt[NWAITQ_BTFRAMES], int skip);
  #endif
  
-lck_grp_t waitq_lck_grp;
+LCK_GRP_DECLARE(waitq_lck_grp, "waitq");
  
  #if __arm64__
  
  
  #if __arm64__
  
@@ -164,7 +172,7 @@ lck_grp_t waitq_lck_grp;
   * Prepost callback function for specially marked waitq sets
   * (prepost alternative)
   */
   * Prepost callback function for specially marked waitq sets
   * (prepost alternative)
   */
-extern void waitq_set__CALLING_PREPOST_HOOK__(void *ctx, void *memberctx, int priority);
+extern void waitq_set__CALLING_PREPOST_HOOK__(waitq_set_prepost_hook_t *ctx);
  
  #define DEFAULT_MIN_FREE_TABLE_ELEM    100
  static uint32_t g_min_free_table_elem;
  
  #define DEFAULT_MIN_FREE_TABLE_ELEM    100
  static uint32_t g_min_free_table_elem;
@@ -744,7 +752,7 @@ wq_prepost_refill_cpu_cache(uint32_t nalloc)
         }
  
         disable_preemption();
         }
  
         disable_preemption();
-       cache = &PROCESSOR_DATA(current_processor(), wqp_cache);
+       cache = PERCPU_GET(wqp_cache);
  
         /* check once more before putting these elements on the list */
         if (cache->avail >= WQP_CACHE_MAX) {
  
         /* check once more before putting these elements on the list */
         if (cache->avail >= WQP_CACHE_MAX) {
@@ -776,14 +784,14 @@ wq_prepost_ensure_free_space(void)
         struct wqp_cache *cache;
  
         if (g_min_free_cache == 0) {
         struct wqp_cache *cache;
  
         if (g_min_free_cache == 0) {
-               g_min_free_cache = (WQP_CACHE_MAX * ml_get_max_cpus());
+               g_min_free_cache = (WQP_CACHE_MAX * ml_wait_max_cpus());
         }
  
         /*
          * Ensure that we always have a pool of per-CPU prepost elements
          */
         disable_preemption();
         }
  
         /*
          * Ensure that we always have a pool of per-CPU prepost elements
          */
         disable_preemption();
-       cache = &PROCESSOR_DATA(current_processor(), wqp_cache);
+       cache = PERCPU_GET(wqp_cache);
         free_elem = cache->avail;
         enable_preemption();
  
         free_elem = cache->avail;
         enable_preemption();
  
@@ -828,7 +836,7 @@ wq_prepost_alloc(int type, int nelem)
          * allocating RESERVED elements
          */
         disable_preemption();
          * allocating RESERVED elements
          */
         disable_preemption();
-       cache = &PROCESSOR_DATA(current_processor(), wqp_cache);
+       cache = PERCPU_GET(wqp_cache);
         if (nelem <= (int)cache->avail) {
                 struct lt_elem *first, *next = NULL;
                 int nalloc = nelem;
         if (nelem <= (int)cache->avail) {
                 struct lt_elem *first, *next = NULL;
                 int nalloc = nelem;
@@ -1048,7 +1056,7 @@ wq_prepost_release_rlist(struct wq_prepost *wqp)
          * if our cache is running low.
          */
         disable_preemption();
          * if our cache is running low.
          */
         disable_preemption();
-       cache = &PROCESSOR_DATA(current_processor(), wqp_cache);
+       cache = PERCPU_GET(wqp_cache);
         if (cache->avail < WQP_CACHE_MAX) {
                 struct lt_elem *tmp = NULL;
                 if (cache->head != LT_IDX_MAX) {
         if (cache->avail < WQP_CACHE_MAX) {
                 struct lt_elem *tmp = NULL;
                 if (cache->head != LT_IDX_MAX) {
@@ -1121,14 +1129,14 @@ restart:
                         /* the caller wants to remove the only prepost here */
                         assert(wqp_id == wqset->wqset_prepost_id);
                         wqset->wqset_prepost_id = 0;
                         /* the caller wants to remove the only prepost here */
                         assert(wqp_id == wqset->wqset_prepost_id);
                         wqset->wqset_prepost_id = 0;
-               /* fall through */
+                       OS_FALLTHROUGH;
                 case WQ_ITERATE_CONTINUE:
                         wq_prepost_put(wqp);
                         ret = WQ_ITERATE_SUCCESS;
                         break;
                 case WQ_ITERATE_RESTART:
                         wq_prepost_put(wqp);
                 case WQ_ITERATE_CONTINUE:
                         wq_prepost_put(wqp);
                         ret = WQ_ITERATE_SUCCESS;
                         break;
                 case WQ_ITERATE_RESTART:
                         wq_prepost_put(wqp);
-               /* fall through */
+                       OS_FALLTHROUGH;
                 case WQ_ITERATE_DROPPED:
                         goto restart;
                 default:
                 case WQ_ITERATE_DROPPED:
                         goto restart;
                 default:
@@ -1196,7 +1204,7 @@ restart:
                         goto next_prepost;
                 case WQ_ITERATE_RESTART:
                         wq_prepost_put(wqp);
                         goto next_prepost;
                 case WQ_ITERATE_RESTART:
                         wq_prepost_put(wqp);
-               /* fall-through */
+                       OS_FALLTHROUGH;
                 case WQ_ITERATE_DROPPED:
                         /* the callback dropped the ref to wqp: just restart */
                         goto restart;
                 case WQ_ITERATE_DROPPED:
                         /* the callback dropped the ref to wqp: just restart */
                         goto restart;
@@ -1706,7 +1714,7 @@ waitq_grab_backtrace(uintptr_t bt[NWAITQ_BTFRAMES], int skip)
                 skip = 0;
         }
         memset(buf, 0, (NWAITQ_BTFRAMES + skip) * sizeof(uintptr_t));
                 skip = 0;
         }
         memset(buf, 0, (NWAITQ_BTFRAMES + skip) * sizeof(uintptr_t));
-       backtrace(buf, g_nwaitq_btframes + skip);
+       backtrace(buf, g_nwaitq_btframes + skip, NULL);
         memcpy(&bt[0], &buf[skip], NWAITQ_BTFRAMES * sizeof(uintptr_t));
  }
  #else /* no stats */
         memcpy(&bt[0], &buf[skip], NWAITQ_BTFRAMES * sizeof(uintptr_t));
  }
  #else /* no stats */
@@ -1809,19 +1817,29 @@ waitq_irq_safe(struct waitq *waitq)
         return waitq->waitq_irq;
  }
  
         return waitq->waitq_irq;
  }
  
-struct waitq *
-waitq_get_safeq(struct waitq *waitq)
+static inline bool
+waitq_empty(struct waitq *wq)
  {
  {
-       struct waitq *safeq;
+       if (waitq_is_turnstile_queue(wq)) {
+               return priority_queue_empty(&wq->waitq_prio_queue);
+       } else if (waitq_is_turnstile_proxy(wq)) {
+               struct turnstile *ts = wq->waitq_ts;
+               return ts == TURNSTILE_NULL ||
+                      priority_queue_empty(&ts->ts_waitq.waitq_prio_queue);
+       } else {
+               return queue_empty(&wq->waitq_queue);
+       }
+}
  
  
+static struct waitq *
+waitq_get_safeq(struct waitq *waitq)
+{
         /* Check if it's a port waitq */
         /* Check if it's a port waitq */
-       if (waitq_is_port_queue(waitq)) {
-               assert(!waitq_irq_safe(waitq));
-               safeq = ipc_port_rcv_turnstile_waitq(waitq);
-       } else {
-               safeq = global_eventq(waitq);
+       if (waitq_is_turnstile_proxy(waitq)) {
+               struct turnstile *ts = waitq->waitq_ts;
+               return ts ? &ts->ts_waitq : NULL;
         }
         }
-       return safeq;
+       return global_eventq(waitq);
  }
  
  static uint32_t
  }
  
  static uint32_t
@@ -1850,29 +1868,8 @@ waitq_thread_insert(struct waitq *wq,
      thread_t thread, boolean_t fifo)
  {
         if (waitq_is_turnstile_queue(wq)) {
      thread_t thread, boolean_t fifo)
  {
         if (waitq_is_turnstile_queue(wq)) {
-               KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
-                   (TURNSTILE_CODE(TURNSTILE_HEAP_OPERATIONS, (THREAD_ADDED_TO_TURNSTILE_WAITQ))) | DBG_FUNC_NONE,
-                   VM_KERNEL_UNSLIDE_OR_PERM(waitq_to_turnstile(wq)),
-                   thread_tid(thread),
-                   thread->base_pri, 0, 0);
-
                 turnstile_stats_update(0, TSU_TURNSTILE_BLOCK_COUNT, NULL);
                 turnstile_stats_update(0, TSU_TURNSTILE_BLOCK_COUNT, NULL);
-
-               /*
-                * For turnstile queues (which use priority queues),
-                * insert the thread in the heap based on its current
-                * base_pri. Note that the priority queue implementation
-                * is currently not stable, so does not maintain fifo for
-                * threads at the same base_pri. Also, if the base_pri
-                * of the thread changes while its blocked in the waitq,
-                * the thread position should be updated in the priority
-                * queue by calling priority queue increase/decrease
-                * operations.
-                */
-               priority_queue_entry_init(&(thread->wait_prioq_links));
-               priority_queue_insert(&wq->waitq_prio_queue,
-                   &thread->wait_prioq_links, thread->base_pri,
-                   PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE);
+               turnstile_waitq_add_thread_priority_queue(wq, thread);
         } else {
                 turnstile_stats_update(0, TSU_REGULAR_WAITQ_BLOCK_COUNT, NULL);
                 if (fifo) {
         } else {
                 turnstile_stats_update(0, TSU_REGULAR_WAITQ_BLOCK_COUNT, NULL);
                 if (fifo) {
@@ -1893,8 +1890,7 @@ waitq_thread_remove(struct waitq *wq,
                     VM_KERNEL_UNSLIDE_OR_PERM(waitq_to_turnstile(wq)),
                     thread_tid(thread),
                     0, 0, 0);
                     VM_KERNEL_UNSLIDE_OR_PERM(waitq_to_turnstile(wq)),
                     thread_tid(thread),
                     0, 0, 0);
-               priority_queue_remove(&wq->waitq_prio_queue, &thread->wait_prioq_links,
-                   PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE);
+               priority_queue_remove(&wq->waitq_prio_queue, &thread->wait_prioq_links);
         } else {
                 remqueue(&(thread->wait_links));
         }
         } else {
                 remqueue(&(thread->wait_links));
         }
@@ -1912,8 +1908,6 @@ waitq_bootstrap(void)
         }
         wqdbg("Minimum free table elements: %d", tmp32);
  
         }
         wqdbg("Minimum free table elements: %d", tmp32);
  
-       lck_grp_init(&waitq_lck_grp, "waitq", LCK_GRP_ATTR_NULL);
-
         /*
          * Determine the amount of memory we're willing to reserve for
          * the waitqueue hash table
         /*
          * Determine the amount of memory we're willing to reserve for
          * the waitqueue hash table
@@ -1964,12 +1958,6 @@ waitq_bootstrap(void)
                 waitq_init(&global_waitqs[i], SYNC_POLICY_FIFO | SYNC_POLICY_DISABLE_IRQ);
         }
  
                 waitq_init(&global_waitqs[i], SYNC_POLICY_FIFO | SYNC_POLICY_DISABLE_IRQ);
         }
  
-       waitq_set_zone = zinit(sizeof(struct waitq_set),
-           WAITQ_SET_MAX * sizeof(struct waitq_set),
-           sizeof(struct waitq_set),
-           "waitq sets");
-       zone_change(waitq_set_zone, Z_NOENCRYPT, TRUE);
-
         /* initialize the global waitq link table */
         wql_init();
  
         /* initialize the global waitq link table */
         wql_init();
  
@@ -2059,6 +2047,7 @@ struct waitq_select_args {
         event64_t        event;
         waitq_select_cb  select_cb;
         void            *select_ctx;
         event64_t        event;
         waitq_select_cb  select_cb;
         void            *select_ctx;
+       int             priority;
  
         uint64_t        *reserved_preposts;
  
  
         uint64_t        *reserved_preposts;
  
@@ -2119,16 +2108,13 @@ waitq_select_walk_cb(struct waitq *waitq, void *ctx,
          */
         do_waitq_select_n_locked(&args);
  
          */
         do_waitq_select_n_locked(&args);
  
-       if (*(args.nthreads) > 0 ||
-           (args.threadq && !queue_empty(args.threadq))) {
+       if (*args.nthreads > 0 || (args.threadq && !queue_empty(args.threadq))) {
                 /* at least 1 thread was selected and returned: don't prepost */
                 /* at least 1 thread was selected and returned: don't prepost */
-               if (args.max_threads > 0 &&
-                   *(args.nthreads) >= args.max_threads) {
+               if (args.max_threads > 0 && *args.nthreads >= args.max_threads) {
                         /* break out of the setid walk */
                         ret = WQ_ITERATE_FOUND;
                 }
                         /* break out of the setid walk */
                         ret = WQ_ITERATE_FOUND;
                 }
-               goto out_unlock;
-       } else {
+       } else if (args.event == NO_EVENT64) {
                 /*
                  * No thread selected: prepost 'waitq' to 'wqset'
                  * if wqset can handle preposts and the event is set to 0.
                 /*
                  * No thread selected: prepost 'waitq' to 'wqset'
                  * if wqset can handle preposts and the event is set to 0.
@@ -2139,14 +2125,39 @@ waitq_select_walk_cb(struct waitq *waitq, void *ctx,
                  * callout function and pass the set's 'prepost_hook.' This
                  * could potentially release another thread to handle events.
                  */
                  * callout function and pass the set's 'prepost_hook.' This
                  * could potentially release another thread to handle events.
                  */
-               if (args.event == NO_EVENT64) {
-                       if (waitq_set_can_prepost(wqset)) {
-                               wq_prepost_do_post_locked(
-                                       wqset, waitq, args.reserved_preposts);
-                       } else if (waitq_set_has_prepost_hook(wqset)) {
-                               waitq_set__CALLING_PREPOST_HOOK__(
-                                       wqset->wqset_prepost_hook, waitq, 0);
-                       }
+               if (waitq_set_can_prepost(wqset)) {
+                       wq_prepost_do_post_locked(
+                               wqset, waitq, args.reserved_preposts);
+               } else if (waitq_set_has_prepost_hook(wqset)) {
+                       waitq_set_prepost_hook_t *hook = wqset->wqset_prepost_hook;
+
+                       /*
+                        * When calling out to the prepost hook,
+                        * we drop the waitq lock, to allow for the kevent
+                        * subsytem to call into the waitq subsystem again,
+                        * without risking a deadlock.
+                        *
+                        * However, we need to guard against wqset going away,
+                        * so we increment the prepost hook use count
+                        * while the lock is dropped.
+                        *
+                        * This lets waitq_set_deinit() know to wait for the
+                        * prepost hook call to be done before it can proceed.
+                        *
+                        * Note: we need to keep preemption disabled the whole
+                        * time as waitq_set_deinit will spin on this.
+                        */
+
+                       disable_preemption();
+                       os_atomic_add(hook, (uint16_t)1, relaxed);
+                       waitq_set_unlock(wqset);
+
+                       waitq_set__CALLING_PREPOST_HOOK__(hook);
+
+                       /* Note: after this decrement, the wqset may be deallocated */
+                       os_atomic_add(hook, (uint16_t)-1, relaxed);
+                       enable_preemption();
+                       return ret;
                 }
         }
  
                 }
         }
  
@@ -2285,8 +2296,7 @@ waitq_prioq_iterate_locked(struct waitq *safeq, struct waitq *waitq,
  
                 if (remove_op) {
                         thread = priority_queue_remove_max(&safeq->waitq_prio_queue,
  
                 if (remove_op) {
                         thread = priority_queue_remove_max(&safeq->waitq_prio_queue,
-                           struct thread, wait_prioq_links,
-                           PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE);
+                           struct thread, wait_prioq_links);
                 } else {
                         /* For the peek operation, the only valid value for max_threads is 1 */
                         assert(max_threads == 1);
                 } else {
                         /* For the peek operation, the only valid value for max_threads is 1 */
                         assert(max_threads == 1);
@@ -2324,6 +2334,13 @@ waitq_prioq_iterate_locked(struct waitq *safeq, struct waitq *waitq,
  
                 if (first_thread == THREAD_NULL) {
                         first_thread = thread;
  
                 if (first_thread == THREAD_NULL) {
                         first_thread = thread;
+                       /*
+                        * turnstile_kernel_update_inheritor_on_wake_locked will lock
+                        * first_thread, so call it before locking it.
+                        */
+                       if (args->priority == WAITQ_PROMOTE_ON_WAKE && first_thread != THREAD_NULL && waitq_is_turnstile_queue(safeq)) {
+                               turnstile_kernel_update_inheritor_on_wake_locked(waitq_to_turnstile(safeq), (turnstile_inheritor_t)first_thread, TURNSTILE_INHERITOR_THREAD);
+                       }
                 }
  
                 /* For the peek operation, break out early */
                 }
  
                 /* For the peek operation, break out early */
@@ -2378,6 +2395,15 @@ do_waitq_select_n_locked(struct waitq_select_args *args)
                 /* JMM - add flag to waitq to avoid global lookup if no waiters */
                 eventmask = _CAST_TO_EVENT_MASK(waitq);
                 safeq = waitq_get_safeq(waitq);
                 /* JMM - add flag to waitq to avoid global lookup if no waiters */
                 eventmask = _CAST_TO_EVENT_MASK(waitq);
                 safeq = waitq_get_safeq(waitq);
+               if (safeq == NULL) {
+                       /*
+                        * in the WQT_TSPROXY case, if there's no turnstile,
+                        * there's no queue and no waiters, so we can move straight
+                        * to the waitq set recursion
+                        */
+                       goto handle_waitq_set;
+               }
+
                 if (*nthreads == 0) {
                         spl = splsched();
                 }
                 if (*nthreads == 0) {
                         spl = splsched();
                 }
@@ -2431,6 +2457,7 @@ do_waitq_select_n_locked(struct waitq_select_args *args)
                 /* we know this is the first (and only) thread */
                 ++(*nthreads);
                 *(args->spl) = (safeq != waitq) ? spl : splsched();
                 /* we know this is the first (and only) thread */
                 ++(*nthreads);
                 *(args->spl) = (safeq != waitq) ? spl : splsched();
+
                 thread_lock(first_thread);
                 thread_clear_waitq_state(first_thread);
                 waitq_thread_remove(safeq, first_thread);
                 thread_lock(first_thread);
                 thread_clear_waitq_state(first_thread);
                 waitq_thread_remove(safeq, first_thread);
@@ -2454,6 +2481,7 @@ do_waitq_select_n_locked(struct waitq_select_args *args)
                 return;
         }
  
                 return;
         }
  
+handle_waitq_set:
         /*
          * wait queues that are not in any sets
          * are the bottom of the recursion
         /*
          * wait queues that are not in any sets
          * are the bottom of the recursion
@@ -2510,7 +2538,8 @@ waitq_select_n_locked(struct waitq *waitq,
      void *select_ctx,
      uint64_t *reserved_preposts,
      queue_t threadq,
      void *select_ctx,
      uint64_t *reserved_preposts,
      queue_t threadq,
-    int max_threads, spl_t *spl)
+    int max_threads, spl_t *spl,
+    int priority)
  {
         int nthreads = 0;
  
  {
         int nthreads = 0;
  
@@ -2520,6 +2549,7 @@ waitq_select_n_locked(struct waitq *waitq,
                 .event = event,
                 .select_cb = select_cb,
                 .select_ctx = select_ctx,
                 .event = event,
                 .select_cb = select_cb,
                 .select_ctx = select_ctx,
+               .priority = priority,
                 .reserved_preposts = reserved_preposts,
                 .threadq = threadq,
                 .max_threads = max_threads,
                 .reserved_preposts = reserved_preposts,
                 .threadq = threadq,
                 .max_threads = max_threads,
@@ -2547,14 +2577,13 @@ waitq_select_one_locked(struct waitq *waitq, event64_t event,
      uint64_t *reserved_preposts,
      int priority, spl_t *spl)
  {
      uint64_t *reserved_preposts,
      int priority, spl_t *spl)
  {
-       (void)priority;
         int nthreads;
         queue_head_t threadq;
  
         queue_init(&threadq);
  
         nthreads = waitq_select_n_locked(waitq, event, NULL, NULL,
         int nthreads;
         queue_head_t threadq;
  
         queue_init(&threadq);
  
         nthreads = waitq_select_n_locked(waitq, event, NULL, NULL,
-           reserved_preposts, &threadq, 1, spl);
+           reserved_preposts, &threadq, 1, spl, priority);
  
         /* if we selected a thread, return it (still locked) */
         if (!queue_empty(&threadq)) {
  
         /* if we selected a thread, return it (still locked) */
         if (!queue_empty(&threadq)) {
@@ -2569,96 +2598,6 @@ waitq_select_one_locked(struct waitq *waitq, event64_t event,
         return THREAD_NULL;
  }
  
         return THREAD_NULL;
  }
  
-struct find_max_pri_ctx {
-       integer_t max_sched_pri;
-       integer_t max_base_pri;
-       thread_t highest_thread;
-};
-
-/**
- * callback function that finds the max priority thread
- *
- * Conditions:
- *      'waitq' is locked
- *      'thread' is not locked
- */
-static thread_t
-waitq_find_max_pri_cb(void         *ctx_in,
-    __unused struct waitq *waitq,
-    __unused int           is_global,
-    thread_t      thread)
-{
-       struct find_max_pri_ctx *ctx = (struct find_max_pri_ctx *)ctx_in;
-
-       /*
-        * thread is not locked, use pri as a hint only
-        * wake up the highest base pri, and find the highest sched pri at that base pri
-        */
-       integer_t sched_pri = *(volatile int16_t *)&thread->sched_pri;
-       integer_t base_pri  = *(volatile int16_t *)&thread->base_pri;
-
-       if (ctx->highest_thread == THREAD_NULL ||
-           (base_pri > ctx->max_base_pri) ||
-           (base_pri == ctx->max_base_pri && sched_pri > ctx->max_sched_pri)) {
-               /* don't select the thread, just update ctx */
-
-               ctx->max_sched_pri  = sched_pri;
-               ctx->max_base_pri   = base_pri;
-               ctx->highest_thread = thread;
-       }
-
-       return THREAD_NULL;
-}
-
-/**
- * select from a waitq the highest priority thread waiting for a given event
- *
- * Conditions:
- *     'waitq' is locked
- *
- * Returns:
- *     A locked thread that's been removed from the waitq, but has not
- *     yet been put on a run queue. Caller is responsible to call splx
- *     with the '*spl' value.
- */
-static thread_t
-waitq_select_max_locked(struct waitq *waitq, event64_t event,
-    uint64_t *reserved_preposts,
-    spl_t *spl)
-{
-       __assert_only int nthreads;
-       assert(!waitq->waitq_set_id); /* doesn't support recursive sets */
-
-       struct find_max_pri_ctx ctx = {
-               .max_sched_pri = 0,
-               .max_base_pri = 0,
-               .highest_thread = THREAD_NULL,
-       };
-
-       /*
-        * Scan the waitq to find the highest priority thread.
-        * This doesn't remove any thread from the queue
-        */
-       nthreads = waitq_select_n_locked(waitq, event,
-           waitq_find_max_pri_cb,
-           &ctx, reserved_preposts, NULL, 1, spl);
-
-       assert(nthreads == 0);
-
-       if (ctx.highest_thread != THREAD_NULL) {
-               __assert_only kern_return_t ret;
-
-               /* Remove only the thread we just found */
-               ret = waitq_select_thread_locked(waitq, event, ctx.highest_thread, spl);
-
-               assert(ret == KERN_SUCCESS);
-               return ctx.highest_thread;
-       }
-
-       return THREAD_NULL;
-}
-
-
  struct select_thread_ctx {
         thread_t      thread;
         event64_t     event;
  struct select_thread_ctx {
         thread_t      thread;
         event64_t     event;
@@ -2757,13 +2696,22 @@ waitq_select_thread_locked(struct waitq *waitq,
         kern_return_t kr;
         spl_t s;
  
         kern_return_t kr;
         spl_t s;
  
-       s = splsched();
-
         /* Find and lock the interrupts disabled queue the thread is actually on */
         if (!waitq_irq_safe(waitq)) {
                 safeq = waitq_get_safeq(waitq);
         /* Find and lock the interrupts disabled queue the thread is actually on */
         if (!waitq_irq_safe(waitq)) {
                 safeq = waitq_get_safeq(waitq);
+               if (safeq == NULL) {
+                       /*
+                        * in the WQT_TSPROXY case, if there's no turnstile,
+                        * there's no queue and no waiters, so we can move straight
+                        * to the waitq set recursion
+                        */
+                       goto handle_waitq_set;
+               }
+
+               s = splsched();
                 waitq_lock(safeq);
         } else {
                 waitq_lock(safeq);
         } else {
+               s = splsched();
                 safeq = waitq;
         }
  
                 safeq = waitq;
         }
  
@@ -2788,6 +2736,7 @@ waitq_select_thread_locked(struct waitq *waitq,
  
         splx(s);
  
  
         splx(s);
  
+handle_waitq_set:
         if (!waitq->waitq_set_id) {
                 return KERN_NOT_WAITING;
         }
         if (!waitq->waitq_set_id) {
                 return KERN_NOT_WAITING;
         }
@@ -2898,6 +2847,10 @@ waitq_assert_wait64_locked(struct waitq *waitq,
          */
         if (!waitq_irq_safe(waitq)) {
                 safeq = waitq_get_safeq(waitq);
          */
         if (!waitq_irq_safe(waitq)) {
                 safeq = waitq_get_safeq(waitq);
+               if (__improbable(safeq == NULL)) {
+                       panic("Trying to assert_wait on a turnstile proxy "
+                           "that hasn't been donated one (waitq: %p)", waitq);
+               }
                 eventmask = _CAST_TO_EVENT_MASK(waitq);
                 waitq_lock(safeq);
         } else {
                 eventmask = _CAST_TO_EVENT_MASK(waitq);
                 waitq_lock(safeq);
         } else {
@@ -3001,6 +2954,10 @@ waitq_pull_thread_locked(struct waitq *waitq, thread_t thread)
         /* Find the interrupts disabled queue thread is waiting on */
         if (!waitq_irq_safe(waitq)) {
                 safeq = waitq_get_safeq(waitq);
         /* Find the interrupts disabled queue thread is waiting on */
         if (!waitq_irq_safe(waitq)) {
                 safeq = waitq_get_safeq(waitq);
+               if (__improbable(safeq == NULL)) {
+                       panic("Trying to clear_wait on a turnstile proxy "
+                           "that hasn't been donated one (waitq: %p)", waitq);
+               }
         } else {
                 safeq = waitq;
         }
         } else {
                 safeq = waitq;
         }
@@ -3051,9 +3008,6 @@ maybe_adjust_thread_pri(thread_t   thread,
                 }
  
                 sched_thread_promote_reason(thread, TH_SFLAG_WAITQ_PROMOTED, trace_waitq);
                 }
  
                 sched_thread_promote_reason(thread, TH_SFLAG_WAITQ_PROMOTED, trace_waitq);
-       } else if (priority > 0) {
-               /* Mutex subsystem wants to see this thread before we 'go' it */
-               lck_mtx_wakeup_adjust_pri(thread, priority);
         }
  }
  
         }
  }
  
@@ -3123,7 +3077,7 @@ waitq_wakeup64_all_locked(struct waitq *waitq,
  
         nthreads = waitq_select_n_locked(waitq, wake_event, NULL, NULL,
             reserved_preposts,
  
         nthreads = waitq_select_n_locked(waitq, wake_event, NULL, NULL,
             reserved_preposts,
-           &wakeup_queue, -1, &th_spl);
+           &wakeup_queue, -1, &th_spl, priority);
  
         /* set each thread running */
         ret = KERN_NOT_WAITING;
  
         /* set each thread running */
         ret = KERN_NOT_WAITING;
@@ -3140,7 +3094,7 @@ waitq_wakeup64_all_locked(struct waitq *waitq,
                 assert_thread_magic(thread);
                 remqueue(&thread->wait_links);
                 maybe_adjust_thread_pri(thread, priority, waitq);
                 assert_thread_magic(thread);
                 remqueue(&thread->wait_links);
                 maybe_adjust_thread_pri(thread, priority, waitq);
-               ret = thread_go(thread, result);
+               ret = thread_go(thread, result, WQ_OPTION_NONE);
                 assert(ret == KERN_SUCCESS);
                 thread_unlock(thread);
         }
                 assert(ret == KERN_SUCCESS);
                 thread_unlock(thread);
         }
@@ -3168,23 +3122,17 @@ waitq_wakeup64_one_locked(struct waitq *waitq,
      wait_result_t result,
      uint64_t *reserved_preposts,
      int priority,
      wait_result_t result,
      uint64_t *reserved_preposts,
      int priority,
-    waitq_lock_state_t lock_state)
+    waitq_lock_state_t lock_state,
+    waitq_options_t option)
  {
         thread_t thread;
         spl_t th_spl;
  
         assert(waitq_held(waitq));
  
  {
         thread_t thread;
         spl_t th_spl;
  
         assert(waitq_held(waitq));
  
-       if (priority == WAITQ_SELECT_MAX_PRI) {
-               thread = waitq_select_max_locked(waitq, wake_event,
-                   reserved_preposts,
-                   &th_spl);
-       } else {
-               thread = waitq_select_one_locked(waitq, wake_event,
-                   reserved_preposts,
-                   priority, &th_spl);
-       }
-
+       thread = waitq_select_one_locked(waitq, wake_event,
+           reserved_preposts,
+           priority, &th_spl);
  
         if (thread != THREAD_NULL) {
                 waitq_stats_count_wakeup(waitq);
  
         if (thread != THREAD_NULL) {
                 waitq_stats_count_wakeup(waitq);
@@ -3198,7 +3146,7 @@ waitq_wakeup64_one_locked(struct waitq *waitq,
  
         if (thread != THREAD_NULL) {
                 maybe_adjust_thread_pri(thread, priority, waitq);
  
         if (thread != THREAD_NULL) {
                 maybe_adjust_thread_pri(thread, priority, waitq);
-               kern_return_t ret = thread_go(thread, result);
+               kern_return_t ret = thread_go(thread, result, option);
                 assert(ret == KERN_SUCCESS);
                 thread_unlock(thread);
                 splx(th_spl);
                 assert(ret == KERN_SUCCESS);
                 thread_unlock(thread);
                 splx(th_spl);
@@ -3233,15 +3181,9 @@ waitq_wakeup64_identify_locked(struct waitq     *waitq,
  
         assert(waitq_held(waitq));
  
  
         assert(waitq_held(waitq));
  
-       if (priority == WAITQ_SELECT_MAX_PRI) {
-               thread = waitq_select_max_locked(waitq, wake_event,
-                   reserved_preposts,
-                   spl);
-       } else {
-               thread = waitq_select_one_locked(waitq, wake_event,
-                   reserved_preposts,
-                   priority, spl);
-       }
+       thread = waitq_select_one_locked(waitq, wake_event,
+           reserved_preposts,
+           priority, spl);
  
         if (thread != THREAD_NULL) {
                 waitq_stats_count_wakeup(waitq);
  
         if (thread != THREAD_NULL) {
                 waitq_stats_count_wakeup(waitq);
@@ -3255,7 +3197,7 @@ waitq_wakeup64_identify_locked(struct waitq     *waitq,
  
         if (thread != THREAD_NULL) {
                 kern_return_t __assert_only ret;
  
         if (thread != THREAD_NULL) {
                 kern_return_t __assert_only ret;
-               ret = thread_go(thread, result);
+               ret = thread_go(thread, result, WQ_OPTION_NONE);
                 assert(ret == KERN_SUCCESS);
         }
  
                 assert(ret == KERN_SUCCESS);
         }
  
@@ -3309,7 +3251,7 @@ waitq_wakeup64_thread_locked(struct waitq *waitq,
                 return KERN_NOT_WAITING;
         }
  
                 return KERN_NOT_WAITING;
         }
  
-       ret = thread_go(thread, result);
+       ret = thread_go(thread, result, WQ_OPTION_NONE);
         assert(ret == KERN_SUCCESS);
         thread_unlock(thread);
         splx(th_spl);
         assert(ret == KERN_SUCCESS);
         thread_unlock(thread);
         splx(th_spl);
@@ -3341,8 +3283,12 @@ waitq_init(struct waitq *waitq, int policy)
         waitq->waitq_fifo = ((policy & SYNC_POLICY_REVERSED) == 0);
         waitq->waitq_irq = !!(policy & SYNC_POLICY_DISABLE_IRQ);
         waitq->waitq_prepost = 0;
         waitq->waitq_fifo = ((policy & SYNC_POLICY_REVERSED) == 0);
         waitq->waitq_irq = !!(policy & SYNC_POLICY_DISABLE_IRQ);
         waitq->waitq_prepost = 0;
-       waitq->waitq_type = WQT_QUEUE;
-       waitq->waitq_turnstile_or_port = !!(policy & SYNC_POLICY_TURNSTILE);
+       if (policy & SYNC_POLICY_TURNSTILE_PROXY) {
+               waitq->waitq_type = WQT_TSPROXY;
+       } else {
+               waitq->waitq_type = WQT_QUEUE;
+       }
+       waitq->waitq_turnstile = !!(policy & SYNC_POLICY_TURNSTILE);
         waitq->waitq_eventmask = 0;
  
         waitq->waitq_set_id = 0;
         waitq->waitq_eventmask = 0;
  
         waitq->waitq_set_id = 0;
@@ -3351,9 +3297,11 @@ waitq_init(struct waitq *waitq, int policy)
         waitq_lock_init(waitq);
         if (waitq_is_turnstile_queue(waitq)) {
                 /* For turnstile, initialize it as a priority queue */
         waitq_lock_init(waitq);
         if (waitq_is_turnstile_queue(waitq)) {
                 /* For turnstile, initialize it as a priority queue */
-               priority_queue_init(&waitq->waitq_prio_queue,
-                   PRIORITY_QUEUE_BUILTIN_MAX_HEAP);
+               priority_queue_init(&waitq->waitq_prio_queue);
                 assert(waitq->waitq_fifo == 0);
                 assert(waitq->waitq_fifo == 0);
+       } else if (policy & SYNC_POLICY_TURNSTILE_PROXY) {
+               waitq->waitq_ts = TURNSTILE_NULL;
+               waitq->waitq_tspriv = NULL;
         } else {
                 queue_init(&waitq->waitq_queue);
         }
         } else {
                 queue_init(&waitq->waitq_queue);
         }
@@ -3438,7 +3386,12 @@ waitq_deinit(struct waitq *waitq)
  {
         spl_t s;
  
  {
         spl_t s;
  
-       if (!waitq || !waitq_is_queue(waitq)) {
+       assert(waitq);
+       if (!waitq_is_valid(waitq)) {
+               return;
+       }
+
+       if (!waitq_is_queue(waitq) && !waitq_is_turnstile_proxy(waitq)) {
                 return;
         }
  
                 return;
         }
  
@@ -3446,25 +3399,33 @@ waitq_deinit(struct waitq *waitq)
                 s = splsched();
         }
         waitq_lock(waitq);
                 s = splsched();
         }
         waitq_lock(waitq);
-       if (!waitq_valid(waitq)) {
-               waitq_unlock(waitq);
-               if (waitq_irq_safe(waitq)) {
-                       splx(s);
+
+       if (waitq_valid(waitq)) {
+               waitq->waitq_isvalid = 0;
+               if (!waitq_irq_safe(waitq)) {
+                       waitq_unlink_all_unlock(waitq);
+                       /* waitq unlocked and set links deallocated */
+                       goto out;
                 }
                 }
-               return;
         }
  
         }
  
-       waitq->waitq_isvalid = 0;
-
-       if (!waitq_irq_safe(waitq)) {
-               waitq_unlink_all_unlock(waitq);
-               /* waitq unlocked and set links deallocated */
-       } else {
-               waitq_unlock(waitq);
+       waitq_unlock(waitq);
+       if (waitq_irq_safe(waitq)) {
                 splx(s);
         }
  
                 splx(s);
         }
  
-       assert(waitq_empty(waitq));
+out:
+#if MACH_ASSERT
+       if (waitq_is_turnstile_queue(waitq)) {
+               assert(priority_queue_empty(&waitq->waitq_prio_queue));
+       } else if (waitq_is_turnstile_proxy(waitq)) {
+               assert(waitq->waitq_ts == TURNSTILE_NULL);
+       } else {
+               assert(queue_empty(&waitq->waitq_queue));
+       }
+#else
+       (void)0;
+#endif // MACH_ASSERT
  }
  
  void
  }
  
  void
@@ -3508,7 +3469,7 @@ wqset_clear_prepost_chain_cb(struct waitq_set __unused *wqset,
   *     NULL on failure
   */
  struct waitq_set *
   *     NULL on failure
   */
  struct waitq_set *
-waitq_set_alloc(int policy, void *prepost_hook)
+waitq_set_alloc(int policy, waitq_set_prepost_hook_t *prepost_hook)
  {
         struct waitq_set *wqset;
  
  {
         struct waitq_set *wqset;
  
@@ -3537,7 +3498,7 @@ waitq_set_alloc(int policy, void *prepost_hook)
  kern_return_t
  waitq_set_init(struct waitq_set *wqset,
      int policy, uint64_t *reserved_link,
  kern_return_t
  waitq_set_init(struct waitq_set *wqset,
      int policy, uint64_t *reserved_link,
-    void *prepost_hook)
+    waitq_set_prepost_hook_t *prepost_hook)
  {
         struct waitq_link *link;
         kern_return_t ret;
  {
         struct waitq_link *link;
         kern_return_t ret;
@@ -3677,6 +3638,20 @@ waitq_set_deinit(struct waitq_set *wqset)
  
         waitq_set_lock(wqset);
  
  
         waitq_set_lock(wqset);
  
+       if (waitq_set_has_prepost_hook(wqset)) {
+               waitq_set_prepost_hook_t *hook = wqset->wqset_prepost_hook;
+               /*
+                * If the wqset_prepost_hook value is non 0,
+                * then another core is currently posting to this waitq set
+                * and we need for it to finish what it's doing.
+                */
+               while (os_atomic_load(hook, relaxed) != 0) {
+                       waitq_set_unlock(wqset);
+                       delay(1);
+                       waitq_set_lock(wqset);
+               }
+       }
+
         set_id = wqset->wqset_id;
  
         if (waitqs_is_linked(wqset) || set_id == 0) {
         set_id = wqset->wqset_id;
  
         if (waitqs_is_linked(wqset) || set_id == 0) {
@@ -4969,7 +4944,7 @@ waitq_alloc_prepost_reservation(int nalloc, struct waitq *waitq,
          */
         if (waitq) {
                 disable_preemption();
          */
         if (waitq) {
                 disable_preemption();
-               cache = &PROCESSOR_DATA(current_processor(), wqp_cache);
+               cache = PERCPU_GET(wqp_cache);
                 if (nalloc <= (int)cache->avail) {
                         goto do_alloc;
                 }
                 if (nalloc <= (int)cache->avail) {
                         goto do_alloc;
                 }
@@ -5569,7 +5544,7 @@ waitq_wakeup64_one(struct waitq *waitq, event64_t wake_event,
  
         /* waitq is locked upon return */
         kr = waitq_wakeup64_one_locked(waitq, wake_event, result,
  
         /* waitq is locked upon return */
         kr = waitq_wakeup64_one_locked(waitq, wake_event, result,
-           &reserved_preposts, priority, WAITQ_UNLOCK);
+           &reserved_preposts, priority, WAITQ_UNLOCK, WQ_OPTION_NONE);
  
         if (waitq_irq_safe(waitq)) {
                 splx(spl);
  
         if (waitq_irq_safe(waitq)) {
                 splx(spl);
@@ -5661,7 +5636,7 @@ waitq_wakeup64_thread(struct waitq *waitq,
         waitq_unlock(waitq);
  
         if (ret == KERN_SUCCESS) {
         waitq_unlock(waitq);
  
         if (ret == KERN_SUCCESS) {
-               ret = thread_go(thread, result);
+               ret = thread_go(thread, result, WQ_OPTION_NONE);
                 assert(ret == KERN_SUCCESS);
                 thread_unlock(thread);
                 splx(th_spl);
                 assert(ret == KERN_SUCCESS);
                 thread_unlock(thread);
                 splx(th_spl);