]> git.saurik.com Git - apple/xnu.git/blobdiff - osfmk/kern/waitq.c
xnu-7195.101.1.tar.gz
[apple/xnu.git] / osfmk / kern / waitq.c
index 1a38d76feb9a34676279de9f07859cbc6ddba3e5..1674f379aa0d8909d6e337ed3cd5539686683ffe 100644 (file)
@@ -1,5 +1,5 @@
 /*
 /*
- * Copyright (c) 2015-2016 Apple Inc. All rights reserved.
+ * Copyright (c) 2015-2020 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -66,6 +66,7 @@
 #include <kern/kern_types.h>
 #include <kern/ltable.h>
 #include <kern/mach_param.h>
 #include <kern/kern_types.h>
 #include <kern/ltable.h>
 #include <kern/mach_param.h>
+#include <kern/percpu.h>
 #include <kern/queue.h>
 #include <kern/sched_prim.h>
 #include <kern/simple_lock.h>
 #include <kern/queue.h>
 #include <kern/sched_prim.h>
 #include <kern/simple_lock.h>
@@ -122,9 +123,16 @@ static kern_return_t waitq_select_thread_locked(struct waitq *waitq,
     event64_t event,
     thread_t thread, spl_t *spl);
 
     event64_t event,
     thread_t thread, spl_t *spl);
 
-#define WAITQ_SET_MAX (task_max * 3)
-static zone_t waitq_set_zone;
+ZONE_DECLARE(waitq_set_zone, "waitq sets",
+    sizeof(struct waitq_set), ZC_NOENCRYPT);
 
 
+/* waitq prepost cache */
+#define WQP_CACHE_MAX   50
+struct wqp_cache {
+       uint64_t        head;
+       unsigned int    avail;
+};
+static struct wqp_cache PERCPU_DATA(wqp_cache);
 
 #define P2ROUNDUP(x, align) (-(-((uint32_t)(x)) & -(align)))
 #define ROUNDDOWN(x, y)  (((x)/(y))*(y))
 
 #define P2ROUNDUP(x, align) (-(-((uint32_t)(x)) & -(align)))
 #define ROUNDDOWN(x, y)  (((x)/(y))*(y))
@@ -134,7 +142,7 @@ static zone_t waitq_set_zone;
 static __inline__ void waitq_grab_backtrace(uintptr_t bt[NWAITQ_BTFRAMES], int skip);
 #endif
 
 static __inline__ void waitq_grab_backtrace(uintptr_t bt[NWAITQ_BTFRAMES], int skip);
 #endif
 
-lck_grp_t waitq_lck_grp;
+LCK_GRP_DECLARE(waitq_lck_grp, "waitq");
 
 #if __arm64__
 
 
 #if __arm64__
 
@@ -164,7 +172,7 @@ lck_grp_t waitq_lck_grp;
  * Prepost callback function for specially marked waitq sets
  * (prepost alternative)
  */
  * Prepost callback function for specially marked waitq sets
  * (prepost alternative)
  */
-extern void waitq_set__CALLING_PREPOST_HOOK__(void *ctx, void *memberctx, int priority);
+extern void waitq_set__CALLING_PREPOST_HOOK__(waitq_set_prepost_hook_t *ctx);
 
 #define DEFAULT_MIN_FREE_TABLE_ELEM    100
 static uint32_t g_min_free_table_elem;
 
 #define DEFAULT_MIN_FREE_TABLE_ELEM    100
 static uint32_t g_min_free_table_elem;
@@ -744,7 +752,7 @@ wq_prepost_refill_cpu_cache(uint32_t nalloc)
        }
 
        disable_preemption();
        }
 
        disable_preemption();
-       cache = &PROCESSOR_DATA(current_processor(), wqp_cache);
+       cache = PERCPU_GET(wqp_cache);
 
        /* check once more before putting these elements on the list */
        if (cache->avail >= WQP_CACHE_MAX) {
 
        /* check once more before putting these elements on the list */
        if (cache->avail >= WQP_CACHE_MAX) {
@@ -776,14 +784,14 @@ wq_prepost_ensure_free_space(void)
        struct wqp_cache *cache;
 
        if (g_min_free_cache == 0) {
        struct wqp_cache *cache;
 
        if (g_min_free_cache == 0) {
-               g_min_free_cache = (WQP_CACHE_MAX * ml_get_max_cpus());
+               g_min_free_cache = (WQP_CACHE_MAX * ml_wait_max_cpus());
        }
 
        /*
         * Ensure that we always have a pool of per-CPU prepost elements
         */
        disable_preemption();
        }
 
        /*
         * Ensure that we always have a pool of per-CPU prepost elements
         */
        disable_preemption();
-       cache = &PROCESSOR_DATA(current_processor(), wqp_cache);
+       cache = PERCPU_GET(wqp_cache);
        free_elem = cache->avail;
        enable_preemption();
 
        free_elem = cache->avail;
        enable_preemption();
 
@@ -828,7 +836,7 @@ wq_prepost_alloc(int type, int nelem)
         * allocating RESERVED elements
         */
        disable_preemption();
         * allocating RESERVED elements
         */
        disable_preemption();
-       cache = &PROCESSOR_DATA(current_processor(), wqp_cache);
+       cache = PERCPU_GET(wqp_cache);
        if (nelem <= (int)cache->avail) {
                struct lt_elem *first, *next = NULL;
                int nalloc = nelem;
        if (nelem <= (int)cache->avail) {
                struct lt_elem *first, *next = NULL;
                int nalloc = nelem;
@@ -1048,7 +1056,7 @@ wq_prepost_release_rlist(struct wq_prepost *wqp)
         * if our cache is running low.
         */
        disable_preemption();
         * if our cache is running low.
         */
        disable_preemption();
-       cache = &PROCESSOR_DATA(current_processor(), wqp_cache);
+       cache = PERCPU_GET(wqp_cache);
        if (cache->avail < WQP_CACHE_MAX) {
                struct lt_elem *tmp = NULL;
                if (cache->head != LT_IDX_MAX) {
        if (cache->avail < WQP_CACHE_MAX) {
                struct lt_elem *tmp = NULL;
                if (cache->head != LT_IDX_MAX) {
@@ -1121,14 +1129,14 @@ restart:
                        /* the caller wants to remove the only prepost here */
                        assert(wqp_id == wqset->wqset_prepost_id);
                        wqset->wqset_prepost_id = 0;
                        /* the caller wants to remove the only prepost here */
                        assert(wqp_id == wqset->wqset_prepost_id);
                        wqset->wqset_prepost_id = 0;
-               /* fall through */
+                       OS_FALLTHROUGH;
                case WQ_ITERATE_CONTINUE:
                        wq_prepost_put(wqp);
                        ret = WQ_ITERATE_SUCCESS;
                        break;
                case WQ_ITERATE_RESTART:
                        wq_prepost_put(wqp);
                case WQ_ITERATE_CONTINUE:
                        wq_prepost_put(wqp);
                        ret = WQ_ITERATE_SUCCESS;
                        break;
                case WQ_ITERATE_RESTART:
                        wq_prepost_put(wqp);
-               /* fall through */
+                       OS_FALLTHROUGH;
                case WQ_ITERATE_DROPPED:
                        goto restart;
                default:
                case WQ_ITERATE_DROPPED:
                        goto restart;
                default:
@@ -1196,7 +1204,7 @@ restart:
                        goto next_prepost;
                case WQ_ITERATE_RESTART:
                        wq_prepost_put(wqp);
                        goto next_prepost;
                case WQ_ITERATE_RESTART:
                        wq_prepost_put(wqp);
-               /* fall-through */
+                       OS_FALLTHROUGH;
                case WQ_ITERATE_DROPPED:
                        /* the callback dropped the ref to wqp: just restart */
                        goto restart;
                case WQ_ITERATE_DROPPED:
                        /* the callback dropped the ref to wqp: just restart */
                        goto restart;
@@ -1706,7 +1714,7 @@ waitq_grab_backtrace(uintptr_t bt[NWAITQ_BTFRAMES], int skip)
                skip = 0;
        }
        memset(buf, 0, (NWAITQ_BTFRAMES + skip) * sizeof(uintptr_t));
                skip = 0;
        }
        memset(buf, 0, (NWAITQ_BTFRAMES + skip) * sizeof(uintptr_t));
-       backtrace(buf, g_nwaitq_btframes + skip);
+       backtrace(buf, g_nwaitq_btframes + skip, NULL);
        memcpy(&bt[0], &buf[skip], NWAITQ_BTFRAMES * sizeof(uintptr_t));
 }
 #else /* no stats */
        memcpy(&bt[0], &buf[skip], NWAITQ_BTFRAMES * sizeof(uintptr_t));
 }
 #else /* no stats */
@@ -1809,19 +1817,29 @@ waitq_irq_safe(struct waitq *waitq)
        return waitq->waitq_irq;
 }
 
        return waitq->waitq_irq;
 }
 
-struct waitq *
-waitq_get_safeq(struct waitq *waitq)
+static inline bool
+waitq_empty(struct waitq *wq)
 {
 {
-       struct waitq *safeq;
+       if (waitq_is_turnstile_queue(wq)) {
+               return priority_queue_empty(&wq->waitq_prio_queue);
+       } else if (waitq_is_turnstile_proxy(wq)) {
+               struct turnstile *ts = wq->waitq_ts;
+               return ts == TURNSTILE_NULL ||
+                      priority_queue_empty(&ts->ts_waitq.waitq_prio_queue);
+       } else {
+               return queue_empty(&wq->waitq_queue);
+       }
+}
 
 
+static struct waitq *
+waitq_get_safeq(struct waitq *waitq)
+{
        /* Check if it's a port waitq */
        /* Check if it's a port waitq */
-       if (waitq_is_port_queue(waitq)) {
-               assert(!waitq_irq_safe(waitq));
-               safeq = ipc_port_rcv_turnstile_waitq(waitq);
-       } else {
-               safeq = global_eventq(waitq);
+       if (waitq_is_turnstile_proxy(waitq)) {
+               struct turnstile *ts = waitq->waitq_ts;
+               return ts ? &ts->ts_waitq : NULL;
        }
        }
-       return safeq;
+       return global_eventq(waitq);
 }
 
 static uint32_t
 }
 
 static uint32_t
@@ -1850,29 +1868,8 @@ waitq_thread_insert(struct waitq *wq,
     thread_t thread, boolean_t fifo)
 {
        if (waitq_is_turnstile_queue(wq)) {
     thread_t thread, boolean_t fifo)
 {
        if (waitq_is_turnstile_queue(wq)) {
-               KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
-                   (TURNSTILE_CODE(TURNSTILE_HEAP_OPERATIONS, (THREAD_ADDED_TO_TURNSTILE_WAITQ))) | DBG_FUNC_NONE,
-                   VM_KERNEL_UNSLIDE_OR_PERM(waitq_to_turnstile(wq)),
-                   thread_tid(thread),
-                   thread->base_pri, 0, 0);
-
                turnstile_stats_update(0, TSU_TURNSTILE_BLOCK_COUNT, NULL);
                turnstile_stats_update(0, TSU_TURNSTILE_BLOCK_COUNT, NULL);
-
-               /*
-                * For turnstile queues (which use priority queues),
-                * insert the thread in the heap based on its current
-                * base_pri. Note that the priority queue implementation
-                * is currently not stable, so does not maintain fifo for
-                * threads at the same base_pri. Also, if the base_pri
-                * of the thread changes while its blocked in the waitq,
-                * the thread position should be updated in the priority
-                * queue by calling priority queue increase/decrease
-                * operations.
-                */
-               priority_queue_entry_init(&(thread->wait_prioq_links));
-               priority_queue_insert(&wq->waitq_prio_queue,
-                   &thread->wait_prioq_links, thread->base_pri,
-                   PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE);
+               turnstile_waitq_add_thread_priority_queue(wq, thread);
        } else {
                turnstile_stats_update(0, TSU_REGULAR_WAITQ_BLOCK_COUNT, NULL);
                if (fifo) {
        } else {
                turnstile_stats_update(0, TSU_REGULAR_WAITQ_BLOCK_COUNT, NULL);
                if (fifo) {
@@ -1893,8 +1890,7 @@ waitq_thread_remove(struct waitq *wq,
                    VM_KERNEL_UNSLIDE_OR_PERM(waitq_to_turnstile(wq)),
                    thread_tid(thread),
                    0, 0, 0);
                    VM_KERNEL_UNSLIDE_OR_PERM(waitq_to_turnstile(wq)),
                    thread_tid(thread),
                    0, 0, 0);
-               priority_queue_remove(&wq->waitq_prio_queue, &thread->wait_prioq_links,
-                   PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE);
+               priority_queue_remove(&wq->waitq_prio_queue, &thread->wait_prioq_links);
        } else {
                remqueue(&(thread->wait_links));
        }
        } else {
                remqueue(&(thread->wait_links));
        }
@@ -1912,8 +1908,6 @@ waitq_bootstrap(void)
        }
        wqdbg("Minimum free table elements: %d", tmp32);
 
        }
        wqdbg("Minimum free table elements: %d", tmp32);
 
-       lck_grp_init(&waitq_lck_grp, "waitq", LCK_GRP_ATTR_NULL);
-
        /*
         * Determine the amount of memory we're willing to reserve for
         * the waitqueue hash table
        /*
         * Determine the amount of memory we're willing to reserve for
         * the waitqueue hash table
@@ -1964,12 +1958,6 @@ waitq_bootstrap(void)
                waitq_init(&global_waitqs[i], SYNC_POLICY_FIFO | SYNC_POLICY_DISABLE_IRQ);
        }
 
                waitq_init(&global_waitqs[i], SYNC_POLICY_FIFO | SYNC_POLICY_DISABLE_IRQ);
        }
 
-       waitq_set_zone = zinit(sizeof(struct waitq_set),
-           WAITQ_SET_MAX * sizeof(struct waitq_set),
-           sizeof(struct waitq_set),
-           "waitq sets");
-       zone_change(waitq_set_zone, Z_NOENCRYPT, TRUE);
-
        /* initialize the global waitq link table */
        wql_init();
 
        /* initialize the global waitq link table */
        wql_init();
 
@@ -2059,6 +2047,7 @@ struct waitq_select_args {
        event64_t        event;
        waitq_select_cb  select_cb;
        void            *select_ctx;
        event64_t        event;
        waitq_select_cb  select_cb;
        void            *select_ctx;
+       int             priority;
 
        uint64_t        *reserved_preposts;
 
 
        uint64_t        *reserved_preposts;
 
@@ -2119,16 +2108,13 @@ waitq_select_walk_cb(struct waitq *waitq, void *ctx,
         */
        do_waitq_select_n_locked(&args);
 
         */
        do_waitq_select_n_locked(&args);
 
-       if (*(args.nthreads) > 0 ||
-           (args.threadq && !queue_empty(args.threadq))) {
+       if (*args.nthreads > 0 || (args.threadq && !queue_empty(args.threadq))) {
                /* at least 1 thread was selected and returned: don't prepost */
                /* at least 1 thread was selected and returned: don't prepost */
-               if (args.max_threads > 0 &&
-                   *(args.nthreads) >= args.max_threads) {
+               if (args.max_threads > 0 && *args.nthreads >= args.max_threads) {
                        /* break out of the setid walk */
                        ret = WQ_ITERATE_FOUND;
                }
                        /* break out of the setid walk */
                        ret = WQ_ITERATE_FOUND;
                }
-               goto out_unlock;
-       } else {
+       } else if (args.event == NO_EVENT64) {
                /*
                 * No thread selected: prepost 'waitq' to 'wqset'
                 * if wqset can handle preposts and the event is set to 0.
                /*
                 * No thread selected: prepost 'waitq' to 'wqset'
                 * if wqset can handle preposts and the event is set to 0.
@@ -2139,14 +2125,39 @@ waitq_select_walk_cb(struct waitq *waitq, void *ctx,
                 * callout function and pass the set's 'prepost_hook.' This
                 * could potentially release another thread to handle events.
                 */
                 * callout function and pass the set's 'prepost_hook.' This
                 * could potentially release another thread to handle events.
                 */
-               if (args.event == NO_EVENT64) {
-                       if (waitq_set_can_prepost(wqset)) {
-                               wq_prepost_do_post_locked(
-                                       wqset, waitq, args.reserved_preposts);
-                       } else if (waitq_set_has_prepost_hook(wqset)) {
-                               waitq_set__CALLING_PREPOST_HOOK__(
-                                       wqset->wqset_prepost_hook, waitq, 0);
-                       }
+               if (waitq_set_can_prepost(wqset)) {
+                       wq_prepost_do_post_locked(
+                               wqset, waitq, args.reserved_preposts);
+               } else if (waitq_set_has_prepost_hook(wqset)) {
+                       waitq_set_prepost_hook_t *hook = wqset->wqset_prepost_hook;
+
+                       /*
+                        * When calling out to the prepost hook,
+                        * we drop the waitq lock, to allow for the kevent
+                        * subsytem to call into the waitq subsystem again,
+                        * without risking a deadlock.
+                        *
+                        * However, we need to guard against wqset going away,
+                        * so we increment the prepost hook use count
+                        * while the lock is dropped.
+                        *
+                        * This lets waitq_set_deinit() know to wait for the
+                        * prepost hook call to be done before it can proceed.
+                        *
+                        * Note: we need to keep preemption disabled the whole
+                        * time as waitq_set_deinit will spin on this.
+                        */
+
+                       disable_preemption();
+                       os_atomic_add(hook, (uint16_t)1, relaxed);
+                       waitq_set_unlock(wqset);
+
+                       waitq_set__CALLING_PREPOST_HOOK__(hook);
+
+                       /* Note: after this decrement, the wqset may be deallocated */
+                       os_atomic_add(hook, (uint16_t)-1, relaxed);
+                       enable_preemption();
+                       return ret;
                }
        }
 
                }
        }
 
@@ -2285,8 +2296,7 @@ waitq_prioq_iterate_locked(struct waitq *safeq, struct waitq *waitq,
 
                if (remove_op) {
                        thread = priority_queue_remove_max(&safeq->waitq_prio_queue,
 
                if (remove_op) {
                        thread = priority_queue_remove_max(&safeq->waitq_prio_queue,
-                           struct thread, wait_prioq_links,
-                           PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE);
+                           struct thread, wait_prioq_links);
                } else {
                        /* For the peek operation, the only valid value for max_threads is 1 */
                        assert(max_threads == 1);
                } else {
                        /* For the peek operation, the only valid value for max_threads is 1 */
                        assert(max_threads == 1);
@@ -2324,6 +2334,13 @@ waitq_prioq_iterate_locked(struct waitq *safeq, struct waitq *waitq,
 
                if (first_thread == THREAD_NULL) {
                        first_thread = thread;
 
                if (first_thread == THREAD_NULL) {
                        first_thread = thread;
+                       /*
+                        * turnstile_kernel_update_inheritor_on_wake_locked will lock
+                        * first_thread, so call it before locking it.
+                        */
+                       if (args->priority == WAITQ_PROMOTE_ON_WAKE && first_thread != THREAD_NULL && waitq_is_turnstile_queue(safeq)) {
+                               turnstile_kernel_update_inheritor_on_wake_locked(waitq_to_turnstile(safeq), (turnstile_inheritor_t)first_thread, TURNSTILE_INHERITOR_THREAD);
+                       }
                }
 
                /* For the peek operation, break out early */
                }
 
                /* For the peek operation, break out early */
@@ -2378,6 +2395,15 @@ do_waitq_select_n_locked(struct waitq_select_args *args)
                /* JMM - add flag to waitq to avoid global lookup if no waiters */
                eventmask = _CAST_TO_EVENT_MASK(waitq);
                safeq = waitq_get_safeq(waitq);
                /* JMM - add flag to waitq to avoid global lookup if no waiters */
                eventmask = _CAST_TO_EVENT_MASK(waitq);
                safeq = waitq_get_safeq(waitq);
+               if (safeq == NULL) {
+                       /*
+                        * in the WQT_TSPROXY case, if there's no turnstile,
+                        * there's no queue and no waiters, so we can move straight
+                        * to the waitq set recursion
+                        */
+                       goto handle_waitq_set;
+               }
+
                if (*nthreads == 0) {
                        spl = splsched();
                }
                if (*nthreads == 0) {
                        spl = splsched();
                }
@@ -2431,6 +2457,7 @@ do_waitq_select_n_locked(struct waitq_select_args *args)
                /* we know this is the first (and only) thread */
                ++(*nthreads);
                *(args->spl) = (safeq != waitq) ? spl : splsched();
                /* we know this is the first (and only) thread */
                ++(*nthreads);
                *(args->spl) = (safeq != waitq) ? spl : splsched();
+
                thread_lock(first_thread);
                thread_clear_waitq_state(first_thread);
                waitq_thread_remove(safeq, first_thread);
                thread_lock(first_thread);
                thread_clear_waitq_state(first_thread);
                waitq_thread_remove(safeq, first_thread);
@@ -2454,6 +2481,7 @@ do_waitq_select_n_locked(struct waitq_select_args *args)
                return;
        }
 
                return;
        }
 
+handle_waitq_set:
        /*
         * wait queues that are not in any sets
         * are the bottom of the recursion
        /*
         * wait queues that are not in any sets
         * are the bottom of the recursion
@@ -2510,7 +2538,8 @@ waitq_select_n_locked(struct waitq *waitq,
     void *select_ctx,
     uint64_t *reserved_preposts,
     queue_t threadq,
     void *select_ctx,
     uint64_t *reserved_preposts,
     queue_t threadq,
-    int max_threads, spl_t *spl)
+    int max_threads, spl_t *spl,
+    int priority)
 {
        int nthreads = 0;
 
 {
        int nthreads = 0;
 
@@ -2520,6 +2549,7 @@ waitq_select_n_locked(struct waitq *waitq,
                .event = event,
                .select_cb = select_cb,
                .select_ctx = select_ctx,
                .event = event,
                .select_cb = select_cb,
                .select_ctx = select_ctx,
+               .priority = priority,
                .reserved_preposts = reserved_preposts,
                .threadq = threadq,
                .max_threads = max_threads,
                .reserved_preposts = reserved_preposts,
                .threadq = threadq,
                .max_threads = max_threads,
@@ -2547,14 +2577,13 @@ waitq_select_one_locked(struct waitq *waitq, event64_t event,
     uint64_t *reserved_preposts,
     int priority, spl_t *spl)
 {
     uint64_t *reserved_preposts,
     int priority, spl_t *spl)
 {
-       (void)priority;
        int nthreads;
        queue_head_t threadq;
 
        queue_init(&threadq);
 
        nthreads = waitq_select_n_locked(waitq, event, NULL, NULL,
        int nthreads;
        queue_head_t threadq;
 
        queue_init(&threadq);
 
        nthreads = waitq_select_n_locked(waitq, event, NULL, NULL,
-           reserved_preposts, &threadq, 1, spl);
+           reserved_preposts, &threadq, 1, spl, priority);
 
        /* if we selected a thread, return it (still locked) */
        if (!queue_empty(&threadq)) {
 
        /* if we selected a thread, return it (still locked) */
        if (!queue_empty(&threadq)) {
@@ -2569,96 +2598,6 @@ waitq_select_one_locked(struct waitq *waitq, event64_t event,
        return THREAD_NULL;
 }
 
        return THREAD_NULL;
 }
 
-struct find_max_pri_ctx {
-       integer_t max_sched_pri;
-       integer_t max_base_pri;
-       thread_t highest_thread;
-};
-
-/**
- * callback function that finds the max priority thread
- *
- * Conditions:
- *      'waitq' is locked
- *      'thread' is not locked
- */
-static thread_t
-waitq_find_max_pri_cb(void         *ctx_in,
-    __unused struct waitq *waitq,
-    __unused int           is_global,
-    thread_t      thread)
-{
-       struct find_max_pri_ctx *ctx = (struct find_max_pri_ctx *)ctx_in;
-
-       /*
-        * thread is not locked, use pri as a hint only
-        * wake up the highest base pri, and find the highest sched pri at that base pri
-        */
-       integer_t sched_pri = *(volatile int16_t *)&thread->sched_pri;
-       integer_t base_pri  = *(volatile int16_t *)&thread->base_pri;
-
-       if (ctx->highest_thread == THREAD_NULL ||
-           (base_pri > ctx->max_base_pri) ||
-           (base_pri == ctx->max_base_pri && sched_pri > ctx->max_sched_pri)) {
-               /* don't select the thread, just update ctx */
-
-               ctx->max_sched_pri  = sched_pri;
-               ctx->max_base_pri   = base_pri;
-               ctx->highest_thread = thread;
-       }
-
-       return THREAD_NULL;
-}
-
-/**
- * select from a waitq the highest priority thread waiting for a given event
- *
- * Conditions:
- *     'waitq' is locked
- *
- * Returns:
- *     A locked thread that's been removed from the waitq, but has not
- *     yet been put on a run queue. Caller is responsible to call splx
- *     with the '*spl' value.
- */
-static thread_t
-waitq_select_max_locked(struct waitq *waitq, event64_t event,
-    uint64_t *reserved_preposts,
-    spl_t *spl)
-{
-       __assert_only int nthreads;
-       assert(!waitq->waitq_set_id); /* doesn't support recursive sets */
-
-       struct find_max_pri_ctx ctx = {
-               .max_sched_pri = 0,
-               .max_base_pri = 0,
-               .highest_thread = THREAD_NULL,
-       };
-
-       /*
-        * Scan the waitq to find the highest priority thread.
-        * This doesn't remove any thread from the queue
-        */
-       nthreads = waitq_select_n_locked(waitq, event,
-           waitq_find_max_pri_cb,
-           &ctx, reserved_preposts, NULL, 1, spl);
-
-       assert(nthreads == 0);
-
-       if (ctx.highest_thread != THREAD_NULL) {
-               __assert_only kern_return_t ret;
-
-               /* Remove only the thread we just found */
-               ret = waitq_select_thread_locked(waitq, event, ctx.highest_thread, spl);
-
-               assert(ret == KERN_SUCCESS);
-               return ctx.highest_thread;
-       }
-
-       return THREAD_NULL;
-}
-
-
 struct select_thread_ctx {
        thread_t      thread;
        event64_t     event;
 struct select_thread_ctx {
        thread_t      thread;
        event64_t     event;
@@ -2757,13 +2696,22 @@ waitq_select_thread_locked(struct waitq *waitq,
        kern_return_t kr;
        spl_t s;
 
        kern_return_t kr;
        spl_t s;
 
-       s = splsched();
-
        /* Find and lock the interrupts disabled queue the thread is actually on */
        if (!waitq_irq_safe(waitq)) {
                safeq = waitq_get_safeq(waitq);
        /* Find and lock the interrupts disabled queue the thread is actually on */
        if (!waitq_irq_safe(waitq)) {
                safeq = waitq_get_safeq(waitq);
+               if (safeq == NULL) {
+                       /*
+                        * in the WQT_TSPROXY case, if there's no turnstile,
+                        * there's no queue and no waiters, so we can move straight
+                        * to the waitq set recursion
+                        */
+                       goto handle_waitq_set;
+               }
+
+               s = splsched();
                waitq_lock(safeq);
        } else {
                waitq_lock(safeq);
        } else {
+               s = splsched();
                safeq = waitq;
        }
 
                safeq = waitq;
        }
 
@@ -2788,6 +2736,7 @@ waitq_select_thread_locked(struct waitq *waitq,
 
        splx(s);
 
 
        splx(s);
 
+handle_waitq_set:
        if (!waitq->waitq_set_id) {
                return KERN_NOT_WAITING;
        }
        if (!waitq->waitq_set_id) {
                return KERN_NOT_WAITING;
        }
@@ -2898,6 +2847,10 @@ waitq_assert_wait64_locked(struct waitq *waitq,
         */
        if (!waitq_irq_safe(waitq)) {
                safeq = waitq_get_safeq(waitq);
         */
        if (!waitq_irq_safe(waitq)) {
                safeq = waitq_get_safeq(waitq);
+               if (__improbable(safeq == NULL)) {
+                       panic("Trying to assert_wait on a turnstile proxy "
+                           "that hasn't been donated one (waitq: %p)", waitq);
+               }
                eventmask = _CAST_TO_EVENT_MASK(waitq);
                waitq_lock(safeq);
        } else {
                eventmask = _CAST_TO_EVENT_MASK(waitq);
                waitq_lock(safeq);
        } else {
@@ -3001,6 +2954,10 @@ waitq_pull_thread_locked(struct waitq *waitq, thread_t thread)
        /* Find the interrupts disabled queue thread is waiting on */
        if (!waitq_irq_safe(waitq)) {
                safeq = waitq_get_safeq(waitq);
        /* Find the interrupts disabled queue thread is waiting on */
        if (!waitq_irq_safe(waitq)) {
                safeq = waitq_get_safeq(waitq);
+               if (__improbable(safeq == NULL)) {
+                       panic("Trying to clear_wait on a turnstile proxy "
+                           "that hasn't been donated one (waitq: %p)", waitq);
+               }
        } else {
                safeq = waitq;
        }
        } else {
                safeq = waitq;
        }
@@ -3051,9 +3008,6 @@ maybe_adjust_thread_pri(thread_t   thread,
                }
 
                sched_thread_promote_reason(thread, TH_SFLAG_WAITQ_PROMOTED, trace_waitq);
                }
 
                sched_thread_promote_reason(thread, TH_SFLAG_WAITQ_PROMOTED, trace_waitq);
-       } else if (priority > 0) {
-               /* Mutex subsystem wants to see this thread before we 'go' it */
-               lck_mtx_wakeup_adjust_pri(thread, priority);
        }
 }
 
        }
 }
 
@@ -3123,7 +3077,7 @@ waitq_wakeup64_all_locked(struct waitq *waitq,
 
        nthreads = waitq_select_n_locked(waitq, wake_event, NULL, NULL,
            reserved_preposts,
 
        nthreads = waitq_select_n_locked(waitq, wake_event, NULL, NULL,
            reserved_preposts,
-           &wakeup_queue, -1, &th_spl);
+           &wakeup_queue, -1, &th_spl, priority);
 
        /* set each thread running */
        ret = KERN_NOT_WAITING;
 
        /* set each thread running */
        ret = KERN_NOT_WAITING;
@@ -3140,7 +3094,7 @@ waitq_wakeup64_all_locked(struct waitq *waitq,
                assert_thread_magic(thread);
                remqueue(&thread->wait_links);
                maybe_adjust_thread_pri(thread, priority, waitq);
                assert_thread_magic(thread);
                remqueue(&thread->wait_links);
                maybe_adjust_thread_pri(thread, priority, waitq);
-               ret = thread_go(thread, result);
+               ret = thread_go(thread, result, WQ_OPTION_NONE);
                assert(ret == KERN_SUCCESS);
                thread_unlock(thread);
        }
                assert(ret == KERN_SUCCESS);
                thread_unlock(thread);
        }
@@ -3168,23 +3122,17 @@ waitq_wakeup64_one_locked(struct waitq *waitq,
     wait_result_t result,
     uint64_t *reserved_preposts,
     int priority,
     wait_result_t result,
     uint64_t *reserved_preposts,
     int priority,
-    waitq_lock_state_t lock_state)
+    waitq_lock_state_t lock_state,
+    waitq_options_t option)
 {
        thread_t thread;
        spl_t th_spl;
 
        assert(waitq_held(waitq));
 
 {
        thread_t thread;
        spl_t th_spl;
 
        assert(waitq_held(waitq));
 
-       if (priority == WAITQ_SELECT_MAX_PRI) {
-               thread = waitq_select_max_locked(waitq, wake_event,
-                   reserved_preposts,
-                   &th_spl);
-       } else {
-               thread = waitq_select_one_locked(waitq, wake_event,
-                   reserved_preposts,
-                   priority, &th_spl);
-       }
-
+       thread = waitq_select_one_locked(waitq, wake_event,
+           reserved_preposts,
+           priority, &th_spl);
 
        if (thread != THREAD_NULL) {
                waitq_stats_count_wakeup(waitq);
 
        if (thread != THREAD_NULL) {
                waitq_stats_count_wakeup(waitq);
@@ -3198,7 +3146,7 @@ waitq_wakeup64_one_locked(struct waitq *waitq,
 
        if (thread != THREAD_NULL) {
                maybe_adjust_thread_pri(thread, priority, waitq);
 
        if (thread != THREAD_NULL) {
                maybe_adjust_thread_pri(thread, priority, waitq);
-               kern_return_t ret = thread_go(thread, result);
+               kern_return_t ret = thread_go(thread, result, option);
                assert(ret == KERN_SUCCESS);
                thread_unlock(thread);
                splx(th_spl);
                assert(ret == KERN_SUCCESS);
                thread_unlock(thread);
                splx(th_spl);
@@ -3233,15 +3181,9 @@ waitq_wakeup64_identify_locked(struct waitq     *waitq,
 
        assert(waitq_held(waitq));
 
 
        assert(waitq_held(waitq));
 
-       if (priority == WAITQ_SELECT_MAX_PRI) {
-               thread = waitq_select_max_locked(waitq, wake_event,
-                   reserved_preposts,
-                   spl);
-       } else {
-               thread = waitq_select_one_locked(waitq, wake_event,
-                   reserved_preposts,
-                   priority, spl);
-       }
+       thread = waitq_select_one_locked(waitq, wake_event,
+           reserved_preposts,
+           priority, spl);
 
        if (thread != THREAD_NULL) {
                waitq_stats_count_wakeup(waitq);
 
        if (thread != THREAD_NULL) {
                waitq_stats_count_wakeup(waitq);
@@ -3255,7 +3197,7 @@ waitq_wakeup64_identify_locked(struct waitq     *waitq,
 
        if (thread != THREAD_NULL) {
                kern_return_t __assert_only ret;
 
        if (thread != THREAD_NULL) {
                kern_return_t __assert_only ret;
-               ret = thread_go(thread, result);
+               ret = thread_go(thread, result, WQ_OPTION_NONE);
                assert(ret == KERN_SUCCESS);
        }
 
                assert(ret == KERN_SUCCESS);
        }
 
@@ -3309,7 +3251,7 @@ waitq_wakeup64_thread_locked(struct waitq *waitq,
                return KERN_NOT_WAITING;
        }
 
                return KERN_NOT_WAITING;
        }
 
-       ret = thread_go(thread, result);
+       ret = thread_go(thread, result, WQ_OPTION_NONE);
        assert(ret == KERN_SUCCESS);
        thread_unlock(thread);
        splx(th_spl);
        assert(ret == KERN_SUCCESS);
        thread_unlock(thread);
        splx(th_spl);
@@ -3341,8 +3283,12 @@ waitq_init(struct waitq *waitq, int policy)
        waitq->waitq_fifo = ((policy & SYNC_POLICY_REVERSED) == 0);
        waitq->waitq_irq = !!(policy & SYNC_POLICY_DISABLE_IRQ);
        waitq->waitq_prepost = 0;
        waitq->waitq_fifo = ((policy & SYNC_POLICY_REVERSED) == 0);
        waitq->waitq_irq = !!(policy & SYNC_POLICY_DISABLE_IRQ);
        waitq->waitq_prepost = 0;
-       waitq->waitq_type = WQT_QUEUE;
-       waitq->waitq_turnstile_or_port = !!(policy & SYNC_POLICY_TURNSTILE);
+       if (policy & SYNC_POLICY_TURNSTILE_PROXY) {
+               waitq->waitq_type = WQT_TSPROXY;
+       } else {
+               waitq->waitq_type = WQT_QUEUE;
+       }
+       waitq->waitq_turnstile = !!(policy & SYNC_POLICY_TURNSTILE);
        waitq->waitq_eventmask = 0;
 
        waitq->waitq_set_id = 0;
        waitq->waitq_eventmask = 0;
 
        waitq->waitq_set_id = 0;
@@ -3351,9 +3297,11 @@ waitq_init(struct waitq *waitq, int policy)
        waitq_lock_init(waitq);
        if (waitq_is_turnstile_queue(waitq)) {
                /* For turnstile, initialize it as a priority queue */
        waitq_lock_init(waitq);
        if (waitq_is_turnstile_queue(waitq)) {
                /* For turnstile, initialize it as a priority queue */
-               priority_queue_init(&waitq->waitq_prio_queue,
-                   PRIORITY_QUEUE_BUILTIN_MAX_HEAP);
+               priority_queue_init(&waitq->waitq_prio_queue);
                assert(waitq->waitq_fifo == 0);
                assert(waitq->waitq_fifo == 0);
+       } else if (policy & SYNC_POLICY_TURNSTILE_PROXY) {
+               waitq->waitq_ts = TURNSTILE_NULL;
+               waitq->waitq_tspriv = NULL;
        } else {
                queue_init(&waitq->waitq_queue);
        }
        } else {
                queue_init(&waitq->waitq_queue);
        }
@@ -3438,7 +3386,12 @@ waitq_deinit(struct waitq *waitq)
 {
        spl_t s;
 
 {
        spl_t s;
 
-       if (!waitq || !waitq_is_queue(waitq)) {
+       assert(waitq);
+       if (!waitq_is_valid(waitq)) {
+               return;
+       }
+
+       if (!waitq_is_queue(waitq) && !waitq_is_turnstile_proxy(waitq)) {
                return;
        }
 
                return;
        }
 
@@ -3446,25 +3399,33 @@ waitq_deinit(struct waitq *waitq)
                s = splsched();
        }
        waitq_lock(waitq);
                s = splsched();
        }
        waitq_lock(waitq);
-       if (!waitq_valid(waitq)) {
-               waitq_unlock(waitq);
-               if (waitq_irq_safe(waitq)) {
-                       splx(s);
+
+       if (waitq_valid(waitq)) {
+               waitq->waitq_isvalid = 0;
+               if (!waitq_irq_safe(waitq)) {
+                       waitq_unlink_all_unlock(waitq);
+                       /* waitq unlocked and set links deallocated */
+                       goto out;
                }
                }
-               return;
        }
 
        }
 
-       waitq->waitq_isvalid = 0;
-
-       if (!waitq_irq_safe(waitq)) {
-               waitq_unlink_all_unlock(waitq);
-               /* waitq unlocked and set links deallocated */
-       } else {
-               waitq_unlock(waitq);
+       waitq_unlock(waitq);
+       if (waitq_irq_safe(waitq)) {
                splx(s);
        }
 
                splx(s);
        }
 
-       assert(waitq_empty(waitq));
+out:
+#if MACH_ASSERT
+       if (waitq_is_turnstile_queue(waitq)) {
+               assert(priority_queue_empty(&waitq->waitq_prio_queue));
+       } else if (waitq_is_turnstile_proxy(waitq)) {
+               assert(waitq->waitq_ts == TURNSTILE_NULL);
+       } else {
+               assert(queue_empty(&waitq->waitq_queue));
+       }
+#else
+       (void)0;
+#endif // MACH_ASSERT
 }
 
 void
 }
 
 void
@@ -3508,7 +3469,7 @@ wqset_clear_prepost_chain_cb(struct waitq_set __unused *wqset,
  *     NULL on failure
  */
 struct waitq_set *
  *     NULL on failure
  */
 struct waitq_set *
-waitq_set_alloc(int policy, void *prepost_hook)
+waitq_set_alloc(int policy, waitq_set_prepost_hook_t *prepost_hook)
 {
        struct waitq_set *wqset;
 
 {
        struct waitq_set *wqset;
 
@@ -3537,7 +3498,7 @@ waitq_set_alloc(int policy, void *prepost_hook)
 kern_return_t
 waitq_set_init(struct waitq_set *wqset,
     int policy, uint64_t *reserved_link,
 kern_return_t
 waitq_set_init(struct waitq_set *wqset,
     int policy, uint64_t *reserved_link,
-    void *prepost_hook)
+    waitq_set_prepost_hook_t *prepost_hook)
 {
        struct waitq_link *link;
        kern_return_t ret;
 {
        struct waitq_link *link;
        kern_return_t ret;
@@ -3677,6 +3638,20 @@ waitq_set_deinit(struct waitq_set *wqset)
 
        waitq_set_lock(wqset);
 
 
        waitq_set_lock(wqset);
 
+       if (waitq_set_has_prepost_hook(wqset)) {
+               waitq_set_prepost_hook_t *hook = wqset->wqset_prepost_hook;
+               /*
+                * If the wqset_prepost_hook value is non 0,
+                * then another core is currently posting to this waitq set
+                * and we need for it to finish what it's doing.
+                */
+               while (os_atomic_load(hook, relaxed) != 0) {
+                       waitq_set_unlock(wqset);
+                       delay(1);
+                       waitq_set_lock(wqset);
+               }
+       }
+
        set_id = wqset->wqset_id;
 
        if (waitqs_is_linked(wqset) || set_id == 0) {
        set_id = wqset->wqset_id;
 
        if (waitqs_is_linked(wqset) || set_id == 0) {
@@ -4969,7 +4944,7 @@ waitq_alloc_prepost_reservation(int nalloc, struct waitq *waitq,
         */
        if (waitq) {
                disable_preemption();
         */
        if (waitq) {
                disable_preemption();
-               cache = &PROCESSOR_DATA(current_processor(), wqp_cache);
+               cache = PERCPU_GET(wqp_cache);
                if (nalloc <= (int)cache->avail) {
                        goto do_alloc;
                }
                if (nalloc <= (int)cache->avail) {
                        goto do_alloc;
                }
@@ -5569,7 +5544,7 @@ waitq_wakeup64_one(struct waitq *waitq, event64_t wake_event,
 
        /* waitq is locked upon return */
        kr = waitq_wakeup64_one_locked(waitq, wake_event, result,
 
        /* waitq is locked upon return */
        kr = waitq_wakeup64_one_locked(waitq, wake_event, result,
-           &reserved_preposts, priority, WAITQ_UNLOCK);
+           &reserved_preposts, priority, WAITQ_UNLOCK, WQ_OPTION_NONE);
 
        if (waitq_irq_safe(waitq)) {
                splx(spl);
 
        if (waitq_irq_safe(waitq)) {
                splx(spl);
@@ -5661,7 +5636,7 @@ waitq_wakeup64_thread(struct waitq *waitq,
        waitq_unlock(waitq);
 
        if (ret == KERN_SUCCESS) {
        waitq_unlock(waitq);
 
        if (ret == KERN_SUCCESS) {
-               ret = thread_go(thread, result);
+               ret = thread_go(thread, result, WQ_OPTION_NONE);
                assert(ret == KERN_SUCCESS);
                thread_unlock(thread);
                splx(th_spl);
                assert(ret == KERN_SUCCESS);
                thread_unlock(thread);
                splx(th_spl);