xnu-4903.221.2.tar.gz

[apple/xnu.git] / osfmk / kern / locks.c
diff --git a/osfmk/kern/locks.c b/osfmk/kern/locks.c

index c816715913b81fcb6cc47672d0c268eab09a7e6b..04b5dd239fb8fa5e232cb902c9a74ad8fc663736 100644 (file)
--- a/osfmk/kern/locks.c
+++ b/osfmk/kern/locks.c
@@ -1,5 +1,5 @@
  /*
- * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2016 Apple Inc. All rights reserved.
   *
   * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   * 
@@ -53,6 +53,10 @@
   * any improvements or extensions that they make and grant Carnegie Mellon
   * the rights to redistribute these changes.
   */
+
+#define ATOMIC_PRIVATE 1
+#define LOCK_PRIVATE 1
+
  #include <mach_ldebug.h>
  #include <debug.h>
  
@@ -67,9 +71,11 @@
  #include <kern/processor.h>
  #include <kern/sched_prim.h>
  #include <kern/debug.h>
+#include <libkern/section_keywords.h>
+#include <machine/atomic.h>
+#include <machine/machine_cpu.h>
  #include <string.h>
  
-
  #include <sys/kdebug.h>
  
  #if    CONFIG_DTRACE
@@ -87,16 +93,55 @@
  #define        LCK_MTX_LCK_WAIT_CODE           2
  #define        LCK_MTX_UNLCK_WAKEUP_CODE       3
  
+#if MACH_LDEBUG
+#define ALIGN_TEST(p,t) do{if((uintptr_t)p&(sizeof(t)-1)) __builtin_trap();}while(0)
+#else
+#define ALIGN_TEST(p,t) do{}while(0)
+#endif
+
+/* Silence the volatile to _Atomic cast warning */
+#define ATOMIC_CAST(t,p) ((_Atomic t*)(uintptr_t)(p))
+
+/* Enforce program order of loads and stores. */
+#define ordered_load(target, type) \
+               __c11_atomic_load((_Atomic type *)(target), memory_order_relaxed)
+#define ordered_store(target, type, value) \
+               __c11_atomic_store((_Atomic type *)(target), value, memory_order_relaxed)
+
+#define ordered_load_hw(lock)                  ordered_load(&(lock)->lock_data, uintptr_t)
+#define ordered_store_hw(lock, value)  ordered_store(&(lock)->lock_data, uintptr_t, (value))
+
+#define NOINLINE               __attribute__((noinline))
+
+
  static queue_head_t    lck_grp_queue;
  static unsigned int    lck_grp_cnt;
  
  decl_lck_mtx_data(static,lck_grp_lock)
  static lck_mtx_ext_t lck_grp_lock_ext;
  
+SECURITY_READ_ONLY_LATE(boolean_t) spinlock_timeout_panic = TRUE;
+
  lck_grp_attr_t LockDefaultGroupAttr;
  lck_grp_t              LockCompatGroup;
  lck_attr_t             LockDefaultLckAttr;
  
+#if CONFIG_DTRACE && __SMP__
+#if defined (__x86_64__)
+uint64_t dtrace_spin_threshold = 500; // 500ns
+#elif defined(__arm__) || defined(__arm64__)
+uint64_t dtrace_spin_threshold = LOCK_PANIC_TIMEOUT / 1000000; // 500ns
+#endif
+#endif
+
+uintptr_t
+unslide_for_kdebug(void* object) {
+       if (__improbable(kdebug_enable))
+               return VM_KERNEL_UNSLIDE_OR_PERM(object);
+       else
+               return 0;
+}
+
  /*
   * Routine:    lck_mod_init
   */
@@ -111,6 +156,12 @@ lck_mod_init(
         if (!PE_parse_boot_argn("lcks", &LcksOpts, sizeof (LcksOpts)))
                 LcksOpts = 0;
  
+
+#if (DEVELOPMENT || DEBUG) && defined(__x86_64__)
+       if (!PE_parse_boot_argn("-disable_mtx_chk", &LckDisablePreemptCheck, sizeof (LckDisablePreemptCheck)))
+               LckDisablePreemptCheck = 0;
+#endif /* (DEVELOPMENT || DEBUG) && defined(__x86_64__) */
+
         queue_init(&lck_grp_queue);
         
         /* 
@@ -135,7 +186,6 @@ lck_mod_init(
         lck_attr_setdefault(&LockDefaultLckAttr);
         
         lck_mtx_init_ext(&lck_grp_lock, &lck_grp_lock_ext, &LockCompatGroup, &LockDefaultLckAttr);
-       
  }
  
  /*
@@ -218,6 +268,9 @@ lck_grp_alloc_init(
  void
  lck_grp_init(lck_grp_t * grp, const char * grp_name, lck_grp_attr_t * attr)
  {
+       /* make sure locking infrastructure has been initialized */
+       assert(lck_grp_cnt > 0);
+
         bzero((void *)grp, sizeof(lck_grp_t));
  
         (void)strlcpy(grp->lck_grp_name, grp_name, LCK_GRP_MAX_NAME);
@@ -315,6 +368,7 @@ lck_grp_lckcnt_decr(
         lck_type_t      lck_type)
  {
         unsigned int    *lckcnt;
+       int             updated;
  
         switch (lck_type) {
         case LCK_TYPE_SPIN:
@@ -327,10 +381,12 @@ lck_grp_lckcnt_decr(
                 lckcnt = &grp->lck_grp_rwcnt;
                 break;
         default:
-               return panic("lck_grp_lckcnt_decr(): invalid lock type: %d\n", lck_type);
+               panic("lck_grp_lckcnt_decr(): invalid lock type: %d\n", lck_type);
+               return;
         }
  
-       (void)hw_atomic_sub(lckcnt, 1);
+       updated = (int)hw_atomic_sub(lckcnt, 1);
+       assert(updated >= 0);
  }
  
  /*
@@ -358,7 +414,10 @@ void
  lck_attr_setdefault(
         lck_attr_t      *attr)
  {
-#if   __i386__ || __x86_64__
+#if __arm__ || __arm64__
+       /* <rdar://problem/4404579>: Using LCK_ATTR_DEBUG here causes panic at boot time for arm */
+       attr->lck_attr_val =  LCK_ATTR_NONE;
+#elif __i386__ || __x86_64__
  #if     !DEBUG
         if (LcksOpts & enaLkDeb)
                 attr->lck_attr_val =  LCK_ATTR_DEBUG;
@@ -415,6 +474,280 @@ lck_attr_free(
         kfree(attr, sizeof(lck_attr_t));
  }
  
+/*
+ * Routine:    hw_lock_init
+ *
+ *     Initialize a hardware lock.
+ */
+void
+hw_lock_init(hw_lock_t lock)
+{
+       ordered_store_hw(lock, 0);
+}
+
+/*
+ *     Routine: hw_lock_lock_contended
+ *
+ *     Spin until lock is acquired or timeout expires.
+ *     timeout is in mach_absolute_time ticks. Called with
+ *     preemption disabled.
+ */
+
+#if    __SMP__
+static unsigned int NOINLINE
+hw_lock_lock_contended(hw_lock_t lock, uintptr_t data, uint64_t timeout, boolean_t do_panic)
+{
+       uint64_t        end = 0;
+       uintptr_t       holder = lock->lock_data;
+       int             i;
+
+       if (timeout == 0)
+               timeout = LOCK_PANIC_TIMEOUT;
+#if CONFIG_DTRACE
+       uint64_t begin;
+       boolean_t dtrace_enabled = lockstat_probemap[LS_LCK_SPIN_LOCK_SPIN] != 0;
+       if (__improbable(dtrace_enabled))
+               begin = mach_absolute_time();
+#endif
+       for ( ; ; ) {   
+               for (i = 0; i < LOCK_SNOOP_SPINS; i++) {
+                       cpu_pause();
+#if (!__ARM_ENABLE_WFE_) || (LOCK_PRETEST)
+                       holder = ordered_load_hw(lock);
+                       if (holder != 0)
+                               continue;
+#endif
+                       if (atomic_compare_exchange(&lock->lock_data, 0, data,
+                           memory_order_acquire_smp, TRUE)) {
+#if CONFIG_DTRACE
+                               if (__improbable(dtrace_enabled)) {
+                                       uint64_t spintime = mach_absolute_time() - begin;
+                                       if (spintime > dtrace_spin_threshold)
+                                               LOCKSTAT_RECORD2(LS_LCK_SPIN_LOCK_SPIN, lock, spintime, dtrace_spin_threshold);
+                               }
+#endif
+                               return 1;
+                       }
+               }
+               if (end == 0) {
+                       end = ml_get_timebase() + timeout;
+               }
+               else if (ml_get_timebase() >= end)
+                       break;
+       }
+       if (do_panic) {
+               // Capture the actual time spent blocked, which may be higher than the timeout
+               // if a misbehaving interrupt stole this thread's CPU time.
+               panic("Spinlock timeout after %llu ticks, %p = %lx",
+                       (ml_get_timebase() - end + timeout), lock, holder);
+       }
+       return 0;
+}
+#endif // __SMP__
+
+static inline void
+hw_lock_lock_internal(hw_lock_t lock, thread_t thread)
+{
+       uintptr_t       state;
+
+       state = LCK_MTX_THREAD_TO_STATE(thread) | PLATFORM_LCK_ILOCK;
+#if    __SMP__
+
+#if    LOCK_PRETEST
+       if (ordered_load_hw(lock))
+               goto contended;
+#endif // LOCK_PRETEST
+       if (atomic_compare_exchange(&lock->lock_data, 0, state,
+                                       memory_order_acquire_smp, TRUE)) {
+               goto end;
+       }
+#if    LOCK_PRETEST
+contended:
+#endif // LOCK_PRETEST
+       hw_lock_lock_contended(lock, state, 0, spinlock_timeout_panic);
+end:
+#else  // __SMP__
+       if (lock->lock_data)
+               panic("Spinlock held %p", lock);
+       lock->lock_data = state;
+#endif // __SMP__
+#if CONFIG_DTRACE
+       LOCKSTAT_RECORD(LS_LCK_SPIN_LOCK_ACQUIRE, lock, 0);
+#endif
+       return;
+}
+
+/*
+ *     Routine: hw_lock_lock
+ *
+ *     Acquire lock, spinning until it becomes available,
+ *     return with preemption disabled.
+ */
+void
+hw_lock_lock(hw_lock_t lock)
+{
+       thread_t thread = current_thread();
+       disable_preemption_for_thread(thread);
+       hw_lock_lock_internal(lock, thread);
+}
+
+/*
+ *     Routine: hw_lock_lock_nopreempt
+ *
+ *     Acquire lock, spinning until it becomes available.
+ */
+void
+hw_lock_lock_nopreempt(hw_lock_t lock)
+{
+       thread_t thread = current_thread();
+       if (__improbable(!preemption_disabled_for_thread(thread)))
+               panic("Attempt to take no-preempt spinlock %p in preemptible context", lock);
+       hw_lock_lock_internal(lock, thread);
+}
+
+/*
+ *     Routine: hw_lock_to
+ *
+ *     Acquire lock, spinning until it becomes available or timeout.
+ *     Timeout is in mach_absolute_time ticks, return with
+ *     preemption disabled.
+ */
+unsigned int
+hw_lock_to(hw_lock_t lock, uint64_t timeout)
+{
+       thread_t        thread;
+       uintptr_t       state;
+       unsigned int success = 0;
+
+       thread = current_thread();
+       disable_preemption_for_thread(thread);
+       state = LCK_MTX_THREAD_TO_STATE(thread) | PLATFORM_LCK_ILOCK;
+#if    __SMP__
+
+#if    LOCK_PRETEST
+       if (ordered_load_hw(lock))
+               goto contended;
+#endif // LOCK_PRETEST
+       if (atomic_compare_exchange(&lock->lock_data, 0, state,
+                                       memory_order_acquire_smp, TRUE)) {
+               success = 1;
+               goto end;
+       }
+#if    LOCK_PRETEST
+contended:
+#endif // LOCK_PRETEST
+       success = hw_lock_lock_contended(lock, state, timeout, FALSE);
+end:
+#else  // __SMP__
+       (void)timeout;
+       if (ordered_load_hw(lock) == 0) {
+               ordered_store_hw(lock, state);
+               success = 1;
+       }
+#endif // __SMP__
+#if CONFIG_DTRACE
+       if (success)
+               LOCKSTAT_RECORD(LS_LCK_SPIN_LOCK_ACQUIRE, lock, 0);
+#endif
+       return success;
+}
+
+/*
+ *     Routine: hw_lock_try
+ *
+ *     returns with preemption disabled on success.
+ */
+static inline unsigned int
+hw_lock_try_internal(hw_lock_t lock, thread_t thread)
+{
+       int             success = 0;
+
+#if    __SMP__
+#if    LOCK_PRETEST
+       if (ordered_load_hw(lock))
+               goto failed;
+#endif // LOCK_PRETEST
+       success = atomic_compare_exchange(&lock->lock_data, 0, LCK_MTX_THREAD_TO_STATE(thread) | PLATFORM_LCK_ILOCK,
+                                       memory_order_acquire_smp, FALSE);
+#else
+       if (lock->lock_data == 0) {
+               lock->lock_data = LCK_MTX_THREAD_TO_STATE(thread) | PLATFORM_LCK_ILOCK;
+               success = 1;
+       }
+#endif // __SMP__
+
+#if    LOCK_PRETEST
+failed:
+#endif // LOCK_PRETEST
+#if CONFIG_DTRACE
+       if (success)
+               LOCKSTAT_RECORD(LS_LCK_SPIN_LOCK_ACQUIRE, lock, 0);
+#endif
+       return success;
+}
+
+unsigned int
+hw_lock_try(hw_lock_t lock)
+{
+       thread_t thread = current_thread();
+       disable_preemption_for_thread(thread);
+       unsigned int success = hw_lock_try_internal(lock, thread);
+       if (!success)
+               enable_preemption();
+       return success;
+}
+
+unsigned int
+hw_lock_try_nopreempt(hw_lock_t lock)
+{
+       thread_t thread = current_thread();
+       if (__improbable(!preemption_disabled_for_thread(thread)))
+               panic("Attempt to test no-preempt spinlock %p in preemptible context", lock);
+       return hw_lock_try_internal(lock, thread);
+}
+
+/*
+ *     Routine: hw_lock_unlock
+ *
+ *     Unconditionally release lock, release preemption level.
+ */
+static inline void
+hw_lock_unlock_internal(hw_lock_t lock)
+{
+       __c11_atomic_store((_Atomic uintptr_t *)&lock->lock_data, 0, memory_order_release_smp);
+#if __arm__ || __arm64__
+       // ARM tests are only for open-source exclusion
+       set_event();
+#endif // __arm__ || __arm64__
+#if    CONFIG_DTRACE
+       LOCKSTAT_RECORD(LS_LCK_SPIN_UNLOCK_RELEASE, lock, 0);
+#endif /* CONFIG_DTRACE */
+}
+
+void
+hw_lock_unlock(hw_lock_t lock)
+{
+       hw_lock_unlock_internal(lock);
+       enable_preemption();
+}
+
+void
+hw_lock_unlock_nopreempt(hw_lock_t lock)
+{
+       if (__improbable(!preemption_disabled_for_thread(current_thread())))
+               panic("Attempt to release no-preempt spinlock %p in preemptible context", lock);
+       hw_lock_unlock_internal(lock);
+}
+
+/*
+ *     Routine hw_lock_held, doesn't change preemption state.
+ *     N.B.  Racy, of course.
+ */
+unsigned int
+hw_lock_held(hw_lock_t lock)
+{
+       return (ordered_load_hw(lock) != 0);
+}
  
  /*
   * Routine:    lck_spin_sleep
@@ -476,40 +809,6 @@ lck_spin_sleep_deadline(
         return res;
  }
  
-
-/*
- * Routine:    lck_mtx_clear_promoted
- *
- * Handle clearing of TH_SFLAG_PROMOTED,
- * adjusting thread priority as needed.
- *
- * Called with thread lock held
- */
-static void
-lck_mtx_clear_promoted (
-       thread_t                        thread,
-       __kdebug_only uintptr_t         trace_lck)
-{
-       thread->sched_flags &= ~TH_SFLAG_PROMOTED;
-
-       if (thread->sched_flags & TH_SFLAG_RW_PROMOTED) {
-               /* Thread still has a RW lock promotion */
-       } else if (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) {
-               KERNEL_DEBUG_CONSTANT(
-                       MACHDBG_CODE(DBG_MACH_SCHED,MACH_DEMOTE) | DBG_FUNC_NONE,
-                               thread->sched_pri, DEPRESSPRI, 0, trace_lck, 0);
-               set_sched_pri(thread, DEPRESSPRI);
-       } else {
-               if (thread->base_pri < thread->sched_pri) {
-                       KERNEL_DEBUG_CONSTANT(
-                               MACHDBG_CODE(DBG_MACH_SCHED,MACH_DEMOTE) | DBG_FUNC_NONE,
-                                       thread->sched_pri, thread->base_pri, 0, trace_lck, 0);
-               }
-               thread_recompute_sched_pri(thread, FALSE);
-       }
-}
-
-
  /*
   * Routine:    lck_mtx_sleep
   */
@@ -546,6 +845,8 @@ lck_mtx_sleep(
                 if (!(lck_sleep_action & LCK_SLEEP_UNLOCK)) {
                         if ((lck_sleep_action & LCK_SLEEP_SPIN))
                                 lck_mtx_lock_spin(lck);
+                       else if ((lck_sleep_action & LCK_SLEEP_SPIN_ALWAYS))
+                               lck_mtx_lock_spin_always(lck);
                         else
                                 lck_mtx_lock(lck);
                 }
@@ -557,7 +858,7 @@ lck_mtx_sleep(
         if (lck_sleep_action & LCK_SLEEP_PROMOTED_PRI) {
                 if ((thread->rwlock_count-- == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
                         /* sched_flags checked without lock, but will be rechecked while clearing */
-                       lck_rw_clear_promotion(thread);
+                       lck_rw_clear_promotion(thread, unslide_for_kdebug(event));
                 }
         }
  
@@ -612,7 +913,7 @@ lck_mtx_sleep_deadline(
         if (lck_sleep_action & LCK_SLEEP_PROMOTED_PRI) {
                 if ((thread->rwlock_count-- == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
                         /* sched_flags checked without lock, but will be rechecked while clearing */
-                       lck_rw_clear_promotion(thread);
+                       lck_rw_clear_promotion(thread, unslide_for_kdebug(event));
                 }
         }
  
@@ -622,12 +923,58 @@ lck_mtx_sleep_deadline(
  }
  
  /*
- * Routine:    lck_mtx_lock_wait
+ * Lock Boosting Invariants:
+ *
+ * The lock owner is always promoted to the max priority of all its waiters.
+ * Max priority is capped at MAXPRI_PROMOTE.
+ *
+ * lck_mtx_pri being set implies that the lock owner is promoted to at least lck_mtx_pri
+ *      This prevents the thread from dropping in priority while holding a mutex
+ *      (note: Intel locks currently don't do this, to avoid thread lock churn)
+ *
+ * thread->promotions has a +1 for every mutex currently promoting the thread
+ * and 1 for was_promoted_on_wakeup being set.
+ * TH_SFLAG_PROMOTED is set on a thread whenever it has any promotions
+ * from any mutex (i.e. thread->promotions != 0)
+ *
+ * was_promoted_on_wakeup is set on a thread which is woken up by a mutex when
+ * it raises the priority of the woken thread to match lck_mtx_pri.
+ * It can be set for multiple iterations of wait, fail to acquire, re-wait, etc
+ * was_promoted_on_wakeup being set always implies a +1 promotions count.
+ *
+ * The last waiter is not given a promotion when it wakes up or acquires the lock.
+ * When the last waiter is waking up, a new contender can always come in and
+ * steal the lock without having to wait for the last waiter to make forward progress.
+ *
+ * lck_mtx_waiters has a +1 for every waiter currently between wait and acquire
+ * This prevents us from asserting that every wakeup wakes up a thread.
+ * This also causes excess thread_wakeup calls in the unlock path.
+ * It can only be fooled into thinking there are more waiters than are
+ * actually blocked, not less.
+ * It does allows us to reduce the complexity of the lock state.
+ *
+ * This also means that a starved bg thread as the last waiter could end up
+ * keeping the lock in the contended state for a long period of time, which
+ * may keep lck_mtx_pri artificially high for a very long time even though
+ * it is not participating or blocking anyone else.
+ * Intel locks don't have this problem because they can go uncontended
+ * as soon as there are no blocked threads involved.
+ */
+
+/*
+ * Routine: lck_mtx_lock_wait
   *
   * Invoked in order to wait on contention.
   *
   * Called with the interlock locked and
   * returns it unlocked.
+ *
+ * Always aggressively sets the owning thread to promoted,
+ * even if it's the same or higher priority
+ * This prevents it from lowering its own priority while holding a lock
+ *
+ * TODO: Come up with a more efficient way to handle same-priority promotions
+ *      <rdar://problem/30737670> ARM mutex contention logic could avoid taking the thread lock
   */
  void
  lck_mtx_lock_wait (
@@ -636,10 +983,8 @@ lck_mtx_lock_wait (
  {
         thread_t                self = current_thread();
         lck_mtx_t               *mutex;
-       __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
-       __kdebug_only uintptr_t trace_holder = VM_KERNEL_UNSLIDE_OR_PERM(holder);
-       integer_t               priority;
-       spl_t                   s = splsched();
+       __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck);
+
  #if    CONFIG_DTRACE
         uint64_t                sleep_start = 0;
  
@@ -653,51 +998,69 @@ lck_mtx_lock_wait (
         else
                 mutex = &lck->lck_mtx_ptr->lck_mtx;
  
-       KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_START, trace_lck, trace_holder, 0, 0, 0);
+       KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_START,
+                    trace_lck, (uintptr_t)thread_tid(thread), 0, 0, 0);
+
+       spl_t s = splsched();
+       thread_lock(holder);
+
+       assert_promotions_invariant(holder);
  
-       priority = self->sched_pri;
-       if (priority < self->base_pri)
-               priority = self->base_pri;
-       if (priority < BASEPRI_DEFAULT)
-               priority = BASEPRI_DEFAULT;
+       if ((holder->sched_flags & TH_SFLAG_DEPRESS) == 0)
+               assert(holder->sched_pri >= mutex->lck_mtx_pri);
  
-       /* Do not promote past promotion ceiling */
+       integer_t priority = self->sched_pri;
+       priority = MAX(priority, self->base_pri);
+       priority = MAX(priority, BASEPRI_DEFAULT);
         priority = MIN(priority, MAXPRI_PROMOTE);
  
-       thread_lock(holder);
-       if (mutex->lck_mtx_pri == 0)
-               holder->promotions++;
-       holder->sched_flags |= TH_SFLAG_PROMOTED;
-       if (mutex->lck_mtx_pri < priority && holder->sched_pri < priority) {
-               KERNEL_DEBUG_CONSTANT(
-                       MACHDBG_CODE(DBG_MACH_SCHED,MACH_PROMOTE) | DBG_FUNC_NONE,
-                                       holder->sched_pri, priority, trace_holder, trace_lck, 0);
-               set_sched_pri(holder, priority);
+       if (mutex->lck_mtx_pri == 0) {
+               /* This is the first promotion for this mutex */
+               if (holder->promotions++ == 0) {
+                       /* This is the first promotion for holder */
+                       sched_thread_promote_to_pri(holder, priority, trace_lck);
+               } else {
+                       /* Holder was previously promoted due to a different mutex, raise to match this one */
+                       sched_thread_update_promotion_to_pri(holder, priority, trace_lck);
+               }
+       } else {
+               /* Holder was previously promoted due to this mutex, check if the pri needs to go up */
+               sched_thread_update_promotion_to_pri(holder, priority, trace_lck);
         }
+
+       assert(holder->promotions > 0);
+       assert(holder->promotion_priority >= priority);
+
+       if ((holder->sched_flags & TH_SFLAG_DEPRESS) == 0)
+               assert(holder->sched_pri >= mutex->lck_mtx_pri);
+
+       assert_promotions_invariant(holder);
+
         thread_unlock(holder);
         splx(s);
  
         if (mutex->lck_mtx_pri < priority)
                 mutex->lck_mtx_pri = priority;
-       if (self->pending_promoter[self->pending_promoter_index] == NULL) {
-               self->pending_promoter[self->pending_promoter_index] = mutex;
-               mutex->lck_mtx_waiters++;
-       }
-       else
-       if (self->pending_promoter[self->pending_promoter_index] != mutex) {
-               self->pending_promoter[++self->pending_promoter_index] = mutex;
+
+       if (self->waiting_for_mutex == NULL) {
+               self->waiting_for_mutex = mutex;
                 mutex->lck_mtx_waiters++;
         }
  
-       assert_wait(LCK_MTX_EVENT(mutex), THREAD_UNINT);
+       assert(self->waiting_for_mutex == mutex);
+
+       thread_set_pending_block_hint(self, kThreadWaitKernelMutex);
+       assert_wait(LCK_MTX_EVENT(mutex), THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
         lck_mtx_ilk_unlock(mutex);
  
         thread_block(THREAD_CONTINUE_NULL);
  
+       assert(mutex->lck_mtx_waiters > 0);
+
         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_END, 0, 0, 0, 0, 0);
  #if    CONFIG_DTRACE
         /*
-        * Record the Dtrace lockstat probe for blocking, block time
+        * Record the DTrace lockstat probe for blocking, block time
          * measured from when we were entered.
          */
         if (sleep_start) {
@@ -729,50 +1092,80 @@ lck_mtx_lock_acquire(
         thread_t                thread = current_thread();
         lck_mtx_t               *mutex;
         integer_t               priority;
-       spl_t                   s;
-       __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
  
         if (lck->lck_mtx_tag != LCK_MTX_TAG_INDIRECT)
                 mutex = lck;
         else
                 mutex = &lck->lck_mtx_ptr->lck_mtx;
  
-       if (thread->pending_promoter[thread->pending_promoter_index] == mutex) {
-               thread->pending_promoter[thread->pending_promoter_index] = NULL;
-               if (thread->pending_promoter_index > 0)
-                       thread->pending_promoter_index--;
+       /*
+        * If waiting_for_mutex is set, then this thread was previously blocked waiting on this lock
+        * If it's un-set, then this thread stole the lock from another waiter.
+        */
+       if (thread->waiting_for_mutex == mutex) {
+               assert(mutex->lck_mtx_waiters > 0);
+
+               thread->waiting_for_mutex = NULL;
                 mutex->lck_mtx_waiters--;
         }
  
-       if (mutex->lck_mtx_waiters)
+       assert(thread->waiting_for_mutex == NULL);
+
+       if (mutex->lck_mtx_waiters > 0) {
                 priority = mutex->lck_mtx_pri;
-       else {
+       } else {
+               /* I was the last waiter, so the mutex is no longer promoted or contended */
                 mutex->lck_mtx_pri = 0;
                 priority = 0;
         }
  
         if (priority || thread->was_promoted_on_wakeup) {
-               s = splsched();
+               __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck);
+
+               /*
+                * Note: was_promoted_on_wakeup can happen for multiple wakeups in a row without
+                * an intervening acquire if a thread keeps failing to acquire the lock
+                *
+                * If priority is true but not promoted on wakeup,
+                * then this is a lock steal of a promoted mutex, so it needs a ++ of promotions.
+                *
+                * If promoted on wakeup is true, but priority is not,
+                * then this is the last owner, and the last owner does not need a promotion.
+                */
+
+               spl_t s = splsched();
                 thread_lock(thread);
  
+               assert_promotions_invariant(thread);
+
+               if (thread->was_promoted_on_wakeup)
+                       assert(thread->promotions > 0);
+
                 if (priority) {
-                       thread->promotions++;
-                       thread->sched_flags |= TH_SFLAG_PROMOTED;
-                       if (thread->sched_pri < priority) {
-                               KERNEL_DEBUG_CONSTANT(
-                                       MACHDBG_CODE(DBG_MACH_SCHED,MACH_PROMOTE) | DBG_FUNC_NONE,
-                                                       thread->sched_pri, priority, 0, trace_lck, 0);
-                               /* Do not promote past promotion ceiling */
-                               assert(priority <= MAXPRI_PROMOTE);
-                               set_sched_pri(thread, priority);
+                       if (thread->promotions++ == 0) {
+                               /* This is the first promotion for holder */
+                               sched_thread_promote_to_pri(thread, priority, trace_lck);
+                       } else {
+                               /*
+                                * Holder was previously promoted due to a different mutex, raise to match this one
+                                * Or, this thread was promoted on wakeup but someone else later contended on mutex
+                                * at higher priority before we got here
+                                */
+                               sched_thread_update_promotion_to_pri(thread, priority, trace_lck);
                         }
                 }
+
                 if (thread->was_promoted_on_wakeup) {
                         thread->was_promoted_on_wakeup = 0;
-                       if (thread->promotions == 0)
-                               lck_mtx_clear_promoted(thread, trace_lck);
+                       if (--thread->promotions == 0)
+                               sched_thread_unpromote(thread, trace_lck);
                 }
  
+               assert_promotions_invariant(thread);
+
+               if (priority && (thread->sched_flags & TH_SFLAG_DEPRESS) == 0)
+                       assert(thread->sched_pri >= priority);
+
                 thread_unlock(thread);
                 splx(s);
         }
@@ -795,6 +1188,10 @@ lck_mtx_lock_acquire(
   * Invoked on unlock when there is contention.
   *
   * Called with the interlock locked.
+ *
+ * TODO: the 'waiters' flag does not indicate waiters exist on the waitqueue,
+ * it indicates waiters exist between wait and acquire.
+ * This means that here we may do extra unneeded wakeups.
   */
  void
  lck_mtx_unlock_wakeup (
@@ -803,7 +1200,7 @@ lck_mtx_unlock_wakeup (
  {
         thread_t                thread = current_thread();
         lck_mtx_t               *mutex;
-       __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
+       __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck);
  
         if (lck->lck_mtx_tag != LCK_MTX_TAG_INDIRECT)
                 mutex = lck;
@@ -813,20 +1210,36 @@ lck_mtx_unlock_wakeup (
         if (thread != holder)
                 panic("lck_mtx_unlock_wakeup: mutex %p holder %p\n", mutex, holder);
  
-       KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_UNLCK_WAKEUP_CODE) | DBG_FUNC_START, trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(holder), 0, 0, 0);
+       KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_UNLCK_WAKEUP_CODE) | DBG_FUNC_START,
+                    trace_lck, (uintptr_t)thread_tid(thread), 0, 0, 0);
  
         assert(mutex->lck_mtx_waiters > 0);
+       assert(thread->was_promoted_on_wakeup == 0);
+       assert(thread->waiting_for_mutex == NULL);
+
+       /*
+        * The waiters count does not precisely match the number of threads on the waitqueue,
+        * therefore we cannot assert that we actually wake up a thread here
+        */
         if (mutex->lck_mtx_waiters > 1)
                 thread_wakeup_one_with_pri(LCK_MTX_EVENT(lck), lck->lck_mtx_pri);
         else
                 thread_wakeup_one(LCK_MTX_EVENT(lck));
  
-       if (thread->promotions > 0) {
-               spl_t           s = splsched();
-
+       /* When mutex->lck_mtx_pri is set, it means means I as the owner have a promotion. */
+       if (mutex->lck_mtx_pri) {
+               spl_t s = splsched();
                 thread_lock(thread);
-               if (--thread->promotions == 0 && (thread->sched_flags & TH_SFLAG_PROMOTED))
-                       lck_mtx_clear_promoted(thread, trace_lck);
+
+               assert(thread->promotions > 0);
+
+               assert_promotions_invariant(thread);
+
+               if (--thread->promotions == 0)
+                       sched_thread_unpromote(thread, trace_lck);
+
+               assert_promotions_invariant(thread);
+
                 thread_unlock(thread);
                 splx(s);
         }
@@ -834,21 +1247,50 @@ lck_mtx_unlock_wakeup (
         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_UNLCK_WAKEUP_CODE) | DBG_FUNC_END, 0, 0, 0, 0, 0);
  }
  
+/*
+ * Callout from the waitqueue code from inside thread_wakeup_one_with_pri
+ * At splsched, thread is pulled from waitq, still locked, not on runqueue yet
+ *
+ * We always make sure to set the promotion flag, even if the thread is already at this priority,
+ * so that it doesn't go down.
+ */
  void
-lck_mtx_unlockspin_wakeup (
-       lck_mtx_t                       *lck)
+lck_mtx_wakeup_adjust_pri(thread_t thread, integer_t priority)
  {
-       assert(lck->lck_mtx_waiters > 0);
-       thread_wakeup_one(LCK_MTX_EVENT(lck));
+       assert(priority <= MAXPRI_PROMOTE);
+       assert(thread->waiting_for_mutex != NULL);
  
-       KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_UNLCK_WAKEUP_CODE) | DBG_FUNC_NONE, VM_KERNEL_UNSLIDE_OR_PERM(lck), 0, 0, 1, 0);
-#if CONFIG_DTRACE
-       /*
-        * When there are waiters, we skip the hot-patch spot in the
-        * fastpath, so we record it here.
-        */
-       LOCKSTAT_RECORD(LS_LCK_MTX_UNLOCK_RELEASE, lck, 0);
-#endif
+       __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(thread->waiting_for_mutex);
+
+       assert_promotions_invariant(thread);
+
+       if (thread->was_promoted_on_wakeup) {
+               /* Thread was previously promoted, but contended again */
+               sched_thread_update_promotion_to_pri(thread, priority, trace_lck);
+               return;
+       }
+
+       if (thread->promotions > 0 && priority <= thread->promotion_priority) {
+               /*
+                * Thread is already promoted to the right level, no need to do more
+                * I can draft off of another promotion here, which is OK
+                * because I know the thread will soon run acquire to get its own promotion
+                */
+               assert((thread->sched_flags & TH_SFLAG_PROMOTED) == TH_SFLAG_PROMOTED);
+               return;
+       }
+
+       thread->was_promoted_on_wakeup = 1;
+
+       if (thread->promotions++ == 0) {
+               /* This is the first promotion for this thread */
+               sched_thread_promote_to_pri(thread, priority, trace_lck);
+       } else {
+               /* Holder was previously promoted due to a different mutex, raise to match this one */
+               sched_thread_update_promotion_to_pri(thread, priority, trace_lck);
+       }
+
+       assert_promotions_invariant(thread);
  }
  
  
@@ -971,7 +1413,7 @@ lck_rw_sleep(
                         /* Only if the caller wanted the lck_rw_t returned unlocked should we drop to 0 */
                         assert(lck_sleep_action & LCK_SLEEP_UNLOCK);
  
-                       lck_rw_clear_promotion(thread);
+                       lck_rw_clear_promotion(thread, unslide_for_kdebug(event));
                 }
         }
  
@@ -1025,7 +1467,7 @@ lck_rw_sleep_deadline(
                         /* Only if the caller wanted the lck_rw_t returned unlocked should we drop to 0 */
                         assert(lck_sleep_action & LCK_SLEEP_UNLOCK);
  
-                       lck_rw_clear_promotion(thread);
+                       lck_rw_clear_promotion(thread, unslide_for_kdebug(event));
                 }
         }
  
@@ -1037,11 +1479,11 @@ lck_rw_sleep_deadline(
   *
   * We support a limited form of reader-writer
   * lock promotion whose effects are:
- * 
+ *
   *   * Qualifying threads have decay disabled
   *   * Scheduler priority is reset to a floor of
   *     of their statically assigned priority
- *     or BASEPRI_BACKGROUND
+ *     or MINPRI_RWLOCK
   *
   * The rationale is that lck_rw_ts do not have
   * a single owner, so we cannot apply a directed
@@ -1087,35 +1529,37 @@ lck_rw_sleep_deadline(
   * lck_rw_clear_promotion: Undo priority promotions when the last RW
   * lock is released by a thread (if a promotion was active)
   */
-void lck_rw_clear_promotion(thread_t thread)
+void lck_rw_clear_promotion(thread_t thread, uintptr_t trace_obj)
  {
         assert(thread->rwlock_count == 0);
  
         /* Cancel any promotions if the thread had actually blocked while holding a RW lock */
         spl_t s = splsched();
-
         thread_lock(thread);
  
-       if (thread->sched_flags & TH_SFLAG_RW_PROMOTED) {
-               thread->sched_flags &= ~TH_SFLAG_RW_PROMOTED;
+       if (thread->sched_flags & TH_SFLAG_RW_PROMOTED)
+               sched_thread_unpromote_reason(thread, TH_SFLAG_RW_PROMOTED, trace_obj);
  
-               if (thread->sched_flags & TH_SFLAG_PROMOTED) {
-                       /* Thread still has a mutex promotion */
-               } else if (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) {
-                       KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RW_DEMOTE) | DBG_FUNC_NONE,
-                                             (uintptr_t)thread_tid(thread), thread->sched_pri, DEPRESSPRI, 0, 0);
+       thread_unlock(thread);
+       splx(s);
+}
  
-                       set_sched_pri(thread, DEPRESSPRI);
-               } else {
-                       KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RW_DEMOTE) | DBG_FUNC_NONE,
-                                             (uintptr_t)thread_tid(thread), thread->sched_pri, thread->base_pri, 0, 0);
+/*
+ * Callout from context switch if the thread goes
+ * off core with a positive rwlock_count
+ *
+ * Called at splsched with the thread locked
+ */
+void
+lck_rw_set_promotion_locked(thread_t thread)
+{
+       if (LcksOpts & disLkRWPrio)
+               return;
  
-                       thread_recompute_sched_pri(thread, FALSE);
-               }
-       }
+       assert(thread->rwlock_count > 0);
  
-       thread_unlock(thread);
-       splx(s);
+       if (!(thread->sched_flags & TH_SFLAG_RW_PROMOTED))
+               sched_thread_promote_reason(thread, TH_SFLAG_RW_PROMOTED, 0);
  }
  
  kern_return_t
@@ -1202,3 +1646,58 @@ host_lockgroup_info(
         return(KERN_SUCCESS);
  }
  
+/*
+ * Atomic primitives, prototyped in kern/simple_lock.h
+ * Noret versions are more efficient on some architectures
+ */
+       
+uint32_t
+hw_atomic_add(volatile uint32_t *dest, uint32_t delt)
+{
+       ALIGN_TEST(dest,uint32_t);
+       return __c11_atomic_fetch_add(ATOMIC_CAST(uint32_t,dest), delt, memory_order_relaxed) + delt;
+}
+
+uint32_t
+hw_atomic_sub(volatile uint32_t *dest, uint32_t delt)
+{
+       ALIGN_TEST(dest,uint32_t);
+       return __c11_atomic_fetch_sub(ATOMIC_CAST(uint32_t,dest), delt, memory_order_relaxed) - delt;
+}
+
+uint32_t
+hw_atomic_or(volatile uint32_t *dest, uint32_t mask)
+{
+       ALIGN_TEST(dest,uint32_t);
+       return __c11_atomic_fetch_or(ATOMIC_CAST(uint32_t,dest), mask, memory_order_relaxed) | mask;
+}
+
+void
+hw_atomic_or_noret(volatile uint32_t *dest, uint32_t mask)
+{
+       ALIGN_TEST(dest,uint32_t);
+       __c11_atomic_fetch_or(ATOMIC_CAST(uint32_t,dest), mask, memory_order_relaxed);
+}
+
+uint32_t
+hw_atomic_and(volatile uint32_t *dest, uint32_t mask)
+{
+       ALIGN_TEST(dest,uint32_t);
+       return __c11_atomic_fetch_and(ATOMIC_CAST(uint32_t,dest), mask, memory_order_relaxed) & mask;
+}
+
+void
+hw_atomic_and_noret(volatile uint32_t *dest, uint32_t mask)
+{
+       ALIGN_TEST(dest,uint32_t);
+       __c11_atomic_fetch_and(ATOMIC_CAST(uint32_t,dest), mask, memory_order_relaxed);
+}
+
+uint32_t
+hw_compare_and_store(uint32_t oldval, uint32_t newval, volatile uint32_t *dest)
+{
+       ALIGN_TEST(dest,uint32_t);
+       return __c11_atomic_compare_exchange_strong(ATOMIC_CAST(uint32_t,dest), &oldval, newval,
+                       memory_order_acq_rel_smp, memory_order_relaxed);
+}
+