]> git.saurik.com Git - apple/xnu.git/blobdiff - osfmk/i386/locks_i386.c
xnu-4570.51.1.tar.gz
[apple/xnu.git] / osfmk / i386 / locks_i386.c
index 048dc704dd89fb0cbf1439a884221c5a114cab87..039584749b070f6c7c6d286f792fb484cef84892 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2012 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
  *     Locking primitives implementation
  */
 
-#include <mach_kdb.h>
 #include <mach_ldebug.h>
 
-#include <kern/lock.h>
 #include <kern/locks.h>
 #include <kern/kalloc.h>
 #include <kern/misc_protos.h>
 #include <kern/debug.h>
 #include <string.h>
 
-#if    MACH_KDB
-#include <ddb/db_command.h>
-#include <ddb/db_output.h>
-#include <ddb/db_sym.h>
-#include <ddb/db_print.h>
-#endif /* MACH_KDB */
 #include <i386/machine_routines.h> /* machine_timeout_suspended() */
+#include <machine/atomic.h>
 #include <machine/machine_cpu.h>
 #include <i386/mp.h>
 
 #if    CONFIG_DTRACE
 #define NEED_DTRACE_DEFS
 #include <../bsd/sys/lockstat.h>
+
+#define DTRACE_RW_SHARED       0x0     //reader
+#define DTRACE_RW_EXCL         0x1     //writer
+#define DTRACE_NO_FLAG         0x0     //not applicable
+
 #endif
 
 #define        LCK_RW_LCK_EXCLUSIVE_CODE       0x100
 
 unsigned int LcksOpts=0;
 
-/* Forwards */
-
-#if    MACH_KDB
-void   db_print_simple_lock(
-                       simple_lock_t   addr);
-#endif /* MACH_KDB */
+#if DEVELOPMENT || DEBUG
+unsigned int LckDisablePreemptCheck = 0;
+#endif
 
+/* Forwards */
 
 #if    USLOCK_DEBUG
 /*
@@ -139,6 +135,7 @@ decl_simple_lock_data(extern , printf_lock)
 decl_simple_lock_data(extern , panic_lock)
 #endif /* USLOCK_DEBUG */
 
+extern unsigned int not_in_kdp;
 
 /*
  *     We often want to know the addresses of the callers
@@ -163,6 +160,63 @@ typedef void       *pc_t;
 #endif /* lint */
 #endif /* USLOCK_DEBUG */
 
+// Enforce program order of loads and stores.
+#define ordered_load(target) _Generic( (target),\
+               uint32_t* : __c11_atomic_load((_Atomic uint32_t* )(target), memory_order_relaxed), \
+               uintptr_t*: __c11_atomic_load((_Atomic uintptr_t*)(target), memory_order_relaxed) )
+#define ordered_store(target, value) _Generic( (target),\
+               uint32_t* : __c11_atomic_store((_Atomic uint32_t* )(target), (value), memory_order_relaxed), \
+               uintptr_t*: __c11_atomic_store((_Atomic uintptr_t*)(target), (value), memory_order_relaxed) )
+
+/*
+ * atomic exchange API is a low level abstraction of the operations
+ * to atomically read, modify, and write a pointer.  This abstraction works
+ * for both Intel and ARMv8.1 compare and exchange atomic instructions as
+ * well as the ARM exclusive instructions.
+ *
+ * atomic_exchange_begin() - begin exchange and retrieve current value
+ * atomic_exchange_complete() - conclude an exchange
+ * atomic_exchange_abort() - cancel an exchange started with atomic_exchange_begin()
+ */
+static uint32_t
+atomic_exchange_begin32(uint32_t *target, uint32_t *previous, enum memory_order ord)
+{
+       uint32_t        val;
+
+       (void)ord;                      // Memory order not used
+       val = __c11_atomic_load((_Atomic uint32_t *)target, memory_order_relaxed);
+       *previous = val;
+       return val;
+}
+
+static boolean_t
+atomic_exchange_complete32(uint32_t *target, uint32_t previous, uint32_t newval, enum memory_order ord)
+{
+       return __c11_atomic_compare_exchange_strong((_Atomic uint32_t *)target, &previous, newval, ord, memory_order_relaxed);
+}
+
+static void
+atomic_exchange_abort(void) { }
+
+static boolean_t
+atomic_test_and_set32(uint32_t *target, uint32_t test_mask, uint32_t set_mask, enum memory_order ord, boolean_t wait)
+{
+       uint32_t        value, prev;
+
+       for ( ; ; ) {
+               value = atomic_exchange_begin32(target, &prev, ord);
+               if (value & test_mask) {
+                       if (wait)
+                               cpu_pause();
+                       else
+                               atomic_exchange_abort();
+                       return FALSE;
+               }
+               value |= set_mask;
+               if (atomic_exchange_complete32(target, prev, value, ord))
+                       return TRUE;
+       }
+}
 
 /*
  *     Portable lock package implementation of usimple_locks.
@@ -182,35 +236,20 @@ int               usld_lock_common_checks(usimple_lock_t, char *);
 #endif /* USLOCK_DEBUG */
 
 
-extern int lck_rw_grab_want(lck_rw_t *lck);
-extern int lck_rw_grab_shared(lck_rw_t *lck);
-extern int lck_rw_held_read_or_upgrade(lck_rw_t *lck);
-
-
 /*
  * Forward definitions
  */
 
-void lck_rw_lock_shared_gen(
-       lck_rw_t        *lck);
-
-void lck_rw_lock_exclusive_gen(
-       lck_rw_t        *lck);
-
-boolean_t lck_rw_lock_shared_to_exclusive_success(
-       lck_rw_t        *lck);
-
-boolean_t lck_rw_lock_shared_to_exclusive_failure(
-       lck_rw_t        *lck,
-       int             prior_lock_state);
-
-void lck_rw_lock_exclusive_to_shared_gen(
-       lck_rw_t        *lck,
-       int             prior_lock_state);
-
-lck_rw_type_t lck_rw_done_gen(
-       lck_rw_t        *lck,
-       int             prior_lock_state);
+static void lck_rw_lock_shared_gen(lck_rw_t *lck);
+static void lck_rw_lock_exclusive_gen(lck_rw_t *lck);
+static boolean_t lck_rw_lock_shared_to_exclusive_success(lck_rw_t *lck);
+static boolean_t lck_rw_lock_shared_to_exclusive_failure(lck_rw_t *lck, uint32_t prior_lock_state);
+static void lck_rw_lock_exclusive_to_shared_gen(lck_rw_t *lck, uint32_t prior_lock_state);
+static lck_rw_type_t lck_rw_done_gen(lck_rw_t *lck, uint32_t prior_lock_state);
+void lck_rw_clear_promotions_x86(thread_t thread);
+static boolean_t lck_rw_held_read_or_upgrade(lck_rw_t *lock);
+static boolean_t lck_rw_grab_want(lck_rw_t *lock);
+static boolean_t lck_rw_grab_shared(lck_rw_t *lock);
 
 /*
  *      Routine:        lck_spin_alloc_init
@@ -298,7 +337,60 @@ boolean_t
 lck_spin_try_lock(
        lck_spin_t      *lck)
 {
-       return((boolean_t)usimple_lock_try((usimple_lock_t) lck));
+       boolean_t lrval = (boolean_t)usimple_lock_try((usimple_lock_t) lck);
+#if    DEVELOPMENT || DEBUG
+       if (lrval) {
+               pltrace(FALSE);
+       }
+#endif
+       return(lrval);
+}
+
+/*
+ *     Routine:        lck_spin_assert
+ */
+void
+lck_spin_assert(lck_spin_t *lock, unsigned int type)
+{
+       thread_t thread, holder;
+       uintptr_t state;
+
+       if (__improbable(type != LCK_ASSERT_OWNED && type != LCK_ASSERT_NOTOWNED)) {
+               panic("lck_spin_assert(): invalid arg (%u)", type);
+       }
+
+       state = lock->interlock;
+       holder = (thread_t)state;
+       thread = current_thread();
+       if (type == LCK_ASSERT_OWNED) {
+               if (__improbable(holder == THREAD_NULL)) {
+                       panic("Lock not owned %p = %lx", lock, state);
+               }
+               if (__improbable(holder != thread)) {
+                       panic("Lock not owned by current thread %p = %lx", lock, state);
+               }
+       } else if (type == LCK_ASSERT_NOTOWNED) {
+               if (__improbable(holder != THREAD_NULL)) {
+                       if (holder == thread) {
+                               panic("Lock owned by current thread %p = %lx", lock, state);
+                       } else {
+                               panic("Lock %p owned by thread %p", lock, holder);
+                       }
+               }
+       }
+}
+
+/*
+ *      Routine: kdp_lck_spin_is_acquired
+ *      NOT SAFE: To be used only by kernel debugger to avoid deadlock.
+ *      Returns: TRUE if lock is acquired.
+ */
+boolean_t
+kdp_lck_spin_is_acquired(lck_spin_t *lck) {
+       if (not_in_kdp) {
+               panic("panic: spinlock acquired check done outside of kernel debugger");
+       }
+       return (lck->interlock != 0)? TRUE : FALSE;
 }
 
 /*
@@ -322,20 +414,16 @@ usimple_lock_init(
 volatile uint32_t spinlock_owner_cpu = ~0;
 volatile usimple_lock_t spinlock_timed_out;
 
-static uint32_t spinlock_timeout_NMI(uintptr_t thread_addr) {
-       uint64_t deadline;
+uint32_t spinlock_timeout_NMI(uintptr_t thread_addr) {
        uint32_t i;
 
        for (i = 0; i < real_ncpus; i++) {
-               if ((uintptr_t)cpu_data_ptr[i]->cpu_active_thread == thread_addr) {
+               if ((cpu_data_ptr[i] != NULL) && ((uintptr_t)cpu_data_ptr[i]->cpu_active_thread == thread_addr)) {
                        spinlock_owner_cpu = i;
-                       if ((uint32_t) cpu_number() == i)
-                               break;
-                       cpu_datap(i)->cpu_NMI_acknowledged = FALSE;
-                       cpu_NMI_interrupt(i);
-                       deadline = mach_absolute_time() + (LockTimeOut * 2);
-                       while (mach_absolute_time() < deadline && cpu_datap(i)->cpu_NMI_acknowledged == FALSE)
-                               cpu_pause();
+                       if ((uint32_t) cpu_number() != i) {
+                               /* Cause NMI and panic on the owner's cpu */
+                               NMIPI_panic(cpu_to_cpumask(i), SPINLOCK_TIMEOUT);
+                       }
                        break;
                }
        }
@@ -373,13 +461,21 @@ usimple_lock(
                        uintptr_t lowner = (uintptr_t)l->interlock.lock_data;
                        spinlock_timed_out = l;
                        lock_cpu = spinlock_timeout_NMI(lowner);
-                       panic("Spinlock acquisition timed out: lock=%p, lock owner thread=0x%lx, current_thread: %p, lock owner active on CPU 0x%x, current owner: 0x%lx", l, lowner,  current_thread(), lock_cpu, (uintptr_t)l->interlock.lock_data);
+                       panic("Spinlock acquisition timed out: lock=%p, lock owner thread=0x%lx, current_thread: %p, lock owner active on CPU 0x%x, current owner: 0x%lx, time: %llu",
+                             l, lowner,  current_thread(), lock_cpu, (uintptr_t)l->interlock.lock_data, mach_absolute_time());
                }
        }
+#if DEVELOPMENT || DEBUG
+               pltrace(FALSE);
+#endif
+
        USLDBG(usld_lock_post(l, pc));
 #else
        simple_lock((simple_lock_t)l);
 #endif
+#if CONFIG_DTRACE
+       LOCKSTAT_RECORD(LS_LCK_SPIN_LOCK_ACQUIRE, l, 0);
+#endif
 }
 
 
@@ -399,6 +495,9 @@ usimple_unlock(
 
        OBTAIN_PC(pc);
        USLDBG(usld_unlock(l, pc));
+#if DEVELOPMENT || DEBUG
+               pltrace(TRUE);
+#endif
        hw_lock_unlock(&l->interlock);
 #else
        simple_unlock_rwmb((simple_lock_t)l);
@@ -429,7 +528,10 @@ usimple_lock_try(
        OBTAIN_PC(pc);
        USLDBG(usld_lock_try_pre(l, pc));
        if ((success = hw_lock_try(&l->interlock))) {
-               USLDBG(usld_lock_try_post(l, pc));
+#if DEVELOPMENT || DEBUG
+               pltrace(FALSE);
+#endif
+       USLDBG(usld_lock_try_post(l, pc));
        }
        return success;
 #else
@@ -437,6 +539,22 @@ usimple_lock_try(
 #endif
 }
 
+/*
+ * Acquire a usimple_lock while polling for pending TLB flushes
+ * and spinning on a lock.
+ *
+ */
+void
+usimple_lock_try_lock_loop(usimple_lock_t l)
+{
+       boolean_t istate = ml_get_interrupts_enabled();
+       while (!simple_lock_try((l))) {
+               if (!istate)
+                       handle_pending_TLB_flushes();
+               cpu_pause();
+       }
+}
+
 #if    USLOCK_DEBUG
 /*
  *     States of a usimple_lock.  The default when initializing
@@ -546,7 +664,7 @@ usld_lock_post(
        usimple_lock_t  l,
        pc_t            pc)
 {
-       register int    mycpu;
+       int     mycpu;
        char    caller[] = "successful usimple_lock";
 
 
@@ -583,7 +701,7 @@ usld_unlock(
        usimple_lock_t  l,
        pc_t            pc)
 {
-       register int    mycpu;
+       int     mycpu;
        char    caller[] = "usimple_unlock";
 
 
@@ -648,7 +766,7 @@ usld_lock_try_post(
        usimple_lock_t  l,
        pc_t            pc)
 {
-       register int    mycpu;
+       int     mycpu;
        char    caller[] = "successful usimple_lock_try";
 
        if (!usld_lock_common_checks(l, caller))
@@ -699,126 +817,6 @@ usl_trace(
 
 #endif /* USLOCK_DEBUG */
 
-/*
- *     Routine:        lock_alloc
- *     Function:
- *             Allocate a lock for external users who cannot
- *             hard-code the structure definition into their
- *             objects.
- *             For now just use kalloc, but a zone is probably
- *             warranted.
- */
-lock_t *
-lock_alloc(
-       boolean_t       can_sleep,
-       unsigned short  tag,
-       unsigned short  tag1)
-{
-       lock_t          *l;
-
-       if ((l = (lock_t *)kalloc(sizeof(lock_t))) != 0)
-         lock_init(l, can_sleep, tag, tag1);
-       return(l);
-}
-
-/*
- *     Routine:        lock_free
- *     Function:
- *             Free a lock allocated for external users.
- *             For now just use kfree, but a zone is probably
- *             warranted.
- */
-void
-lock_free(
-       lock_t          *l)
-{
-       kfree(l, sizeof(lock_t));
-}
-
-         
-/*
- *     Routine:        lock_init
- *     Function:
- *             Initialize a lock; required before use.
- *             Note that clients declare the "struct lock"
- *             variables and then initialize them, rather
- *             than getting a new one from this module.
- */
-void
-lock_init(
-       lock_t          *l,
-       boolean_t       can_sleep,
-       __unused unsigned short tag,
-       __unused unsigned short tag1)
-{
-       hw_lock_byte_init(&l->lck_rw_interlock);
-       l->lck_rw_want_write = FALSE;
-       l->lck_rw_want_upgrade = FALSE;
-       l->lck_rw_shared_count = 0;
-       l->lck_rw_can_sleep = can_sleep;
-       l->lck_rw_tag = tag;
-       l->lck_rw_priv_excl = 1;
-       l->lck_r_waiting = l->lck_w_waiting = 0;
-}
-
-
-/*
- *     Sleep locks.  These use the same data structure and algorithm
- *     as the spin locks, but the process sleeps while it is waiting
- *     for the lock.  These work on uniprocessor systems.
- */
-
-#define DECREMENTER_TIMEOUT 1000000
-
-void
-lock_write(
-       register lock_t * l)
-{
-       lck_rw_lock_exclusive(l);
-}
-
-void
-lock_done(
-       register lock_t * l)
-{
-       (void) lck_rw_done(l);
-}
-
-void
-lock_read(
-       register lock_t * l)
-{
-       lck_rw_lock_shared(l);
-}
-
-
-/*
- *     Routine:        lock_read_to_write
- *     Function:
- *             Improves a read-only lock to one with
- *             write permission.  If another reader has
- *             already requested an upgrade to a write lock,
- *             no lock is held upon return.
- *
- *             Returns FALSE if the upgrade *failed*.
- */
-
-boolean_t
-lock_read_to_write(
-       register lock_t * l)
-{
-       return lck_rw_lock_shared_to_exclusive(l);
-}
-
-void
-lock_write_to_read(
-       register lock_t * l)
-{
-       lck_rw_lock_exclusive_to_shared(l);
-}
-
-
-
 /*
  *      Routine:        lck_rw_alloc_init
  */
@@ -883,6 +881,9 @@ lck_rw_destroy(
 {
        if (lck->lck_rw_tag == LCK_RW_TAG_DESTROYED)
                return;
+#if MACH_LDEBUG
+       lck_rw_assert(lck, LCK_RW_ASSERT_NOTHELD);
+#endif
        lck->lck_rw_tag = LCK_RW_TAG_DESTROYED;
        lck_grp_lckcnt_decr(grp, LCK_TYPE_RW);
        lck_grp_deallocate(grp);
@@ -897,29 +898,22 @@ lck_rw_destroy(
 
 #define DECREMENTER_TIMEOUT 1000000
 
-#define RW_LOCK_READER_EVENT(x)                \
-               ((event_t) (((unsigned char*) (x)) + (offsetof(lck_rw_t, lck_rw_tag))))
-
-#define RW_LOCK_WRITER_EVENT(x)                \
-               ((event_t) (((unsigned char*) (x)) + (offsetof(lck_rw_t, lck_rw_pad8))))
-
 /*
  * We disable interrupts while holding the RW interlock to prevent an
  * interrupt from exacerbating hold time.
  * Hence, local helper functions lck_interlock_lock()/lck_interlock_unlock().
  */
-static boolean_t
+static inline boolean_t
 lck_interlock_lock(lck_rw_t *lck)
 {
        boolean_t       istate;
 
        istate = ml_set_interrupts_enabled(FALSE);      
        hw_lock_byte_lock(&lck->lck_rw_interlock);
-
        return istate;
 }
 
-static void
+static inline void
 lck_interlock_unlock(lck_rw_t *lck, boolean_t istate)
 {               
        hw_lock_byte_unlock(&lck->lck_rw_interlock);
@@ -940,6 +934,13 @@ lck_rw_lock_pause(boolean_t interrupts_enabled)
        cpu_pause();
 }
 
+static inline boolean_t
+lck_rw_held_read_or_upgrade(lck_rw_t *lock)
+{
+       if (ordered_load(&lock->data) & (LCK_RW_SHARED_MASK | LCK_RW_INTERLOCK | LCK_RW_WANT_UPGRADE))
+               return TRUE;
+       return FALSE;
+}
 
 /*
  * compute the deadline to spin against when
@@ -968,13 +969,68 @@ lck_rw_deadline_for_spin(lck_rw_t *lck)
 }
 
 
+/*
+ * Spin while interlock is held.
+ */
+
+static inline void
+lck_rw_interlock_spin(lck_rw_t *lock)
+{
+       while (ordered_load(&lock->data) & LCK_RW_INTERLOCK) {
+               cpu_pause();
+       }
+}
+
+static boolean_t
+lck_rw_grab_want(lck_rw_t *lock)
+{
+       uint32_t        data, prev;
+
+       for ( ; ; ) {
+               data = atomic_exchange_begin32(&lock->data, &prev, memory_order_relaxed);
+               if ((data & LCK_RW_INTERLOCK) == 0)
+                       break;
+               atomic_exchange_abort();
+               lck_rw_interlock_spin(lock);
+       }
+       if (data & LCK_RW_WANT_WRITE) {
+               atomic_exchange_abort();
+               return FALSE;
+       }
+       data |= LCK_RW_WANT_WRITE;
+       return atomic_exchange_complete32(&lock->data, prev, data, memory_order_relaxed);
+}
+
+static boolean_t
+lck_rw_grab_shared(lck_rw_t *lock)
+{
+       uint32_t        data, prev;
+
+       for ( ; ; ) {
+               data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
+               if ((data & LCK_RW_INTERLOCK) == 0)
+                       break;
+               atomic_exchange_abort();
+               lck_rw_interlock_spin(lock);
+       }
+       if (data & (LCK_RW_WANT_WRITE | LCK_RW_WANT_UPGRADE)) {
+               if (((data & LCK_RW_SHARED_MASK) == 0) || (data & LCK_RW_PRIV_EXCL)) {
+                       atomic_exchange_abort();
+                       return FALSE;
+               }
+       }
+       data += LCK_RW_SHARED_READER;
+       return atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp);
+}
+
 /*
  *      Routine:        lck_rw_lock_exclusive
  */
-void
+static void
 lck_rw_lock_exclusive_gen(
        lck_rw_t        *lck)
 {
+       __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
        uint64_t        deadline = 0;
        int             slept = 0;
        int             gotlock = 0;
@@ -1015,12 +1071,12 @@ lck_rw_lock_exclusive_gen(
 
                deadline = lck_rw_deadline_for_spin(lck);
 
-               KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_START, (int)lck, 0, 0, 0, 0);
+               KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
                
                while (((gotlock = lck_rw_grab_want(lck)) == 0) && mach_absolute_time() < deadline)
                        lck_rw_lock_pause(istate);
 
-               KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_END, (int)lck, 0, 0, gotlock, 0);
+               KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_END, trace_lck, 0, 0, gotlock, 0);
 
                if (gotlock)
                        break;
@@ -1035,10 +1091,11 @@ lck_rw_lock_exclusive_gen(
 
                        if (lck->lck_rw_want_write) {
 
-                               KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_START, (int)lck, 0, 0, 0, 0);
+                               KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
 
                                lck->lck_w_waiting = TRUE;
 
+                               thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite);
                                res = assert_wait(RW_LOCK_WRITER_EVENT(lck), THREAD_UNINT);
                                lck_interlock_unlock(lck, istate);
 
@@ -1046,7 +1103,7 @@ lck_rw_lock_exclusive_gen(
                                        res = thread_block(THREAD_CONTINUE_NULL);
                                        slept++;
                                }
-                               KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_END, (int)lck, res, slept, 0, 0);
+                               KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_END, trace_lck, res, slept, 0, 0);
                        } else {
                                lck->lck_rw_want_write = TRUE;
                                lck_interlock_unlock(lck, istate);
@@ -1094,12 +1151,12 @@ lck_rw_lock_exclusive_gen(
 
                deadline = lck_rw_deadline_for_spin(lck);
 
-               KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_START, (int)lck, 0, 0, 0, 0);
+               KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
 
                while ((lockheld = lck_rw_held_read_or_upgrade(lck)) && mach_absolute_time() < deadline)
                        lck_rw_lock_pause(istate);
 
-               KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_END, (int)lck, 0, 0, lockheld, 0);
+               KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_END, trace_lck, 0, 0, lockheld, 0);
 
                if ( !lockheld)
                        break;
@@ -1113,10 +1170,11 @@ lck_rw_lock_exclusive_gen(
                        istate = lck_interlock_lock(lck);
 
                        if (lck->lck_rw_shared_count != 0 || lck->lck_rw_want_upgrade) {
-                               KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_START, (int)lck, 0, 0, 0, 0);
+                               KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
 
                                lck->lck_w_waiting = TRUE;
 
+                               thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite);
                                res = assert_wait(RW_LOCK_WRITER_EVENT(lck), THREAD_UNINT);
                                lck_interlock_unlock(lck, istate);
 
@@ -1124,7 +1182,7 @@ lck_rw_lock_exclusive_gen(
                                        res = thread_block(THREAD_CONTINUE_NULL);
                                        slept++;
                                }
-                               KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_END, (int)lck, res, slept, 0, 0);
+                               KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_END, trace_lck, res, slept, 0, 0);
                        } else {
                                lck_interlock_unlock(lck, istate);
                                /*
@@ -1167,11 +1225,53 @@ lck_rw_lock_exclusive_gen(
 #endif
 }
 
+/*
+ *      Routine:        lck_rw_done
+ */
+
+lck_rw_type_t lck_rw_done(lck_rw_t *lock)
+{
+       uint32_t        data, prev;
+
+       for ( ; ; ) {
+               data = atomic_exchange_begin32(&lock->data, &prev, memory_order_release_smp);
+               if (data & LCK_RW_INTERLOCK) {          /* wait for interlock to clear */
+                       atomic_exchange_abort();
+                       lck_rw_interlock_spin(lock);
+                       continue;
+               }
+               if (data & LCK_RW_SHARED_MASK) {
+                       data -= LCK_RW_SHARED_READER;
+                       if ((data & LCK_RW_SHARED_MASK) == 0)   /* if reader count has now gone to 0, check for waiters */
+                               goto check_waiters;
+               } else {                                        /* if reader count == 0, must be exclusive lock */
+                       if (data & LCK_RW_WANT_UPGRADE) {
+                               data &= ~(LCK_RW_WANT_UPGRADE);
+                       } else {
+                               if (data & LCK_RW_WANT_WRITE)
+                                       data &= ~(LCK_RW_WANT_EXCL);
+                               else                                    /* lock is not 'owned', panic */
+                                       panic("Releasing non-exclusive RW lock without a reader refcount!");
+                       }
+check_waiters:
+                       if (prev & LCK_RW_W_WAITING) {
+                               data &= ~(LCK_RW_W_WAITING);
+                               if ((prev & LCK_RW_PRIV_EXCL) == 0)
+                                       data &= ~(LCK_RW_R_WAITING);
+                       } else
+                               data &= ~(LCK_RW_R_WAITING);
+               }
+               if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_release_smp))
+                       break;
+               cpu_pause();
+       }
+       return lck_rw_done_gen(lock, prev);
+}
 
 /*
  *      Routine:        lck_rw_done_gen
  *
- *     called from the assembly language wrapper...
+ *     called from lck_rw_done()
  *     prior_lock_state is the value in the 1st
  *     word of the lock at the time of a successful
  *     atomic compare and exchange with the new value...
@@ -1185,13 +1285,15 @@ lck_rw_lock_exclusive_gen(
  *     this by examining the state of the lock before
  *     we changed it
  */
-lck_rw_type_t
+static lck_rw_type_t
 lck_rw_done_gen(
        lck_rw_t        *lck,
-       int             prior_lock_state)
+       uint32_t        prior_lock_state)
 {
        lck_rw_t        *fake_lck;
        lck_rw_type_t   lock_type;
+       thread_t        thread;
+       uint32_t        rwlock_count;
 
        /*
         * prior_lock state is a snapshot of the 1st word of the
@@ -1213,6 +1315,19 @@ lck_rw_done_gen(
        else
                lock_type = LCK_RW_TYPE_EXCLUSIVE;
 
+       /* Check if dropping the lock means that we need to unpromote */
+       thread = current_thread();
+       rwlock_count = thread->rwlock_count--;
+#if MACH_LDEBUG
+       if (rwlock_count == 0) {
+               panic("rw lock count underflow for thread %p", thread);
+       }
+#endif
+       if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
+               /* sched_flags checked without lock, but will be rechecked while clearing */
+               lck_rw_clear_promotion(thread);
+       }
+
 #if CONFIG_DTRACE
        LOCKSTAT_RECORD(LS_LCK_RW_DONE_RELEASE, lck, lock_type == LCK_RW_TYPE_SHARED ? 0 : 1);
 #endif
@@ -1247,10 +1362,11 @@ lck_rw_unlock_shared(
 {
        lck_rw_type_t   ret;
 
+       assertf(lck->lck_rw_shared_count > 0, "lck %p has shared_count=0x%x", lck, lck->lck_rw_shared_count);
        ret = lck_rw_done(lck);
 
        if (ret != LCK_RW_TYPE_SHARED)
-               panic("lck_rw_unlock(): lock held in mode: %d\n", ret);
+               panic("lck_rw_unlock_shared(): lock %p held in mode: %d\n", lck, ret);
 }
 
 
@@ -1286,6 +1402,32 @@ lck_rw_lock(
                panic("lck_rw_lock(): Invalid RW lock type: %x\n", lck_rw_type);
 }
 
+/*
+ *     Routine:        lck_rw_lock_shared
+ */
+void
+lck_rw_lock_shared(lck_rw_t *lock)
+{
+       uint32_t        data, prev;
+
+       current_thread()->rwlock_count++;
+       for ( ; ; ) {
+               data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
+               if (data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE | LCK_RW_INTERLOCK)) {
+                       atomic_exchange_abort();
+                       lck_rw_lock_shared_gen(lock);
+                       break;
+               }
+               data += LCK_RW_SHARED_READER;
+               if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp))
+                       break;
+               cpu_pause();
+       }
+#if    CONFIG_DTRACE
+       LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lock, DTRACE_RW_SHARED);
+#endif /* CONFIG_DTRACE */
+       return;
+}
 
 /*
  *     Routine:        lck_rw_lock_shared_gen
@@ -1294,16 +1436,17 @@ lck_rw_lock(
  *             is held exclusively... this is where we spin/block
  *             until we can acquire the lock in the shared mode
  */
-void
+static void
 lck_rw_lock_shared_gen(
        lck_rw_t        *lck)
 {
+       __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
        uint64_t        deadline = 0;
        int             gotlock = 0;
        int             slept = 0;
        wait_result_t   res = 0;
        boolean_t       istate = -1;
-       
+
 #if    CONFIG_DTRACE
        uint64_t wait_interval = 0;
        int readers_at_sleep = 0;
@@ -1335,13 +1478,13 @@ lck_rw_lock_shared_gen(
                deadline = lck_rw_deadline_for_spin(lck);
 
                KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_START,
-                            (int)lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0);
+                            trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0);
 
                while (((gotlock = lck_rw_grab_shared(lck)) == 0) && mach_absolute_time() < deadline)
                        lck_rw_lock_pause(istate);
 
                KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_END,
-                            (int)lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, gotlock, 0);
+                            trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, gotlock, 0);
 
                if (gotlock)
                        break;
@@ -1358,10 +1501,11 @@ lck_rw_lock_shared_gen(
                            ((lck->lck_rw_shared_count == 0) || lck->lck_rw_priv_excl)) {
 
                                KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_START,
-                                            (int)lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0);
+                                            trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0);
 
                                lck->lck_r_waiting = TRUE;
 
+                               thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockRead);
                                res = assert_wait(RW_LOCK_READER_EVENT(lck), THREAD_UNINT);
                                lck_interlock_unlock(lck, istate);
 
@@ -1370,7 +1514,7 @@ lck_rw_lock_shared_gen(
                                        slept++;
                                }
                                KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_END,
-                                            (int)lck, res, slept, 0, 0);
+                                            trace_lck, res, slept, 0, 0);
                        } else {
                                lck->lck_rw_shared_count++;
                                lck_interlock_unlock(lck, istate);
@@ -1394,6 +1538,65 @@ lck_rw_lock_shared_gen(
 }
 
 
+/*
+ *     Routine:        lck_rw_lock_exclusive
+ */
+
+void
+lck_rw_lock_exclusive(lck_rw_t *lock)
+{
+       current_thread()->rwlock_count++;
+       if (atomic_test_and_set32(&lock->data,
+               (LCK_RW_SHARED_MASK | LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE | LCK_RW_INTERLOCK),
+               LCK_RW_WANT_EXCL, memory_order_acquire_smp, FALSE)) {
+#if    CONFIG_DTRACE
+               LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL);
+#endif /* CONFIG_DTRACE */
+       } else
+               lck_rw_lock_exclusive_gen(lock);
+}
+
+
+/*
+ *     Routine:        lck_rw_lock_shared_to_exclusive
+ */
+
+boolean_t
+lck_rw_lock_shared_to_exclusive(lck_rw_t *lock)
+{
+       uint32_t        data, prev;
+
+       for ( ; ; ) {
+               data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
+               if (data & LCK_RW_INTERLOCK) {
+                       atomic_exchange_abort();
+                       lck_rw_interlock_spin(lock);
+                       continue;
+               }
+               if (data & LCK_RW_WANT_UPGRADE) {
+                       data -= LCK_RW_SHARED_READER;
+                       if ((data & LCK_RW_SHARED_MASK) == 0)           /* we were the last reader */
+                               data &= ~(LCK_RW_W_WAITING);            /* so clear the wait indicator */
+                       if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp))
+                               return lck_rw_lock_shared_to_exclusive_failure(lock, prev);
+               } else {
+                       data |= LCK_RW_WANT_UPGRADE;            /* ask for WANT_UPGRADE */
+                       data -= LCK_RW_SHARED_READER;           /* and shed our read count */
+                       if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp))
+                               break;
+               }
+               cpu_pause();
+       }
+                                               /* we now own the WANT_UPGRADE */
+       if (data & LCK_RW_SHARED_MASK)          /* check to see if all of the readers are drained */
+               lck_rw_lock_shared_to_exclusive_success(lock);  /* if not, we need to go wait */
+#if    CONFIG_DTRACE
+       LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lock, 0);
+#endif
+       return TRUE;
+}
+
+
 /*
  *     Routine:        lck_rw_lock_shared_to_exclusive_failure
  *     Function:
@@ -1402,19 +1605,22 @@ lck_rw_lock_shared_gen(
  *             if 'lck_rw_shared_count' == 0, its also already dropped 'lck_w_waiting'
  *             all we need to do here is determine if a wakeup is needed
  */
-boolean_t
+static boolean_t
 lck_rw_lock_shared_to_exclusive_failure(
        lck_rw_t        *lck,
-       int             prior_lock_state)
+       uint32_t        prior_lock_state)
 {
        lck_rw_t        *fake_lck;
-
-       /*
-        * prior_lock state is a snapshot of the 1st word of the
-        * lock in question... we'll fake up a pointer to it
-        * and carefully not access anything beyond whats defined
-        * in the first word of a lck_rw_t
-        */
+       thread_t        thread = current_thread();
+       uint32_t        rwlock_count;
+
+       /* Check if dropping the lock means that we need to unpromote */
+       rwlock_count = thread->rwlock_count--;
+#if MACH_LDEBUG
+       if (rwlock_count == 0) {
+               panic("rw lock count underflow for thread %p", thread);
+       }
+#endif
        fake_lck = (lck_rw_t *)&prior_lock_state;
 
        if (fake_lck->lck_w_waiting && fake_lck->lck_rw_shared_count == 1) {
@@ -1425,8 +1631,14 @@ lck_rw_lock_shared_to_exclusive_failure(
                 */
                thread_wakeup(RW_LOCK_WRITER_EVENT(lck));
        }
+
+       if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
+               /* sched_flags checked without lock, but will be rechecked while clearing */
+               lck_rw_clear_promotion(thread);
+       }
+
        KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_CODE) | DBG_FUNC_NONE,
-                    (int)lck, lck->lck_rw_shared_count, lck->lck_rw_want_upgrade, 0, 0);
+                    VM_KERNEL_UNSLIDE_OR_PERM(lck), lck->lck_rw_shared_count, lck->lck_rw_want_upgrade, 0, 0);
 
        return (FALSE);
 }
@@ -1440,10 +1652,11 @@ lck_rw_lock_shared_to_exclusive_failure(
  *             we just need to wait for the rest of the readers to drain
  *             and then we can return as the exclusive holder of this lock
  */
-boolean_t
+static boolean_t
 lck_rw_lock_shared_to_exclusive_success(
        lck_rw_t        *lck)
 {
+       __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
        uint64_t        deadline = 0;
        int             slept = 0;
        int             still_shared = 0;
@@ -1481,13 +1694,13 @@ lck_rw_lock_shared_to_exclusive_success(
                deadline = lck_rw_deadline_for_spin(lck);
 
                KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_START,
-                            (int)lck, lck->lck_rw_shared_count, 0, 0, 0);
+                            trace_lck, lck->lck_rw_shared_count, 0, 0, 0);
 
                while ((still_shared = lck->lck_rw_shared_count) && mach_absolute_time() < deadline)
                        lck_rw_lock_pause(istate);
 
                KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_END,
-                            (int)lck, lck->lck_rw_shared_count, 0, 0, 0);
+                            trace_lck, lck->lck_rw_shared_count, 0, 0, 0);
 
                if ( !still_shared)
                        break;
@@ -1502,10 +1715,11 @@ lck_rw_lock_shared_to_exclusive_success(
                        
                        if (lck->lck_rw_shared_count != 0) {
                                KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_START,
-                                            (int)lck, lck->lck_rw_shared_count, 0, 0, 0);
+                                            trace_lck, lck->lck_rw_shared_count, 0, 0, 0);
 
                                lck->lck_w_waiting = TRUE;
 
+                               thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockUpgrade);
                                res = assert_wait(RW_LOCK_WRITER_EVENT(lck), THREAD_UNINT);
                                lck_interlock_unlock(lck, istate);
 
@@ -1514,7 +1728,7 @@ lck_rw_lock_shared_to_exclusive_success(
                                        slept++;
                                }
                                KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_END,
-                                            (int)lck, res, slept, 0, 0);
+                                            trace_lck, res, slept, 0, 0);
                        } else {
                                lck_interlock_unlock(lck, istate);
                                break;
@@ -1539,32 +1753,56 @@ lck_rw_lock_shared_to_exclusive_success(
        return (TRUE);
 }
 
+/*
+ *     Routine:        lck_rw_lock_exclusive_to_shared
+ */
+
+void lck_rw_lock_exclusive_to_shared(lck_rw_t *lock)
+{
+       uint32_t        data, prev;
+
+       for ( ; ; ) {
+               data = atomic_exchange_begin32(&lock->data, &prev, memory_order_release_smp);
+               if (data & LCK_RW_INTERLOCK) {
+                       atomic_exchange_abort();
+                       lck_rw_interlock_spin(lock);    /* wait for interlock to clear */
+                       continue;
+               }
+               data += LCK_RW_SHARED_READER;
+               if (data & LCK_RW_WANT_UPGRADE)
+                       data &= ~(LCK_RW_WANT_UPGRADE);
+               else
+                       data &= ~(LCK_RW_WANT_EXCL);
+               if (!((prev & LCK_RW_W_WAITING) && (prev & LCK_RW_PRIV_EXCL)))
+                       data &= ~(LCK_RW_W_WAITING);
+               if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_release_smp))
+                       break;
+               cpu_pause();
+       }
+       return lck_rw_lock_exclusive_to_shared_gen(lock, prev);
+}
+
 
 /*
- *      Routine:        lck_rw_lock_exclusive_to_shared
+ *      Routine:        lck_rw_lock_exclusive_to_shared_gen
  *     Function:
  *             assembly fast path has already dropped
  *             our exclusive state and bumped lck_rw_shared_count
  *             all we need to do here is determine if anyone
  *             needs to be awakened.
  */
-void
+static void
 lck_rw_lock_exclusive_to_shared_gen(
        lck_rw_t        *lck,
-       int             prior_lock_state)
+       uint32_t        prior_lock_state)
 {
-       lck_rw_t        *fake_lck;
+       __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
+       lck_rw_t                *fake_lck;
 
-       /*
-        * prior_lock state is a snapshot of the 1st word of the
-        * lock in question... we'll fake up a pointer to it
-        * and carefully not access anything beyond whats defined
-        * in the first word of a lck_rw_t
-        */
        fake_lck = (lck_rw_t *)&prior_lock_state;
 
        KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_START,
-                            (int)lck, fake_lck->lck_rw_want_write, fake_lck->lck_rw_want_upgrade, 0, 0);
+                            trace_lck, fake_lck->lck_rw_want_write, fake_lck->lck_rw_want_upgrade, 0, 0);
 
        /*
         * don't wake up anyone waiting to take the lock exclusively
@@ -1578,7 +1816,7 @@ lck_rw_lock_exclusive_to_shared_gen(
                thread_wakeup(RW_LOCK_READER_EVENT(lck));
 
        KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_END,
-                            (int)lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, lck->lck_rw_shared_count, 0);
+                            trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, lck->lck_rw_shared_count, 0);
 
 #if CONFIG_DTRACE
        LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_TO_SHARED_DOWNGRADE, lck, 0);
@@ -1603,6 +1841,71 @@ lck_rw_try_lock(
        return(FALSE);
 }
 
+/*
+ *     Routine:        lck_rw_try_lock_shared
+ */
+
+boolean_t lck_rw_try_lock_shared(lck_rw_t *lock)
+{
+       uint32_t        data, prev;
+
+       for ( ; ; ) {
+               data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
+               if (data & LCK_RW_INTERLOCK) {
+                       atomic_exchange_abort();
+                       lck_rw_interlock_spin(lock);
+                       continue;
+               }
+               if (data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) {
+                       atomic_exchange_abort();
+                       return FALSE;                   /* lock is busy */
+               }
+               data += LCK_RW_SHARED_READER;           /* Increment reader refcount */
+               if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp))
+                       break;
+               cpu_pause();
+       }
+       current_thread()->rwlock_count++;
+       /* There is a 3 instr window where preemption may not notice rwlock_count after cmpxchg */
+#if    CONFIG_DTRACE
+       LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_SHARED_ACQUIRE, lock, DTRACE_RW_SHARED);
+#endif /* CONFIG_DTRACE */
+       return TRUE;
+}
+
+
+/*
+ *     Routine:        lck_rw_try_lock_exclusive
+ */
+
+boolean_t lck_rw_try_lock_exclusive(lck_rw_t *lock)
+{
+       uint32_t        data, prev;
+
+       for ( ; ; ) {
+               data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
+               if (data & LCK_RW_INTERLOCK) {
+                       atomic_exchange_abort();
+                       lck_rw_interlock_spin(lock);
+                       continue;
+               }
+               if (data & (LCK_RW_SHARED_MASK | LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) {
+                       atomic_exchange_abort();
+                       return FALSE;                           /* can't get it */
+               }
+               data |= LCK_RW_WANT_EXCL;
+               if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp))
+                       break;
+               cpu_pause();
+       }
+
+       current_thread()->rwlock_count++;
+#if    CONFIG_DTRACE
+       LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL);
+#endif /* CONFIG_DTRACE */
+       return TRUE;
+}
+
 
 void
 lck_rw_assert(
@@ -1629,13 +1932,62 @@ lck_rw_assert(
                        return;
                }
                break;
+       case LCK_RW_ASSERT_NOTHELD:
+               if (!(lck->lck_rw_want_write ||
+                         lck->lck_rw_want_upgrade ||
+                         lck->lck_rw_shared_count != 0)) {
+                       return;
+               }
+               break;
        default:
                break;
        }
 
-       panic("rw lock (%p) not held (mode=%u), first word %08x\n", lck, type, *(uint32_t *)lck);
+       panic("rw lock (%p)%s held (mode=%u), first word %08x\n", lck, (type == LCK_RW_ASSERT_NOTHELD ? "" : " not"), type, *(uint32_t *)lck);
+}
+
+/* On return to userspace, this routine is called if the rwlock_count is somehow imbalanced */
+void
+lck_rw_clear_promotions_x86(thread_t thread)
+{
+#if MACH_LDEBUG
+       /* It's fatal to leave a RW lock locked and return to userspace */
+       panic("%u rw lock(s) held on return to userspace for thread %p", thread->rwlock_count, thread);
+#else
+       /* Paper over the issue */
+       thread->rwlock_count = 0;
+       lck_rw_clear_promotion(thread);
+#endif
+}
+
+boolean_t
+lck_rw_lock_yield_shared(lck_rw_t *lck, boolean_t force_yield)
+{
+       lck_rw_assert(lck, LCK_RW_ASSERT_SHARED);
+
+       if (lck->lck_rw_want_write || lck->lck_rw_want_upgrade || force_yield) {
+               lck_rw_unlock_shared(lck);
+               mutex_pause(2);
+               lck_rw_lock_shared(lck);
+               return TRUE;
+       }
+
+       return FALSE;
+}
+
+/*
+ * Routine: kdp_lck_rw_lock_is_acquired_exclusive
+ * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
+ */
+boolean_t
+kdp_lck_rw_lock_is_acquired_exclusive(lck_rw_t *lck) {
+       if (not_in_kdp) {
+               panic("panic: rw lock exclusive check done outside of kernel debugger");
+       }
+       return ((lck->lck_rw_want_upgrade || lck->lck_rw_want_write) && (lck->lck_rw_shared_count == 0)) ? TRUE : FALSE;
 }
 
+
 #ifdef MUTEX_ZONE
 extern zone_t lck_mtx_zone;
 #endif
@@ -1696,9 +2048,7 @@ lck_mtx_ext_init(
                lck->lck_mtx_attr |= LCK_MTX_ATTR_STAT;
 
        lck->lck_mtx.lck_mtx_is_ext = 1;
-#if    defined(__x86_64__)
-       lck->lck_mtx.lck_mtx_sw.lck_mtxd.lck_mtxd_pad32 = 0xFFFFFFFF;
-#endif
+       lck->lck_mtx.lck_mtx_pad32 = 0xFFFFFFFF;
 }
 
 /*
@@ -1728,9 +2078,7 @@ lck_mtx_init(
                lck->lck_mtx_owner = 0;
                lck->lck_mtx_state = 0;
        }
-#if    defined(__x86_64__)
-       lck->lck_mtx_sw.lck_mtxd.lck_mtxd_pad32 = 0xFFFFFFFF;
-#endif
+       lck->lck_mtx_pad32 = 0xFFFFFFFF;
        lck_grp_reference(grp);
        lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX);
 }
@@ -1760,9 +2108,7 @@ lck_mtx_init_ext(
                lck->lck_mtx_owner = 0;
                lck->lck_mtx_state = 0;
        }
-#if    defined(__x86_64__)
-       lck->lck_mtx_sw.lck_mtxd.lck_mtxd_pad32 = 0xFFFFFFFF;
-#endif
+       lck->lck_mtx_pad32 = 0xFFFFFFFF;
 
        lck_grp_reference(grp);
        lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX);
@@ -1780,6 +2126,9 @@ lck_mtx_destroy(
        
        if (lck->lck_mtx_tag == LCK_MTX_TAG_DESTROYED)
                return;
+#if MACH_LDEBUG
+       lck_mtx_assert(lck, LCK_MTX_ASSERT_NOTOWNED);
+#endif
        lck_is_indirect = (lck->lck_mtx_tag == LCK_MTX_TAG_INDIRECT);
 
        lck_mtx_lock_mark_destroyed(lck);
@@ -1814,7 +2163,8 @@ lck_mtx_unlock_wakeup_x86 (
        lck_mtx_t       *mutex,
        int             prior_lock_state)
 {
-       lck_mtx_t       fake_lck;
+       __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(mutex);
+       lck_mtx_t               fake_lck;
 
        /*
         * prior_lock state is a snapshot of the 2nd word of the
@@ -1825,14 +2175,13 @@ lck_mtx_unlock_wakeup_x86 (
        fake_lck.lck_mtx_state = prior_lock_state;
 
        KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_START,
-                    mutex, fake_lck.lck_mtx_promoted, fake_lck.lck_mtx_waiters, fake_lck.lck_mtx_pri, 0);
+                    trace_lck, fake_lck.lck_mtx_promoted, fake_lck.lck_mtx_waiters, fake_lck.lck_mtx_pri, 0);
 
        if (__probable(fake_lck.lck_mtx_waiters)) {
-
                if (fake_lck.lck_mtx_waiters > 1)
-                       thread_wakeup_one_with_pri((event_t)(((unsigned int*)mutex)+(sizeof(lck_mtx_t)-1)/sizeof(unsigned int)), fake_lck.lck_mtx_pri);
+                       thread_wakeup_one_with_pri(LCK_MTX_EVENT(mutex), fake_lck.lck_mtx_pri);
                else
-                       thread_wakeup_one((event_t)(((unsigned int*)mutex)+(sizeof(lck_mtx_t)-1)/sizeof(unsigned int)));
+                       thread_wakeup_one(LCK_MTX_EVENT(mutex));
        }
 
        if (__improbable(fake_lck.lck_mtx_promoted)) {
@@ -1851,18 +2200,20 @@ lck_mtx_unlock_wakeup_x86 (
 
                                thread->sched_flags &= ~TH_SFLAG_PROMOTED;
 
-                               if (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) {
+                               if (thread->sched_flags & TH_SFLAG_RW_PROMOTED) {
+                                       /* Thread still has a RW lock promotion */
+                               } else if (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) {
                                        KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_DEMOTE) | DBG_FUNC_NONE,
-                                                             thread->sched_pri, DEPRESSPRI, 0, mutex, 0);
+                                                             thread->sched_pri, DEPRESSPRI, 0, trace_lck, 0);
 
                                        set_sched_pri(thread, DEPRESSPRI);
                                }
                                else {
-                                       if (thread->priority < thread->sched_pri) {
+                                       if (thread->base_pri < thread->sched_pri) {
                                                KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_DEMOTE) | DBG_FUNC_NONE,
-                                                                     thread->sched_pri, thread->priority, 0, mutex, 0);
+                                                                     thread->sched_pri, thread->base_pri, 0, trace_lck, 0);
 
-                                               SCHED(compute_priority)(thread, FALSE);
+                                               thread_recompute_sched_pri(thread, FALSE);
                                        }
                                }
                        }
@@ -1871,7 +2222,7 @@ lck_mtx_unlock_wakeup_x86 (
                }
        }
        KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_END,
-                    mutex, 0, mutex->lck_mtx_waiters, 0, 0);
+                    trace_lck, 0, mutex->lck_mtx_waiters, 0, 0);
 }
 
 
@@ -1889,12 +2240,13 @@ void
 lck_mtx_lock_acquire_x86(
        lck_mtx_t       *mutex)
 {
-       thread_t        thread;
-       integer_t       priority;
-       spl_t           s;
+       __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(mutex);
+       thread_t                thread;
+       integer_t               priority;
+       spl_t                   s;
 
        KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_START,
-                    mutex, thread->was_promoted_on_wakeup, mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0);
+                    trace_lck, thread->was_promoted_on_wakeup, mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0);
 
        if (mutex->lck_mtx_waiters)
                priority = mutex->lck_mtx_pri;
@@ -1906,14 +2258,16 @@ lck_mtx_lock_acquire_x86(
        if (thread->sched_pri < priority || thread->was_promoted_on_wakeup) {
 
                KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PROMOTE) | DBG_FUNC_NONE,
-                                     thread->sched_pri, priority, thread->was_promoted_on_wakeup, mutex, 0);
+                                     thread->sched_pri, priority, thread->was_promoted_on_wakeup, trace_lck, 0);
 
                s = splsched();
                thread_lock(thread);
 
-               if (thread->sched_pri < priority)
+               if (thread->sched_pri < priority) {
+                       /* Do not promote past promotion ceiling */
+                       assert(priority <= MAXPRI_PROMOTE);
                        set_sched_pri(thread, priority);
-
+               }
                if (mutex->lck_mtx_promoted == 0) {
                        mutex->lck_mtx_promoted = 1;
                        
@@ -1926,9 +2280,30 @@ lck_mtx_lock_acquire_x86(
                splx(s);
        }
        KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_END,
-                    mutex, 0, mutex->lck_mtx_waiters, 0, 0);
+                    trace_lck, 0, mutex->lck_mtx_waiters, 0, 0);
+}
+
+
+static int
+lck_mtx_interlock_try_lock(lck_mtx_t *mutex, boolean_t *istate)
+{
+       int             retval;
+
+       *istate = ml_set_interrupts_enabled(FALSE);
+       retval = lck_mtx_ilk_try_lock(mutex);
+
+       if (retval == 0)
+               ml_set_interrupts_enabled(*istate);
+
+       return retval;
 }
 
+static void
+lck_mtx_interlock_unlock(lck_mtx_t *mutex, boolean_t istate)
+{               
+       lck_mtx_ilk_unlock(mutex);
+       ml_set_interrupts_enabled(istate);
+}
 
 
 /*
@@ -1947,16 +2322,20 @@ int
 lck_mtx_lock_spinwait_x86(
        lck_mtx_t       *mutex)
 {
+       __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(mutex);
        thread_t        holder;
-       uint64_t        deadline;
+       uint64_t        overall_deadline;
+       uint64_t        check_owner_deadline;
+       uint64_t        cur_time;
        int             retval = 1;
        int             loopcount = 0;
 
-
        KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_START,
-                    mutex, mutex->lck_mtx_owner, mutex->lck_mtx_waiters, 0, 0);
+                    trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, 0, 0);
 
-       deadline = mach_absolute_time() + MutexSpin;
+       cur_time = mach_absolute_time();
+       overall_deadline = cur_time + MutexSpin;
+       check_owner_deadline = cur_time;
 
        /*
         * Spin while:
@@ -1971,25 +2350,42 @@ lck_mtx_lock_spinwait_x86(
                        retval = 0;
                        break;
                }
-               if ((holder = (thread_t) mutex->lck_mtx_owner) != NULL) {
+               cur_time = mach_absolute_time();
 
-                       if ( !(holder->machine.specFlags & OnProc) ||
-                            (holder->state & TH_IDLE)) {
-                               if (loopcount == 0)
-                                       retval = 2;
-                               break;
+               if (cur_time >= overall_deadline)
+                       break;
+
+               if (cur_time >= check_owner_deadline && mutex->lck_mtx_owner) {
+                       boolean_t       istate;
+
+                       if (lck_mtx_interlock_try_lock(mutex, &istate)) {
+
+                               if ((holder = (thread_t) mutex->lck_mtx_owner) != NULL) {
+
+                                       if ( !(holder->machine.specFlags & OnProc) ||
+                                            (holder->state & TH_IDLE)) {
+
+                                               lck_mtx_interlock_unlock(mutex, istate);
+
+                                               if (loopcount == 0)
+                                                       retval = 2;
+                                               break;
+                                       }
+                               }
+                               lck_mtx_interlock_unlock(mutex, istate);
+
+                               check_owner_deadline = cur_time + (MutexSpin / 4);
                        }
                }
                cpu_pause();
 
                loopcount++;
 
-       } while (mach_absolute_time() < deadline);
-
+       } while (TRUE);
 
 #if    CONFIG_DTRACE
        /*
-        * We've already kept a count via deadline of how long we spun.
+        * We've already kept a count via overall_deadline of how long we spun.
         * If dtrace is active, then we compute backwards to decide how
         * long we spun.
         *
@@ -2000,16 +2396,16 @@ lck_mtx_lock_spinwait_x86(
         */
        if (__probable(mutex->lck_mtx_is_ext == 0)) {
                LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN, mutex,
-                   mach_absolute_time() - (deadline - MutexSpin));
+                       mach_absolute_time() - (overall_deadline - MutexSpin));
        } else {
                LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_SPIN, mutex,
-                   mach_absolute_time() - (deadline - MutexSpin));
+                       mach_absolute_time() - (overall_deadline - MutexSpin));
        }
        /* The lockstat acquire event is recorded by the assembly code beneath us. */
 #endif
 
        KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_END,
-                    mutex, mutex->lck_mtx_owner, mutex->lck_mtx_waiters, retval, 0);
+                    trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, retval, 0);
 
        return retval;
 }
@@ -2029,6 +2425,7 @@ void
 lck_mtx_lock_wait_x86 (
        lck_mtx_t       *mutex)
 {
+       __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(mutex);
        thread_t        self = current_thread();
        thread_t        holder;
        integer_t       priority;
@@ -2041,30 +2438,38 @@ lck_mtx_lock_wait_x86 (
        }
 #endif
        KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_START,
-                    mutex, mutex->lck_mtx_owner, mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0);
+                    trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0);
 
        priority = self->sched_pri;
 
-       if (priority < self->priority)
-               priority = self->priority;
+       if (priority < self->base_pri)
+               priority = self->base_pri;
        if (priority < BASEPRI_DEFAULT)
                priority = BASEPRI_DEFAULT;
 
+       /* Do not promote past promotion ceiling */
+       priority = MIN(priority, MAXPRI_PROMOTE);
+
        if (mutex->lck_mtx_waiters == 0 || priority > mutex->lck_mtx_pri)
                mutex->lck_mtx_pri = priority;
        mutex->lck_mtx_waiters++;
 
        if ( (holder = (thread_t)mutex->lck_mtx_owner) &&
             holder->sched_pri < mutex->lck_mtx_pri ) {
-
                s = splsched();
                thread_lock(holder);
 
+               /* holder priority may have been bumped by another thread
+                * before thread_lock was taken
+                */
                if (holder->sched_pri < mutex->lck_mtx_pri) {
                        KERNEL_DEBUG_CONSTANT(
                                MACHDBG_CODE(DBG_MACH_SCHED, MACH_PROMOTE) | DBG_FUNC_NONE,
-                               holder->sched_pri, priority, thread_tid(holder), mutex, 0);
-
+                               holder->sched_pri, priority, thread_tid(holder), trace_lck, 0);
+                       /* Assert that we're not altering the priority of a
+                        * thread above the MAXPRI_PROMOTE band
+                        */
+                       assert(holder->sched_pri < MAXPRI_PROMOTE);
                        set_sched_pri(holder, priority);
                        
                        if (mutex->lck_mtx_promoted == 0) {
@@ -2077,14 +2482,15 @@ lck_mtx_lock_wait_x86 (
                thread_unlock(holder);
                splx(s);
        }
-       assert_wait((event_t)(((unsigned int*)mutex)+((sizeof(lck_mtx_t)-1)/sizeof(unsigned int))), THREAD_UNINT);
+       thread_set_pending_block_hint(self, kThreadWaitKernelMutex);
+       assert_wait(LCK_MTX_EVENT(mutex), THREAD_UNINT);
 
        lck_mtx_ilk_unlock(mutex);
 
        thread_block(THREAD_CONTINUE_NULL);
 
        KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_END,
-                    mutex, mutex->lck_mtx_owner, mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0);
+                    trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0);
 
 #if    CONFIG_DTRACE
        /*
@@ -2103,65 +2509,50 @@ lck_mtx_lock_wait_x86 (
 #endif
 }
 
-
-#if    MACH_KDB
-
-void
-db_show_one_lock(
-       lock_t  *lock)
-{
-       db_printf("Read_count = 0x%x, %swant_upgrade, %swant_write, ",
-                 lock->lck_rw_shared_count,
-                 lock->lck_rw_want_upgrade ? "" : "!",
-                 lock->lck_rw_want_write ? "" : "!");
-       db_printf("%swaiting, %scan_sleep\n", 
-                 (lock->lck_r_waiting || lock->lck_w_waiting) ? "" : "!", 
-                 lock->lck_rw_can_sleep ? "" : "!");
-       db_printf("Interlock:\n");
-       db_show_one_simple_lock((db_expr_t) ((vm_offset_t)simple_lock_addr(lock->lck_rw_interlock)),
-                       TRUE, (db_expr_t)0, (char *)0);
-}
-
 /*
- * Routines to print out simple_locks and mutexes in a nicely-formatted
- * fashion.
+ *      Routine: kdp_lck_mtx_lock_spin_is_acquired
+ *      NOT SAFE: To be used only by kernel debugger to avoid deadlock.
+ *      Returns: TRUE if lock is acquired.
  */
-
-const char *simple_lock_labels =       "ENTRY    ILK THREAD   DURATION CALLER";
-
-void
-db_show_one_simple_lock (
-       db_expr_t       addr,
-       boolean_t       have_addr,
-       __unused db_expr_t      count,
-       __unused char           * modif)
+boolean_t
+kdp_lck_mtx_lock_spin_is_acquired(lck_mtx_t    *lck)
 {
-       simple_lock_t   saddr = (simple_lock_t) ((vm_offset_t) addr);
+       if (not_in_kdp) {
+               panic("panic: kdp_lck_mtx_lock_spin_is_acquired called outside of kernel debugger");
+       }
 
-       if (saddr == (simple_lock_t)0 || !have_addr) {
-               db_error ("No simple_lock\n");
+       if (lck->lck_mtx_ilocked || lck->lck_mtx_mlocked) {
+               return TRUE;
        }
-#if    USLOCK_DEBUG
-       else if (saddr->lock_type != USLOCK_TAG)
-               db_error ("Not a simple_lock\n");
-#endif /* USLOCK_DEBUG */
 
-       db_printf ("%s\n", simple_lock_labels);
-       db_print_simple_lock (saddr);
+       return FALSE;
 }
 
 void
-db_print_simple_lock (
-       simple_lock_t   addr)
+kdp_lck_mtx_find_owner(__unused struct waitq * waitq, event64_t event, thread_waitinfo_t * waitinfo)
 {
-
-       db_printf ("%08x %3d", addr, *hw_lock_addr(addr->interlock));
-#if    USLOCK_DEBUG
-       db_printf (" %08x", addr->debug.lock_thread);
-       db_printf (" %08x ", addr->debug.duration[1]);
-       db_printsym ((int)addr->debug.lock_pc, DB_STGY_ANY);
-#endif /* USLOCK_DEBUG */
-       db_printf ("\n");
+       lck_mtx_t * mutex = LCK_EVENT_TO_MUTEX(event);
+       waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(mutex);
+       thread_t holder   = (thread_t)mutex->lck_mtx_owner;
+       waitinfo->owner   = thread_tid(holder);
 }
 
-#endif /* MACH_KDB */
+void
+kdp_rwlck_find_owner(__unused struct waitq * waitq, event64_t event, thread_waitinfo_t * waitinfo)
+{
+       lck_rw_t *rwlck = NULL;
+       switch(waitinfo->wait_type) {
+               case kThreadWaitKernelRWLockRead:
+                       rwlck = READ_EVENT_TO_RWLOCK(event);
+                       break;
+               case kThreadWaitKernelRWLockWrite:
+               case kThreadWaitKernelRWLockUpgrade:
+                       rwlck = WRITE_EVENT_TO_RWLOCK(event);
+                       break;
+               default:
+                       panic("%s was called with an invalid blocking type", __FUNCTION__);
+                       break;
+       }
+       waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(rwlck);
+       waitinfo->owner = 0;
+}