]> git.saurik.com Git - apple/xnu.git/blobdiff - osfmk/i386/locks_i386.c
xnu-2782.1.97.tar.gz
[apple/xnu.git] / osfmk / i386 / locks_i386.c
index 8c715d086afaaa46d6e520c20f724f0893e627d2..4dd253e01dea0f8226683c58b155f83bef11fb97 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2012 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
  *     Locking primitives implementation
  */
 
-#include <mach_kdb.h>
 #include <mach_ldebug.h>
 
-#include <kern/lock.h>
 #include <kern/locks.h>
 #include <kern/kalloc.h>
 #include <kern/misc_protos.h>
 #include <kern/debug.h>
 #include <string.h>
 
-#if    MACH_KDB
-#include <ddb/db_command.h>
-#include <ddb/db_output.h>
-#include <ddb/db_sym.h>
-#include <ddb/db_print.h>
-#endif /* MACH_KDB */
 #include <i386/machine_routines.h> /* machine_timeout_suspended() */
 #include <machine/machine_cpu.h>
 #include <i386/mp.h>
 
 #include <sys/kdebug.h>
+#include <mach/branch_predicates.h>
 
 /*
  * We need only enough declarations from the BSD-side to be able to
@@ -122,12 +115,6 @@ unsigned int LcksOpts=0;
 
 /* Forwards */
 
-#if    MACH_KDB
-void   db_print_simple_lock(
-                       simple_lock_t   addr);
-#endif /* MACH_KDB */
-
-
 #if    USLOCK_DEBUG
 /*
  *     Perform simple lock checks.
@@ -138,6 +125,7 @@ decl_simple_lock_data(extern , printf_lock)
 decl_simple_lock_data(extern , panic_lock)
 #endif /* USLOCK_DEBUG */
 
+extern unsigned int not_in_kdp;
 
 /*
  *     We often want to know the addresses of the callers
@@ -211,6 +199,7 @@ lck_rw_type_t lck_rw_done_gen(
        lck_rw_t        *lck,
        int             prior_lock_state);
 
+void lck_rw_clear_promotions_x86(thread_t thread);
 
 /*
  *      Routine:        lck_spin_alloc_init
@@ -301,6 +290,19 @@ lck_spin_try_lock(
        return((boolean_t)usimple_lock_try((usimple_lock_t) lck));
 }
 
+/*
+ *      Routine: lck_spin_is_acquired
+ *      NOT SAFE: To be used only by kernel debugger to avoid deadlock.
+ *      Returns: TRUE if lock is acquired.
+ */
+boolean_t
+lck_spin_is_acquired(lck_spin_t *lck) {
+       if (not_in_kdp) {
+               panic("panic: spinlock acquired check done outside of kernel debugger");
+       }
+       return (lck->interlock != 0)? TRUE : FALSE;
+}
+
 /*
  *     Initialize a usimple_lock.
  *
@@ -322,14 +324,14 @@ usimple_lock_init(
 volatile uint32_t spinlock_owner_cpu = ~0;
 volatile usimple_lock_t spinlock_timed_out;
 
-static uint32_t spinlock_timeout_NMI(uintptr_t thread_addr) {
+uint32_t spinlock_timeout_NMI(uintptr_t thread_addr) {
        uint64_t deadline;
        uint32_t i;
 
        for (i = 0; i < real_ncpus; i++) {
                if ((uintptr_t)cpu_data_ptr[i]->cpu_active_thread == thread_addr) {
                        spinlock_owner_cpu = i;
-                       if ((uint32_t)cpu_number() == i)
+                       if ((uint32_t) cpu_number() == i)
                                break;
                        cpu_datap(i)->cpu_NMI_acknowledged = FALSE;
                        cpu_NMI_interrupt(i);
@@ -359,19 +361,21 @@ usimple_lock(
 
        OBTAIN_PC(pc);
        USLDBG(usld_lock_pre(l, pc));
-/* Try to get the lock with a timeout */
-       if(!hw_lock_to(&l->interlock, LockTimeOutTSC))  {
+
+       if(__improbable(hw_lock_to(&l->interlock, LockTimeOutTSC) == 0))        {
                boolean_t uslock_acquired = FALSE;
                while (machine_timeout_suspended()) {
                        enable_preemption();
                        if ((uslock_acquired = hw_lock_to(&l->interlock, LockTimeOutTSC)))
                                break;
-       }
+               }
+
                if (uslock_acquired == FALSE) {
                        uint32_t lock_cpu;
+                       uintptr_t lowner = (uintptr_t)l->interlock.lock_data;
                        spinlock_timed_out = l;
-                       lock_cpu = spinlock_timeout_NMI((uintptr_t)l->interlock.lock_data);
-                       panic("Spinlock acquisition timed out: lock=%p, lock owner thread=0x%lx, current_thread: %p, lock owner active on CPU 0x%x", l, (uintptr_t)l->interlock.lock_data, current_thread(), lock_cpu);
+                       lock_cpu = spinlock_timeout_NMI(lowner);
+                       panic("Spinlock acquisition timed out: lock=%p, lock owner thread=0x%lx, current_thread: %p, lock owner active on CPU 0x%x, current owner: 0x%lx", l, lowner,  current_thread(), lock_cpu, (uintptr_t)l->interlock.lock_data);
                }
        }
        USLDBG(usld_lock_post(l, pc));
@@ -488,10 +492,9 @@ usld_lock_common_checks(
        if (l == USIMPLE_LOCK_NULL)
                panic("%s:  null lock pointer", caller);
        if (l->lock_type != USLOCK_TAG)
-               panic("%s:  0x%p is not a usimple lock", caller, l);
+               panic("%s:  %p is not a usimple lock, 0x%x", caller, l, l->lock_type);
        if (!(l->debug.state & USLOCK_INIT))
-               panic("%s:  %p is not an initialized lock",
-                     caller, l);
+               panic("%s:  %p is not an initialized lock, 0x%x", caller, l, l->debug.state);
        return USLOCK_CHECKING(l);
 }
 
@@ -698,126 +701,6 @@ usl_trace(
 
 #endif /* USLOCK_DEBUG */
 
-/*
- *     Routine:        lock_alloc
- *     Function:
- *             Allocate a lock for external users who cannot
- *             hard-code the structure definition into their
- *             objects.
- *             For now just use kalloc, but a zone is probably
- *             warranted.
- */
-lock_t *
-lock_alloc(
-       boolean_t       can_sleep,
-       unsigned short  tag,
-       unsigned short  tag1)
-{
-       lock_t          *l;
-
-       if ((l = (lock_t *)kalloc(sizeof(lock_t))) != 0)
-         lock_init(l, can_sleep, tag, tag1);
-       return(l);
-}
-
-/*
- *     Routine:        lock_free
- *     Function:
- *             Free a lock allocated for external users.
- *             For now just use kfree, but a zone is probably
- *             warranted.
- */
-void
-lock_free(
-       lock_t          *l)
-{
-       kfree(l, sizeof(lock_t));
-}
-
-         
-/*
- *     Routine:        lock_init
- *     Function:
- *             Initialize a lock; required before use.
- *             Note that clients declare the "struct lock"
- *             variables and then initialize them, rather
- *             than getting a new one from this module.
- */
-void
-lock_init(
-       lock_t          *l,
-       boolean_t       can_sleep,
-       __unused unsigned short tag,
-       __unused unsigned short tag1)
-{
-       hw_lock_byte_init(&l->lck_rw_interlock);
-       l->lck_rw_want_write = FALSE;
-       l->lck_rw_want_upgrade = FALSE;
-       l->lck_rw_shared_count = 0;
-       l->lck_rw_can_sleep = can_sleep;
-       l->lck_rw_tag = tag;
-       l->lck_rw_priv_excl = 1;
-       l->lck_r_waiting = l->lck_w_waiting = 0;
-}
-
-
-/*
- *     Sleep locks.  These use the same data structure and algorithm
- *     as the spin locks, but the process sleeps while it is waiting
- *     for the lock.  These work on uniprocessor systems.
- */
-
-#define DECREMENTER_TIMEOUT 1000000
-
-void
-lock_write(
-       register lock_t * l)
-{
-       lck_rw_lock_exclusive(l);
-}
-
-void
-lock_done(
-       register lock_t * l)
-{
-       (void) lck_rw_done(l);
-}
-
-void
-lock_read(
-       register lock_t * l)
-{
-       lck_rw_lock_shared(l);
-}
-
-
-/*
- *     Routine:        lock_read_to_write
- *     Function:
- *             Improves a read-only lock to one with
- *             write permission.  If another reader has
- *             already requested an upgrade to a write lock,
- *             no lock is held upon return.
- *
- *             Returns FALSE if the upgrade *failed*.
- */
-
-boolean_t
-lock_read_to_write(
-       register lock_t * l)
-{
-       return lck_rw_lock_shared_to_exclusive(l);
-}
-
-void
-lock_write_to_read(
-       register lock_t * l)
-{
-       lck_rw_lock_exclusive_to_shared(l);
-}
-
-
-
 /*
  *      Routine:        lck_rw_alloc_init
  */
@@ -882,6 +765,9 @@ lck_rw_destroy(
 {
        if (lck->lck_rw_tag == LCK_RW_TAG_DESTROYED)
                return;
+#if MACH_LDEBUG
+       lck_rw_assert(lck, LCK_RW_ASSERT_NOTHELD);
+#endif
        lck->lck_rw_tag = LCK_RW_TAG_DESTROYED;
        lck_grp_lckcnt_decr(grp, LCK_TYPE_RW);
        lck_grp_deallocate(grp);
@@ -903,8 +789,8 @@ lck_rw_destroy(
                ((event_t) (((unsigned char*) (x)) + (offsetof(lck_rw_t, lck_rw_pad8))))
 
 /*
- * We need to disable interrupts while holding the mutex interlock
- * to prevent an IPI intervening.
+ * We disable interrupts while holding the RW interlock to prevent an
+ * interrupt from exacerbating hold time.
  * Hence, local helper functions lck_interlock_lock()/lck_interlock_unlock().
  */
 static boolean_t
@@ -1191,6 +1077,8 @@ lck_rw_done_gen(
 {
        lck_rw_t        *fake_lck;
        lck_rw_type_t   lock_type;
+       thread_t        thread;
+       uint32_t        rwlock_count;
 
        /*
         * prior_lock state is a snapshot of the 1st word of the
@@ -1212,6 +1100,19 @@ lck_rw_done_gen(
        else
                lock_type = LCK_RW_TYPE_EXCLUSIVE;
 
+       /* Check if dropping the lock means that we need to unpromote */
+       thread = current_thread();
+       rwlock_count = thread->rwlock_count--;
+#if MACH_LDEBUG
+       if (rwlock_count == 0) {
+               panic("rw lock count underflow for thread %p", thread);
+       }
+#endif
+       if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
+               /* sched_flags checked without lock, but will be rechecked while clearing */
+               lck_rw_clear_promotion(thread);
+       }
+
 #if CONFIG_DTRACE
        LOCKSTAT_RECORD(LS_LCK_RW_DONE_RELEASE, lck, lock_type == LCK_RW_TYPE_SHARED ? 0 : 1);
 #endif
@@ -1407,6 +1308,20 @@ lck_rw_lock_shared_to_exclusive_failure(
        int             prior_lock_state)
 {
        lck_rw_t        *fake_lck;
+       thread_t        thread = current_thread();
+       uint32_t        rwlock_count;
+
+       /* Check if dropping the lock means that we need to unpromote */
+       rwlock_count = thread->rwlock_count--;
+#if MACH_LDEBUG
+       if (rwlock_count == 0) {
+               panic("rw lock count underflow for thread %p", thread);
+       }
+#endif
+       if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
+               /* sched_flags checked without lock, but will be rechecked while clearing */
+               lck_rw_clear_promotion(thread);
+       }
 
        /*
         * prior_lock state is a snapshot of the 1st word of the
@@ -1628,13 +1543,38 @@ lck_rw_assert(
                        return;
                }
                break;
+       case LCK_RW_ASSERT_NOTHELD:
+               if (!(lck->lck_rw_want_write ||
+                         lck->lck_rw_want_upgrade ||
+                         lck->lck_rw_shared_count != 0)) {
+                       return;
+               }
+               break;
        default:
                break;
        }
 
-       panic("rw lock (%p) not held (mode=%u), first word %08x\n", lck, type, *(uint32_t *)lck);
+       panic("rw lock (%p)%s held (mode=%u), first word %08x\n", lck, (type == LCK_RW_ASSERT_NOTHELD ? "" : " not"), type, *(uint32_t *)lck);
+}
+
+/* On return to userspace, this routine is called if the rwlock_count is somehow imbalanced */
+void
+lck_rw_clear_promotions_x86(thread_t thread)
+{
+#if MACH_LDEBUG
+       /* It's fatal to leave a RW lock locked and return to userspace */
+       panic("%u rw lock(s) held on return to userspace for thread %p", thread->rwlock_count, thread);
+#else
+       /* Paper over the issue */
+       thread->rwlock_count = 0;
+       lck_rw_clear_promotion(thread);
+#endif
 }
 
+
+#ifdef MUTEX_ZONE
+extern zone_t lck_mtx_zone;
+#endif
 /*
  *      Routine:        lck_mtx_alloc_init
  */
@@ -1644,10 +1584,13 @@ lck_mtx_alloc_init(
        lck_attr_t      *attr)
 {
        lck_mtx_t       *lck;
-
+#ifdef MUTEX_ZONE
+       if ((lck = (lck_mtx_t *)zalloc(lck_mtx_zone)) != 0)
+               lck_mtx_init(lck, grp, attr);
+#else
        if ((lck = (lck_mtx_t *)kalloc(sizeof(lck_mtx_t))) != 0)
                lck_mtx_init(lck, grp, attr);
-               
+#endif         
        return(lck);
 }
 
@@ -1660,7 +1603,11 @@ lck_mtx_free(
        lck_grp_t       *grp)
 {
        lck_mtx_destroy(lck, grp);
+#ifdef MUTEX_ZONE
+       zfree(lck_mtx_zone, lck);
+#else
        kfree(lck, sizeof(lck_mtx_t));
+#endif
 }
 
 /*
@@ -1682,9 +1629,10 @@ lck_mtx_ext_init(
        lck->lck_mtx_grp = grp;
 
        if (grp->lck_grp_attr & LCK_GRP_ATTR_STAT)
-                lck->lck_mtx_attr |= LCK_MTX_ATTR_STAT;
+               lck->lck_mtx_attr |= LCK_MTX_ATTR_STAT;
 
-       lck->lck_mtx.lck_mtx_ptr = (void *)LCK_MTX_PTR_EXTENDED;
+       lck->lck_mtx.lck_mtx_is_ext = 1;
+       lck->lck_mtx.lck_mtx_sw.lck_mtxd.lck_mtxd_pad32 = 0xFFFFFFFF;
 }
 
 /*
@@ -1709,18 +1657,12 @@ lck_mtx_init(
                        lck_mtx_ext_init(lck_ext, grp, lck_attr);       
                        lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT;
                        lck->lck_mtx_ptr = lck_ext;
-                       lck->lck_mtx_ilocked = 1;
                }
        } else {
                lck->lck_mtx_owner = 0;
-               lck->lck_mtx_ptr = 0;
-               lck->lck_mtx_waiters = 0;
-               lck->lck_mtx_pri = 0;
-               lck->lck_mtx_ilocked = 0;
-               lck->lck_mtx_mlocked = 0;
-               lck->lck_mtx_promoted = 0;
-               lck->lck_mtx_spin = 0;
+               lck->lck_mtx_state = 0;
        }
+       lck->lck_mtx_sw.lck_mtxd.lck_mtxd_pad32 = 0xFFFFFFFF;
        lck_grp_reference(grp);
        lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX);
 }
@@ -1746,17 +1688,12 @@ lck_mtx_init_ext(
                lck_mtx_ext_init(lck_ext, grp, lck_attr);
                lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT;
                lck->lck_mtx_ptr = lck_ext;
-               lck->lck_mtx_ilocked = 1;
        } else {
                lck->lck_mtx_owner = 0;
-               lck->lck_mtx_ptr = 0;
-               lck->lck_mtx_waiters = 0;
-               lck->lck_mtx_pri = 0;
-               lck->lck_mtx_ilocked = 0;
-               lck->lck_mtx_mlocked = 0;
-               lck->lck_mtx_promoted = 0;
-               lck->lck_mtx_spin = 0;
+               lck->lck_mtx_state = 0;
        }
+       lck->lck_mtx_sw.lck_mtxd.lck_mtxd_pad32 = 0xFFFFFFFF;
+
        lck_grp_reference(grp);
        lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX);
 }
@@ -1773,6 +1710,9 @@ lck_mtx_destroy(
        
        if (lck->lck_mtx_tag == LCK_MTX_TAG_DESTROYED)
                return;
+#if MACH_LDEBUG
+       lck_mtx_assert(lck, LCK_MTX_ASSERT_NOTOWNED);
+#endif
        lck_is_indirect = (lck->lck_mtx_tag == LCK_MTX_TAG_INDIRECT);
 
        lck_mtx_lock_mark_destroyed(lck);
@@ -1795,50 +1735,68 @@ lck_mtx_destroy(
 /*
  * Routine:    lck_mtx_unlock_wakeup_x86
  *
- * Invoked on unlock when there is contention.
+ * Invoked on unlock when there is 
+ * contention (i.e. the assembly routine sees that
+ * that mutex->lck_mtx_waiters != 0 or 
+ * that mutex->lck_mtx_promoted != 0...
  *
+ * neither the mutex or interlock is held
  */
 void
 lck_mtx_unlock_wakeup_x86 (
        lck_mtx_t       *mutex,
-       int             owner_was_promoted)
+       int             prior_lock_state)
 {
+       lck_mtx_t       fake_lck;
+
+       /*
+        * prior_lock state is a snapshot of the 2nd word of the
+        * lock in question... we'll fake up a lock with the bits
+        * copied into place and carefully not access anything
+        * beyond whats defined in the second word of a lck_mtx_t
+        */
+       fake_lck.lck_mtx_state = prior_lock_state;
 
-       KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_START, (int)mutex, owner_was_promoted, mutex->lck_mtx_waiters, 0, 0);
+       KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_START,
+                    mutex, fake_lck.lck_mtx_promoted, fake_lck.lck_mtx_waiters, fake_lck.lck_mtx_pri, 0);
 
-       if (lck_mtx_lock_decr_waiter(mutex))
-               thread_wakeup_one((event_t)(((unsigned int*)mutex)+(sizeof(lck_mtx_t)-1)/sizeof(unsigned int)));
+       if (__probable(fake_lck.lck_mtx_waiters)) {
+               if (fake_lck.lck_mtx_waiters > 1)
+                       thread_wakeup_one_with_pri((event_t)(((unsigned int*)mutex)+(sizeof(lck_mtx_t)-1)/sizeof(unsigned int)), fake_lck.lck_mtx_pri);
+               else
+                       thread_wakeup_one((event_t)(((unsigned int*)mutex)+(sizeof(lck_mtx_t)-1)/sizeof(unsigned int)));
+       }
 
-       if (owner_was_promoted) {
+       if (__improbable(fake_lck.lck_mtx_promoted)) {
                thread_t        thread = current_thread();
 
 
-               KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_DEMOTE_CODE) | DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), thread->promotions,
-                            thread->sched_mode & TH_MODE_PROMOTED, 0, 0);
+               KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_DEMOTE_CODE) | DBG_FUNC_NONE,
+                            thread_tid(thread), thread->promotions, thread->sched_flags & TH_SFLAG_PROMOTED, 0, 0);
 
                if (thread->promotions > 0) {
                        spl_t   s = splsched();
 
                        thread_lock(thread);
 
-                       if (--thread->promotions == 0 && (thread->sched_mode & TH_MODE_PROMOTED)) {
+                       if (--thread->promotions == 0 && (thread->sched_flags & TH_SFLAG_PROMOTED)) {
 
-                               thread->sched_mode &= ~TH_MODE_PROMOTED;
+                               thread->sched_flags &= ~TH_SFLAG_PROMOTED;
 
-                               if (thread->sched_mode & TH_MODE_ISDEPRESSED) {
-                                       KERNEL_DEBUG_CONSTANT(
-                                               MACHDBG_CODE(DBG_MACH_SCHED,MACH_DEMOTE) | DBG_FUNC_NONE,
-                                               thread->sched_pri, DEPRESSPRI, 0, mutex, 0);
+                               if (thread->sched_flags & TH_SFLAG_RW_PROMOTED) {
+                                       /* Thread still has a RW lock promotion */
+                               } else if (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) {
+                                       KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_DEMOTE) | DBG_FUNC_NONE,
+                                                             thread->sched_pri, DEPRESSPRI, 0, mutex, 0);
 
                                        set_sched_pri(thread, DEPRESSPRI);
                                }
                                else {
                                        if (thread->priority < thread->sched_pri) {
-                                               KERNEL_DEBUG_CONSTANT(
-                                                       MACHDBG_CODE(DBG_MACH_SCHED,MACH_DEMOTE) | DBG_FUNC_NONE,
-                                                       thread->sched_pri, thread->priority, 0, mutex, 0);
+                                               KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_DEMOTE) | DBG_FUNC_NONE,
+                                                                     thread->sched_pri, thread->priority, 0, mutex, 0);
 
-                                               compute_priority(thread, FALSE);
+                                               SCHED(compute_priority)(thread, FALSE);
                                        }
                                }
                        }
@@ -1846,7 +1804,8 @@ lck_mtx_unlock_wakeup_x86 (
                        splx(s);
                }
        }
-       KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_END, (int)mutex, 0, mutex->lck_mtx_waiters, 0, 0);
+       KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_END,
+                    mutex, 0, mutex->lck_mtx_waiters, 0, 0);
 }
 
 
@@ -1854,43 +1813,56 @@ lck_mtx_unlock_wakeup_x86 (
  * Routine:    lck_mtx_lock_acquire_x86
  *
  * Invoked on acquiring the mutex when there is
- * contention.
- * mutex is owned...  interlock is not held
+ * contention (i.e. the assembly routine sees that
+ * that mutex->lck_mtx_waiters != 0 or 
+ * thread->was_promoted_on_wakeup != 0)...
+ *
+ * mutex is owned...  interlock is held... preemption is disabled
  */
 void
 lck_mtx_lock_acquire_x86(
        lck_mtx_t       *mutex)
 {
-       thread_t        thread = current_thread();
+       thread_t        thread;
        integer_t       priority;
+       spl_t           s;
 
-       KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_START, (int)mutex, 0, mutex->lck_mtx_waiters, 0, 0);
-
-       priority = lck_mtx_lock_get_pri(mutex);
+       KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_START,
+                    mutex, thread->was_promoted_on_wakeup, mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0);
 
-       if (thread->sched_pri < priority) {
+       if (mutex->lck_mtx_waiters)
+               priority = mutex->lck_mtx_pri;
+       else
+               priority = 0;
 
-               if (lck_mtx_lock_mark_promoted(mutex)) {
-                       spl_t   s = splsched();
+       thread = (thread_t)mutex->lck_mtx_owner;        /* faster then current_thread() */
 
-                       thread_lock(thread);
+       if (thread->sched_pri < priority || thread->was_promoted_on_wakeup) {
 
-                       if (thread->sched_pri < priority) {
+               KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PROMOTE) | DBG_FUNC_NONE,
+                                     thread->sched_pri, priority, thread->was_promoted_on_wakeup, mutex, 0);
 
-                               KERNEL_DEBUG_CONSTANT(
-                                       MACHDBG_CODE(DBG_MACH_SCHED,MACH_PROMOTE) | DBG_FUNC_NONE,
-                                       thread->sched_pri, priority, 0, mutex, 0);
+               s = splsched();
+               thread_lock(thread);
 
-                               set_sched_pri(thread, priority);
-                       }
+               if (thread->sched_pri < priority) {
+                       /* Do not promote past promotion ceiling */
+                       assert(priority <= MAXPRI_PROMOTE);
+                       set_sched_pri(thread, priority);
+               }
+               if (mutex->lck_mtx_promoted == 0) {
+                       mutex->lck_mtx_promoted = 1;
+                       
                        thread->promotions++;
-                       thread->sched_mode |= TH_MODE_PROMOTED;
-
-                       thread_unlock(thread);
-                       splx(s);
+                       thread->sched_flags |= TH_SFLAG_PROMOTED;
                }
+               thread->was_promoted_on_wakeup = 0;
+               
+               thread_unlock(thread);
+               splx(s);
        }
-       KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_END, (int)mutex, 0, mutex->lck_mtx_waiters, 0, 0);
+       KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_END,
+                    mutex, 0, mutex->lck_mtx_waiters, 0, 0);
 }
 
 
@@ -1903,6 +1875,9 @@ lck_mtx_lock_acquire_x86(
  * time waiting for the lock to be released.
  *
  * Called with the interlock unlocked.
+ * returns 0 if mutex acquired
+ * returns 1 if we spun
+ * returns 2 if we didn't spin due to the holder not running
  */
 int
 lck_mtx_lock_spinwait_x86(
@@ -1913,9 +1888,9 @@ lck_mtx_lock_spinwait_x86(
        int             retval = 1;
        int             loopcount = 0;
 
-       KERNEL_DEBUG(
-               MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_START,
-               (int)mutex, (int)mutex->lck_mtx_owner, mutex->lck_mtx_waiters, 0, 0);
+
+       KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_START,
+                    mutex, mutex->lck_mtx_owner, mutex->lck_mtx_waiters, 0, 0);
 
        deadline = mach_absolute_time() + MutexSpin;
 
@@ -1928,7 +1903,7 @@ lck_mtx_lock_spinwait_x86(
         *   - we haven't spun for long enough.
         */
        do {
-               if (lck_mtx_lock_grab_mutex(mutex)) {
+               if (__probable(lck_mtx_lock_grab_mutex(mutex))) {
                        retval = 0;
                        break;
                }
@@ -1959,7 +1934,7 @@ lck_mtx_lock_spinwait_x86(
         * penalize only lock groups that have debug/stats enabled
         * with dtrace processing if desired.
         */
-       if (mutex->lck_mtx_ptr != (void *)LCK_MTX_PTR_EXTENDED) {
+       if (__probable(mutex->lck_mtx_is_ext == 0)) {
                LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN, mutex,
                    mach_absolute_time() - (deadline - MutexSpin));
        } else {
@@ -1969,9 +1944,8 @@ lck_mtx_lock_spinwait_x86(
        /* The lockstat acquire event is recorded by the assembly code beneath us. */
 #endif
 
-       KERNEL_DEBUG(
-               MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_END,
-               (int)mutex, (int)mutex->lck_mtx_owner, mutex->lck_mtx_waiters, retval, 0);
+       KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_END,
+                    mutex, mutex->lck_mtx_owner, mutex->lck_mtx_waiters, retval, 0);
 
        return retval;
 }
@@ -1984,7 +1958,8 @@ lck_mtx_lock_spinwait_x86(
  * Invoked in order to wait on contention.
  *
  * Called with the interlock locked and
- * returns it unlocked.
+ * preemption disabled...  
+ * returns it unlocked and with preemption enabled
  */
 void
 lck_mtx_lock_wait_x86 (
@@ -1993,7 +1968,6 @@ lck_mtx_lock_wait_x86 (
        thread_t        self = current_thread();
        thread_t        holder;
        integer_t       priority;
-       integer_t       old_lck_mtx_pri;
        spl_t           s;
 #if    CONFIG_DTRACE
        uint64_t        sleep_start = 0;
@@ -2002,7 +1976,8 @@ lck_mtx_lock_wait_x86 (
                sleep_start = mach_absolute_time();
        }
 #endif
-       KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_START, (int)mutex, (int)mutex->lck_mtx_owner, mutex->lck_mtx_waiters, 0, 0);
+       KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_START,
+                    mutex, mutex->lck_mtx_owner, mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0);
 
        priority = self->sched_pri;
 
@@ -2011,45 +1986,49 @@ lck_mtx_lock_wait_x86 (
        if (priority < BASEPRI_DEFAULT)
                priority = BASEPRI_DEFAULT;
 
-       if (mutex->lck_mtx_waiters == 0)
-               old_lck_mtx_pri = 0;
-       else
-               old_lck_mtx_pri = mutex->lck_mtx_pri;
+       /* Do not promote past promotion ceiling */
+       priority = MIN(priority, MAXPRI_PROMOTE);
 
-       if (old_lck_mtx_pri < priority)
+       if (mutex->lck_mtx_waiters == 0 || priority > mutex->lck_mtx_pri)
                mutex->lck_mtx_pri = priority;
+       mutex->lck_mtx_waiters++;
 
-       if ( (holder = (thread_t)mutex->lck_mtx_owner) ) {
-
+       if ( (holder = (thread_t)mutex->lck_mtx_owner) &&
+            holder->sched_pri < mutex->lck_mtx_pri ) {
                s = splsched();
                thread_lock(holder);
 
-               if (holder->sched_pri < priority) {
+               /* holder priority may have been bumped by another thread
+                * before thread_lock was taken
+                */
+               if (holder->sched_pri < mutex->lck_mtx_pri) {
                        KERNEL_DEBUG_CONSTANT(
                                MACHDBG_CODE(DBG_MACH_SCHED, MACH_PROMOTE) | DBG_FUNC_NONE,
-                               holder->sched_pri, priority, holder, mutex, 0);
-
+                               holder->sched_pri, priority, thread_tid(holder), mutex, 0);
+                       /* Assert that we're not altering the priority of a
+                        * thread above the MAXPRI_PROMOTE band
+                        */
+                       assert(holder->sched_pri < MAXPRI_PROMOTE);
                        set_sched_pri(holder, priority);
                        
                        if (mutex->lck_mtx_promoted == 0) {
                                holder->promotions++;
-                               holder->sched_mode |= TH_MODE_PROMOTED;
-
+                               holder->sched_flags |= TH_SFLAG_PROMOTED;
+                               
                                mutex->lck_mtx_promoted = 1;
                        }
                }
                thread_unlock(holder);
                splx(s);
        }
-       mutex->lck_mtx_waiters++;
-
        assert_wait((event_t)(((unsigned int*)mutex)+((sizeof(lck_mtx_t)-1)/sizeof(unsigned int))), THREAD_UNINT);
 
        lck_mtx_ilk_unlock(mutex);
 
        thread_block(THREAD_CONTINUE_NULL);
 
-       KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_END, (int)mutex, (int)mutex->lck_mtx_owner, mutex->lck_mtx_waiters, 0, 0);
+       KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_END,
+                    mutex, mutex->lck_mtx_owner, mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0);
 
 #if    CONFIG_DTRACE
        /*
@@ -2057,7 +2036,7 @@ lck_mtx_lock_wait_x86 (
         * measured from when we were entered.
         */
        if (sleep_start) {
-               if (mutex->lck_mtx_ptr != (void *)LCK_MTX_PTR_EXTENDED) {
+               if (mutex->lck_mtx_is_ext == 0) {
                        LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_BLOCK, mutex,
                            mach_absolute_time() - sleep_start);
                } else {
@@ -2067,66 +2046,3 @@ lck_mtx_lock_wait_x86 (
        }
 #endif
 }
-
-
-#if    MACH_KDB
-
-void
-db_show_one_lock(
-       lock_t  *lock)
-{
-       db_printf("Read_count = 0x%x, %swant_upgrade, %swant_write, ",
-                 lock->lck_rw_shared_count,
-                 lock->lck_rw_want_upgrade ? "" : "!",
-                 lock->lck_rw_want_write ? "" : "!");
-       db_printf("%swaiting, %scan_sleep\n", 
-                 (lock->lck_r_waiting || lock->lck_w_waiting) ? "" : "!", 
-                 lock->lck_rw_can_sleep ? "" : "!");
-       db_printf("Interlock:\n");
-       db_show_one_simple_lock((db_expr_t) ((vm_offset_t)simple_lock_addr(lock->lck_rw_interlock)),
-                       TRUE, (db_expr_t)0, (char *)0);
-}
-
-/*
- * Routines to print out simple_locks and mutexes in a nicely-formatted
- * fashion.
- */
-
-const char *simple_lock_labels =       "ENTRY    ILK THREAD   DURATION CALLER";
-
-void
-db_show_one_simple_lock (
-       db_expr_t       addr,
-       boolean_t       have_addr,
-       __unused db_expr_t      count,
-       __unused char           * modif)
-{
-       simple_lock_t   saddr = (simple_lock_t) ((vm_offset_t) addr);
-
-       if (saddr == (simple_lock_t)0 || !have_addr) {
-               db_error ("No simple_lock\n");
-       }
-#if    USLOCK_DEBUG
-       else if (saddr->lock_type != USLOCK_TAG)
-               db_error ("Not a simple_lock\n");
-#endif /* USLOCK_DEBUG */
-
-       db_printf ("%s\n", simple_lock_labels);
-       db_print_simple_lock (saddr);
-}
-
-void
-db_print_simple_lock (
-       simple_lock_t   addr)
-{
-
-       db_printf ("%08x %3d", addr, *hw_lock_addr(addr->interlock));
-#if    USLOCK_DEBUG
-       db_printf (" %08x", addr->debug.lock_thread);
-       db_printf (" %08x ", addr->debug.duration[1]);
-       db_printsym ((int)addr->debug.lock_pc, DB_STGY_ANY);
-#endif /* USLOCK_DEBUG */
-       db_printf ("\n");
-}
-
-#endif /* MACH_KDB */