X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/2d21ac55c334faf3a56e5634905ed6987fc787d4..fe8ab488e9161c46dd9885d58fc52996dc0249ff:/osfmk/i386/locks_i386.c?ds=sidebyside

diff --git a/osfmk/i386/locks_i386.c b/osfmk/i386/locks_i386.c
index 38d332b00..4dd253e01 100644
--- a/osfmk/i386/locks_i386.c
+++ b/osfmk/i386/locks_i386.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2012 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -61,10 +61,8 @@
  *	Locking primitives implementation
  */
 
-#include <mach_kdb.h>
 #include <mach_ldebug.h>
 
-#include <kern/lock.h>
 #include <kern/locks.h>
 #include <kern/kalloc.h>
 #include <kern/misc_protos.h>
@@ -77,16 +75,12 @@
 #include <kern/debug.h>
 #include <string.h>
 
-#if	MACH_KDB
-#include <ddb/db_command.h>
-#include <ddb/db_output.h>
-#include <ddb/db_sym.h>
-#include <ddb/db_print.h>
-#endif	/* MACH_KDB */
-
-#include <i386/machine_cpu.h>
+#include <i386/machine_routines.h> /* machine_timeout_suspended() */
+#include <machine/machine_cpu.h>
+#include <i386/mp.h>
 
 #include <sys/kdebug.h>
+#include <mach/branch_predicates.h>
 
 /*
  * We need only enough declarations from the BSD-side to be able to
@@ -105,24 +99,22 @@
 #define	LCK_RW_LCK_SH_TO_EX1_CODE	0x104
 #define	LCK_RW_LCK_EX_TO_SH_CODE	0x105
 
-#define	LCK_MTX_LCK_SPIN		0x200
+#define LCK_RW_LCK_EX_WRITER_SPIN_CODE	0x106
+#define LCK_RW_LCK_EX_WRITER_WAIT_CODE	0x107
+#define LCK_RW_LCK_EX_READER_SPIN_CODE	0x108
+#define LCK_RW_LCK_EX_READER_WAIT_CODE	0x109
+#define LCK_RW_LCK_SHARED_SPIN_CODE	0x110
+#define LCK_RW_LCK_SHARED_WAIT_CODE	0x111
+#define LCK_RW_LCK_SH_TO_EX_SPIN_CODE	0x112
+#define LCK_RW_LCK_SH_TO_EX_WAIT_CODE	0x113
+
 
 #define	ANY_LOCK_DEBUG	(USLOCK_DEBUG || LOCK_DEBUG || MUTEX_DEBUG)
 
 unsigned int LcksOpts=0;
-unsigned int lock_wait_time[2] = { (unsigned int)-1, 0 } ;
 
 /* Forwards */
 
-#if	MACH_KDB
-void	db_print_simple_lock(
-			simple_lock_t	addr);
-
-void	db_print_mutex(
-			mutex_t		* addr);
-#endif	/* MACH_KDB */
-
-
 #if	USLOCK_DEBUG
 /*
  *	Perform simple lock checks.
@@ -131,11 +123,9 @@ int	uslock_check = 1;
 int	max_lock_loops	= 100000000;
 decl_simple_lock_data(extern , printf_lock)
 decl_simple_lock_data(extern , panic_lock)
-#if	MACH_KDB
-decl_simple_lock_data(extern , kdb_lock)
-#endif	/* MACH_KDB */
 #endif	/* USLOCK_DEBUG */
 
+extern unsigned int not_in_kdp;
 
 /*
  *	We often want to know the addresses of the callers
@@ -146,7 +136,7 @@ typedef void	*pc_t;
 #define	INVALID_PC	((void *) VM_MAX_KERNEL_ADDRESS)
 #define	INVALID_THREAD	((void *) VM_MAX_KERNEL_ADDRESS)
 #if	ANY_LOCK_DEBUG
-#define	OBTAIN_PC(pc,l)	((pc) = (void *) GET_RETURN_PC(&(l)))
+#define	OBTAIN_PC(pc)	((pc) = GET_RETURN_PC())
 #define DECL_PC(pc)	pc_t pc;
 #else	/* ANY_LOCK_DEBUG */
 #define DECL_PC(pc)
@@ -154,9 +144,9 @@ typedef void	*pc_t;
 /*
  *	Eliminate lint complaints about unused local pc variables.
  */
-#define	OBTAIN_PC(pc,l)	++pc
+#define	OBTAIN_PC(pc)	++pc
 #else	/* lint */
-#define	OBTAIN_PC(pc,l)
+#define	OBTAIN_PC(pc)
 #endif	/* lint */
 #endif	/* USLOCK_DEBUG */
 
@@ -178,6 +168,12 @@ int		usld_lock_common_checks(usimple_lock_t, char *);
 #define	USLDBG(stmt)
 #endif	/* USLOCK_DEBUG */
 
+
+extern int lck_rw_grab_want(lck_rw_t *lck);
+extern int lck_rw_grab_shared(lck_rw_t *lck);
+extern int lck_rw_held_read_or_upgrade(lck_rw_t *lck);
+
+
 /*
  * Forward definitions
  */
@@ -185,9 +181,26 @@ int		usld_lock_common_checks(usimple_lock_t, char *);
 void lck_rw_lock_shared_gen(
 	lck_rw_t	*lck);
 
-lck_rw_type_t lck_rw_done_gen(
+void lck_rw_lock_exclusive_gen(
+	lck_rw_t	*lck);
+
+boolean_t lck_rw_lock_shared_to_exclusive_success(
 	lck_rw_t	*lck);
 
+boolean_t lck_rw_lock_shared_to_exclusive_failure(
+	lck_rw_t	*lck,
+	int		prior_lock_state);
+
+void lck_rw_lock_exclusive_to_shared_gen(
+	lck_rw_t	*lck,
+	int		prior_lock_state);
+
+lck_rw_type_t lck_rw_done_gen(
+	lck_rw_t	*lck,
+	int		prior_lock_state);
+
+void lck_rw_clear_promotions_x86(thread_t thread);
+
 /*
  *      Routine:        lck_spin_alloc_init
  */
@@ -238,9 +251,9 @@ lck_spin_destroy(
 	lck_spin_t	*lck,
 	lck_grp_t	*grp)
 {
-	if (lck->lck_spin_data[0] == LCK_SPIN_TAG_DESTROYED)
+	if (lck->interlock == LCK_SPIN_TAG_DESTROYED)
 		return;
-	lck->lck_spin_data[0] = LCK_SPIN_TAG_DESTROYED;
+	lck->interlock = LCK_SPIN_TAG_DESTROYED;
 	lck_grp_lckcnt_decr(grp, LCK_TYPE_SPIN);
 	lck_grp_deallocate(grp);
 	return;
@@ -277,6 +290,19 @@ lck_spin_try_lock(
 	return((boolean_t)usimple_lock_try((usimple_lock_t) lck));
 }
 
+/*
+ *      Routine: lck_spin_is_acquired
+ *      NOT SAFE: To be used only by kernel debugger to avoid deadlock.
+ *      Returns: TRUE if lock is acquired.
+ */
+boolean_t
+lck_spin_is_acquired(lck_spin_t *lck) {
+	if (not_in_kdp) {
+		panic("panic: spinlock acquired check done outside of kernel debugger");
+	}
+	return (lck->interlock != 0)? TRUE : FALSE;
+}
+
 /*
  *	Initialize a usimple_lock.
  *
@@ -295,6 +321,29 @@ usimple_lock_init(
 #endif
 }
 
+volatile uint32_t spinlock_owner_cpu = ~0;
+volatile usimple_lock_t spinlock_timed_out;
+
+uint32_t spinlock_timeout_NMI(uintptr_t thread_addr) {
+	uint64_t deadline;
+	uint32_t i;
+
+	for (i = 0; i < real_ncpus; i++) {
+		if ((uintptr_t)cpu_data_ptr[i]->cpu_active_thread == thread_addr) {
+			spinlock_owner_cpu = i;
+			if ((uint32_t) cpu_number() == i)
+				break;
+			cpu_datap(i)->cpu_NMI_acknowledged = FALSE;
+			cpu_NMI_interrupt(i);
+			deadline = mach_absolute_time() + (LockTimeOut * 2);
+			while (mach_absolute_time() < deadline && cpu_datap(i)->cpu_NMI_acknowledged == FALSE)
+				cpu_pause();
+			break;
+		}
+	}
+
+	return spinlock_owner_cpu;
+}
 
 /*
  *	Acquire a usimple_lock.
@@ -310,12 +359,25 @@ usimple_lock(
 #ifndef	MACHINE_SIMPLE_LOCK
 	DECL_PC(pc);
 
-	OBTAIN_PC(pc, l);
+	OBTAIN_PC(pc);
 	USLDBG(usld_lock_pre(l, pc));
 
-	if(!hw_lock_to(&l->interlock, LockTimeOutTSC))	/* Try to get the lock with a timeout */ 
-		panic("simple lock deadlock detection: lock=%p, cpu=%d, owning thread=0x%x", l, cpu_number(), l->interlock.lock_data);
+	if(__improbable(hw_lock_to(&l->interlock, LockTimeOutTSC) == 0))	{
+		boolean_t uslock_acquired = FALSE;
+		while (machine_timeout_suspended()) {
+			enable_preemption();
+			if ((uslock_acquired = hw_lock_to(&l->interlock, LockTimeOutTSC)))
+				break;
+		}
 
+		if (uslock_acquired == FALSE) {
+			uint32_t lock_cpu;
+			uintptr_t lowner = (uintptr_t)l->interlock.lock_data;
+			spinlock_timed_out = l;
+			lock_cpu = spinlock_timeout_NMI(lowner);
+			panic("Spinlock acquisition timed out: lock=%p, lock owner thread=0x%lx, current_thread: %p, lock owner active on CPU 0x%x, current owner: 0x%lx", l, lowner,  current_thread(), lock_cpu, (uintptr_t)l->interlock.lock_data);
+		}
+	}
 	USLDBG(usld_lock_post(l, pc));
 #else
 	simple_lock((simple_lock_t)l);
@@ -337,7 +399,7 @@ usimple_unlock(
 #ifndef	MACHINE_SIMPLE_LOCK
 	DECL_PC(pc);
 
-	OBTAIN_PC(pc, l);
+	OBTAIN_PC(pc);
 	USLDBG(usld_unlock(l, pc));
 	hw_lock_unlock(&l->interlock);
 #else
@@ -366,7 +428,7 @@ usimple_lock_try(
 	unsigned int	success;
 	DECL_PC(pc);
 
-	OBTAIN_PC(pc, l);
+	OBTAIN_PC(pc);
 	USLDBG(usld_lock_try_pre(l, pc));
 	if ((success = hw_lock_try(&l->interlock))) {
 		USLDBG(usld_lock_try_post(l, pc));
@@ -430,10 +492,9 @@ usld_lock_common_checks(
 	if (l == USIMPLE_LOCK_NULL)
 		panic("%s:  null lock pointer", caller);
 	if (l->lock_type != USLOCK_TAG)
-		panic("%s:  0x%x is not a usimple lock", caller, (integer_t) l);
+		panic("%s:  %p is not a usimple lock, 0x%x", caller, l, l->lock_type);
 	if (!(l->debug.state & USLOCK_INIT))
-		panic("%s:  0x%x is not an initialized lock",
-		      caller, (integer_t) l);
+		panic("%s:  %p is not an initialized lock, 0x%x", caller, l, l->debug.state);
 	return USLOCK_CHECKING(l);
 }
 
@@ -495,11 +556,11 @@ usld_lock_post(
 		return;
 
 	if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED))
-		panic("%s:  lock 0x%x became uninitialized",
-		      caller, (integer_t) l);
+		panic("%s:  lock %p became uninitialized",
+		      caller, l);
 	if ((l->debug.state & USLOCK_TAKEN))
-		panic("%s:  lock 0x%x became TAKEN by someone else",
-		      caller, (integer_t) l);
+		panic("%s:  lock 0x%p became TAKEN by someone else",
+		      caller, l);
 
 	mycpu = cpu_number();
 	l->debug.lock_thread = (void *)current_thread();
@@ -534,14 +595,14 @@ usld_unlock(
 	mycpu = cpu_number();
 
 	if (!(l->debug.state & USLOCK_TAKEN))
-		panic("%s:  lock 0x%x hasn't been taken",
-		      caller, (integer_t) l);
+		panic("%s:  lock 0x%p hasn't been taken",
+		      caller, l);
 	if (l->debug.lock_thread != (void *) current_thread())
-		panic("%s:  unlocking lock 0x%x, owned by thread %p",
-		      caller, (integer_t) l, l->debug.lock_thread);
+		panic("%s:  unlocking lock 0x%p, owned by thread %p",
+		      caller, l, l->debug.lock_thread);
 	if (l->debug.lock_cpu != mycpu) {
-		printf("%s:  unlocking lock 0x%x on cpu 0x%x",
-		       caller, (integer_t) l, mycpu);
+		printf("%s:  unlocking lock 0x%p on cpu 0x%x",
+		       caller, l, mycpu);
 		printf(" (acquired on cpu 0x%x)\n", l->debug.lock_cpu);
 		panic("%s", caller);
 	}
@@ -596,11 +657,11 @@ usld_lock_try_post(
 		return;
 
 	if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED))
-		panic("%s:  lock 0x%x became uninitialized",
-		      caller, (integer_t) l);
+		panic("%s:  lock 0x%p became uninitialized",
+		      caller, l);
 	if ((l->debug.state & USLOCK_TAKEN))
-		panic("%s:  lock 0x%x became TAKEN by someone else",
-		      caller, (integer_t) l);
+		panic("%s:  lock 0x%p became TAKEN by someone else",
+		      caller, l);
 
 	mycpu = cpu_number();
 	l->debug.lock_thread = (void *) current_thread();
@@ -631,8 +692,8 @@ usl_trace(
 	if (traced_lock == l) {
 		XPR(XPR_SLOCK,
 		    "seq %d, cpu %d, %s @ %x\n",
-		    (integer_t) lock_seq, (integer_t) mycpu,
-		    (integer_t) op_name, (integer_t) pc, 0);
+		    (uintptr_t) lock_seq, (uintptr_t) mycpu,
+		    (uintptr_t) op_name, (uintptr_t) pc, 0);
 		lock_seq++;
 	}
 }
@@ -640,125 +701,6 @@ usl_trace(
 
 #endif	/* USLOCK_DEBUG */
 
-/*
- *	Routine:	lock_alloc
- *	Function:
- *		Allocate a lock for external users who cannot
- *		hard-code the structure definition into their
- *		objects.
- *		For now just use kalloc, but a zone is probably
- *		warranted.
- */
-lock_t *
-lock_alloc(
-	boolean_t	can_sleep,
-	unsigned short	tag,
-	unsigned short	tag1)
-{
-	lock_t		*l;
-
-	if ((l = (lock_t *)kalloc(sizeof(lock_t))) != 0)
-	  lock_init(l, can_sleep, tag, tag1);
-	return(l);
-}
-
-/*
- *	Routine:	lock_free
- *	Function:
- *		Free a lock allocated for external users.
- *		For now just use kfree, but a zone is probably
- *		warranted.
- */
-void
-lock_free(
-	lock_t		*l)
-{
-	kfree(l, sizeof(lock_t));
-}
-
-	  
-/*
- *	Routine:	lock_init
- *	Function:
- *		Initialize a lock; required before use.
- *		Note that clients declare the "struct lock"
- *		variables and then initialize them, rather
- *		than getting a new one from this module.
- */
-void
-lock_init(
-	lock_t		*l,
-	boolean_t	can_sleep,
-	__unused unsigned short	tag,
-	__unused unsigned short	tag1)
-{
-	hw_lock_byte_init(&l->lck_rw_interlock);
-	l->lck_rw_want_write = FALSE;
-	l->lck_rw_want_upgrade = FALSE;
-	l->lck_rw_shared_count = 0;
-	l->lck_rw_can_sleep = can_sleep;
-	l->lck_rw_tag = tag;
-	l->lck_rw_priv_excl = 1;
-}
-
-
-/*
- *	Sleep locks.  These use the same data structure and algorithm
- *	as the spin locks, but the process sleeps while it is waiting
- *	for the lock.  These work on uniprocessor systems.
- */
-
-#define DECREMENTER_TIMEOUT 1000000
-
-void
-lock_write(
-	register lock_t	* l)
-{
-	lck_rw_lock_exclusive(l);
-}
-
-void
-lock_done(
-	register lock_t	* l)
-{
-	(void) lck_rw_done(l);
-}
-
-void
-lock_read(
-	register lock_t	* l)
-{
-	lck_rw_lock_shared(l);
-}
-
-
-/*
- *	Routine:	lock_read_to_write
- *	Function:
- *		Improves a read-only lock to one with
- *		write permission.  If another reader has
- *		already requested an upgrade to a write lock,
- *		no lock is held upon return.
- *
- *		Returns FALSE if the upgrade *failed*.
- */
-
-boolean_t
-lock_read_to_write(
-	register lock_t	* l)
-{
-	return lck_rw_lock_shared_to_exclusive(l);
-}
-
-void
-lock_write_to_read(
-	register lock_t	* l)
-{
-	lck_rw_lock_exclusive_to_shared(l);
-}
-
-
-
 /*
  *      Routine:        lck_rw_alloc_init
  */
@@ -768,9 +710,11 @@ lck_rw_alloc_init(
 	lck_attr_t	*attr) {
 	lck_rw_t	*lck;
 
-	if ((lck = (lck_rw_t *)kalloc(sizeof(lck_rw_t))) != 0)
+	if ((lck = (lck_rw_t *)kalloc(sizeof(lck_rw_t))) != 0) {
+		bzero(lck, sizeof(lck_rw_t));
 		lck_rw_init(lck, grp, attr);
-		
+	}
+
 	return(lck);
 }
 
@@ -802,6 +746,7 @@ lck_rw_init(
 	lck->lck_rw_want_upgrade = FALSE;
 	lck->lck_rw_shared_count = 0;
 	lck->lck_rw_can_sleep = TRUE;
+	lck->lck_r_waiting = lck->lck_w_waiting = 0;
 	lck->lck_rw_tag = 0;
 	lck->lck_rw_priv_excl = ((lck_attr->lck_attr_val &
 				LCK_ATTR_RW_SHARED_PRIORITY) == 0);
@@ -816,9 +761,13 @@ lck_rw_init(
 void
 lck_rw_destroy(
 	lck_rw_t	*lck,
-	lck_grp_t	*grp) {
+	lck_grp_t	*grp)
+{
 	if (lck->lck_rw_tag == LCK_RW_TAG_DESTROYED)
 		return;
+#if MACH_LDEBUG
+	lck_rw_assert(lck, LCK_RW_ASSERT_NOTHELD);
+#endif
 	lck->lck_rw_tag = LCK_RW_TAG_DESTROYED;
 	lck_grp_lckcnt_decr(grp, LCK_TYPE_RW);
 	lck_grp_deallocate(grp);
@@ -840,8 +789,8 @@ lck_rw_destroy(
 		((event_t) (((unsigned char*) (x)) + (offsetof(lck_rw_t, lck_rw_pad8))))
 
 /*
- * We need to disable interrupts while holding the mutex interlock
- * to prevent an IPI intervening.
+ * We disable interrupts while holding the RW interlock to prevent an
+ * interrupt from exacerbating hold time.
  * Hence, local helper functions lck_interlock_lock()/lck_interlock_unlock().
  */
 static boolean_t
@@ -876,92 +825,132 @@ lck_rw_lock_pause(boolean_t interrupts_enabled)
 	cpu_pause();
 }
 
+
+/*
+ * compute the deadline to spin against when
+ * waiting for a change of state on a lck_rw_t
+ */
+static inline uint64_t
+lck_rw_deadline_for_spin(lck_rw_t *lck)
+{
+	if (lck->lck_rw_can_sleep) {
+		if (lck->lck_r_waiting || lck->lck_w_waiting || lck->lck_rw_shared_count > machine_info.max_cpus) {
+			/*
+			 * there are already threads waiting on this lock... this
+			 * implies that they have spun beyond their deadlines waiting for 
+			 * the desired state to show up so we will not bother spinning at this time...
+			 *   or
+			 * the current number of threads sharing this lock exceeds our capacity to run them
+			 * concurrently and since all states we're going to spin for require the rw_shared_count
+			 * to be at 0, we'll not bother spinning since the latency for this to happen is
+			 * unpredictable...
+			 */
+			return (mach_absolute_time());
+		}
+		return (mach_absolute_time() + MutexSpin);
+	} else
+		return (mach_absolute_time() + (100000LL * 1000000000LL));
+}
+
+
 /*
  *      Routine:        lck_rw_lock_exclusive
  */
 void
-lck_rw_lock_exclusive(
+lck_rw_lock_exclusive_gen(
 	lck_rw_t	*lck)
 {
-	int	   i;
-	wait_result_t	res;
-#if	MACH_LDEBUG
-	int				decrementer;
-#endif	/* MACH_LDEBUG */
-	boolean_t	istate;
-#if	CONFIG_DTRACE
-	uint64_t wait_interval = 0;
-	int slept = 0;
-	int readers_at_sleep;
-#endif
+	uint64_t	deadline = 0;
+	int		slept = 0;
+	int		gotlock = 0;
+	int		lockheld = 0;
+	wait_result_t	res = 0;
+	boolean_t	istate = -1;
 
-	istate = lck_interlock_lock(lck);
 #if	CONFIG_DTRACE
-	readers_at_sleep = lck->lck_rw_shared_count;
+	boolean_t dtrace_ls_initialized = FALSE;
+	boolean_t dtrace_rwl_excl_spin, dtrace_rwl_excl_block, dtrace_ls_enabled= FALSE;
+	uint64_t wait_interval = 0;
+	int readers_at_sleep = 0;
 #endif
 
-#if	MACH_LDEBUG
-	decrementer = DECREMENTER_TIMEOUT;
-#endif	/* MACH_LDEBUG */
-
 	/*
 	 *	Try to acquire the lck_rw_want_write bit.
 	 */
-	while (lck->lck_rw_want_write) {
+	while ( !lck_rw_grab_want(lck)) {
 
-		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EXCLUSIVE_CODE) | DBG_FUNC_START, (int)lck, 0, 0, 0, 0);
-		/*
-		 * Either sleeping or spinning is happening, start
-		 * a timing of our delay interval now.
-		 */
 #if	CONFIG_DTRACE
-		if ((lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] || lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK]) && wait_interval == 0) {
-			wait_interval = mach_absolute_time();
-		} else {
-			wait_interval = -1;
+		if (dtrace_ls_initialized == FALSE) {
+			dtrace_ls_initialized = TRUE;
+			dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
+			dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
+			dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
+			if (dtrace_ls_enabled) {
+				/*
+				 * Either sleeping or spinning is happening,
+				 *  start a timing of our delay interval now.
+				 */
+				readers_at_sleep = lck->lck_rw_shared_count;
+				wait_interval = mach_absolute_time();
+			}
 		}
 #endif
+		if (istate == -1)
+			istate = ml_get_interrupts_enabled();
+
+		deadline = lck_rw_deadline_for_spin(lck);
+
+		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_START, (int)lck, 0, 0, 0, 0);
+		
+		while (((gotlock = lck_rw_grab_want(lck)) == 0) && mach_absolute_time() < deadline)
+			lck_rw_lock_pause(istate);
 
+		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_END, (int)lck, 0, 0, gotlock, 0);
+
+		if (gotlock)
+			break;
+		/*
+		 * if we get here, the deadline has expired w/o us
+		 * being able to grab the lock exclusively
+		 * check to see if we're allowed to do a thread_block
+		 */
+		if (lck->lck_rw_can_sleep) {
 
-		i = lock_wait_time[lck->lck_rw_can_sleep ? 1 : 0];
-		if (i != 0) {
-			lck_interlock_unlock(lck, istate);
-#if	MACH_LDEBUG
-			if (!--decrementer)
-				Debugger("timeout - lck_rw_want_write");
-#endif	/* MACH_LDEBUG */
-			while (--i != 0 && lck->lck_rw_want_write)
-				lck_rw_lock_pause(istate);
 			istate = lck_interlock_lock(lck);
-		}
 
-		if (lck->lck_rw_can_sleep && lck->lck_rw_want_write) {
-			lck->lck_w_waiting = TRUE;
-			res = assert_wait(RW_LOCK_WRITER_EVENT(lck), THREAD_UNINT);
-			if (res == THREAD_WAITING) {
-				lck_interlock_unlock(lck, istate);
-				res = thread_block(THREAD_CONTINUE_NULL);
-#if	CONFIG_DTRACE
-				slept = 1;
-#endif
-				istate = lck_interlock_lock(lck);
-			}
-		}
-		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EXCLUSIVE_CODE) | DBG_FUNC_END, (int)lck, res, 0, 0, 0);
-	}
-	lck->lck_rw_want_write = TRUE;
+			if (lck->lck_rw_want_write) {
 
-	/* Wait for readers (and upgrades) to finish */
+				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_START, (int)lck, 0, 0, 0, 0);
 
-#if	MACH_LDEBUG
-	decrementer = DECREMENTER_TIMEOUT;
-#endif	/* MACH_LDEBUG */
-	while ((lck->lck_rw_shared_count != 0) || lck->lck_rw_want_upgrade) {
+				lck->lck_w_waiting = TRUE;
 
-		i = lock_wait_time[lck->lck_rw_can_sleep ? 1 : 0];
+				res = assert_wait(RW_LOCK_WRITER_EVENT(lck), THREAD_UNINT);
+				lck_interlock_unlock(lck, istate);
 
-		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EXCLUSIVE1_CODE) | DBG_FUNC_START,
-			     (int)lck, lck->lck_rw_shared_count, lck->lck_rw_want_upgrade, i, 0);
+				if (res == THREAD_WAITING) {
+					res = thread_block(THREAD_CONTINUE_NULL);
+					slept++;
+				}
+				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_END, (int)lck, res, slept, 0, 0);
+			} else {
+				lck->lck_rw_want_write = TRUE;
+				lck_interlock_unlock(lck, istate);
+				break;
+			}
+		}
+	}
+	/*
+	 * Wait for readers (and upgrades) to finish...
+	 * the test for these conditions must be done simultaneously with
+	 * a check of the interlock not being held since
+	 * the rw_shared_count will drop to 0 first and then want_upgrade
+	 * will be set to 1 in the shared_to_exclusive scenario... those
+	 * adjustments are done behind the interlock and represent an
+	 * atomic change in state and must be considered as such
+	 * however, once we see the read count at 0, the want_upgrade not set
+	 * and the interlock not held, we are safe to proceed
+	 */
+	while (lck_rw_held_read_or_upgrade(lck)) {
 
 #if	CONFIG_DTRACE
 		/*
@@ -970,42 +959,69 @@ lck_rw_lock_exclusive(
 		 * to -1 we don't have accurate data so we cannot later
 		 * decide to record a dtrace spin or sleep event.
 		 */
-		if ((lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] || lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK]) && wait_interval == 0) {
-			wait_interval = mach_absolute_time();
-		} else {
-			wait_interval = (unsigned) -1;
+		if (dtrace_ls_initialized == FALSE) {
+			dtrace_ls_initialized = TRUE;
+			dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
+			dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
+			dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
+			if (dtrace_ls_enabled) {
+				/*
+				 * Either sleeping or spinning is happening,
+				 *  start a timing of our delay interval now.
+				 */
+				readers_at_sleep = lck->lck_rw_shared_count;
+				wait_interval = mach_absolute_time();
+			}
 		}
 #endif
+		if (istate == -1)
+			istate = ml_get_interrupts_enabled();
+
+		deadline = lck_rw_deadline_for_spin(lck);
+
+		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_START, (int)lck, 0, 0, 0, 0);
+
+		while ((lockheld = lck_rw_held_read_or_upgrade(lck)) && mach_absolute_time() < deadline)
+			lck_rw_lock_pause(istate);
+
+		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_END, (int)lck, 0, 0, lockheld, 0);
+
+		if ( !lockheld)
+			break;
+		/*
+		 * if we get here, the deadline has expired w/o us
+		 * being able to grab the lock exclusively
+		 * check to see if we're allowed to do a thread_block
+		 */
+		if (lck->lck_rw_can_sleep) {
 
-		if (i != 0) {
-			lck_interlock_unlock(lck, istate);
-#if	MACH_LDEBUG
-			if (!--decrementer)
-				Debugger("timeout - wait for readers");
-#endif	/* MACH_LDEBUG */
-			while (--i != 0 && (lck->lck_rw_shared_count != 0 ||
-					    lck->lck_rw_want_upgrade))
-				lck_rw_lock_pause(istate);
 			istate = lck_interlock_lock(lck);
-		}
 
-		if (lck->lck_rw_can_sleep && (lck->lck_rw_shared_count != 0 || lck->lck_rw_want_upgrade)) {
-			lck->lck_w_waiting = TRUE;
-			res = assert_wait(RW_LOCK_WRITER_EVENT(lck), THREAD_UNINT);
-			if (res == THREAD_WAITING) {
+			if (lck->lck_rw_shared_count != 0 || lck->lck_rw_want_upgrade) {
+				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_START, (int)lck, 0, 0, 0, 0);
+
+				lck->lck_w_waiting = TRUE;
+
+				res = assert_wait(RW_LOCK_WRITER_EVENT(lck), THREAD_UNINT);
 				lck_interlock_unlock(lck, istate);
-				res = thread_block(THREAD_CONTINUE_NULL);
-#if	CONFIG_DTRACE
-				slept = 1;
-#endif
-				istate = lck_interlock_lock(lck);
+
+				if (res == THREAD_WAITING) {
+					res = thread_block(THREAD_CONTINUE_NULL);
+					slept++;
+				}
+				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_END, (int)lck, res, slept, 0, 0);
+			} else {
+				lck_interlock_unlock(lck, istate);
+				/*
+				 * must own the lock now, since we checked for
+				 * readers or upgrade owner behind the interlock
+				 * no need for a call to 'lck_rw_held_read_or_upgrade'
+				 */
+				break;
 			}
 		}
-		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EXCLUSIVE1_CODE) | DBG_FUNC_END,
-			     (int)lck, lck->lck_rw_shared_count, lck->lck_rw_want_upgrade, res, 0);
 	}
 
-	lck_interlock_unlock(lck, istate);
 #if	CONFIG_DTRACE
 	/*
 	 * Decide what latencies we suffered that are Dtrace events.
@@ -1016,7 +1032,7 @@ lck_rw_lock_exclusive(
 	 * If we have set wait_interval to -1, then dtrace was not enabled when we
 	 * started sleeping/spinning so we don't record this event.
 	 */
-	if (wait_interval != 0 && wait_interval != (unsigned) -1) {
+	if (dtrace_ls_enabled == TRUE) {
 		if (slept == 0) {
 			LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_EXCL_SPIN, lck,
 			    mach_absolute_time() - wait_interval, 1);
@@ -1039,67 +1055,72 @@ lck_rw_lock_exclusive(
 
 /*
  *      Routine:        lck_rw_done_gen
+ *
+ *	called from the assembly language wrapper...
+ *	prior_lock_state is the value in the 1st
+ * 	word of the lock at the time of a successful
+ *	atomic compare and exchange with the new value...
+ * 	it represents the state of the lock before we
+ *	decremented the rw_shared_count or cleared either
+ * 	rw_want_upgrade or rw_want_write and
+ *	the lck_x_waiting bits...  since the wrapper
+ * 	routine has already changed the state atomically, 
+ *	we just need to decide if we should
+ *	wake up anyone and what value to return... we do
+ *	this by examining the state of the lock before
+ *	we changed it
  */
 lck_rw_type_t
 lck_rw_done_gen(
-	lck_rw_t	*lck)
+	lck_rw_t	*lck,
+	int		prior_lock_state)
 {
-	boolean_t	wakeup_readers = FALSE;
-	boolean_t	wakeup_writers = FALSE;
-	lck_rw_type_t	lck_rw_type;
-	boolean_t	istate;
-
-	istate = lck_interlock_lock(lck);
-
-	if (lck->lck_rw_shared_count != 0) {
-		lck_rw_type = LCK_RW_TYPE_SHARED;
-		lck->lck_rw_shared_count--;
-	}
-	else {	
-		lck_rw_type = LCK_RW_TYPE_EXCLUSIVE;
-		if (lck->lck_rw_want_upgrade) 
-			lck->lck_rw_want_upgrade = FALSE;
-		else 
-			lck->lck_rw_want_write = FALSE;
-	}
+	lck_rw_t	*fake_lck;
+	lck_rw_type_t	lock_type;
+	thread_t	thread;
+	uint32_t	rwlock_count;
 
 	/*
-	 *	There is no reason to wakeup a waiting thread
-	 *	if the read-count is non-zero.  Consider:
-	 *		we must be dropping a read lock
-	 *		threads are waiting only if one wants a write lock
-	 *		if there are still readers, they can't proceed
+	 * prior_lock state is a snapshot of the 1st word of the
+	 * lock in question... we'll fake up a pointer to it
+	 * and carefully not access anything beyond whats defined
+	 * in the first word of a lck_rw_t
 	 */
+	fake_lck = (lck_rw_t *)&prior_lock_state;
 
-	if (lck->lck_rw_shared_count == 0) {
-		if (lck->lck_w_waiting) {
-			lck->lck_w_waiting = FALSE;
-			wakeup_writers = TRUE;
-		} 
-		if (!(lck->lck_rw_priv_excl && wakeup_writers == TRUE) && 
-				lck->lck_r_waiting) {
-			lck->lck_r_waiting = FALSE;
-			wakeup_readers = TRUE;
-		}
-	}
-
-	lck_interlock_unlock(lck, istate);
+	if (fake_lck->lck_rw_shared_count <= 1) {
+		if (fake_lck->lck_w_waiting)
+			thread_wakeup(RW_LOCK_WRITER_EVENT(lck));
 
-	if (wakeup_readers) 
-		thread_wakeup(RW_LOCK_READER_EVENT(lck));
-	if (wakeup_writers) 
-		thread_wakeup(RW_LOCK_WRITER_EVENT(lck));
+		if (!(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting) && fake_lck->lck_r_waiting)
+			thread_wakeup(RW_LOCK_READER_EVENT(lck));
+	}
+	if (fake_lck->lck_rw_shared_count)
+		lock_type = LCK_RW_TYPE_SHARED;
+	else
+		lock_type = LCK_RW_TYPE_EXCLUSIVE;
+
+	/* Check if dropping the lock means that we need to unpromote */
+	thread = current_thread();
+	rwlock_count = thread->rwlock_count--;
+#if MACH_LDEBUG
+	if (rwlock_count == 0) {
+		panic("rw lock count underflow for thread %p", thread);
+	}
+#endif
+	if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
+		/* sched_flags checked without lock, but will be rechecked while clearing */
+		lck_rw_clear_promotion(thread);
+	}
 
 #if CONFIG_DTRACE
-	LOCKSTAT_RECORD(LS_LCK_RW_DONE_RELEASE, lck, (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE ? 1 : 0));
+	LOCKSTAT_RECORD(LS_LCK_RW_DONE_RELEASE, lck, lock_type == LCK_RW_TYPE_SHARED ? 0 : 1);
 #endif
 
-	return(lck_rw_type);
+	return(lock_type);
 }
 
 
-
-
 /*
  *	Routine:	lck_rw_unlock
  */
@@ -1168,82 +1189,98 @@ lck_rw_lock(
 
 /*
  *	Routine:	lck_rw_lock_shared_gen
+ *	Function:
+ *		assembly fast path code has determined that this lock
+ *		is held exclusively... this is where we spin/block
+ *		until we can acquire the lock in the shared mode
  */
 void
 lck_rw_lock_shared_gen(
 	lck_rw_t	*lck)
 {
-	int		i;
-	wait_result_t      res;
-#if	MACH_LDEBUG
-	int		decrementer;
-#endif	/* MACH_LDEBUG */
-	boolean_t	istate;
+	uint64_t	deadline = 0;
+	int		gotlock = 0;
+	int		slept = 0;
+	wait_result_t	res = 0;
+	boolean_t	istate = -1;
+	
 #if	CONFIG_DTRACE
 	uint64_t wait_interval = 0;
-	int slept = 0;
-	int readers_at_sleep;
+	int readers_at_sleep = 0;
+	boolean_t dtrace_ls_initialized = FALSE;
+	boolean_t dtrace_rwl_shared_spin, dtrace_rwl_shared_block, dtrace_ls_enabled = FALSE;
 #endif
 
-	istate = lck_interlock_lock(lck);
+	while ( !lck_rw_grab_shared(lck)) {
+
 #if	CONFIG_DTRACE
-	readers_at_sleep = lck->lck_rw_shared_count;
+		if (dtrace_ls_initialized == FALSE) {
+			dtrace_ls_initialized = TRUE;
+			dtrace_rwl_shared_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_SPIN] != 0);
+			dtrace_rwl_shared_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_BLOCK] != 0);
+			dtrace_ls_enabled = dtrace_rwl_shared_spin || dtrace_rwl_shared_block;
+			if (dtrace_ls_enabled) {
+				/*
+				 * Either sleeping or spinning is happening,
+				 *  start a timing of our delay interval now.
+				 */
+				readers_at_sleep = lck->lck_rw_shared_count;
+				wait_interval = mach_absolute_time();
+			}
+		}
 #endif
+		if (istate == -1)
+			istate = ml_get_interrupts_enabled();
 
-#if	MACH_LDEBUG
-	decrementer = DECREMENTER_TIMEOUT;
-#endif	/* MACH_LDEBUG */
-	while ((lck->lck_rw_want_write || lck->lck_rw_want_upgrade) &&
-	    ((lck->lck_rw_shared_count == 0) || lck->lck_rw_priv_excl)) {
+		deadline = lck_rw_deadline_for_spin(lck);
 
-		i = lock_wait_time[lck->lck_rw_can_sleep ? 1 : 0];
+		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_START,
+			     (int)lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0);
 
-		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_CODE) | DBG_FUNC_START,
-			     (int)lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, i, 0);
-#if	CONFIG_DTRACE
-		if ((lockstat_probemap[LS_LCK_RW_LOCK_SHARED_SPIN] || lockstat_probemap[LS_LCK_RW_LOCK_SHARED_BLOCK]) && wait_interval == 0) {
-			wait_interval = mach_absolute_time();
-		} else {
-			wait_interval = -1;
-		}
-#endif
+		while (((gotlock = lck_rw_grab_shared(lck)) == 0) && mach_absolute_time() < deadline)
+			lck_rw_lock_pause(istate);
+
+		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_END,
+			     (int)lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, gotlock, 0);
+
+		if (gotlock)
+			break;
+		/*
+		 * if we get here, the deadline has expired w/o us
+		 * being able to grab the lock for read
+		 * check to see if we're allowed to do a thread_block
+		 */
+		if (lck->lck_rw_can_sleep) {
 
-		if (i != 0) {
-			lck_interlock_unlock(lck, istate);
-#if	MACH_LDEBUG
-			if (!--decrementer)
-				Debugger("timeout - wait no writers");
-#endif	/* MACH_LDEBUG */
-			while (--i != 0 &&
-			    (lck->lck_rw_want_write || lck->lck_rw_want_upgrade) &&
-			       ((lck->lck_rw_shared_count == 0) || lck->lck_rw_priv_excl))
-				lck_rw_lock_pause(istate);
 			istate = lck_interlock_lock(lck);
-		}
 
-		if (lck->lck_rw_can_sleep &&
-		    (lck->lck_rw_want_write || lck->lck_rw_want_upgrade) &&
-		    ((lck->lck_rw_shared_count == 0) || lck->lck_rw_priv_excl)) {
-			lck->lck_r_waiting = TRUE;
-			res = assert_wait(RW_LOCK_READER_EVENT(lck), THREAD_UNINT);
-			if (res == THREAD_WAITING) {
+			if ((lck->lck_rw_want_write || lck->lck_rw_want_upgrade) &&
+			    ((lck->lck_rw_shared_count == 0) || lck->lck_rw_priv_excl)) {
+
+				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_START,
+					     (int)lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0);
+
+				lck->lck_r_waiting = TRUE;
+
+				res = assert_wait(RW_LOCK_READER_EVENT(lck), THREAD_UNINT);
 				lck_interlock_unlock(lck, istate);
-				res = thread_block(THREAD_CONTINUE_NULL);
-#if	CONFIG_DTRACE
-				slept = 1;
-#endif
-				istate = lck_interlock_lock(lck);
+
+				if (res == THREAD_WAITING) {
+					res = thread_block(THREAD_CONTINUE_NULL);
+					slept++;
+				}
+				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_END,
+					     (int)lck, res, slept, 0, 0);
+			} else {
+				lck->lck_rw_shared_count++;
+				lck_interlock_unlock(lck, istate);
+				break;
 			}
 		}
-		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_CODE) | DBG_FUNC_END,
-			     (int)lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, res, 0);
 	}
 
-	lck->lck_rw_shared_count++;
-
-	lck_interlock_unlock(lck, istate);
 #if	CONFIG_DTRACE
-	if (wait_interval != 0 && wait_interval != (unsigned) -1) {
+	if (dtrace_ls_enabled == TRUE) {
 		if (slept == 0) {
 			LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_SHARED_SPIN, lck, mach_absolute_time() - wait_interval, 0);
 		} else {
@@ -1258,114 +1295,151 @@ lck_rw_lock_shared_gen(
 
 
 /*
- *	Routine:	lck_rw_lock_shared_to_exclusive
+ *	Routine:	lck_rw_lock_shared_to_exclusive_failure
  *	Function:
- *		Improves a read-only lock to one with
- *		write permission.  If another reader has
- *		already requested an upgrade to a write lock,
- *		no lock is held upon return.
- *
- *		Returns FALSE if the upgrade *failed*.
+ *		assembly fast path code has already dropped our read
+ *		count and determined that someone else owns 'lck_rw_want_upgrade'
+ *		if 'lck_rw_shared_count' == 0, its also already dropped 'lck_w_waiting'
+ *		all we need to do here is determine if a wakeup is needed
  */
-
 boolean_t
-lck_rw_lock_shared_to_exclusive(
-	lck_rw_t	*lck)
+lck_rw_lock_shared_to_exclusive_failure(
+	lck_rw_t	*lck,
+	int		prior_lock_state)
 {
-	int	    i;
-	boolean_t	    do_wakeup = FALSE;
-	wait_result_t      res;
-#if	MACH_LDEBUG
-	int		   decrementer;
-#endif	/* MACH_LDEBUG */
-	boolean_t	istate;
-#if	CONFIG_DTRACE
-	uint64_t wait_interval = 0;
-	int slept = 0;
-	int readers_at_sleep = 0;
+	lck_rw_t	*fake_lck;
+	thread_t	thread = current_thread();
+	uint32_t	rwlock_count;
+
+	/* Check if dropping the lock means that we need to unpromote */
+	rwlock_count = thread->rwlock_count--;
+#if MACH_LDEBUG
+	if (rwlock_count == 0) {
+		panic("rw lock count underflow for thread %p", thread);
+	}
 #endif
+	if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
+		/* sched_flags checked without lock, but will be rechecked while clearing */
+		lck_rw_clear_promotion(thread);
+	}
 
-	istate = lck_interlock_lock(lck);
-
-	lck->lck_rw_shared_count--;	
-
-	if (lck->lck_rw_want_upgrade) {
-		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_CODE) | DBG_FUNC_START,
-			     (int)lck, lck->lck_rw_shared_count, lck->lck_rw_want_upgrade, 0, 0);
+	/*
+	 * prior_lock state is a snapshot of the 1st word of the
+	 * lock in question... we'll fake up a pointer to it
+	 * and carefully not access anything beyond whats defined
+	 * in the first word of a lck_rw_t
+	 */
+	fake_lck = (lck_rw_t *)&prior_lock_state;
 
+	if (fake_lck->lck_w_waiting && fake_lck->lck_rw_shared_count == 1) {
 		/*
 		 *	Someone else has requested upgrade.
-		 *	Since we've released a read lock, wake
-		 *	him up.
+		 *	Since we've released the read lock, wake
+		 *	him up if he's blocked waiting
 		 */
-		if (lck->lck_w_waiting && (lck->lck_rw_shared_count == 0)) {
-			lck->lck_w_waiting = FALSE;
-			do_wakeup = TRUE;
-		}
-
-		lck_interlock_unlock(lck, istate);
+		thread_wakeup(RW_LOCK_WRITER_EVENT(lck));
+	}
+	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_CODE) | DBG_FUNC_NONE,
+		     (int)lck, lck->lck_rw_shared_count, lck->lck_rw_want_upgrade, 0, 0);
 
-		if (do_wakeup) 
-			thread_wakeup(RW_LOCK_WRITER_EVENT(lck));
+	return (FALSE);
+}
 
-		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_CODE) | DBG_FUNC_END,
-			     (int)lck, lck->lck_rw_shared_count, lck->lck_rw_want_upgrade, 0, 0);
 
-		return (FALSE);
-	}
+/*
+ *	Routine:	lck_rw_lock_shared_to_exclusive_failure
+ *	Function:
+ *		assembly fast path code has already dropped our read
+ *		count and successfully acquired 'lck_rw_want_upgrade'
+ *		we just need to wait for the rest of the readers to drain
+ *		and then we can return as the exclusive holder of this lock
+ */
+boolean_t
+lck_rw_lock_shared_to_exclusive_success(
+	lck_rw_t	*lck)
+{
+	uint64_t	deadline = 0;
+	int		slept = 0;
+	int		still_shared = 0;
+	wait_result_t	res;
+	boolean_t	istate = -1;
 
-	lck->lck_rw_want_upgrade = TRUE;
+#if	CONFIG_DTRACE
+	uint64_t wait_interval = 0;
+	int readers_at_sleep = 0;
+	boolean_t dtrace_ls_initialized = FALSE;
+	boolean_t dtrace_rwl_shared_to_excl_spin, dtrace_rwl_shared_to_excl_block, dtrace_ls_enabled = FALSE;
+#endif
 
-#if	MACH_LDEBUG
-	decrementer = DECREMENTER_TIMEOUT;
-#endif	/* MACH_LDEBUG */
 	while (lck->lck_rw_shared_count != 0) {
+
 #if	CONFIG_DTRACE
-		if (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN] && wait_interval == 0) {
-			wait_interval = mach_absolute_time();
-			readers_at_sleep = lck->lck_rw_shared_count;
-		} else {
-			wait_interval = -1;
+		if (dtrace_ls_initialized == FALSE) {
+			dtrace_ls_initialized = TRUE;
+			dtrace_rwl_shared_to_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN] != 0);
+			dtrace_rwl_shared_to_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK] != 0);
+			dtrace_ls_enabled = dtrace_rwl_shared_to_excl_spin || dtrace_rwl_shared_to_excl_block;
+			if (dtrace_ls_enabled) {
+				/*
+				 * Either sleeping or spinning is happening,
+				 *  start a timing of our delay interval now.
+				 */
+				readers_at_sleep = lck->lck_rw_shared_count;
+				wait_interval = mach_absolute_time();
+			}
 		}
 #endif
-		i = lock_wait_time[lck->lck_rw_can_sleep ? 1 : 0];
-
-		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX1_CODE) | DBG_FUNC_START,
-			     (int)lck, lck->lck_rw_shared_count, i, 0, 0);
-
-		if (i != 0) {
-			lck_interlock_unlock(lck, istate);
-#if	MACH_LDEBUG
-			if (!--decrementer)
-				Debugger("timeout - lck_rw_shared_count");
-#endif	/* MACH_LDEBUG */
-			while (--i != 0 && lck->lck_rw_shared_count != 0)
-				lck_rw_lock_pause(istate);
+		if (istate == -1)
+			istate = ml_get_interrupts_enabled();
+
+		deadline = lck_rw_deadline_for_spin(lck);
+
+		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_START,
+			     (int)lck, lck->lck_rw_shared_count, 0, 0, 0);
+
+		while ((still_shared = lck->lck_rw_shared_count) && mach_absolute_time() < deadline)
+			lck_rw_lock_pause(istate);
+
+		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_END,
+			     (int)lck, lck->lck_rw_shared_count, 0, 0, 0);
+
+		if ( !still_shared)
+			break;
+		/*
+		 * if we get here, the deadline has expired w/o
+		 * the rw_shared_count having drained to 0
+		 * check to see if we're allowed to do a thread_block
+		 */
+		if (lck->lck_rw_can_sleep) {
+			
 			istate = lck_interlock_lock(lck);
-		}
+			
+			if (lck->lck_rw_shared_count != 0) {
+				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_START,
+					     (int)lck, lck->lck_rw_shared_count, 0, 0, 0);
 
-		if (lck->lck_rw_can_sleep && lck->lck_rw_shared_count != 0) {
-			lck->lck_w_waiting = TRUE;
-			res = assert_wait(RW_LOCK_WRITER_EVENT(lck), THREAD_UNINT);
-			if (res == THREAD_WAITING) {
+				lck->lck_w_waiting = TRUE;
+
+				res = assert_wait(RW_LOCK_WRITER_EVENT(lck), THREAD_UNINT);
 				lck_interlock_unlock(lck, istate);
-				res = thread_block(THREAD_CONTINUE_NULL);
-#if	CONFIG_DTRACE
-				slept = 1;
-#endif
-				istate = lck_interlock_lock(lck);
+
+				if (res == THREAD_WAITING) {
+					res = thread_block(THREAD_CONTINUE_NULL);
+					slept++;
+				}
+				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_END,
+					     (int)lck, res, slept, 0, 0);
+			} else {
+				lck_interlock_unlock(lck, istate);
+				break;
 			}
 		}
-		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX1_CODE) | DBG_FUNC_END,
-			     (int)lck, lck->lck_rw_shared_count, 0, 0, 0);
 	}
-
-	lck_interlock_unlock(lck, istate);
 #if	CONFIG_DTRACE
 	/*
 	 * We infer whether we took the sleep/spin path above by checking readers_at_sleep.
 	 */
-	if (wait_interval != 0 && wait_interval != (unsigned) -1 && readers_at_sleep) {
+	if (dtrace_ls_enabled == TRUE) {
 		if (slept == 0) {
 			LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN, lck, mach_absolute_time() - wait_interval, 0);
 		} else {
@@ -1374,50 +1448,48 @@ lck_rw_lock_shared_to_exclusive(
 			    (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
 		}
 	}
-
 	LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lck, 1);
 #endif
 	return (TRUE);
 }
 
+
 /*
  *      Routine:        lck_rw_lock_exclusive_to_shared
+ * 	Function:
+ *		assembly fast path has already dropped
+ *		our exclusive state and bumped lck_rw_shared_count
+ *		all we need to do here is determine if anyone
+ *		needs to be awakened.
  */
 void
-lck_rw_lock_exclusive_to_shared(
-	lck_rw_t	*lck)
+lck_rw_lock_exclusive_to_shared_gen(
+	lck_rw_t	*lck,
+	int		prior_lock_state)
 {
-	boolean_t	wakeup_readers = FALSE;
-	boolean_t	wakeup_writers = FALSE;
-	boolean_t	istate;
+	lck_rw_t	*fake_lck;
 
-	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_START,
-			     (int)lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0);
-
-	istate = lck_interlock_lock(lck);
-
-	lck->lck_rw_shared_count++;
-	if (lck->lck_rw_want_upgrade)
-		lck->lck_rw_want_upgrade = FALSE;
-	else
-	 	lck->lck_rw_want_write = FALSE;
-
-	if (lck->lck_w_waiting) {
-		lck->lck_w_waiting = FALSE;
-		wakeup_writers = TRUE;
-	} 
-	if (!(lck->lck_rw_priv_excl && wakeup_writers == TRUE) && 
-			lck->lck_r_waiting) {
-		lck->lck_r_waiting = FALSE;
-		wakeup_readers = TRUE;
-	}
+	/*
+	 * prior_lock state is a snapshot of the 1st word of the
+	 * lock in question... we'll fake up a pointer to it
+	 * and carefully not access anything beyond whats defined
+	 * in the first word of a lck_rw_t
+	 */
+	fake_lck = (lck_rw_t *)&prior_lock_state;
 
-	lck_interlock_unlock(lck, istate);
+	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_START,
+			     (int)lck, fake_lck->lck_rw_want_write, fake_lck->lck_rw_want_upgrade, 0, 0);
 
-	if (wakeup_readers)
+	/*
+	 * don't wake up anyone waiting to take the lock exclusively
+	 * since we hold a read count... when the read count drops to 0,
+	 * the writers will be woken.
+	 *
+	 * wake up any waiting readers if we don't have any writers waiting,
+	 * or the lock is NOT marked as rw_priv_excl (writers have privilege)
+	 */
+	if (!(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting) && fake_lck->lck_r_waiting)
 		thread_wakeup(RW_LOCK_READER_EVENT(lck));
-	if (wakeup_writers)
-		thread_wakeup(RW_LOCK_WRITER_EVENT(lck));
 
 	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_END,
 			     (int)lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, lck->lck_rw_shared_count, 0);
@@ -1445,74 +1517,6 @@ lck_rw_try_lock(
 	return(FALSE);
 }
 
-/*
- *	Routine:	lck_rw_try_lock_exclusive
- *	Function:
- *		Tries to get a write lock.
- *
- *		Returns FALSE if the lock is not held on return.
- */
-
-boolean_t
-lck_rw_try_lock_exclusive(
-	lck_rw_t	*lck)
-{
-	boolean_t	istate;
-
-	istate = lck_interlock_lock(lck);
-
-	if (lck->lck_rw_want_write || lck->lck_rw_want_upgrade || lck->lck_rw_shared_count) {
-		/*
-		 *	Can't get lock.
-		 */
-		lck_interlock_unlock(lck, istate);
-		return(FALSE);
-	}
-
-	/*
-	 *	Have lock.
-	 */
-
-	lck->lck_rw_want_write = TRUE;
-
-	lck_interlock_unlock(lck, istate);
-
-#if CONFIG_DTRACE
-	LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_EXCL_ACQUIRE, lck, 1);
-#endif
-	return(TRUE);
-}
-
-/*
- *	Routine:	lck_rw_try_lock_shared
- *	Function:
- *		Tries to get a read lock.
- *
- *		Returns FALSE if the lock is not held on return.
- */
-
-boolean_t
-lck_rw_try_lock_shared(
-	lck_rw_t	*lck)
-{
-	boolean_t	istate;
-
-	istate = lck_interlock_lock(lck);
-/* No reader priority check here... */
-	if (lck->lck_rw_want_write || lck->lck_rw_want_upgrade) {
-		lck_interlock_unlock(lck, istate);
-		return(FALSE);
-	}
-
-	lck->lck_rw_shared_count++;
-
-	lck_interlock_unlock(lck, istate);
-
-#if CONFIG_DTRACE
-	LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_SHARED_ACQUIRE, lck, 0);
-#endif
-	return(TRUE);
-}
 
 void
 lck_rw_assert(
@@ -1539,13 +1543,38 @@ lck_rw_assert(
 			return;
 		}
 		break;
+	case LCK_RW_ASSERT_NOTHELD:
+		if (!(lck->lck_rw_want_write ||
+			  lck->lck_rw_want_upgrade ||
+			  lck->lck_rw_shared_count != 0)) {
+			return;
+		}
+		break;
 	default:
 		break;
 	}
 
-	panic("rw lock (%p) not held (mode=%u)\n", lck, type);
+	panic("rw lock (%p)%s held (mode=%u), first word %08x\n", lck, (type == LCK_RW_ASSERT_NOTHELD ? "" : " not"), type, *(uint32_t *)lck);
 }
 
+/* On return to userspace, this routine is called if the rwlock_count is somehow imbalanced */
+void
+lck_rw_clear_promotions_x86(thread_t thread)
+{
+#if MACH_LDEBUG
+	/* It's fatal to leave a RW lock locked and return to userspace */
+	panic("%u rw lock(s) held on return to userspace for thread %p", thread->rwlock_count, thread);
+#else
+	/* Paper over the issue */
+	thread->rwlock_count = 0;
+	lck_rw_clear_promotion(thread);
+#endif
+}
+
+
+#ifdef	MUTEX_ZONE
+extern zone_t lck_mtx_zone;
+#endif
 /*
  *      Routine:        lck_mtx_alloc_init
  */
@@ -1555,10 +1584,13 @@ lck_mtx_alloc_init(
 	lck_attr_t	*attr)
 {
 	lck_mtx_t	*lck;
-
+#ifdef	MUTEX_ZONE
+	if ((lck = (lck_mtx_t *)zalloc(lck_mtx_zone)) != 0)
+		lck_mtx_init(lck, grp, attr);
+#else
 	if ((lck = (lck_mtx_t *)kalloc(sizeof(lck_mtx_t))) != 0)
 		lck_mtx_init(lck, grp, attr);
-		
+#endif		
 	return(lck);
 }
 
@@ -1571,7 +1603,11 @@ lck_mtx_free(
 	lck_grp_t	*grp)
 {
 	lck_mtx_destroy(lck, grp);
+#ifdef	MUTEX_ZONE
+	zfree(lck_mtx_zone, lck);
+#else
 	kfree(lck, sizeof(lck_mtx_t));
+#endif
 }
 
 /*
@@ -1593,7 +1629,10 @@ lck_mtx_ext_init(
 	lck->lck_mtx_grp = grp;
 
 	if (grp->lck_grp_attr & LCK_GRP_ATTR_STAT)
-		 lck->lck_mtx_attr |= LCK_MTX_ATTR_STAT;
+		lck->lck_mtx_attr |= LCK_MTX_ATTR_STAT;
+
+	lck->lck_mtx.lck_mtx_is_ext = 1;
+	lck->lck_mtx.lck_mtx_sw.lck_mtxd.lck_mtxd_pad32 = 0xFFFFFFFF;
 }
 
 /*
@@ -1620,11 +1659,10 @@ lck_mtx_init(
 			lck->lck_mtx_ptr = lck_ext;
 		}
 	} else {
-		lck->lck_mtx_ilk = 0;
-		lck->lck_mtx_locked = 0;
-		lck->lck_mtx_waiters = 0;
-		lck->lck_mtx_pri = 0;
+		lck->lck_mtx_owner = 0;
+		lck->lck_mtx_state = 0;
 	}
+	lck->lck_mtx_sw.lck_mtxd.lck_mtxd_pad32 = 0xFFFFFFFF;
 	lck_grp_reference(grp);
 	lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX);
 }
@@ -1651,11 +1689,11 @@ lck_mtx_init_ext(
 		lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT;
 		lck->lck_mtx_ptr = lck_ext;
 	} else {
-		lck->lck_mtx_ilk = 0;
-		lck->lck_mtx_locked = 0;
-		lck->lck_mtx_waiters = 0;
-		lck->lck_mtx_pri = 0;
+		lck->lck_mtx_owner = 0;
+		lck->lck_mtx_state = 0;
 	}
+	lck->lck_mtx_sw.lck_mtxd.lck_mtxd_pad32 = 0xFFFFFFFF;
+
 	lck_grp_reference(grp);
 	lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX);
 }
@@ -1672,8 +1710,13 @@ lck_mtx_destroy(
 	
 	if (lck->lck_mtx_tag == LCK_MTX_TAG_DESTROYED)
 		return;
+#if MACH_LDEBUG
+	lck_mtx_assert(lck, LCK_MTX_ASSERT_NOTOWNED);
+#endif
 	lck_is_indirect = (lck->lck_mtx_tag == LCK_MTX_TAG_INDIRECT);
-	lck->lck_mtx_tag = LCK_MTX_TAG_DESTROYED;
+
+	lck_mtx_lock_mark_destroyed(lck);
+
 	if (lck_is_indirect)
 		kfree(lck->lck_mtx_ptr, sizeof(lck_mtx_ext_t));
 	lck_grp_lckcnt_decr(grp, LCK_TYPE_MTX);
@@ -1681,51 +1724,205 @@ lck_mtx_destroy(
 	return;
 }
 
+
+#define	LCK_MTX_LCK_WAIT_CODE		0x20
+#define	LCK_MTX_LCK_WAKEUP_CODE		0x21
+#define	LCK_MTX_LCK_SPIN_CODE		0x22
+#define	LCK_MTX_LCK_ACQUIRE_CODE	0x23
+#define LCK_MTX_LCK_DEMOTE_CODE		0x24
+
+
+/*
+ * Routine: 	lck_mtx_unlock_wakeup_x86
+ *
+ * Invoked on unlock when there is 
+ * contention (i.e. the assembly routine sees that
+ * that mutex->lck_mtx_waiters != 0 or 
+ * that mutex->lck_mtx_promoted != 0...
+ *
+ * neither the mutex or interlock is held
+ */
+void
+lck_mtx_unlock_wakeup_x86 (
+	lck_mtx_t	*mutex,
+	int		prior_lock_state)
+{
+	lck_mtx_t	fake_lck;
+
+	/*
+	 * prior_lock state is a snapshot of the 2nd word of the
+	 * lock in question... we'll fake up a lock with the bits
+	 * copied into place and carefully not access anything
+	 * beyond whats defined in the second word of a lck_mtx_t
+	 */
+	fake_lck.lck_mtx_state = prior_lock_state;
+
+	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_START,
+		     mutex, fake_lck.lck_mtx_promoted, fake_lck.lck_mtx_waiters, fake_lck.lck_mtx_pri, 0);
+
+	if (__probable(fake_lck.lck_mtx_waiters)) {
+		if (fake_lck.lck_mtx_waiters > 1)
+			thread_wakeup_one_with_pri((event_t)(((unsigned int*)mutex)+(sizeof(lck_mtx_t)-1)/sizeof(unsigned int)), fake_lck.lck_mtx_pri);
+		else
+			thread_wakeup_one((event_t)(((unsigned int*)mutex)+(sizeof(lck_mtx_t)-1)/sizeof(unsigned int)));
+	}
+
+	if (__improbable(fake_lck.lck_mtx_promoted)) {
+		thread_t	thread = current_thread();
+
+
+		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_DEMOTE_CODE) | DBG_FUNC_NONE,
+			     thread_tid(thread), thread->promotions, thread->sched_flags & TH_SFLAG_PROMOTED, 0, 0);
+
+		if (thread->promotions > 0) {
+			spl_t	s = splsched();
+
+			thread_lock(thread);
+
+			if (--thread->promotions == 0 && (thread->sched_flags & TH_SFLAG_PROMOTED)) {
+
+				thread->sched_flags &= ~TH_SFLAG_PROMOTED;
+
+				if (thread->sched_flags & TH_SFLAG_RW_PROMOTED) {
+					/* Thread still has a RW lock promotion */
+				} else if (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) {
+					KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_DEMOTE) | DBG_FUNC_NONE,
+							      thread->sched_pri, DEPRESSPRI, 0, mutex, 0);
+
+					set_sched_pri(thread, DEPRESSPRI);
+				}
+				else {
+					if (thread->priority < thread->sched_pri) {
+						KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_DEMOTE) | DBG_FUNC_NONE,
+								      thread->sched_pri, thread->priority, 0, mutex, 0);
+
+						SCHED(compute_priority)(thread, FALSE);
+					}
+				}
+			}
+			thread_unlock(thread);
+			splx(s);
+		}
+	}
+	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_END,
+		     mutex, 0, mutex->lck_mtx_waiters, 0, 0);
+}
+
+
+/*
+ * Routine: 	lck_mtx_lock_acquire_x86
+ *
+ * Invoked on acquiring the mutex when there is
+ * contention (i.e. the assembly routine sees that
+ * that mutex->lck_mtx_waiters != 0 or 
+ * thread->was_promoted_on_wakeup != 0)...
+ *
+ * mutex is owned...  interlock is held... preemption is disabled
+ */
+void
+lck_mtx_lock_acquire_x86(
+	lck_mtx_t	*mutex)
+{
+	thread_t	thread;
+	integer_t	priority;
+	spl_t		s;
+
+	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_START,
+		     mutex, thread->was_promoted_on_wakeup, mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0);
+
+	if (mutex->lck_mtx_waiters)
+		priority = mutex->lck_mtx_pri;
+	else
+		priority = 0;
+
+	thread = (thread_t)mutex->lck_mtx_owner;	/* faster then current_thread() */
+
+	if (thread->sched_pri < priority || thread->was_promoted_on_wakeup) {
+
+		KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PROMOTE) | DBG_FUNC_NONE,
+				      thread->sched_pri, priority, thread->was_promoted_on_wakeup, mutex, 0);
+
+		s = splsched();
+		thread_lock(thread);
+
+		if (thread->sched_pri < priority) {
+			/* Do not promote past promotion ceiling */
+			assert(priority <= MAXPRI_PROMOTE);
+			set_sched_pri(thread, priority);
+		}
+		if (mutex->lck_mtx_promoted == 0) {
+			mutex->lck_mtx_promoted = 1;
+			
+			thread->promotions++;
+			thread->sched_flags |= TH_SFLAG_PROMOTED;
+		}
+		thread->was_promoted_on_wakeup = 0;
+		
+		thread_unlock(thread);
+		splx(s);
+	}
+	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_END,
+		     mutex, 0, mutex->lck_mtx_waiters, 0, 0);
+}
+
+
+
 /*
- * Routine: 	lck_mtx_lock_spinwait
+ * Routine: 	lck_mtx_lock_spinwait_x86
  *
  * Invoked trying to acquire a mutex when there is contention but
  * the holder is running on another processor. We spin for up to a maximum
  * time waiting for the lock to be released.
  *
  * Called with the interlock unlocked.
+ * returns 0 if mutex acquired
+ * returns 1 if we spun
+ * returns 2 if we didn't spin due to the holder not running
  */
-void
-lck_mtx_lock_spinwait(
-	lck_mtx_t		*lck)
+int
+lck_mtx_lock_spinwait_x86(
+	lck_mtx_t	*mutex)
 {
-	thread_t		holder;
-	volatile lck_mtx_t	*mutex;
-	uint64_t		deadline;
+	thread_t	holder;
+	uint64_t	deadline;
+	int		retval = 1;
+	int		loopcount = 0;
 
-	if (lck->lck_mtx_tag != LCK_MTX_TAG_INDIRECT)
-		mutex = lck;
-	else
-		mutex = &lck->lck_mtx_ptr->lck_mtx;
 
-	KERNEL_DEBUG(
-		MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN) | DBG_FUNC_NONE,
-		(int)lck, (int)mutex->lck_mtx_locked, 0, 0, 0);
+	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_START,
+		     mutex, mutex->lck_mtx_owner, mutex->lck_mtx_waiters, 0, 0);
 
 	deadline = mach_absolute_time() + MutexSpin;
+
 	/*
 	 * Spin while:
 	 *   - mutex is locked, and
-	 *   - its locked as a spin lock, or
+	 *   - its locked as a spin lock, and
 	 *   - owner is running on another processor, and
 	 *   - owner (processor) is not idling, and
 	 *   - we haven't spun for long enough.
 	 */
-	while ((holder = (thread_t) mutex->lck_mtx_locked) != NULL) {
-	        if ((holder == (thread_t)MUTEX_LOCKED_AS_SPIN) ||
-		    ((holder->machine.specFlags & OnProc) != 0 &&
-		     (holder->state & TH_IDLE) == 0 &&
-		     mach_absolute_time() < deadline)) {
-		        cpu_pause();
-			continue;
+	do {
+		if (__probable(lck_mtx_lock_grab_mutex(mutex))) {
+			retval = 0;
+			break;
 		}
-		break;
-	}
+		if ((holder = (thread_t) mutex->lck_mtx_owner) != NULL) {
+
+			if ( !(holder->machine.specFlags & OnProc) ||
+			     (holder->state & TH_IDLE)) {
+				if (loopcount == 0)
+					retval = 2;
+				break;
+			}
+		}
+		cpu_pause();
+
+		loopcount++;
+
+	} while (mach_absolute_time() < deadline);
+
+
 #if	CONFIG_DTRACE
 	/*
 	 * We've already kept a count via deadline of how long we spun.
@@ -1737,165 +1934,115 @@ lck_mtx_lock_spinwait(
 	 * penalize only lock groups that have debug/stats enabled
 	 * with dtrace processing if desired.
 	 */
-	if (lck->lck_mtx_tag != LCK_MTX_TAG_INDIRECT) {
-		LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN, lck,
+	if (__probable(mutex->lck_mtx_is_ext == 0)) {
+		LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN, mutex,
 		    mach_absolute_time() - (deadline - MutexSpin));
 	} else {
-		LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_SPIN, lck,
+		LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_SPIN, mutex,
 		    mach_absolute_time() - (deadline - MutexSpin));
 	}
 	/* The lockstat acquire event is recorded by the assembly code beneath us. */
 #endif
-}
 
-/*
- * Called from assembly code when a destroyed mutex is detected
- * during a lock/unlock/try/convert
- */
+	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_END,
+		     mutex, mutex->lck_mtx_owner, mutex->lck_mtx_waiters, retval, 0);
 
-void
-lck_mtx_interlock_panic(
-			lck_mtx_t		*lck)
-{
-        panic("trying to interlock destroyed mutex %p", lck);
+	return retval;
 }
 
 
-#if	MACH_KDB
-
-void
-db_show_one_lock(
-	lock_t  *lock)
-{
-	db_printf("Read_count = 0x%x, %swant_upgrade, %swant_write, ",
-		  lock->lck_rw_shared_count,
-		  lock->lck_rw_want_upgrade ? "" : "!",
-		  lock->lck_rw_want_write ? "" : "!");
-	db_printf("%swaiting, %scan_sleep\n", 
-		  (lock->lck_r_waiting || lock->lck_w_waiting) ? "" : "!", 
-		  lock->lck_rw_can_sleep ? "" : "!");
-	db_printf("Interlock:\n");
-	db_show_one_simple_lock((db_expr_t) ((vm_offset_t)simple_lock_addr(lock->lck_rw_interlock)),
-			TRUE, (db_expr_t)0, (char *)0);
-}
-
-#endif	/* MACH_KDB */
 
 /*
- * The C portion of the mutex package.  These routines are only invoked
- * if the optimized assembler routines can't do the work.
- */
-
-/*
- *	Routine:	lock_alloc
- *	Function:
- *		Allocate a mutex for external users who cannot
- *		hard-code the structure definition into their
- *		objects.
- *		For now just use kalloc, but a zone is probably
- *		warranted.
- */
-mutex_t *
-mutex_alloc(
-	unsigned short	tag)
-{
-	mutex_t		*m;
-
-	if ((m = (mutex_t *)kalloc(sizeof(mutex_t))) != 0)
-	  mutex_init(m, tag);
-	return(m);
-}
-
-/*
- *	Routine:	mutex_free
- *	Function:
- *		Free a mutex allocated for external users.
- *		For now just use kfree, but a zone is probably
- *		warranted.
+ * Routine: 	lck_mtx_lock_wait_x86
+ *
+ * Invoked in order to wait on contention.
+ *
+ * Called with the interlock locked and
+ * preemption disabled...  
+ * returns it unlocked and with preemption enabled
  */
 void
-mutex_free(
-	mutex_t		*m)
+lck_mtx_lock_wait_x86 (
+	lck_mtx_t	*mutex)
 {
-	kfree(m, sizeof(mutex_t));
-}
+	thread_t	self = current_thread();
+	thread_t	holder;
+	integer_t	priority;
+	spl_t		s;
+#if	CONFIG_DTRACE
+	uint64_t	sleep_start = 0;
 
+	if (lockstat_probemap[LS_LCK_MTX_LOCK_BLOCK] || lockstat_probemap[LS_LCK_MTX_EXT_LOCK_BLOCK]) {
+		sleep_start = mach_absolute_time();
+	}
+#endif
+	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_START,
+		     mutex, mutex->lck_mtx_owner, mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0);
 
-#if	MACH_KDB
-/*
- * Routines to print out simple_locks and mutexes in a nicely-formatted
- * fashion.
- */
+	priority = self->sched_pri;
 
-const char *simple_lock_labels =	"ENTRY    ILK THREAD   DURATION CALLER";
-const char *mutex_labels =		"ENTRY    LOCKED WAITERS   THREAD CALLER";
+	if (priority < self->priority)
+		priority = self->priority;
+	if (priority < BASEPRI_DEFAULT)
+		priority = BASEPRI_DEFAULT;
 
-void
-db_show_one_simple_lock (
-	db_expr_t	addr,
-	boolean_t	have_addr,
-	__unused db_expr_t	count,
-	__unused char		* modif)
-{
-	simple_lock_t	saddr = (simple_lock_t) ((vm_offset_t) addr);
+	/* Do not promote past promotion ceiling */
+	priority = MIN(priority, MAXPRI_PROMOTE);
 
-	if (saddr == (simple_lock_t)0 || !have_addr) {
-		db_error ("No simple_lock\n");
-	}
-#if	USLOCK_DEBUG
-	else if (saddr->lock_type != USLOCK_TAG)
-		db_error ("Not a simple_lock\n");
-#endif	/* USLOCK_DEBUG */
+	if (mutex->lck_mtx_waiters == 0 || priority > mutex->lck_mtx_pri)
+		mutex->lck_mtx_pri = priority;
+	mutex->lck_mtx_waiters++;
 
-	db_printf ("%s\n", simple_lock_labels);
-	db_print_simple_lock (saddr);
-}
-
-void
-db_print_simple_lock (
-	simple_lock_t	addr)
-{
+	if ( (holder = (thread_t)mutex->lck_mtx_owner) &&
+	     holder->sched_pri < mutex->lck_mtx_pri ) {
+		s = splsched();
+		thread_lock(holder);
 
-	db_printf ("%08x %3d", addr, *hw_lock_addr(addr->interlock));
-#if	USLOCK_DEBUG
-	db_printf (" %08x", addr->debug.lock_thread);
-	db_printf (" %08x ", addr->debug.duration[1]);
-	db_printsym ((int)addr->debug.lock_pc, DB_STGY_ANY);
-#endif	/* USLOCK_DEBUG */
-	db_printf ("\n");
-}
+		/* holder priority may have been bumped by another thread
+		 * before thread_lock was taken
+		 */
+		if (holder->sched_pri < mutex->lck_mtx_pri) {
+			KERNEL_DEBUG_CONSTANT(
+				MACHDBG_CODE(DBG_MACH_SCHED, MACH_PROMOTE) | DBG_FUNC_NONE,
+				holder->sched_pri, priority, thread_tid(holder), mutex, 0);
+			/* Assert that we're not altering the priority of a
+			 * thread above the MAXPRI_PROMOTE band
+			 */
+			assert(holder->sched_pri < MAXPRI_PROMOTE);
+			set_sched_pri(holder, priority);
+			
+			if (mutex->lck_mtx_promoted == 0) {
+				holder->promotions++;
+				holder->sched_flags |= TH_SFLAG_PROMOTED;
+				
+				mutex->lck_mtx_promoted = 1;
+			}
+		}
+		thread_unlock(holder);
+		splx(s);
+	}
+	assert_wait((event_t)(((unsigned int*)mutex)+((sizeof(lck_mtx_t)-1)/sizeof(unsigned int))), THREAD_UNINT);
 
-void
-db_show_one_mutex (
-	db_expr_t	addr,
-	boolean_t	have_addr,
-	__unused db_expr_t	count,
-	__unused char		* modif)
-{
-	mutex_t		* maddr = (mutex_t *)((vm_offset_t) addr);
+	lck_mtx_ilk_unlock(mutex);
 
-	if (maddr == (mutex_t *)0 || !have_addr)
-		db_error ("No mutex\n");
-#if	MACH_LDEBUG
-	else if (maddr->type != MUTEX_TAG)
-		db_error ("Not a mutex\n");
-#endif	/* MACH_LDEBUG */
+	thread_block(THREAD_CONTINUE_NULL);
 
-	db_printf ("%s\n", mutex_labels);
-	db_print_mutex (maddr);
-}
+	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_END,
+		     mutex, mutex->lck_mtx_owner, mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0);
 
-void
-db_print_mutex (
-	mutex_t		* addr)
-{
-	db_printf ("%08x %6d %7d",
-		   addr, *addr, addr->lck_mtx.lck_mtx_waiters);
-#if	MACH_LDEBUG
-	db_printf (" %08x ", addr->thread);
-	db_printsym (addr->pc, DB_STGY_ANY);
-#endif	/* MACH_LDEBUG */
-	db_printf ("\n");
+#if	CONFIG_DTRACE
+	/*
+	 * Record the Dtrace lockstat probe for blocking, block time
+	 * measured from when we were entered.
+	 */
+	if (sleep_start) {
+		if (mutex->lck_mtx_is_ext == 0) {
+			LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_BLOCK, mutex,
+			    mach_absolute_time() - sleep_start);
+		} else {
+			LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_BLOCK, mutex,
+			    mach_absolute_time() - sleep_start);
+		}
+	}
+#endif
 }
-
-#endif	/* MACH_KDB */