xnu-2422.100.13.tar.gz

[apple/xnu.git] / osfmk / i386 / locks_i386.c
diff --git a/osfmk/i386/locks_i386.c b/osfmk/i386/locks_i386.c

index 3e9b62fd749a9051db68b1542e5e3f62938659d4..69b83f1c3335ed7e279553f46fd28778021b0657 100644 (file)
--- a/osfmk/i386/locks_i386.c
+++ b/osfmk/i386/locks_i386.c
@@ -1,23 +1,29 @@
  /*
- * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2000-2012 Apple Inc. All rights reserved.
   *
- * @APPLE_LICENSE_HEADER_START@
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   * 
- * The contents of this file constitute Original Code as defined in and
- * are subject to the Apple Public Source License Version 1.1 (the
- * "License").  You may not use this file except in compliance with the
- * License.  Please obtain a copy of the License at
- * http://www.apple.com/publicsource and read it before using this file.
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
   * 
- * This Original Code and all software distributed under the License are
- * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ * 
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
   * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
   * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
- * License for the specific language governing rights and limitations
- * under the License.
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
   * 
- * @APPLE_LICENSE_HEADER_END@
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
   */
  /*
   * @OSF_COPYRIGHT@
@@ -55,7 +61,6 @@
   *     Locking primitives implementation
   */
  
-#include <mach_kdb.h>
  #include <mach_ldebug.h>
  
  #include <kern/lock.h>
@@ -71,18 +76,22 @@
  #include <kern/debug.h>
  #include <string.h>
  
-#if    MACH_KDB
-#include <ddb/db_command.h>
-#include <ddb/db_output.h>
-#include <ddb/db_sym.h>
-#include <ddb/db_print.h>
-#endif /* MACH_KDB */
-
-#ifdef __ppc__
-#include <ppc/Firmware.h>
-#endif
+#include <i386/machine_routines.h> /* machine_timeout_suspended() */
+#include <machine/machine_cpu.h>
+#include <i386/mp.h>
  
  #include <sys/kdebug.h>
+#include <mach/branch_predicates.h>
+
+/*
+ * We need only enough declarations from the BSD-side to be able to
+ * test if our probe is active, and to call __dtrace_probe().  Setting
+ * NEED_DTRACE_DEFS gets a local copy of those definitions pulled in.
+ */
+#if    CONFIG_DTRACE
+#define NEED_DTRACE_DEFS
+#include <../bsd/sys/lockstat.h>
+#endif
  
  #define        LCK_RW_LCK_EXCLUSIVE_CODE       0x100
  #define        LCK_RW_LCK_EXCLUSIVE1_CODE      0x101
@@ -91,23 +100,22 @@
  #define        LCK_RW_LCK_SH_TO_EX1_CODE       0x104
  #define        LCK_RW_LCK_EX_TO_SH_CODE        0x105
  
+#define LCK_RW_LCK_EX_WRITER_SPIN_CODE 0x106
+#define LCK_RW_LCK_EX_WRITER_WAIT_CODE 0x107
+#define LCK_RW_LCK_EX_READER_SPIN_CODE 0x108
+#define LCK_RW_LCK_EX_READER_WAIT_CODE 0x109
+#define LCK_RW_LCK_SHARED_SPIN_CODE    0x110
+#define LCK_RW_LCK_SHARED_WAIT_CODE    0x111
+#define LCK_RW_LCK_SH_TO_EX_SPIN_CODE  0x112
+#define LCK_RW_LCK_SH_TO_EX_WAIT_CODE  0x113
+
  
  #define        ANY_LOCK_DEBUG  (USLOCK_DEBUG || LOCK_DEBUG || MUTEX_DEBUG)
  
  unsigned int LcksOpts=0;
-unsigned int lock_wait_time[2] = { (unsigned int)-1, 100 } ;
  
  /* Forwards */
  
-#if    MACH_KDB
-void   db_print_simple_lock(
-                       simple_lock_t   addr);
-
-void   db_print_mutex(
-                       mutex_t         * addr);
-#endif /* MACH_KDB */
-
-
  #if    USLOCK_DEBUG
  /*
   *     Perform simple lock checks.
@@ -116,9 +124,6 @@ int uslock_check = 1;
  int    max_lock_loops  = 100000000;
  decl_simple_lock_data(extern , printf_lock)
  decl_simple_lock_data(extern , panic_lock)
-#if    MACH_KDB
-decl_simple_lock_data(extern , kdb_lock)
-#endif /* MACH_KDB */
  #endif /* USLOCK_DEBUG */
  
  
@@ -131,7 +136,7 @@ typedef void        *pc_t;
  #define        INVALID_PC      ((void *) VM_MAX_KERNEL_ADDRESS)
  #define        INVALID_THREAD  ((void *) VM_MAX_KERNEL_ADDRESS)
  #if    ANY_LOCK_DEBUG
-#define        OBTAIN_PC(pc,l) ((pc) = (void *) GET_RETURN_PC(&(l)))
+#define        OBTAIN_PC(pc)   ((pc) = GET_RETURN_PC())
  #define DECL_PC(pc)    pc_t pc;
  #else  /* ANY_LOCK_DEBUG */
  #define DECL_PC(pc)
@@ -139,9 +144,9 @@ typedef void        *pc_t;
  /*
   *     Eliminate lint complaints about unused local pc variables.
   */
-#define        OBTAIN_PC(pc,l) ++pc
+#define        OBTAIN_PC(pc)   ++pc
  #else  /* lint */
-#define        OBTAIN_PC(pc,l)
+#define        OBTAIN_PC(pc)
  #endif /* lint */
  #endif /* USLOCK_DEBUG */
  
@@ -163,6 +168,39 @@ int                usld_lock_common_checks(usimple_lock_t, char *);
  #define        USLDBG(stmt)
  #endif /* USLOCK_DEBUG */
  
+
+extern int lck_rw_grab_want(lck_rw_t *lck);
+extern int lck_rw_grab_shared(lck_rw_t *lck);
+extern int lck_rw_held_read_or_upgrade(lck_rw_t *lck);
+
+
+/*
+ * Forward definitions
+ */
+
+void lck_rw_lock_shared_gen(
+       lck_rw_t        *lck);
+
+void lck_rw_lock_exclusive_gen(
+       lck_rw_t        *lck);
+
+boolean_t lck_rw_lock_shared_to_exclusive_success(
+       lck_rw_t        *lck);
+
+boolean_t lck_rw_lock_shared_to_exclusive_failure(
+       lck_rw_t        *lck,
+       int             prior_lock_state);
+
+void lck_rw_lock_exclusive_to_shared_gen(
+       lck_rw_t        *lck,
+       int             prior_lock_state);
+
+lck_rw_type_t lck_rw_done_gen(
+       lck_rw_t        *lck,
+       int             prior_lock_state);
+
+void lck_rw_clear_promotions_x86(thread_t thread);
+
  /*
   *      Routine:        lck_spin_alloc_init
   */
@@ -213,9 +251,9 @@ lck_spin_destroy(
         lck_spin_t      *lck,
         lck_grp_t       *grp)
  {
-       if (lck->lck_spin_data[0] == LCK_SPIN_TAG_DESTROYED)
+       if (lck->interlock == LCK_SPIN_TAG_DESTROYED)
                 return;
-       lck->lck_spin_data[0] = LCK_SPIN_TAG_DESTROYED;
+       lck->interlock = LCK_SPIN_TAG_DESTROYED;
         lck_grp_lckcnt_decr(grp, LCK_TYPE_SPIN);
         lck_grp_deallocate(grp);
         return;
@@ -249,7 +287,7 @@ boolean_t
  lck_spin_try_lock(
         lck_spin_t      *lck)
  {
-       usimple_lock_try((usimple_lock_t) lck);
+       return((boolean_t)usimple_lock_try((usimple_lock_t) lck));
  }
  
  /*
@@ -270,6 +308,29 @@ usimple_lock_init(
  #endif
  }
  
+volatile uint32_t spinlock_owner_cpu = ~0;
+volatile usimple_lock_t spinlock_timed_out;
+
+static uint32_t spinlock_timeout_NMI(uintptr_t thread_addr) {
+       uint64_t deadline;
+       uint32_t i;
+
+       for (i = 0; i < real_ncpus; i++) {
+               if ((uintptr_t)cpu_data_ptr[i]->cpu_active_thread == thread_addr) {
+                       spinlock_owner_cpu = i;
+                       if ((uint32_t) cpu_number() == i)
+                               break;
+                       cpu_datap(i)->cpu_NMI_acknowledged = FALSE;
+                       cpu_NMI_interrupt(i);
+                       deadline = mach_absolute_time() + (LockTimeOut * 2);
+                       while (mach_absolute_time() < deadline && cpu_datap(i)->cpu_NMI_acknowledged == FALSE)
+                               cpu_pause();
+                       break;
+               }
+       }
+
+       return spinlock_owner_cpu;
+}
  
  /*
   *     Acquire a usimple_lock.
@@ -283,14 +344,27 @@ usimple_lock(
         usimple_lock_t  l)
  {
  #ifndef        MACHINE_SIMPLE_LOCK
-       pc_t            pc = NULL;
+       DECL_PC(pc);
  
-       OBTAIN_PC(pc, l);
+       OBTAIN_PC(pc);
         USLDBG(usld_lock_pre(l, pc));
  
-       if(!hw_lock_to(&l->interlock, LockTimeOut))     /* Try to get the lock with a timeout */ 
-               panic("simple lock deadlock detection - l=%08X, cpu=%d, ret=%08X", l, cpu_number(), pc);
+       if(__improbable(hw_lock_to(&l->interlock, LockTimeOutTSC) == 0))        {
+               boolean_t uslock_acquired = FALSE;
+               while (machine_timeout_suspended()) {
+                       enable_preemption();
+                       if ((uslock_acquired = hw_lock_to(&l->interlock, LockTimeOutTSC)))
+                               break;
+               }
  
+               if (uslock_acquired == FALSE) {
+                       uint32_t lock_cpu;
+                       uintptr_t lowner = (uintptr_t)l->interlock.lock_data;
+                       spinlock_timed_out = l;
+                       lock_cpu = spinlock_timeout_NMI(lowner);
+                       panic("Spinlock acquisition timed out: lock=%p, lock owner thread=0x%lx, current_thread: %p, lock owner active on CPU 0x%x, current owner: 0x%lx", l, lowner,  current_thread(), lock_cpu, (uintptr_t)l->interlock.lock_data);
+               }
+       }
         USLDBG(usld_lock_post(l, pc));
  #else
         simple_lock((simple_lock_t)l);
@@ -312,7 +386,7 @@ usimple_unlock(
  #ifndef        MACHINE_SIMPLE_LOCK
         DECL_PC(pc);
  
-       OBTAIN_PC(pc, l);
+       OBTAIN_PC(pc);
         USLDBG(usld_unlock(l, pc));
         hw_lock_unlock(&l->interlock);
  #else
@@ -338,10 +412,10 @@ usimple_lock_try(
         usimple_lock_t  l)
  {
  #ifndef        MACHINE_SIMPLE_LOCK
-       DECL_PC(pc);
         unsigned int    success;
+       DECL_PC(pc);
  
-       OBTAIN_PC(pc, l);
+       OBTAIN_PC(pc);
         USLDBG(usld_lock_try_pre(l, pc));
         if ((success = hw_lock_try(&l->interlock))) {
                 USLDBG(usld_lock_try_post(l, pc));
@@ -405,10 +479,9 @@ usld_lock_common_checks(
         if (l == USIMPLE_LOCK_NULL)
                 panic("%s:  null lock pointer", caller);
         if (l->lock_type != USLOCK_TAG)
-               panic("%s:  0x%x is not a usimple lock", caller, (integer_t) l);
+               panic("%s:  %p is not a usimple lock, 0x%x", caller, l, l->lock_type);
         if (!(l->debug.state & USLOCK_INIT))
-               panic("%s:  0x%x is not an initialized lock",
-                     caller, (integer_t) l);
+               panic("%s:  %p is not an initialized lock, 0x%x", caller, l, l->debug.state);
         return USLOCK_CHECKING(l);
  }
  
@@ -439,11 +512,11 @@ usld_lock_pre(
  
         if ((l->debug.state & USLOCK_TAKEN) && l->debug.lock_thread &&
             l->debug.lock_thread == (void *) current_thread()) {
-               printf("%s:  lock 0x%x already locked (at 0x%x) by",
-                     caller, (integer_t) l, l->debug.lock_pc);
-               printf(" current thread 0x%x (new attempt at pc 0x%x)\n",
+               printf("%s:  lock %p already locked (at %p) by",
+                     caller, l, l->debug.lock_pc);
+               printf(" current thread %p (new attempt at pc %p)\n",
                        l->debug.lock_thread, pc);
-               panic(caller);
+               panic("%s", caller);
         }
         mp_disable_preemption();
         usl_trace(l, cpu_number(), pc, caller);
@@ -470,11 +543,11 @@ usld_lock_post(
                 return;
  
         if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED))
-               panic("%s:  lock 0x%x became uninitialized",
-                     caller, (integer_t) l);
+               panic("%s:  lock %p became uninitialized",
+                     caller, l);
         if ((l->debug.state & USLOCK_TAKEN))
-               panic("%s:  lock 0x%x became TAKEN by someone else",
-                     caller, (integer_t) l);
+               panic("%s:  lock 0x%p became TAKEN by someone else",
+                     caller, l);
  
         mycpu = cpu_number();
         l->debug.lock_thread = (void *)current_thread();
@@ -509,16 +582,16 @@ usld_unlock(
         mycpu = cpu_number();
  
         if (!(l->debug.state & USLOCK_TAKEN))
-               panic("%s:  lock 0x%x hasn't been taken",
-                     caller, (integer_t) l);
+               panic("%s:  lock 0x%p hasn't been taken",
+                     caller, l);
         if (l->debug.lock_thread != (void *) current_thread())
-               panic("%s:  unlocking lock 0x%x, owned by thread 0x%x",
-                     caller, (integer_t) l, l->debug.lock_thread);
+               panic("%s:  unlocking lock 0x%p, owned by thread %p",
+                     caller, l, l->debug.lock_thread);
         if (l->debug.lock_cpu != mycpu) {
-               printf("%s:  unlocking lock 0x%x on cpu 0x%x",
-                      caller, (integer_t) l, mycpu);
+               printf("%s:  unlocking lock 0x%p on cpu 0x%x",
+                      caller, l, mycpu);
                 printf(" (acquired on cpu 0x%x)\n", l->debug.lock_cpu);
-               panic(caller);
+               panic("%s", caller);
         }
         usl_trace(l, mycpu, pc, caller);
  
@@ -571,11 +644,11 @@ usld_lock_try_post(
                 return;
  
         if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED))
-               panic("%s:  lock 0x%x became uninitialized",
-                     caller, (integer_t) l);
+               panic("%s:  lock 0x%p became uninitialized",
+                     caller, l);
         if ((l->debug.state & USLOCK_TAKEN))
-               panic("%s:  lock 0x%x became TAKEN by someone else",
-                     caller, (integer_t) l);
+               panic("%s:  lock 0x%p became TAKEN by someone else",
+                     caller, l);
  
         mycpu = cpu_number();
         l->debug.lock_thread = (void *) current_thread();
@@ -606,8 +679,8 @@ usl_trace(
         if (traced_lock == l) {
                 XPR(XPR_SLOCK,
                     "seq %d, cpu %d, %s @ %x\n",
-                   (integer_t) lock_seq, (integer_t) mycpu,
-                   (integer_t) op_name, (integer_t) pc, 0);
+                   (uintptr_t) lock_seq, (uintptr_t) mycpu,
+                   (uintptr_t) op_name, (uintptr_t) pc, 0);
                 lock_seq++;
         }
  }
@@ -665,15 +738,16 @@ lock_init(
         lock_t          *l,
         boolean_t       can_sleep,
         __unused unsigned short tag,
-       unsigned short  tag1)
+       __unused unsigned short tag1)
  {
-       (void) memset((void *) l, 0, sizeof(lock_t));
-
-       simple_lock_init(&l->interlock, tag1);
-       l->want_write = FALSE;
-       l->want_upgrade = FALSE;
-       l->read_count = 0;
-       l->can_sleep = can_sleep;
+       hw_lock_byte_init(&l->lck_rw_interlock);
+       l->lck_rw_want_write = FALSE;
+       l->lck_rw_want_upgrade = FALSE;
+       l->lck_rw_shared_count = 0;
+       l->lck_rw_can_sleep = can_sleep;
+       l->lck_rw_tag = tag;
+       l->lck_rw_priv_excl = 1;
+       l->lck_r_waiting = l->lck_w_waiting = 0;
  }
  
  
@@ -689,162 +763,21 @@ void
  lock_write(
         register lock_t * l)
  {
-        register int      i;
-       boolean_t          lock_miss = FALSE;
-#if    MACH_LDEBUG
-       int                decrementer;
-#endif /* MACH_LDEBUG */
-
-       simple_lock(&l->interlock);
-
-#if    MACH_LDEBUG
-       decrementer = DECREMENTER_TIMEOUT;
-#endif /* MACH_LDEBUG */
-
-       /*
-        *      Try to acquire the want_write bit.
-        */
-       while (l->want_write) {
-               if (!lock_miss) {
-                       lock_miss = TRUE;
-               }
-
-               i = lock_wait_time[l->can_sleep ? 1 : 0];
-               if (i != 0) {
-                       simple_unlock(&l->interlock);
-#if    MACH_LDEBUG
-                       if (!--decrementer)
-                               Debugger("timeout - want_write");
-#endif /* MACH_LDEBUG */
-                       while (--i != 0 && l->want_write)
-                               continue;
-                       simple_lock(&l->interlock);
-               }
-
-               if (l->can_sleep && l->want_write) {
-                       l->waiting = TRUE;
-                       thread_sleep_simple_lock((event_t) l,
-                                       simple_lock_addr(l->interlock),
-                                       THREAD_UNINT);
-                       /* interlock relocked */
-               }
-       }
-       l->want_write = TRUE;
-
-       /* Wait for readers (and upgrades) to finish */
-
-#if    MACH_LDEBUG
-       decrementer = DECREMENTER_TIMEOUT;
-#endif /* MACH_LDEBUG */
-       while ((l->read_count != 0) || l->want_upgrade) {
-               if (!lock_miss) {
-                       lock_miss = TRUE;
-               }
-
-               i = lock_wait_time[l->can_sleep ? 1 : 0];
-               if (i != 0) {
-                       simple_unlock(&l->interlock);
-#if    MACH_LDEBUG
-                       if (!--decrementer)
-                               Debugger("timeout - wait for readers");
-#endif /* MACH_LDEBUG */
-                       while (--i != 0 && (l->read_count != 0 ||
-                                           l->want_upgrade))
-                               continue;
-                       simple_lock(&l->interlock);
-               }
-
-               if (l->can_sleep && (l->read_count != 0 || l->want_upgrade)) {
-                       l->waiting = TRUE;
-                       thread_sleep_simple_lock((event_t) l,
-                               simple_lock_addr(l->interlock),
-                               THREAD_UNINT);
-                       /* interlock relocked */
-               }
-       }
-
-       simple_unlock(&l->interlock);
+       lck_rw_lock_exclusive(l);
  }
  
  void
  lock_done(
         register lock_t * l)
  {
-       boolean_t         do_wakeup = FALSE;
-
-
-       simple_lock(&l->interlock);
-
-       if (l->read_count != 0) {
-               l->read_count--;
-       }
-       else    
-               if (l->want_upgrade) {
-                       l->want_upgrade = FALSE;
-               }
-       else {
-               l->want_write = FALSE;
-       }
-
-       /*
-        *      There is no reason to wakeup a waiting thread
-        *      if the read-count is non-zero.  Consider:
-        *              we must be dropping a read lock
-        *              threads are waiting only if one wants a write lock
-        *              if there are still readers, they can't proceed
-        */
-
-       if (l->waiting && (l->read_count == 0)) {
-               l->waiting = FALSE;
-               do_wakeup = TRUE;
-       }
-
-       simple_unlock(&l->interlock);
-
-       if (do_wakeup)
-               thread_wakeup((event_t) l);
+       (void) lck_rw_done(l);
  }
  
  void
  lock_read(
         register lock_t * l)
  {
-       register int        i;
-#if    MACH_LDEBUG
-       int                decrementer;
-#endif /* MACH_LDEBUG */
-
-       simple_lock(&l->interlock);
-
-#if    MACH_LDEBUG
-       decrementer = DECREMENTER_TIMEOUT;
-#endif /* MACH_LDEBUG */
-       while (l->want_write || l->want_upgrade) {
-               i = lock_wait_time[l->can_sleep ? 1 : 0];
-
-               if (i != 0) {
-                       simple_unlock(&l->interlock);
-#if    MACH_LDEBUG
-                       if (!--decrementer)
-                               Debugger("timeout - wait no writers");
-#endif /* MACH_LDEBUG */
-                       while (--i != 0 && (l->want_write || l->want_upgrade))
-                               continue;
-                       simple_lock(&l->interlock);
-               }
-
-               if (l->can_sleep && (l->want_write || l->want_upgrade)) {
-                       l->waiting = TRUE;
-                       thread_sleep_simple_lock((event_t) l,
-                                       simple_lock_addr(l->interlock),
-                                       THREAD_UNINT);
-                       /* interlock relocked */
-               }
-       }
-
-       l->read_count++;
-
-       simple_unlock(&l->interlock);
+       lck_rw_lock_shared(l);
  }
  
  
@@ -856,165 +789,24 @@ lock_read(
   *             already requested an upgrade to a write lock,
   *             no lock is held upon return.
   *
- *             Returns TRUE if the upgrade *failed*.
+ *             Returns FALSE if the upgrade *failed*.
   */
  
  boolean_t
  lock_read_to_write(
         register lock_t * l)
  {
-       register int        i;
-       boolean_t           do_wakeup = FALSE;
-#if    MACH_LDEBUG
-       int                decrementer;
-#endif /* MACH_LDEBUG */
-
-       simple_lock(&l->interlock);
-
-       l->read_count--;        
-
-       if (l->want_upgrade) {
-               /*
-                *      Someone else has requested upgrade.
-                *      Since we've released a read lock, wake
-                *      him up.
-                */
-               if (l->waiting && (l->read_count == 0)) {
-                       l->waiting = FALSE;
-                       do_wakeup = TRUE;
-               }
-
-               simple_unlock(&l->interlock);
-
-               if (do_wakeup)
-                       thread_wakeup((event_t) l);
-               return (TRUE);
-       }
-
-       l->want_upgrade = TRUE;
-
-#if    MACH_LDEBUG
-       decrementer = DECREMENTER_TIMEOUT;
-#endif /* MACH_LDEBUG */
-       while (l->read_count != 0) {
-               i = lock_wait_time[l->can_sleep ? 1 : 0];
-
-               if (i != 0) {
-                       simple_unlock(&l->interlock);
-#if    MACH_LDEBUG
-                       if (!--decrementer)
-                               Debugger("timeout - read_count");
-#endif /* MACH_LDEBUG */
-                       while (--i != 0 && l->read_count != 0)
-                               continue;
-                       simple_lock(&l->interlock);
-               }
-
-               if (l->can_sleep && l->read_count != 0) {
-                       l->waiting = TRUE;
-                       thread_sleep_simple_lock((event_t) l,
-                                       simple_lock_addr(l->interlock),
-                                       THREAD_UNINT);
-                       /* interlock relocked */
-               }
-       }
-
-       simple_unlock(&l->interlock);
-
-       return (FALSE);
+       return lck_rw_lock_shared_to_exclusive(l);
  }
  
  void
  lock_write_to_read(
         register lock_t * l)
  {
-       boolean_t          do_wakeup = FALSE;
-
-       simple_lock(&l->interlock);
-
-       l->read_count++;
-       if (l->want_upgrade)
-               l->want_upgrade = FALSE;
-       else
-               l->want_write = FALSE;
-
-       if (l->waiting) {
-               l->waiting = FALSE;
-               do_wakeup = TRUE;
-       }
-
-       simple_unlock(&l->interlock);
-
-       if (do_wakeup)
-               thread_wakeup((event_t) l);
+       lck_rw_lock_exclusive_to_shared(l);
  }
  
  
-#if    0       /* Unused */
-/*
- *     Routine:        lock_try_write
- *     Function:
- *             Tries to get a write lock.
- *
- *             Returns FALSE if the lock is not held on return.
- */
-
-boolean_t
-lock_try_write(
-       register lock_t * l)
-{
-       pc_t               pc;
-
-       simple_lock(&l->interlock);
-
-       if (l->want_write || l->want_upgrade || l->read_count) {
-               /*
-                *      Can't get lock.
-                */
-               simple_unlock(&l->interlock);
-               return(FALSE);
-       }
-
-       /*
-        *      Have lock.
-        */
-
-       l->want_write = TRUE;
-
-       simple_unlock(&l->interlock);
-
-       return(TRUE);
-}
-
-/*
- *     Routine:        lock_try_read
- *     Function:
- *             Tries to get a read lock.
- *
- *             Returns FALSE if the lock is not held on return.
- */
-
-boolean_t
-lock_try_read(
-       register lock_t * l)
-{
-       pc_t               pc;
-
-       simple_lock(&l->interlock);
-
-       if (l->want_write || l->want_upgrade) {
-               simple_unlock(&l->interlock);
-               return(FALSE);
-       }
-
-       l->read_count++;
-
-       simple_unlock(&l->interlock);
-
-       return(TRUE);
-}
-#endif         /* Unused */
-
  
  /*
   *      Routine:        lck_rw_alloc_init
@@ -1025,9 +817,11 @@ lck_rw_alloc_init(
         lck_attr_t      *attr) {
         lck_rw_t        *lck;
  
-       if ((lck = (lck_rw_t *)kalloc(sizeof(lck_rw_t))) != 0)
+       if ((lck = (lck_rw_t *)kalloc(sizeof(lck_rw_t))) != 0) {
+               bzero(lck, sizeof(lck_rw_t));
                 lck_rw_init(lck, grp, attr);
-               
+       }
+
         return(lck);
  }
  
@@ -1049,14 +843,20 @@ void
  lck_rw_init(
         lck_rw_t        *lck,
         lck_grp_t       *grp,
-       __unused lck_attr_t     *attr) {
-
-       hw_lock_init(&lck->interlock);
-       lck->want_write = FALSE;
-       lck->want_upgrade = FALSE;
-       lck->read_count = 0;
-       lck->can_sleep = TRUE;
+       lck_attr_t      *attr)
+{
+       lck_attr_t      *lck_attr = (attr != LCK_ATTR_NULL) ?
+                                       attr : &LockDefaultLckAttr;
+
+       hw_lock_byte_init(&lck->lck_rw_interlock);
+       lck->lck_rw_want_write = FALSE;
+       lck->lck_rw_want_upgrade = FALSE;
+       lck->lck_rw_shared_count = 0;
+       lck->lck_rw_can_sleep = TRUE;
+       lck->lck_r_waiting = lck->lck_w_waiting = 0;
         lck->lck_rw_tag = 0;
+       lck->lck_rw_priv_excl = ((lck_attr->lck_attr_val &
+                               LCK_ATTR_RW_SHARED_PRIORITY) == 0);
  
         lck_grp_reference(grp);
         lck_grp_lckcnt_incr(grp, LCK_TYPE_RW);
@@ -1068,9 +868,13 @@ lck_rw_init(
  void
  lck_rw_destroy(
         lck_rw_t        *lck,
-       lck_grp_t       *grp) {
+       lck_grp_t       *grp)
+{
         if (lck->lck_rw_tag == LCK_RW_TAG_DESTROYED)
                 return;
+#if MACH_LDEBUG
+       lck_rw_assert(lck, LCK_RW_ASSERT_NOTHELD);
+#endif
         lck->lck_rw_tag = LCK_RW_TAG_DESTROYED;
         lck_grp_lckcnt_decr(grp, LCK_TYPE_RW);
         lck_grp_deallocate(grp);
@@ -1085,10 +889,15 @@ lck_rw_destroy(
  
  #define DECREMENTER_TIMEOUT 1000000
  
+#define RW_LOCK_READER_EVENT(x)                \
+               ((event_t) (((unsigned char*) (x)) + (offsetof(lck_rw_t, lck_rw_tag))))
+
+#define RW_LOCK_WRITER_EVENT(x)                \
+               ((event_t) (((unsigned char*) (x)) + (offsetof(lck_rw_t, lck_rw_pad8))))
  
  /*
- * We need to disable interrupts while holding the mutex interlock
- * to prevent an IPI intervening.
+ * We disable interrupts while holding the RW interlock to prevent an
+ * interrupt from exacerbating hold time.
   * Hence, local helper functions lck_interlock_lock()/lck_interlock_unlock().
   */
  static boolean_t
@@ -1097,7 +906,7 @@ lck_interlock_lock(lck_rw_t *lck)
         boolean_t       istate;
  
         istate = ml_set_interrupts_enabled(FALSE);      
-       hw_lock_lock(&lck->interlock);
+       hw_lock_byte_lock(&lck->lck_rw_interlock);
  
         return istate;
  }
@@ -1105,159 +914,319 @@ lck_interlock_lock(lck_rw_t *lck)
  static void
  lck_interlock_unlock(lck_rw_t *lck, boolean_t istate)
  {               
-       hw_lock_unlock(&lck->interlock);
+       hw_lock_byte_unlock(&lck->lck_rw_interlock);
         ml_set_interrupts_enabled(istate);
  }
  
+/*
+ * This inline is used when busy-waiting for an rw lock.
+ * If interrupts were disabled when the lock primitive was called,
+ * we poll the IPI handler for pending tlb flushes.
+ * XXX This is a hack to avoid deadlocking on the pmap_system_lock.
+ */
+static inline void
+lck_rw_lock_pause(boolean_t interrupts_enabled)
+{
+       if (!interrupts_enabled)
+               handle_pending_TLB_flushes();
+       cpu_pause();
+}
+
+
+/*
+ * compute the deadline to spin against when
+ * waiting for a change of state on a lck_rw_t
+ */
+static inline uint64_t
+lck_rw_deadline_for_spin(lck_rw_t *lck)
+{
+       if (lck->lck_rw_can_sleep) {
+               if (lck->lck_r_waiting || lck->lck_w_waiting || lck->lck_rw_shared_count > machine_info.max_cpus) {
+                       /*
+                        * there are already threads waiting on this lock... this
+                        * implies that they have spun beyond their deadlines waiting for 
+                        * the desired state to show up so we will not bother spinning at this time...
+                        *   or
+                        * the current number of threads sharing this lock exceeds our capacity to run them
+                        * concurrently and since all states we're going to spin for require the rw_shared_count
+                        * to be at 0, we'll not bother spinning since the latency for this to happen is
+                        * unpredictable...
+                        */
+                       return (mach_absolute_time());
+               }
+               return (mach_absolute_time() + MutexSpin);
+       } else
+               return (mach_absolute_time() + (100000LL * 1000000000LL));
+}
+
+
  /*
   *      Routine:        lck_rw_lock_exclusive
   */
  void
-lck_rw_lock_exclusive(
+lck_rw_lock_exclusive_gen(
         lck_rw_t        *lck)
  {
-       int        i;
-       boolean_t               lock_miss = FALSE;
-       wait_result_t   res;
-#if    MACH_LDEBUG
-       int                             decrementer;
-#endif /* MACH_LDEBUG */
-       boolean_t       istate;
-
-       istate = lck_interlock_lock(lck);
-
-#if    MACH_LDEBUG
-       decrementer = DECREMENTER_TIMEOUT;
-#endif /* MACH_LDEBUG */
+       uint64_t        deadline = 0;
+       int             slept = 0;
+       int             gotlock = 0;
+       int             lockheld = 0;
+       wait_result_t   res = 0;
+       boolean_t       istate = -1;
+
+#if    CONFIG_DTRACE
+       boolean_t dtrace_ls_initialized = FALSE;
+       boolean_t dtrace_rwl_excl_spin, dtrace_rwl_excl_block, dtrace_ls_enabled= FALSE;
+       uint64_t wait_interval = 0;
+       int readers_at_sleep = 0;
+#endif
  
         /*
-        *      Try to acquire the want_write bit.
+        *      Try to acquire the lck_rw_want_write bit.
          */
-       while (lck->want_write) {
-               KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EXCLUSIVE_CODE) | DBG_FUNC_START, (int)lck, 0, 0, 0, 0);
-
-               if (!lock_miss) {
-                       lock_miss = TRUE;
+       while ( !lck_rw_grab_want(lck)) {
+
+#if    CONFIG_DTRACE
+               if (dtrace_ls_initialized == FALSE) {
+                       dtrace_ls_initialized = TRUE;
+                       dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
+                       dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
+                       dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
+                       if (dtrace_ls_enabled) {
+                               /*
+                                * Either sleeping or spinning is happening,
+                                *  start a timing of our delay interval now.
+                                */
+                               readers_at_sleep = lck->lck_rw_shared_count;
+                               wait_interval = mach_absolute_time();
+                       }
                 }
+#endif
+               if (istate == -1)
+                       istate = ml_get_interrupts_enabled();
+
+               deadline = lck_rw_deadline_for_spin(lck);
+
+               KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_START, (int)lck, 0, 0, 0, 0);
+               
+               while (((gotlock = lck_rw_grab_want(lck)) == 0) && mach_absolute_time() < deadline)
+                       lck_rw_lock_pause(istate);
+
+               KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_END, (int)lck, 0, 0, gotlock, 0);
+
+               if (gotlock)
+                       break;
+               /*
+                * if we get here, the deadline has expired w/o us
+                * being able to grab the lock exclusively
+                * check to see if we're allowed to do a thread_block
+                */
+               if (lck->lck_rw_can_sleep) {
  
-               i = lock_wait_time[lck->can_sleep ? 1 : 0];
-               if (i != 0) {
-                       lck_interlock_unlock(lck, istate);
-#if    MACH_LDEBUG
-                       if (!--decrementer)
-                               Debugger("timeout - want_write");
-#endif /* MACH_LDEBUG */
-                       while (--i != 0 && lck->want_write)
-                               continue;
                         istate = lck_interlock_lock(lck);
-               }
  
-               if (lck->can_sleep && lck->want_write) {
-                       lck->waiting = TRUE;
-                       res = assert_wait((event_t) lck, THREAD_UNINT);
-                       if (res == THREAD_WAITING) {
+                       if (lck->lck_rw_want_write) {
+
+                               KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_START, (int)lck, 0, 0, 0, 0);
+
+                               lck->lck_w_waiting = TRUE;
+
+                               res = assert_wait(RW_LOCK_WRITER_EVENT(lck), THREAD_UNINT);
                                 lck_interlock_unlock(lck, istate);
-                               res = thread_block(THREAD_CONTINUE_NULL);
-                               istate = lck_interlock_lock(lck);
+
+                               if (res == THREAD_WAITING) {
+                                       res = thread_block(THREAD_CONTINUE_NULL);
+                                       slept++;
+                               }
+                               KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_END, (int)lck, res, slept, 0, 0);
+                       } else {
+                               lck->lck_rw_want_write = TRUE;
+                               lck_interlock_unlock(lck, istate);
+                               break;
                         }
                 }
-               KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EXCLUSIVE_CODE) | DBG_FUNC_END, (int)lck, res, 0, 0, 0);
         }
-       lck->want_write = TRUE;
-
-       /* Wait for readers (and upgrades) to finish */
+       /*
+        * Wait for readers (and upgrades) to finish...
+        * the test for these conditions must be done simultaneously with
+        * a check of the interlock not being held since
+        * the rw_shared_count will drop to 0 first and then want_upgrade
+        * will be set to 1 in the shared_to_exclusive scenario... those
+        * adjustments are done behind the interlock and represent an
+        * atomic change in state and must be considered as such
+        * however, once we see the read count at 0, the want_upgrade not set
+        * and the interlock not held, we are safe to proceed
+        */
+       while (lck_rw_held_read_or_upgrade(lck)) {
  
-#if    MACH_LDEBUG
-       decrementer = DECREMENTER_TIMEOUT;
-#endif /* MACH_LDEBUG */
-       while ((lck->read_count != 0) || lck->want_upgrade) {
-               if (!lock_miss) {
-                       lock_miss = TRUE;
+#if    CONFIG_DTRACE
+               /*
+                * Either sleeping or spinning is happening, start
+                * a timing of our delay interval now.  If we set it
+                * to -1 we don't have accurate data so we cannot later
+                * decide to record a dtrace spin or sleep event.
+                */
+               if (dtrace_ls_initialized == FALSE) {
+                       dtrace_ls_initialized = TRUE;
+                       dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
+                       dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
+                       dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
+                       if (dtrace_ls_enabled) {
+                               /*
+                                * Either sleeping or spinning is happening,
+                                *  start a timing of our delay interval now.
+                                */
+                               readers_at_sleep = lck->lck_rw_shared_count;
+                               wait_interval = mach_absolute_time();
+                       }
                 }
+#endif
+               if (istate == -1)
+                       istate = ml_get_interrupts_enabled();
+
+               deadline = lck_rw_deadline_for_spin(lck);
+
+               KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_START, (int)lck, 0, 0, 0, 0);
  
-               i = lock_wait_time[lck->can_sleep ? 1 : 0];
+               while ((lockheld = lck_rw_held_read_or_upgrade(lck)) && mach_absolute_time() < deadline)
+                       lck_rw_lock_pause(istate);
  
-               KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EXCLUSIVE1_CODE) | DBG_FUNC_START,
-                            (int)lck, lck->read_count, lck->want_upgrade, i, 0);
+               KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_END, (int)lck, 0, 0, lockheld, 0);
+
+               if ( !lockheld)
+                       break;
+               /*
+                * if we get here, the deadline has expired w/o us
+                * being able to grab the lock exclusively
+                * check to see if we're allowed to do a thread_block
+                */
+               if (lck->lck_rw_can_sleep) {
  
-               if (i != 0) {
-                       lck_interlock_unlock(lck, istate);
-#if    MACH_LDEBUG
-                       if (!--decrementer)
-                               Debugger("timeout - wait for readers");
-#endif /* MACH_LDEBUG */
-                       while (--i != 0 && (lck->read_count != 0 ||
-                                           lck->want_upgrade))
-                               continue;
                         istate = lck_interlock_lock(lck);
-               }
  
-               if (lck->can_sleep && (lck->read_count != 0 || lck->want_upgrade)) {
-                       lck->waiting = TRUE;
-                       res = assert_wait((event_t) lck, THREAD_UNINT);
-                       if (res == THREAD_WAITING) {
+                       if (lck->lck_rw_shared_count != 0 || lck->lck_rw_want_upgrade) {
+                               KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_START, (int)lck, 0, 0, 0, 0);
+
+                               lck->lck_w_waiting = TRUE;
+
+                               res = assert_wait(RW_LOCK_WRITER_EVENT(lck), THREAD_UNINT);
+                               lck_interlock_unlock(lck, istate);
+
+                               if (res == THREAD_WAITING) {
+                                       res = thread_block(THREAD_CONTINUE_NULL);
+                                       slept++;
+                               }
+                               KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_END, (int)lck, res, slept, 0, 0);
+                       } else {
                                 lck_interlock_unlock(lck, istate);
-                               res = thread_block(THREAD_CONTINUE_NULL);
-                               istate = lck_interlock_lock(lck);
+                               /*
+                                * must own the lock now, since we checked for
+                                * readers or upgrade owner behind the interlock
+                                * no need for a call to 'lck_rw_held_read_or_upgrade'
+                                */
+                               break;
                         }
                 }
-               KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EXCLUSIVE1_CODE) | DBG_FUNC_END,
-                            (int)lck, lck->read_count, lck->want_upgrade, res, 0);
         }
  
-       lck_interlock_unlock(lck, istate);
+#if    CONFIG_DTRACE
+       /*
+        * Decide what latencies we suffered that are Dtrace events.
+        * If we have set wait_interval, then we either spun or slept.
+        * At least we get out from under the interlock before we record
+        * which is the best we can do here to minimize the impact
+        * of the tracing.
+        * If we have set wait_interval to -1, then dtrace was not enabled when we
+        * started sleeping/spinning so we don't record this event.
+        */
+       if (dtrace_ls_enabled == TRUE) {
+               if (slept == 0) {
+                       LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_EXCL_SPIN, lck,
+                           mach_absolute_time() - wait_interval, 1);
+               } else {
+                       /*
+                        * For the blocking case, we also record if when we blocked
+                        * it was held for read or write, and how many readers.
+                        * Notice that above we recorded this before we dropped
+                        * the interlock so the count is accurate.
+                        */
+                       LOCKSTAT_RECORD4(LS_LCK_RW_LOCK_EXCL_BLOCK, lck,
+                           mach_absolute_time() - wait_interval, 1,
+                           (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
+               }
+       }
+       LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lck, 1);
+#endif
  }
  
  
  /*
- *      Routine:        lck_rw_done
+ *      Routine:        lck_rw_done_gen
+ *
+ *     called from the assembly language wrapper...
+ *     prior_lock_state is the value in the 1st
+ *     word of the lock at the time of a successful
+ *     atomic compare and exchange with the new value...
+ *     it represents the state of the lock before we
+ *     decremented the rw_shared_count or cleared either
+ *     rw_want_upgrade or rw_want_write and
+ *     the lck_x_waiting bits...  since the wrapper
+ *     routine has already changed the state atomically, 
+ *     we just need to decide if we should
+ *     wake up anyone and what value to return... we do
+ *     this by examining the state of the lock before
+ *     we changed it
   */
  lck_rw_type_t
-lck_rw_done(
-       lck_rw_t        *lck)
+lck_rw_done_gen(
+       lck_rw_t        *lck,
+       int             prior_lock_state)
  {
-       boolean_t       do_wakeup = FALSE;
-       lck_rw_type_t   lck_rw_type;
-       boolean_t       istate;
-
-
-       istate = lck_interlock_lock(lck);
+       lck_rw_t        *fake_lck;
+       lck_rw_type_t   lock_type;
+       thread_t        thread = current_thread();
+       uint32_t        rwlock_count;
  
-       if (lck->read_count != 0) {
-               lck_rw_type = LCK_RW_TYPE_SHARED;
-               lck->read_count--;
+       /* Check if dropping the lock means that we need to unpromote */
+       rwlock_count = thread->rwlock_count--;
+#if MACH_LDEBUG
+       if (rwlock_count == 0) {
+               panic("rw lock count underflow for thread %p", thread);
         }
-       else {  
-               lck_rw_type = LCK_RW_TYPE_EXCLUSIVE;
-               if (lck->want_upgrade) 
-                       lck->want_upgrade = FALSE;
-               else 
-                       lck->want_write = FALSE;
+#endif
+       if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
+               /* sched_flags checked without lock, but will be rechecked while clearing */
+               lck_rw_clear_promotion(thread);
         }
  
         /*
-        *      There is no reason to wakeup a waiting thread
-        *      if the read-count is non-zero.  Consider:
-        *              we must be dropping a read lock
-        *              threads are waiting only if one wants a write lock
-        *              if there are still readers, they can't proceed
+        * prior_lock state is a snapshot of the 1st word of the
+        * lock in question... we'll fake up a pointer to it
+        * and carefully not access anything beyond whats defined
+        * in the first word of a lck_rw_t
          */
+       fake_lck = (lck_rw_t *)&prior_lock_state;
+
+       if (fake_lck->lck_rw_shared_count <= 1) {
+               if (fake_lck->lck_w_waiting)
+                       thread_wakeup(RW_LOCK_WRITER_EVENT(lck));
  
-       if (lck->waiting && (lck->read_count == 0)) {
-               lck->waiting = FALSE;
-               do_wakeup = TRUE;
+               if (!(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting) && fake_lck->lck_r_waiting)
+                       thread_wakeup(RW_LOCK_READER_EVENT(lck));
         }
+       if (fake_lck->lck_rw_shared_count)
+               lock_type = LCK_RW_TYPE_SHARED;
+       else
+               lock_type = LCK_RW_TYPE_EXCLUSIVE;
  
-       lck_interlock_unlock(lck, istate);
+#if CONFIG_DTRACE
+       LOCKSTAT_RECORD(LS_LCK_RW_DONE_RELEASE, lck, lock_type == LCK_RW_TYPE_SHARED ? 0 : 1);
+#endif
  
-       if (do_wakeup)
-               thread_wakeup((event_t) lck);
-       return(lck_rw_type);
+       return(lock_type);
  }
  
  
-
-
  /*
   *     Routine:        lck_rw_unlock
   */
@@ -1325,186 +1294,315 @@ lck_rw_lock(
  
  
  /*
- *     Routine:        lck_rw_lock_shared
+ *     Routine:        lck_rw_lock_shared_gen
+ *     Function:
+ *             assembly fast path code has determined that this lock
+ *             is held exclusively... this is where we spin/block
+ *             until we can acquire the lock in the shared mode
   */
  void
-lck_rw_lock_shared(
+lck_rw_lock_shared_gen(
         lck_rw_t        *lck)
  {
-       int             i;
-       wait_result_t      res;
-#if    MACH_LDEBUG
-       int             decrementer;
-#endif /* MACH_LDEBUG */
-       boolean_t       istate;
+       uint64_t        deadline = 0;
+       int             gotlock = 0;
+       int             slept = 0;
+       wait_result_t   res = 0;
+       boolean_t       istate = -1;
+       
+#if    CONFIG_DTRACE
+       uint64_t wait_interval = 0;
+       int readers_at_sleep = 0;
+       boolean_t dtrace_ls_initialized = FALSE;
+       boolean_t dtrace_rwl_shared_spin, dtrace_rwl_shared_block, dtrace_ls_enabled = FALSE;
+#endif
  
-       istate = lck_interlock_lock(lck);
-
-#if    MACH_LDEBUG
-       decrementer = DECREMENTER_TIMEOUT;
-#endif /* MACH_LDEBUG */
-       while (lck->want_write || lck->want_upgrade) {
-               i = lock_wait_time[lck->can_sleep ? 1 : 0];
-
-               KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_CODE) | DBG_FUNC_START,
-                            (int)lck, lck->want_write, lck->want_upgrade, i, 0);
-
-               if (i != 0) {
-                       lck_interlock_unlock(lck, istate);
-#if    MACH_LDEBUG
-                       if (!--decrementer)
-                               Debugger("timeout - wait no writers");
-#endif /* MACH_LDEBUG */
-                       while (--i != 0 && (lck->want_write || lck->want_upgrade))
-                               continue;
-                       istate = lck_interlock_lock(lck);
+       while ( !lck_rw_grab_shared(lck)) {
+
+#if    CONFIG_DTRACE
+               if (dtrace_ls_initialized == FALSE) {
+                       dtrace_ls_initialized = TRUE;
+                       dtrace_rwl_shared_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_SPIN] != 0);
+                       dtrace_rwl_shared_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_BLOCK] != 0);
+                       dtrace_ls_enabled = dtrace_rwl_shared_spin || dtrace_rwl_shared_block;
+                       if (dtrace_ls_enabled) {
+                               /*
+                                * Either sleeping or spinning is happening,
+                                *  start a timing of our delay interval now.
+                                */
+                               readers_at_sleep = lck->lck_rw_shared_count;
+                               wait_interval = mach_absolute_time();
+                       }
                 }
+#endif
+               if (istate == -1)
+                       istate = ml_get_interrupts_enabled();
+
+               deadline = lck_rw_deadline_for_spin(lck);
+
+               KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_START,
+                            (int)lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0);
+
+               while (((gotlock = lck_rw_grab_shared(lck)) == 0) && mach_absolute_time() < deadline)
+                       lck_rw_lock_pause(istate);
+
+               KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_END,
+                            (int)lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, gotlock, 0);
+
+               if (gotlock)
+                       break;
+               /*
+                * if we get here, the deadline has expired w/o us
+                * being able to grab the lock for read
+                * check to see if we're allowed to do a thread_block
+                */
+               if (lck->lck_rw_can_sleep) {
+
+                       istate = lck_interlock_lock(lck);
  
-               if (lck->can_sleep && (lck->want_write || lck->want_upgrade)) {
-                       lck->waiting = TRUE;
-                       res = assert_wait((event_t) lck, THREAD_UNINT);
-                       if (res == THREAD_WAITING) {
+                       if ((lck->lck_rw_want_write || lck->lck_rw_want_upgrade) &&
+                           ((lck->lck_rw_shared_count == 0) || lck->lck_rw_priv_excl)) {
+
+                               KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_START,
+                                            (int)lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0);
+
+                               lck->lck_r_waiting = TRUE;
+
+                               res = assert_wait(RW_LOCK_READER_EVENT(lck), THREAD_UNINT);
                                 lck_interlock_unlock(lck, istate);
-                               res = thread_block(THREAD_CONTINUE_NULL);
-                               istate = lck_interlock_lock(lck);
+
+                               if (res == THREAD_WAITING) {
+                                       res = thread_block(THREAD_CONTINUE_NULL);
+                                       slept++;
+                               }
+                               KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_END,
+                                            (int)lck, res, slept, 0, 0);
+                       } else {
+                               lck->lck_rw_shared_count++;
+                               lck_interlock_unlock(lck, istate);
+                               break;
                         }
                 }
-               KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_CODE) | DBG_FUNC_END,
-                            (int)lck, lck->want_write, lck->want_upgrade, res, 0);
         }
  
-       lck->read_count++;
-
-       lck_interlock_unlock(lck, istate);
+#if    CONFIG_DTRACE
+       if (dtrace_ls_enabled == TRUE) {
+               if (slept == 0) {
+                       LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_SHARED_SPIN, lck, mach_absolute_time() - wait_interval, 0);
+               } else {
+                       LOCKSTAT_RECORD4(LS_LCK_RW_LOCK_SHARED_BLOCK, lck,
+                           mach_absolute_time() - wait_interval, 0,
+                           (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
+               }
+       }
+       LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lck, 0);
+#endif
  }
  
  
  /*
- *     Routine:        lck_rw_lock_shared_to_exclusive
+ *     Routine:        lck_rw_lock_shared_to_exclusive_failure
   *     Function:
- *             Improves a read-only lock to one with
- *             write permission.  If another reader has
- *             already requested an upgrade to a write lock,
- *             no lock is held upon return.
- *
- *             Returns TRUE if the upgrade *failed*.
+ *             assembly fast path code has already dropped our read
+ *             count and determined that someone else owns 'lck_rw_want_upgrade'
+ *             if 'lck_rw_shared_count' == 0, its also already dropped 'lck_w_waiting'
+ *             all we need to do here is determine if a wakeup is needed
   */
-
  boolean_t
-lck_rw_lock_shared_to_exclusive(
-       lck_rw_t        *lck)
+lck_rw_lock_shared_to_exclusive_failure(
+       lck_rw_t        *lck,
+       int             prior_lock_state)
  {
-       int         i;
-       boolean_t           do_wakeup = FALSE;
-       wait_result_t      res;
-#if    MACH_LDEBUG
-       int                decrementer;
-#endif /* MACH_LDEBUG */
-       boolean_t       istate;
-
-       istate = lck_interlock_lock(lck);
+       lck_rw_t        *fake_lck;
+       thread_t        thread = current_thread();
+       uint32_t        rwlock_count;
  
-       lck->read_count--;      
+       /* Check if dropping the lock means that we need to unpromote */
+       rwlock_count = thread->rwlock_count--;
+#if MACH_LDEBUG
+       if (rwlock_count == 0) {
+               panic("rw lock count underflow for thread %p", thread);
+       }
+#endif
+       if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
+               /* sched_flags checked without lock, but will be rechecked while clearing */
+               lck_rw_clear_promotion(thread);
+       }
  
-       if (lck->want_upgrade) {
-               KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_CODE) | DBG_FUNC_START,
-                            (int)lck, lck->read_count, lck->want_upgrade, 0, 0);
+       /*
+        * prior_lock state is a snapshot of the 1st word of the
+        * lock in question... we'll fake up a pointer to it
+        * and carefully not access anything beyond whats defined
+        * in the first word of a lck_rw_t
+        */
+       fake_lck = (lck_rw_t *)&prior_lock_state;
  
+       if (fake_lck->lck_w_waiting && fake_lck->lck_rw_shared_count == 1) {
                 /*
                  *      Someone else has requested upgrade.
-                *      Since we've released a read lock, wake
-                *      him up.
+                *      Since we've released the read lock, wake
+                *      him up if he's blocked waiting
                  */
-               if (lck->waiting && (lck->read_count == 0)) {
-                       lck->waiting = FALSE;
-                       do_wakeup = TRUE;
+               thread_wakeup(RW_LOCK_WRITER_EVENT(lck));
+       }
+       KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_CODE) | DBG_FUNC_NONE,
+                    (int)lck, lck->lck_rw_shared_count, lck->lck_rw_want_upgrade, 0, 0);
+
+       return (FALSE);
+}
+
+
+/*
+ *     Routine:        lck_rw_lock_shared_to_exclusive_failure
+ *     Function:
+ *             assembly fast path code has already dropped our read
+ *             count and successfully acquired 'lck_rw_want_upgrade'
+ *             we just need to wait for the rest of the readers to drain
+ *             and then we can return as the exclusive holder of this lock
+ */
+boolean_t
+lck_rw_lock_shared_to_exclusive_success(
+       lck_rw_t        *lck)
+{
+       uint64_t        deadline = 0;
+       int             slept = 0;
+       int             still_shared = 0;
+       wait_result_t   res;
+       boolean_t       istate = -1;
+
+#if    CONFIG_DTRACE
+       uint64_t wait_interval = 0;
+       int readers_at_sleep = 0;
+       boolean_t dtrace_ls_initialized = FALSE;
+       boolean_t dtrace_rwl_shared_to_excl_spin, dtrace_rwl_shared_to_excl_block, dtrace_ls_enabled = FALSE;
+#endif
+
+       while (lck->lck_rw_shared_count != 0) {
+
+#if    CONFIG_DTRACE
+               if (dtrace_ls_initialized == FALSE) {
+                       dtrace_ls_initialized = TRUE;
+                       dtrace_rwl_shared_to_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN] != 0);
+                       dtrace_rwl_shared_to_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK] != 0);
+                       dtrace_ls_enabled = dtrace_rwl_shared_to_excl_spin || dtrace_rwl_shared_to_excl_block;
+                       if (dtrace_ls_enabled) {
+                               /*
+                                * Either sleeping or spinning is happening,
+                                *  start a timing of our delay interval now.
+                                */
+                               readers_at_sleep = lck->lck_rw_shared_count;
+                               wait_interval = mach_absolute_time();
+                       }
                 }
+#endif
+               if (istate == -1)
+                       istate = ml_get_interrupts_enabled();
  
-               lck_interlock_unlock(lck, istate);
+               deadline = lck_rw_deadline_for_spin(lck);
  
-               if (do_wakeup)
-                       thread_wakeup((event_t) lck);
+               KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_START,
+                            (int)lck, lck->lck_rw_shared_count, 0, 0, 0);
  
-               KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_CODE) | DBG_FUNC_END,
-                            (int)lck, lck->read_count, lck->want_upgrade, 0, 0);
+               while ((still_shared = lck->lck_rw_shared_count) && mach_absolute_time() < deadline)
+                       lck_rw_lock_pause(istate);
  
-               return (TRUE);
-       }
+               KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_END,
+                            (int)lck, lck->lck_rw_shared_count, 0, 0, 0);
  
-       lck->want_upgrade = TRUE;
-
-#if    MACH_LDEBUG
-       decrementer = DECREMENTER_TIMEOUT;
-#endif /* MACH_LDEBUG */
-       while (lck->read_count != 0) {
-               i = lock_wait_time[lck->can_sleep ? 1 : 0];
-
-               KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX1_CODE) | DBG_FUNC_START,
-                            (int)lck, lck->read_count, i, 0, 0);
-
-               if (i != 0) {
-                       lck_interlock_unlock(lck, istate);
-#if    MACH_LDEBUG
-                       if (!--decrementer)
-                               Debugger("timeout - read_count");
-#endif /* MACH_LDEBUG */
-                       while (--i != 0 && lck->read_count != 0)
-                               continue;
+               if ( !still_shared)
+                       break;
+               /*
+                * if we get here, the deadline has expired w/o
+                * the rw_shared_count having drained to 0
+                * check to see if we're allowed to do a thread_block
+                */
+               if (lck->lck_rw_can_sleep) {
+                       
                         istate = lck_interlock_lock(lck);
-               }
+                       
+                       if (lck->lck_rw_shared_count != 0) {
+                               KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_START,
+                                            (int)lck, lck->lck_rw_shared_count, 0, 0, 0);
+
+                               lck->lck_w_waiting = TRUE;
+
+                               res = assert_wait(RW_LOCK_WRITER_EVENT(lck), THREAD_UNINT);
+                               lck_interlock_unlock(lck, istate);
  
-               if (lck->can_sleep && lck->read_count != 0) {
-                       lck->waiting = TRUE;
-                       res = assert_wait((event_t) lck, THREAD_UNINT);
-                       if (res == THREAD_WAITING) {
+                               if (res == THREAD_WAITING) {
+                                       res = thread_block(THREAD_CONTINUE_NULL);
+                                       slept++;
+                               }
+                               KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_END,
+                                            (int)lck, res, slept, 0, 0);
+                       } else {
                                 lck_interlock_unlock(lck, istate);
-                               res = thread_block(THREAD_CONTINUE_NULL);
-                               istate = lck_interlock_lock(lck);
+                               break;
                         }
                 }
-               KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX1_CODE) | DBG_FUNC_END,
-                            (int)lck, lck->read_count, 0, 0, 0);
         }
-
-       lck_interlock_unlock(lck, istate);
-
-       return (FALSE);
+#if    CONFIG_DTRACE
+       /*
+        * We infer whether we took the sleep/spin path above by checking readers_at_sleep.
+        */
+       if (dtrace_ls_enabled == TRUE) {
+               if (slept == 0) {
+                       LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN, lck, mach_absolute_time() - wait_interval, 0);
+               } else {
+                       LOCKSTAT_RECORD4(LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK, lck,
+                           mach_absolute_time() - wait_interval, 1,
+                           (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
+               }
+       }
+       LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lck, 1);
+#endif
+       return (TRUE);
  }
  
+
  /*
   *      Routine:        lck_rw_lock_exclusive_to_shared
+ *     Function:
+ *             assembly fast path has already dropped
+ *             our exclusive state and bumped lck_rw_shared_count
+ *             all we need to do here is determine if anyone
+ *             needs to be awakened.
   */
  void
-lck_rw_lock_exclusive_to_shared(
-       lck_rw_t        *lck)
+lck_rw_lock_exclusive_to_shared_gen(
+       lck_rw_t        *lck,
+       int             prior_lock_state)
  {
-       boolean_t       do_wakeup = FALSE;
-       boolean_t       istate;
+       lck_rw_t        *fake_lck;
  
-       KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_START,
-                            (int)lck, lck->want_write, lck->want_upgrade, 0, 0);
-
-       istate = lck_interlock_lock(lck);
-
-       lck->read_count++;
-       if (lck->want_upgrade)
-               lck->want_upgrade = FALSE;
-       else
-               lck->want_write = FALSE;
-
-       if (lck->waiting) {
-               lck->waiting = FALSE;
-               do_wakeup = TRUE;
-       }
+       /*
+        * prior_lock state is a snapshot of the 1st word of the
+        * lock in question... we'll fake up a pointer to it
+        * and carefully not access anything beyond whats defined
+        * in the first word of a lck_rw_t
+        */
+       fake_lck = (lck_rw_t *)&prior_lock_state;
  
-       lck_interlock_unlock(lck, istate);
+       KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_START,
+                            (int)lck, fake_lck->lck_rw_want_write, fake_lck->lck_rw_want_upgrade, 0, 0);
  
-       if (do_wakeup)
-               thread_wakeup((event_t) lck);
+       /*
+        * don't wake up anyone waiting to take the lock exclusively
+        * since we hold a read count... when the read count drops to 0,
+        * the writers will be woken.
+        *
+        * wake up any waiting readers if we don't have any writers waiting,
+        * or the lock is NOT marked as rw_priv_excl (writers have privilege)
+        */
+       if (!(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting) && fake_lck->lck_r_waiting)
+               thread_wakeup(RW_LOCK_READER_EVENT(lck));
  
         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_END,
-                            (int)lck, lck->want_write, lck->want_upgrade, lck->read_count, 0);
+                            (int)lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, lck->lck_rw_shared_count, 0);
  
+#if CONFIG_DTRACE
+       LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_TO_SHARED_DOWNGRADE, lck, 0);
+#endif
  }
  
  
@@ -1525,69 +1623,64 @@ lck_rw_try_lock(
         return(FALSE);
  }
  
-/*
- *     Routine:        lck_rw_try_lock_exclusive
- *     Function:
- *             Tries to get a write lock.
- *
- *             Returns FALSE if the lock is not held on return.
- */
  
-boolean_t
-lck_rw_try_lock_exclusive(
-       lck_rw_t        *lck)
+void
+lck_rw_assert(
+       lck_rw_t        *lck,
+       unsigned int    type)
  {
-       boolean_t       istate;
-
-       istate = lck_interlock_lock(lck);
-
-       if (lck->want_write || lck->want_upgrade || lck->read_count) {
-               /*
-                *      Can't get lock.
-                */
-               lck_interlock_unlock(lck, istate);
-               return(FALSE);
+       switch (type) {
+       case LCK_RW_ASSERT_SHARED:
+               if (lck->lck_rw_shared_count != 0) {
+                       return;
+               }
+               break;
+       case LCK_RW_ASSERT_EXCLUSIVE:
+               if ((lck->lck_rw_want_write ||
+                    lck->lck_rw_want_upgrade) &&
+                   lck->lck_rw_shared_count == 0) {
+                       return;
+               }
+               break;
+       case LCK_RW_ASSERT_HELD:
+               if (lck->lck_rw_want_write ||
+                   lck->lck_rw_want_upgrade ||
+                   lck->lck_rw_shared_count != 0) {
+                       return;
+               }
+               break;
+       case LCK_RW_ASSERT_NOTHELD:
+               if (!(lck->lck_rw_want_write ||
+                         lck->lck_rw_want_upgrade ||
+                         lck->lck_rw_shared_count != 0)) {
+                       return;
+               }
+               break;
+       default:
+               break;
         }
  
-       /*
-        *      Have lock.
-        */
-
-       lck->want_write = TRUE;
-
-       lck_interlock_unlock(lck, istate);
-
-       return(TRUE);
+       panic("rw lock (%p)%s held (mode=%u), first word %08x\n", lck, (type == LCK_RW_ASSERT_NOTHELD ? "" : " not"), type, *(uint32_t *)lck);
  }
  
-/*
- *     Routine:        lck_rw_try_lock_shared
- *     Function:
- *             Tries to get a read lock.
- *
- *             Returns FALSE if the lock is not held on return.
- */
-
-boolean_t
-lck_rw_try_lock_shared(
-       lck_rw_t        *lck)
+/* On return to userspace, this routine is called if the rwlock_count is somehow imbalanced */
+void
+lck_rw_clear_promotions_x86(thread_t thread)
  {
-       boolean_t       istate;
-
-       istate = lck_interlock_lock(lck);
-
-       if (lck->want_write || lck->want_upgrade) {
-               lck_interlock_unlock(lck, istate);
-               return(FALSE);
-       }
-
-       lck->read_count++;
-
-       lck_interlock_unlock(lck, istate);
-
-       return(TRUE);
+#if MACH_LDEBUG
+       /* It's fatal to leave a RW lock locked and return to userspace */
+       panic("%u rw lock(s) held on return to userspace for thread %p", thread->rwlock_count, thread);
+#else
+       /* Paper over the issue */
+       thread->rwlock_count = 0;
+       lck_rw_clear_promotion(thread);
+#endif
  }
  
+
+#ifdef MUTEX_ZONE
+extern zone_t lck_mtx_zone;
+#endif
  /*
   *      Routine:        lck_mtx_alloc_init
   */
@@ -1597,10 +1690,13 @@ lck_mtx_alloc_init(
         lck_attr_t      *attr)
  {
         lck_mtx_t       *lck;
-
+#ifdef MUTEX_ZONE
+       if ((lck = (lck_mtx_t *)zalloc(lck_mtx_zone)) != 0)
+               lck_mtx_init(lck, grp, attr);
+#else
         if ((lck = (lck_mtx_t *)kalloc(sizeof(lck_mtx_t))) != 0)
                 lck_mtx_init(lck, grp, attr);
-               
+#endif         
         return(lck);
  }
  
@@ -1613,7 +1709,11 @@ lck_mtx_free(
         lck_grp_t       *grp)
  {
         lck_mtx_destroy(lck, grp);
+#ifdef MUTEX_ZONE
+       zfree(lck_mtx_zone, lck);
+#else
         kfree(lck, sizeof(lck_mtx_t));
+#endif
  }
  
  /*
@@ -1625,20 +1725,20 @@ lck_mtx_ext_init(
         lck_grp_t       *grp,
         lck_attr_t      *attr)
  {
-       lck->lck_mtx.lck_mtx_ilk = 0;
-       lck->lck_mtx.lck_mtx_locked = 0;
-       lck->lck_mtx.lck_mtx_waiters = 0;
-       lck->lck_mtx.lck_mtx_pri = 0;
-       lck->lck_mtx_attr = 0;
+       bzero((void *)lck, sizeof(lck_mtx_ext_t));
  
         if ((attr->lck_attr_val) & LCK_ATTR_DEBUG) {
-               lck->lck_mtx_deb.pc = 0;
-               lck->lck_mtx_deb.thread = 0;
                 lck->lck_mtx_deb.type = MUTEX_TAG;
                 lck->lck_mtx_attr |= LCK_MTX_ATTR_DEBUG;
         }
  
         lck->lck_mtx_grp = grp;
+
+       if (grp->lck_grp_attr & LCK_GRP_ATTR_STAT)
+               lck->lck_mtx_attr |= LCK_MTX_ATTR_STAT;
+
+       lck->lck_mtx.lck_mtx_is_ext = 1;
+       lck->lck_mtx.lck_mtx_sw.lck_mtxd.lck_mtxd_pad32 = 0xFFFFFFFF;
  }
  
  /*
@@ -1651,19 +1751,55 @@ lck_mtx_init(
         lck_attr_t      *attr)
  {
         lck_mtx_ext_t   *lck_ext;
+       lck_attr_t      *lck_attr;
+
+       if (attr != LCK_ATTR_NULL)
+               lck_attr = attr;
+       else
+               lck_attr = &LockDefaultLckAttr;
  
-       if ((attr != LCK_ATTR_NULL) && ((attr->lck_attr_val) & LCK_ATTR_DEBUG)) {
+       if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) {
                 if ((lck_ext = (lck_mtx_ext_t *)kalloc(sizeof(lck_mtx_ext_t))) != 0) {
-                       lck_mtx_ext_init(lck_ext, grp, attr);   
+                       lck_mtx_ext_init(lck_ext, grp, lck_attr);       
                         lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT;
                         lck->lck_mtx_ptr = lck_ext;
                 }
         } else {
-               lck->lck_mtx_ilk = 0;
-               lck->lck_mtx_locked = 0;
-               lck->lck_mtx_waiters = 0;
-               lck->lck_mtx_pri = 0;
+               lck->lck_mtx_owner = 0;
+               lck->lck_mtx_state = 0;
+       }
+       lck->lck_mtx_sw.lck_mtxd.lck_mtxd_pad32 = 0xFFFFFFFF;
+       lck_grp_reference(grp);
+       lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX);
+}
+
+/*
+ *      Routine:        lck_mtx_init_ext
+ */
+void
+lck_mtx_init_ext(
+       lck_mtx_t       *lck,
+       lck_mtx_ext_t   *lck_ext,
+       lck_grp_t       *grp,
+       lck_attr_t      *attr)
+{
+       lck_attr_t      *lck_attr;
+
+       if (attr != LCK_ATTR_NULL)
+               lck_attr = attr;
+       else
+               lck_attr = &LockDefaultLckAttr;
+
+       if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) {
+               lck_mtx_ext_init(lck_ext, grp, lck_attr);
+               lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT;
+               lck->lck_mtx_ptr = lck_ext;
+       } else {
+               lck->lck_mtx_owner = 0;
+               lck->lck_mtx_state = 0;
         }
+       lck->lck_mtx_sw.lck_mtxd.lck_mtxd_pad32 = 0xFFFFFFFF;
+
         lck_grp_reference(grp);
         lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX);
  }
@@ -1680,8 +1816,13 @@ lck_mtx_destroy(
         
         if (lck->lck_mtx_tag == LCK_MTX_TAG_DESTROYED)
                 return;
+#if MACH_LDEBUG
+       lck_mtx_assert(lck, LCK_MTX_ASSERT_NOTOWNED);
+#endif
         lck_is_indirect = (lck->lck_mtx_tag == LCK_MTX_TAG_INDIRECT);
-       lck->lck_mtx_tag = LCK_MTX_TAG_DESTROYED;
+
+       lck_mtx_lock_mark_destroyed(lck);
+
         if (lck_is_indirect)
                 kfree(lck->lck_mtx_ptr, sizeof(lck_mtx_ext_t));
         lck_grp_lckcnt_decr(grp, LCK_TYPE_MTX);
@@ -1689,182 +1830,321 @@ lck_mtx_destroy(
         return;
  }
  
+
+#define        LCK_MTX_LCK_WAIT_CODE           0x20
+#define        LCK_MTX_LCK_WAKEUP_CODE         0x21
+#define        LCK_MTX_LCK_SPIN_CODE           0x22
+#define        LCK_MTX_LCK_ACQUIRE_CODE        0x23
+#define LCK_MTX_LCK_DEMOTE_CODE                0x24
+
+
  /*
- *      Routine:        lck_mtx_assert
+ * Routine:    lck_mtx_unlock_wakeup_x86
+ *
+ * Invoked on unlock when there is 
+ * contention (i.e. the assembly routine sees that
+ * that mutex->lck_mtx_waiters != 0 or 
+ * that mutex->lck_mtx_promoted != 0...
+ *
+ * neither the mutex or interlock is held
   */
  void
-lck_mtx_assert(
-       __unused lck_mtx_t      *lck,
-       __unused unsigned int   type)
+lck_mtx_unlock_wakeup_x86 (
+       lck_mtx_t       *mutex,
+       int             prior_lock_state)
  {
-}
+       lck_mtx_t       fake_lck;
  
-#if    MACH_KDB
+       /*
+        * prior_lock state is a snapshot of the 2nd word of the
+        * lock in question... we'll fake up a lock with the bits
+        * copied into place and carefully not access anything
+        * beyond whats defined in the second word of a lck_mtx_t
+        */
+       fake_lck.lck_mtx_state = prior_lock_state;
  
-void   db_show_one_lock(lock_t  *);
+       KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_START,
+                    mutex, fake_lck.lck_mtx_promoted, fake_lck.lck_mtx_waiters, fake_lck.lck_mtx_pri, 0);
  
-void
-db_show_one_lock(
-       lock_t  *lock)
-{
-       db_printf("Read_count = 0x%x, %swant_upgrade, %swant_write, ",
-                 lock->read_count,
-                 lock->want_upgrade ? "" : "!",
-                 lock->want_write ? "" : "!");
-       db_printf("%swaiting, %scan_sleep\n", 
-                 lock->waiting ? "" : "!", lock->can_sleep ? "" : "!");
-       db_printf("Interlock:\n");
-       db_show_one_simple_lock((db_expr_t)simple_lock_addr(lock->interlock),
-                       TRUE, (db_expr_t)0, (char *)0);
-}
+       if (__probable(fake_lck.lck_mtx_waiters)) {
+               if (fake_lck.lck_mtx_waiters > 1)
+                       thread_wakeup_one_with_pri((event_t)(((unsigned int*)mutex)+(sizeof(lck_mtx_t)-1)/sizeof(unsigned int)), fake_lck.lck_mtx_pri);
+               else
+                       thread_wakeup_one((event_t)(((unsigned int*)mutex)+(sizeof(lck_mtx_t)-1)/sizeof(unsigned int)));
+       }
  
-#endif /* MACH_KDB */
+       if (__improbable(fake_lck.lck_mtx_promoted)) {
+               thread_t        thread = current_thread();
  
-/*
- * The C portion of the mutex package.  These routines are only invoked
- * if the optimized assembler routines can't do the work.
- */
  
-/*
- *     Routine:        lock_alloc
- *     Function:
- *             Allocate a mutex for external users who cannot
- *             hard-code the structure definition into their
- *             objects.
- *             For now just use kalloc, but a zone is probably
- *             warranted.
- */
-mutex_t *
-mutex_alloc(
-       unsigned short  tag)
-{
-       mutex_t         *m;
+               KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_DEMOTE_CODE) | DBG_FUNC_NONE,
+                            thread_tid(thread), thread->promotions, thread->sched_flags & TH_SFLAG_PROMOTED, 0, 0);
  
-       if ((m = (mutex_t *)kalloc(sizeof(mutex_t))) != 0)
-         mutex_init(m, tag);
-       return(m);
+               if (thread->promotions > 0) {
+                       spl_t   s = splsched();
+
+                       thread_lock(thread);
+
+                       if (--thread->promotions == 0 && (thread->sched_flags & TH_SFLAG_PROMOTED)) {
+
+                               thread->sched_flags &= ~TH_SFLAG_PROMOTED;
+
+                               if (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) {
+                                       KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_DEMOTE) | DBG_FUNC_NONE,
+                                                             thread->sched_pri, DEPRESSPRI, 0, mutex, 0);
+
+                                       set_sched_pri(thread, DEPRESSPRI);
+                               }
+                               else {
+                                       if (thread->priority < thread->sched_pri) {
+                                               KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_DEMOTE) | DBG_FUNC_NONE,
+                                                                     thread->sched_pri, thread->priority, 0, mutex, 0);
+
+                                               SCHED(compute_priority)(thread, FALSE);
+                                       }
+                               }
+                       }
+                       thread_unlock(thread);
+                       splx(s);
+               }
+       }
+       KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_END,
+                    mutex, 0, mutex->lck_mtx_waiters, 0, 0);
  }
  
+
  /*
- *     Routine:        mutex_free
- *     Function:
- *             Free a mutex allocated for external users.
- *             For now just use kfree, but a zone is probably
- *             warranted.
+ * Routine:    lck_mtx_lock_acquire_x86
+ *
+ * Invoked on acquiring the mutex when there is
+ * contention (i.e. the assembly routine sees that
+ * that mutex->lck_mtx_waiters != 0 or 
+ * thread->was_promoted_on_wakeup != 0)...
+ *
+ * mutex is owned...  interlock is held... preemption is disabled
   */
  void
-mutex_free(
-       mutex_t         *m)
+lck_mtx_lock_acquire_x86(
+       lck_mtx_t       *mutex)
  {
-       kfree(m, sizeof(mutex_t));
+       thread_t        thread;
+       integer_t       priority;
+       spl_t           s;
+
+       KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_START,
+                    mutex, thread->was_promoted_on_wakeup, mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0);
+
+       if (mutex->lck_mtx_waiters)
+               priority = mutex->lck_mtx_pri;
+       else
+               priority = 0;
+
+       thread = (thread_t)mutex->lck_mtx_owner;        /* faster then current_thread() */
+
+       if (thread->sched_pri < priority || thread->was_promoted_on_wakeup) {
+
+               KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PROMOTE) | DBG_FUNC_NONE,
+                                     thread->sched_pri, priority, thread->was_promoted_on_wakeup, mutex, 0);
+
+               s = splsched();
+               thread_lock(thread);
+
+               if (thread->sched_pri < priority) {
+                       /* Do not promote into the realtime priority band */
+                       assert(priority <= MAXPRI_KERNEL);
+                       set_sched_pri(thread, priority);
+               }
+               if (mutex->lck_mtx_promoted == 0) {
+                       mutex->lck_mtx_promoted = 1;
+                       
+                       thread->promotions++;
+                       thread->sched_flags |= TH_SFLAG_PROMOTED;
+               }
+               thread->was_promoted_on_wakeup = 0;
+               
+               thread_unlock(thread);
+               splx(s);
+       }
+       KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_END,
+                    mutex, 0, mutex->lck_mtx_waiters, 0, 0);
  }
  
+
+
  /*
- *     Routine:        _mutex_assert
+ * Routine:    lck_mtx_lock_spinwait_x86
+ *
+ * Invoked trying to acquire a mutex when there is contention but
+ * the holder is running on another processor. We spin for up to a maximum
+ * time waiting for the lock to be released.
+ *
+ * Called with the interlock unlocked.
+ * returns 0 if mutex acquired
+ * returns 1 if we spun
+ * returns 2 if we didn't spin due to the holder not running
   */
-void
-_mutex_assert (
-       mutex_t         *mutex,
-       unsigned int    what)
+int
+lck_mtx_lock_spinwait_x86(
+       lck_mtx_t       *mutex)
  {
-
-       thread_t        thread = current_thread();
         thread_t        holder;
+       uint64_t        deadline;
+       int             retval = 1;
+       int             loopcount = 0;
  
-        if (panicstr != NULL)
-               return;
  
-       holder = (thread_t) mutex->lck_mtx.lck_mtx_locked;
+       KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_START,
+                    mutex, mutex->lck_mtx_owner, mutex->lck_mtx_waiters, 0, 0);
  
-       switch (what) {
-       case MA_OWNED:
-               if (thread != holder)
-                       panic("mutex %x not owned\n", mutex);
-               break;
+       deadline = mach_absolute_time() + MutexSpin;
  
-        case MA_NOTOWNED:
-               if (thread == holder)
-                       panic("mutex %x owned\n", mutex);
-               break;
+       /*
+        * Spin while:
+        *   - mutex is locked, and
+        *   - its locked as a spin lock, and
+        *   - owner is running on another processor, and
+        *   - owner (processor) is not idling, and
+        *   - we haven't spun for long enough.
+        */
+       do {
+               if (__probable(lck_mtx_lock_grab_mutex(mutex))) {
+                       retval = 0;
+                       break;
+               }
+               if ((holder = (thread_t) mutex->lck_mtx_owner) != NULL) {
+
+                       if ( !(holder->machine.specFlags & OnProc) ||
+                            (holder->state & TH_IDLE)) {
+                               if (loopcount == 0)
+                                       retval = 2;
+                               break;
+                       }
+               }
+               cpu_pause();
+
+               loopcount++;
+
+       } while (mach_absolute_time() < deadline);
+
+
+#if    CONFIG_DTRACE
+       /*
+        * We've already kept a count via deadline of how long we spun.
+        * If dtrace is active, then we compute backwards to decide how
+        * long we spun.
+        *
+        * Note that we record a different probe id depending on whether
+        * this is a direct or indirect mutex.  This allows us to 
+        * penalize only lock groups that have debug/stats enabled
+        * with dtrace processing if desired.
+        */
+       if (__probable(mutex->lck_mtx_is_ext == 0)) {
+               LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN, mutex,
+                   mach_absolute_time() - (deadline - MutexSpin));
+       } else {
+               LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_SPIN, mutex,
+                   mach_absolute_time() - (deadline - MutexSpin));
         }
+       /* The lockstat acquire event is recorded by the assembly code beneath us. */
+#endif
+
+       KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_END,
+                    mutex, mutex->lck_mtx_owner, mutex->lck_mtx_waiters, retval, 0);
  
+       return retval;
  }
  
-#if    MACH_KDB
-/*
- * Routines to print out simple_locks and mutexes in a nicely-formatted
- * fashion.
- */
  
-char *simple_lock_labels =     "ENTRY    ILK THREAD   DURATION CALLER";
-char *mutex_labels =           "ENTRY    LOCKED WAITERS   THREAD CALLER";
  
+/*
+ * Routine:    lck_mtx_lock_wait_x86
+ *
+ * Invoked in order to wait on contention.
+ *
+ * Called with the interlock locked and
+ * preemption disabled...  
+ * returns it unlocked and with preemption enabled
+ */
  void
-db_show_one_simple_lock (
-       db_expr_t       addr,
-       boolean_t       have_addr,
-       db_expr_t       count,
-       char            * modif)
+lck_mtx_lock_wait_x86 (
+       lck_mtx_t       *mutex)
  {
-       simple_lock_t   saddr = (simple_lock_t)addr;
+       thread_t        self = current_thread();
+       thread_t        holder;
+       integer_t       priority;
+       spl_t           s;
+#if    CONFIG_DTRACE
+       uint64_t        sleep_start = 0;
  
-       if (saddr == (simple_lock_t)0 || !have_addr) {
-               db_error ("No simple_lock\n");
+       if (lockstat_probemap[LS_LCK_MTX_LOCK_BLOCK] || lockstat_probemap[LS_LCK_MTX_EXT_LOCK_BLOCK]) {
+               sleep_start = mach_absolute_time();
         }
-#if    USLOCK_DEBUG
-       else if (saddr->lock_type != USLOCK_TAG)
-               db_error ("Not a simple_lock\n");
-#endif /* USLOCK_DEBUG */
+#endif
+       KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_START,
+                    mutex, mutex->lck_mtx_owner, mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0);
  
-       db_printf ("%s\n", simple_lock_labels);
-       db_print_simple_lock (saddr);
-}
+       priority = self->sched_pri;
  
-void
-db_print_simple_lock (
-       simple_lock_t   addr)
-{
+       if (priority < self->priority)
+               priority = self->priority;
+       if (priority < BASEPRI_DEFAULT)
+               priority = BASEPRI_DEFAULT;
  
-       db_printf ("%08x %3d", addr, *hw_lock_addr(addr->interlock));
-#if    USLOCK_DEBUG
-       db_printf (" %08x", addr->debug.lock_thread);
-       db_printf (" %08x ", addr->debug.duration[1]);
-       db_printsym ((int)addr->debug.lock_pc, DB_STGY_ANY);
-#endif /* USLOCK_DEBUG */
-       db_printf ("\n");
-}
+       /* Do not promote into the realtime priority band */
+       priority = MIN(priority, MAXPRI_KERNEL);
  
-void
-db_show_one_mutex (
-       db_expr_t       addr,
-       boolean_t       have_addr,
-       db_expr_t       count,
-       char            * modif)
-{
-       mutex_t         * maddr = (mutex_t *)addr;
+       if (mutex->lck_mtx_waiters == 0 || priority > mutex->lck_mtx_pri)
+               mutex->lck_mtx_pri = priority;
+       mutex->lck_mtx_waiters++;
  
-       if (maddr == (mutex_t *)0 || !have_addr)
-               db_error ("No mutex\n");
-#if    MACH_LDEBUG
-       else if (maddr->type != MUTEX_TAG)
-               db_error ("Not a mutex\n");
-#endif /* MACH_LDEBUG */
+       if ( (holder = (thread_t)mutex->lck_mtx_owner) &&
+            holder->sched_pri < mutex->lck_mtx_pri ) {
+               /* Assert that we're not altering the priority of a
+                * MAXPRI_KERNEL or RT prio band thread
+                */
+               assert(holder->sched_pri < MAXPRI_KERNEL);
+               s = splsched();
+               thread_lock(holder);
+
+               if (holder->sched_pri < mutex->lck_mtx_pri) {
+                       KERNEL_DEBUG_CONSTANT(
+                               MACHDBG_CODE(DBG_MACH_SCHED, MACH_PROMOTE) | DBG_FUNC_NONE,
+                               holder->sched_pri, priority, thread_tid(holder), mutex, 0);
+
+                       set_sched_pri(holder, priority);
+                       
+                       if (mutex->lck_mtx_promoted == 0) {
+                               holder->promotions++;
+                               holder->sched_flags |= TH_SFLAG_PROMOTED;
+                               
+                               mutex->lck_mtx_promoted = 1;
+                       }
+               }
+               thread_unlock(holder);
+               splx(s);
+       }
+       assert_wait((event_t)(((unsigned int*)mutex)+((sizeof(lck_mtx_t)-1)/sizeof(unsigned int))), THREAD_UNINT);
  
-       db_printf ("%s\n", mutex_labels);
-       db_print_mutex (maddr);
-}
+       lck_mtx_ilk_unlock(mutex);
  
-void
-db_print_mutex (
-       mutex_t         * addr)
-{
-       db_printf ("%08x %6d %7d",
-                  addr, *addr, addr->lck_mtx.lck_mtx_waiters);
-#if    MACH_LDEBUG
-       db_printf (" %08x ", addr->thread);
-       db_printsym (addr->pc, DB_STGY_ANY);
-#endif /* MACH_LDEBUG */
-       db_printf ("\n");
-}
+       thread_block(THREAD_CONTINUE_NULL);
  
-#endif /* MACH_KDB */
+       KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_END,
+                    mutex, mutex->lck_mtx_owner, mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0);
+
+#if    CONFIG_DTRACE
+       /*
+        * Record the Dtrace lockstat probe for blocking, block time
+        * measured from when we were entered.
+        */
+       if (sleep_start) {
+               if (mutex->lck_mtx_is_ext == 0) {
+                       LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_BLOCK, mutex,
+                           mach_absolute_time() - sleep_start);
+               } else {
+                       LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_BLOCK, mutex,
+                           mach_absolute_time() - sleep_start);
+               }
+       }
+#endif
+}