/*
- * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2000-2012 Apple Inc. All rights reserved.
*
- * @APPLE_LICENSE_HEADER_START@
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
- * The contents of this file constitute Original Code as defined in and
- * are subject to the Apple Public Source License Version 1.1 (the
- * "License"). You may not use this file except in compliance with the
- * License. Please obtain a copy of the License at
- * http://www.apple.com/publicsource and read it before using this file.
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
*
- * This Original Code and all software distributed under the License are
- * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
- * License for the specific language governing rights and limitations
- * under the License.
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
*
- * @APPLE_LICENSE_HEADER_END@
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
*/
/*
* @OSF_COPYRIGHT@
* Locking primitives implementation
*/
-#include <mach_kdb.h>
#include <mach_ldebug.h>
#include <kern/lock.h>
#include <kern/debug.h>
#include <string.h>
-#if MACH_KDB
-#include <ddb/db_command.h>
-#include <ddb/db_output.h>
-#include <ddb/db_sym.h>
-#include <ddb/db_print.h>
-#endif /* MACH_KDB */
-
-#ifdef __ppc__
-#include <ppc/Firmware.h>
-#endif
+#include <i386/machine_routines.h> /* machine_timeout_suspended() */
+#include <machine/machine_cpu.h>
+#include <i386/mp.h>
#include <sys/kdebug.h>
+#include <mach/branch_predicates.h>
+
+/*
+ * We need only enough declarations from the BSD-side to be able to
+ * test if our probe is active, and to call __dtrace_probe(). Setting
+ * NEED_DTRACE_DEFS gets a local copy of those definitions pulled in.
+ */
+#if CONFIG_DTRACE
+#define NEED_DTRACE_DEFS
+#include <../bsd/sys/lockstat.h>
+#endif
#define LCK_RW_LCK_EXCLUSIVE_CODE 0x100
#define LCK_RW_LCK_EXCLUSIVE1_CODE 0x101
#define LCK_RW_LCK_SH_TO_EX1_CODE 0x104
#define LCK_RW_LCK_EX_TO_SH_CODE 0x105
+#define LCK_RW_LCK_EX_WRITER_SPIN_CODE 0x106
+#define LCK_RW_LCK_EX_WRITER_WAIT_CODE 0x107
+#define LCK_RW_LCK_EX_READER_SPIN_CODE 0x108
+#define LCK_RW_LCK_EX_READER_WAIT_CODE 0x109
+#define LCK_RW_LCK_SHARED_SPIN_CODE 0x110
+#define LCK_RW_LCK_SHARED_WAIT_CODE 0x111
+#define LCK_RW_LCK_SH_TO_EX_SPIN_CODE 0x112
+#define LCK_RW_LCK_SH_TO_EX_WAIT_CODE 0x113
+
#define ANY_LOCK_DEBUG (USLOCK_DEBUG || LOCK_DEBUG || MUTEX_DEBUG)
unsigned int LcksOpts=0;
-unsigned int lock_wait_time[2] = { (unsigned int)-1, 100 } ;
/* Forwards */
-#if MACH_KDB
-void db_print_simple_lock(
- simple_lock_t addr);
-
-void db_print_mutex(
- mutex_t * addr);
-#endif /* MACH_KDB */
-
-
#if USLOCK_DEBUG
/*
* Perform simple lock checks.
int max_lock_loops = 100000000;
decl_simple_lock_data(extern , printf_lock)
decl_simple_lock_data(extern , panic_lock)
-#if MACH_KDB
-decl_simple_lock_data(extern , kdb_lock)
-#endif /* MACH_KDB */
#endif /* USLOCK_DEBUG */
#define INVALID_PC ((void *) VM_MAX_KERNEL_ADDRESS)
#define INVALID_THREAD ((void *) VM_MAX_KERNEL_ADDRESS)
#if ANY_LOCK_DEBUG
-#define OBTAIN_PC(pc,l) ((pc) = (void *) GET_RETURN_PC(&(l)))
+#define OBTAIN_PC(pc) ((pc) = GET_RETURN_PC())
#define DECL_PC(pc) pc_t pc;
#else /* ANY_LOCK_DEBUG */
#define DECL_PC(pc)
/*
* Eliminate lint complaints about unused local pc variables.
*/
-#define OBTAIN_PC(pc,l) ++pc
+#define OBTAIN_PC(pc) ++pc
#else /* lint */
-#define OBTAIN_PC(pc,l)
+#define OBTAIN_PC(pc)
#endif /* lint */
#endif /* USLOCK_DEBUG */
#define USLDBG(stmt)
#endif /* USLOCK_DEBUG */
+
+extern int lck_rw_grab_want(lck_rw_t *lck);
+extern int lck_rw_grab_shared(lck_rw_t *lck);
+extern int lck_rw_held_read_or_upgrade(lck_rw_t *lck);
+
+
+/*
+ * Forward definitions
+ */
+
+void lck_rw_lock_shared_gen(
+ lck_rw_t *lck);
+
+void lck_rw_lock_exclusive_gen(
+ lck_rw_t *lck);
+
+boolean_t lck_rw_lock_shared_to_exclusive_success(
+ lck_rw_t *lck);
+
+boolean_t lck_rw_lock_shared_to_exclusive_failure(
+ lck_rw_t *lck,
+ int prior_lock_state);
+
+void lck_rw_lock_exclusive_to_shared_gen(
+ lck_rw_t *lck,
+ int prior_lock_state);
+
+lck_rw_type_t lck_rw_done_gen(
+ lck_rw_t *lck,
+ int prior_lock_state);
+
+void lck_rw_clear_promotions_x86(thread_t thread);
+
/*
* Routine: lck_spin_alloc_init
*/
lck_spin_t *lck,
lck_grp_t *grp)
{
- if (lck->lck_spin_data[0] == LCK_SPIN_TAG_DESTROYED)
+ if (lck->interlock == LCK_SPIN_TAG_DESTROYED)
return;
- lck->lck_spin_data[0] = LCK_SPIN_TAG_DESTROYED;
+ lck->interlock = LCK_SPIN_TAG_DESTROYED;
lck_grp_lckcnt_decr(grp, LCK_TYPE_SPIN);
lck_grp_deallocate(grp);
return;
lck_spin_try_lock(
lck_spin_t *lck)
{
- usimple_lock_try((usimple_lock_t) lck);
+ return((boolean_t)usimple_lock_try((usimple_lock_t) lck));
}
/*
#endif
}
+volatile uint32_t spinlock_owner_cpu = ~0;
+volatile usimple_lock_t spinlock_timed_out;
+
+static uint32_t spinlock_timeout_NMI(uintptr_t thread_addr) {
+ uint64_t deadline;
+ uint32_t i;
+
+ for (i = 0; i < real_ncpus; i++) {
+ if ((uintptr_t)cpu_data_ptr[i]->cpu_active_thread == thread_addr) {
+ spinlock_owner_cpu = i;
+ if ((uint32_t) cpu_number() == i)
+ break;
+ cpu_datap(i)->cpu_NMI_acknowledged = FALSE;
+ cpu_NMI_interrupt(i);
+ deadline = mach_absolute_time() + (LockTimeOut * 2);
+ while (mach_absolute_time() < deadline && cpu_datap(i)->cpu_NMI_acknowledged == FALSE)
+ cpu_pause();
+ break;
+ }
+ }
+
+ return spinlock_owner_cpu;
+}
/*
* Acquire a usimple_lock.
usimple_lock_t l)
{
#ifndef MACHINE_SIMPLE_LOCK
- pc_t pc = NULL;
+ DECL_PC(pc);
- OBTAIN_PC(pc, l);
+ OBTAIN_PC(pc);
USLDBG(usld_lock_pre(l, pc));
- if(!hw_lock_to(&l->interlock, LockTimeOut)) /* Try to get the lock with a timeout */
- panic("simple lock deadlock detection - l=%08X, cpu=%d, ret=%08X", l, cpu_number(), pc);
+ if(__improbable(hw_lock_to(&l->interlock, LockTimeOutTSC) == 0)) {
+ boolean_t uslock_acquired = FALSE;
+ while (machine_timeout_suspended()) {
+ enable_preemption();
+ if ((uslock_acquired = hw_lock_to(&l->interlock, LockTimeOutTSC)))
+ break;
+ }
+ if (uslock_acquired == FALSE) {
+ uint32_t lock_cpu;
+ uintptr_t lowner = (uintptr_t)l->interlock.lock_data;
+ spinlock_timed_out = l;
+ lock_cpu = spinlock_timeout_NMI(lowner);
+ panic("Spinlock acquisition timed out: lock=%p, lock owner thread=0x%lx, current_thread: %p, lock owner active on CPU 0x%x, current owner: 0x%lx", l, lowner, current_thread(), lock_cpu, (uintptr_t)l->interlock.lock_data);
+ }
+ }
USLDBG(usld_lock_post(l, pc));
#else
simple_lock((simple_lock_t)l);
#ifndef MACHINE_SIMPLE_LOCK
DECL_PC(pc);
- OBTAIN_PC(pc, l);
+ OBTAIN_PC(pc);
USLDBG(usld_unlock(l, pc));
hw_lock_unlock(&l->interlock);
#else
usimple_lock_t l)
{
#ifndef MACHINE_SIMPLE_LOCK
- DECL_PC(pc);
unsigned int success;
+ DECL_PC(pc);
- OBTAIN_PC(pc, l);
+ OBTAIN_PC(pc);
USLDBG(usld_lock_try_pre(l, pc));
if ((success = hw_lock_try(&l->interlock))) {
USLDBG(usld_lock_try_post(l, pc));
if (l == USIMPLE_LOCK_NULL)
panic("%s: null lock pointer", caller);
if (l->lock_type != USLOCK_TAG)
- panic("%s: 0x%x is not a usimple lock", caller, (integer_t) l);
+ panic("%s: %p is not a usimple lock, 0x%x", caller, l, l->lock_type);
if (!(l->debug.state & USLOCK_INIT))
- panic("%s: 0x%x is not an initialized lock",
- caller, (integer_t) l);
+ panic("%s: %p is not an initialized lock, 0x%x", caller, l, l->debug.state);
return USLOCK_CHECKING(l);
}
if ((l->debug.state & USLOCK_TAKEN) && l->debug.lock_thread &&
l->debug.lock_thread == (void *) current_thread()) {
- printf("%s: lock 0x%x already locked (at 0x%x) by",
- caller, (integer_t) l, l->debug.lock_pc);
- printf(" current thread 0x%x (new attempt at pc 0x%x)\n",
+ printf("%s: lock %p already locked (at %p) by",
+ caller, l, l->debug.lock_pc);
+ printf(" current thread %p (new attempt at pc %p)\n",
l->debug.lock_thread, pc);
- panic(caller);
+ panic("%s", caller);
}
mp_disable_preemption();
usl_trace(l, cpu_number(), pc, caller);
return;
if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED))
- panic("%s: lock 0x%x became uninitialized",
- caller, (integer_t) l);
+ panic("%s: lock %p became uninitialized",
+ caller, l);
if ((l->debug.state & USLOCK_TAKEN))
- panic("%s: lock 0x%x became TAKEN by someone else",
- caller, (integer_t) l);
+ panic("%s: lock 0x%p became TAKEN by someone else",
+ caller, l);
mycpu = cpu_number();
l->debug.lock_thread = (void *)current_thread();
mycpu = cpu_number();
if (!(l->debug.state & USLOCK_TAKEN))
- panic("%s: lock 0x%x hasn't been taken",
- caller, (integer_t) l);
+ panic("%s: lock 0x%p hasn't been taken",
+ caller, l);
if (l->debug.lock_thread != (void *) current_thread())
- panic("%s: unlocking lock 0x%x, owned by thread 0x%x",
- caller, (integer_t) l, l->debug.lock_thread);
+ panic("%s: unlocking lock 0x%p, owned by thread %p",
+ caller, l, l->debug.lock_thread);
if (l->debug.lock_cpu != mycpu) {
- printf("%s: unlocking lock 0x%x on cpu 0x%x",
- caller, (integer_t) l, mycpu);
+ printf("%s: unlocking lock 0x%p on cpu 0x%x",
+ caller, l, mycpu);
printf(" (acquired on cpu 0x%x)\n", l->debug.lock_cpu);
- panic(caller);
+ panic("%s", caller);
}
usl_trace(l, mycpu, pc, caller);
return;
if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED))
- panic("%s: lock 0x%x became uninitialized",
- caller, (integer_t) l);
+ panic("%s: lock 0x%p became uninitialized",
+ caller, l);
if ((l->debug.state & USLOCK_TAKEN))
- panic("%s: lock 0x%x became TAKEN by someone else",
- caller, (integer_t) l);
+ panic("%s: lock 0x%p became TAKEN by someone else",
+ caller, l);
mycpu = cpu_number();
l->debug.lock_thread = (void *) current_thread();
if (traced_lock == l) {
XPR(XPR_SLOCK,
"seq %d, cpu %d, %s @ %x\n",
- (integer_t) lock_seq, (integer_t) mycpu,
- (integer_t) op_name, (integer_t) pc, 0);
+ (uintptr_t) lock_seq, (uintptr_t) mycpu,
+ (uintptr_t) op_name, (uintptr_t) pc, 0);
lock_seq++;
}
}
lock_t *l,
boolean_t can_sleep,
__unused unsigned short tag,
- unsigned short tag1)
+ __unused unsigned short tag1)
{
- (void) memset((void *) l, 0, sizeof(lock_t));
-
- simple_lock_init(&l->interlock, tag1);
- l->want_write = FALSE;
- l->want_upgrade = FALSE;
- l->read_count = 0;
- l->can_sleep = can_sleep;
+ hw_lock_byte_init(&l->lck_rw_interlock);
+ l->lck_rw_want_write = FALSE;
+ l->lck_rw_want_upgrade = FALSE;
+ l->lck_rw_shared_count = 0;
+ l->lck_rw_can_sleep = can_sleep;
+ l->lck_rw_tag = tag;
+ l->lck_rw_priv_excl = 1;
+ l->lck_r_waiting = l->lck_w_waiting = 0;
}
lock_write(
register lock_t * l)
{
- register int i;
- boolean_t lock_miss = FALSE;
-#if MACH_LDEBUG
- int decrementer;
-#endif /* MACH_LDEBUG */
-
- simple_lock(&l->interlock);
-
-#if MACH_LDEBUG
- decrementer = DECREMENTER_TIMEOUT;
-#endif /* MACH_LDEBUG */
-
- /*
- * Try to acquire the want_write bit.
- */
- while (l->want_write) {
- if (!lock_miss) {
- lock_miss = TRUE;
- }
-
- i = lock_wait_time[l->can_sleep ? 1 : 0];
- if (i != 0) {
- simple_unlock(&l->interlock);
-#if MACH_LDEBUG
- if (!--decrementer)
- Debugger("timeout - want_write");
-#endif /* MACH_LDEBUG */
- while (--i != 0 && l->want_write)
- continue;
- simple_lock(&l->interlock);
- }
-
- if (l->can_sleep && l->want_write) {
- l->waiting = TRUE;
- thread_sleep_simple_lock((event_t) l,
- simple_lock_addr(l->interlock),
- THREAD_UNINT);
- /* interlock relocked */
- }
- }
- l->want_write = TRUE;
-
- /* Wait for readers (and upgrades) to finish */
-
-#if MACH_LDEBUG
- decrementer = DECREMENTER_TIMEOUT;
-#endif /* MACH_LDEBUG */
- while ((l->read_count != 0) || l->want_upgrade) {
- if (!lock_miss) {
- lock_miss = TRUE;
- }
-
- i = lock_wait_time[l->can_sleep ? 1 : 0];
- if (i != 0) {
- simple_unlock(&l->interlock);
-#if MACH_LDEBUG
- if (!--decrementer)
- Debugger("timeout - wait for readers");
-#endif /* MACH_LDEBUG */
- while (--i != 0 && (l->read_count != 0 ||
- l->want_upgrade))
- continue;
- simple_lock(&l->interlock);
- }
-
- if (l->can_sleep && (l->read_count != 0 || l->want_upgrade)) {
- l->waiting = TRUE;
- thread_sleep_simple_lock((event_t) l,
- simple_lock_addr(l->interlock),
- THREAD_UNINT);
- /* interlock relocked */
- }
- }
-
- simple_unlock(&l->interlock);
+ lck_rw_lock_exclusive(l);
}
void
lock_done(
register lock_t * l)
{
- boolean_t do_wakeup = FALSE;
-
-
- simple_lock(&l->interlock);
-
- if (l->read_count != 0) {
- l->read_count--;
- }
- else
- if (l->want_upgrade) {
- l->want_upgrade = FALSE;
- }
- else {
- l->want_write = FALSE;
- }
-
- /*
- * There is no reason to wakeup a waiting thread
- * if the read-count is non-zero. Consider:
- * we must be dropping a read lock
- * threads are waiting only if one wants a write lock
- * if there are still readers, they can't proceed
- */
-
- if (l->waiting && (l->read_count == 0)) {
- l->waiting = FALSE;
- do_wakeup = TRUE;
- }
-
- simple_unlock(&l->interlock);
-
- if (do_wakeup)
- thread_wakeup((event_t) l);
+ (void) lck_rw_done(l);
}
void
lock_read(
register lock_t * l)
{
- register int i;
-#if MACH_LDEBUG
- int decrementer;
-#endif /* MACH_LDEBUG */
-
- simple_lock(&l->interlock);
-
-#if MACH_LDEBUG
- decrementer = DECREMENTER_TIMEOUT;
-#endif /* MACH_LDEBUG */
- while (l->want_write || l->want_upgrade) {
- i = lock_wait_time[l->can_sleep ? 1 : 0];
-
- if (i != 0) {
- simple_unlock(&l->interlock);
-#if MACH_LDEBUG
- if (!--decrementer)
- Debugger("timeout - wait no writers");
-#endif /* MACH_LDEBUG */
- while (--i != 0 && (l->want_write || l->want_upgrade))
- continue;
- simple_lock(&l->interlock);
- }
-
- if (l->can_sleep && (l->want_write || l->want_upgrade)) {
- l->waiting = TRUE;
- thread_sleep_simple_lock((event_t) l,
- simple_lock_addr(l->interlock),
- THREAD_UNINT);
- /* interlock relocked */
- }
- }
-
- l->read_count++;
-
- simple_unlock(&l->interlock);
+ lck_rw_lock_shared(l);
}
* already requested an upgrade to a write lock,
* no lock is held upon return.
*
- * Returns TRUE if the upgrade *failed*.
+ * Returns FALSE if the upgrade *failed*.
*/
boolean_t
lock_read_to_write(
register lock_t * l)
{
- register int i;
- boolean_t do_wakeup = FALSE;
-#if MACH_LDEBUG
- int decrementer;
-#endif /* MACH_LDEBUG */
-
- simple_lock(&l->interlock);
-
- l->read_count--;
-
- if (l->want_upgrade) {
- /*
- * Someone else has requested upgrade.
- * Since we've released a read lock, wake
- * him up.
- */
- if (l->waiting && (l->read_count == 0)) {
- l->waiting = FALSE;
- do_wakeup = TRUE;
- }
-
- simple_unlock(&l->interlock);
-
- if (do_wakeup)
- thread_wakeup((event_t) l);
- return (TRUE);
- }
-
- l->want_upgrade = TRUE;
-
-#if MACH_LDEBUG
- decrementer = DECREMENTER_TIMEOUT;
-#endif /* MACH_LDEBUG */
- while (l->read_count != 0) {
- i = lock_wait_time[l->can_sleep ? 1 : 0];
-
- if (i != 0) {
- simple_unlock(&l->interlock);
-#if MACH_LDEBUG
- if (!--decrementer)
- Debugger("timeout - read_count");
-#endif /* MACH_LDEBUG */
- while (--i != 0 && l->read_count != 0)
- continue;
- simple_lock(&l->interlock);
- }
-
- if (l->can_sleep && l->read_count != 0) {
- l->waiting = TRUE;
- thread_sleep_simple_lock((event_t) l,
- simple_lock_addr(l->interlock),
- THREAD_UNINT);
- /* interlock relocked */
- }
- }
-
- simple_unlock(&l->interlock);
-
- return (FALSE);
+ return lck_rw_lock_shared_to_exclusive(l);
}
void
lock_write_to_read(
register lock_t * l)
{
- boolean_t do_wakeup = FALSE;
-
- simple_lock(&l->interlock);
-
- l->read_count++;
- if (l->want_upgrade)
- l->want_upgrade = FALSE;
- else
- l->want_write = FALSE;
-
- if (l->waiting) {
- l->waiting = FALSE;
- do_wakeup = TRUE;
- }
-
- simple_unlock(&l->interlock);
-
- if (do_wakeup)
- thread_wakeup((event_t) l);
+ lck_rw_lock_exclusive_to_shared(l);
}
-#if 0 /* Unused */
-/*
- * Routine: lock_try_write
- * Function:
- * Tries to get a write lock.
- *
- * Returns FALSE if the lock is not held on return.
- */
-
-boolean_t
-lock_try_write(
- register lock_t * l)
-{
- pc_t pc;
-
- simple_lock(&l->interlock);
-
- if (l->want_write || l->want_upgrade || l->read_count) {
- /*
- * Can't get lock.
- */
- simple_unlock(&l->interlock);
- return(FALSE);
- }
-
- /*
- * Have lock.
- */
-
- l->want_write = TRUE;
-
- simple_unlock(&l->interlock);
-
- return(TRUE);
-}
-
-/*
- * Routine: lock_try_read
- * Function:
- * Tries to get a read lock.
- *
- * Returns FALSE if the lock is not held on return.
- */
-
-boolean_t
-lock_try_read(
- register lock_t * l)
-{
- pc_t pc;
-
- simple_lock(&l->interlock);
-
- if (l->want_write || l->want_upgrade) {
- simple_unlock(&l->interlock);
- return(FALSE);
- }
-
- l->read_count++;
-
- simple_unlock(&l->interlock);
-
- return(TRUE);
-}
-#endif /* Unused */
-
/*
* Routine: lck_rw_alloc_init
lck_attr_t *attr) {
lck_rw_t *lck;
- if ((lck = (lck_rw_t *)kalloc(sizeof(lck_rw_t))) != 0)
+ if ((lck = (lck_rw_t *)kalloc(sizeof(lck_rw_t))) != 0) {
+ bzero(lck, sizeof(lck_rw_t));
lck_rw_init(lck, grp, attr);
-
+ }
+
return(lck);
}
lck_rw_init(
lck_rw_t *lck,
lck_grp_t *grp,
- __unused lck_attr_t *attr) {
-
- hw_lock_init(&lck->interlock);
- lck->want_write = FALSE;
- lck->want_upgrade = FALSE;
- lck->read_count = 0;
- lck->can_sleep = TRUE;
+ lck_attr_t *attr)
+{
+ lck_attr_t *lck_attr = (attr != LCK_ATTR_NULL) ?
+ attr : &LockDefaultLckAttr;
+
+ hw_lock_byte_init(&lck->lck_rw_interlock);
+ lck->lck_rw_want_write = FALSE;
+ lck->lck_rw_want_upgrade = FALSE;
+ lck->lck_rw_shared_count = 0;
+ lck->lck_rw_can_sleep = TRUE;
+ lck->lck_r_waiting = lck->lck_w_waiting = 0;
lck->lck_rw_tag = 0;
+ lck->lck_rw_priv_excl = ((lck_attr->lck_attr_val &
+ LCK_ATTR_RW_SHARED_PRIORITY) == 0);
lck_grp_reference(grp);
lck_grp_lckcnt_incr(grp, LCK_TYPE_RW);
void
lck_rw_destroy(
lck_rw_t *lck,
- lck_grp_t *grp) {
+ lck_grp_t *grp)
+{
if (lck->lck_rw_tag == LCK_RW_TAG_DESTROYED)
return;
+#if MACH_LDEBUG
+ lck_rw_assert(lck, LCK_RW_ASSERT_NOTHELD);
+#endif
lck->lck_rw_tag = LCK_RW_TAG_DESTROYED;
lck_grp_lckcnt_decr(grp, LCK_TYPE_RW);
lck_grp_deallocate(grp);
#define DECREMENTER_TIMEOUT 1000000
+#define RW_LOCK_READER_EVENT(x) \
+ ((event_t) (((unsigned char*) (x)) + (offsetof(lck_rw_t, lck_rw_tag))))
+
+#define RW_LOCK_WRITER_EVENT(x) \
+ ((event_t) (((unsigned char*) (x)) + (offsetof(lck_rw_t, lck_rw_pad8))))
/*
- * We need to disable interrupts while holding the mutex interlock
- * to prevent an IPI intervening.
+ * We disable interrupts while holding the RW interlock to prevent an
+ * interrupt from exacerbating hold time.
* Hence, local helper functions lck_interlock_lock()/lck_interlock_unlock().
*/
static boolean_t
boolean_t istate;
istate = ml_set_interrupts_enabled(FALSE);
- hw_lock_lock(&lck->interlock);
+ hw_lock_byte_lock(&lck->lck_rw_interlock);
return istate;
}
static void
lck_interlock_unlock(lck_rw_t *lck, boolean_t istate)
{
- hw_lock_unlock(&lck->interlock);
+ hw_lock_byte_unlock(&lck->lck_rw_interlock);
ml_set_interrupts_enabled(istate);
}
+/*
+ * This inline is used when busy-waiting for an rw lock.
+ * If interrupts were disabled when the lock primitive was called,
+ * we poll the IPI handler for pending tlb flushes.
+ * XXX This is a hack to avoid deadlocking on the pmap_system_lock.
+ */
+static inline void
+lck_rw_lock_pause(boolean_t interrupts_enabled)
+{
+ if (!interrupts_enabled)
+ handle_pending_TLB_flushes();
+ cpu_pause();
+}
+
+
+/*
+ * compute the deadline to spin against when
+ * waiting for a change of state on a lck_rw_t
+ */
+static inline uint64_t
+lck_rw_deadline_for_spin(lck_rw_t *lck)
+{
+ if (lck->lck_rw_can_sleep) {
+ if (lck->lck_r_waiting || lck->lck_w_waiting || lck->lck_rw_shared_count > machine_info.max_cpus) {
+ /*
+ * there are already threads waiting on this lock... this
+ * implies that they have spun beyond their deadlines waiting for
+ * the desired state to show up so we will not bother spinning at this time...
+ * or
+ * the current number of threads sharing this lock exceeds our capacity to run them
+ * concurrently and since all states we're going to spin for require the rw_shared_count
+ * to be at 0, we'll not bother spinning since the latency for this to happen is
+ * unpredictable...
+ */
+ return (mach_absolute_time());
+ }
+ return (mach_absolute_time() + MutexSpin);
+ } else
+ return (mach_absolute_time() + (100000LL * 1000000000LL));
+}
+
+
/*
* Routine: lck_rw_lock_exclusive
*/
void
-lck_rw_lock_exclusive(
+lck_rw_lock_exclusive_gen(
lck_rw_t *lck)
{
- int i;
- boolean_t lock_miss = FALSE;
- wait_result_t res;
-#if MACH_LDEBUG
- int decrementer;
-#endif /* MACH_LDEBUG */
- boolean_t istate;
-
- istate = lck_interlock_lock(lck);
-
-#if MACH_LDEBUG
- decrementer = DECREMENTER_TIMEOUT;
-#endif /* MACH_LDEBUG */
+ uint64_t deadline = 0;
+ int slept = 0;
+ int gotlock = 0;
+ int lockheld = 0;
+ wait_result_t res = 0;
+ boolean_t istate = -1;
+
+#if CONFIG_DTRACE
+ boolean_t dtrace_ls_initialized = FALSE;
+ boolean_t dtrace_rwl_excl_spin, dtrace_rwl_excl_block, dtrace_ls_enabled= FALSE;
+ uint64_t wait_interval = 0;
+ int readers_at_sleep = 0;
+#endif
/*
- * Try to acquire the want_write bit.
+ * Try to acquire the lck_rw_want_write bit.
*/
- while (lck->want_write) {
- KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EXCLUSIVE_CODE) | DBG_FUNC_START, (int)lck, 0, 0, 0, 0);
-
- if (!lock_miss) {
- lock_miss = TRUE;
+ while ( !lck_rw_grab_want(lck)) {
+
+#if CONFIG_DTRACE
+ if (dtrace_ls_initialized == FALSE) {
+ dtrace_ls_initialized = TRUE;
+ dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
+ dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
+ dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
+ if (dtrace_ls_enabled) {
+ /*
+ * Either sleeping or spinning is happening,
+ * start a timing of our delay interval now.
+ */
+ readers_at_sleep = lck->lck_rw_shared_count;
+ wait_interval = mach_absolute_time();
+ }
}
+#endif
+ if (istate == -1)
+ istate = ml_get_interrupts_enabled();
+
+ deadline = lck_rw_deadline_for_spin(lck);
+
+ KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_START, (int)lck, 0, 0, 0, 0);
+
+ while (((gotlock = lck_rw_grab_want(lck)) == 0) && mach_absolute_time() < deadline)
+ lck_rw_lock_pause(istate);
+
+ KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_END, (int)lck, 0, 0, gotlock, 0);
+
+ if (gotlock)
+ break;
+ /*
+ * if we get here, the deadline has expired w/o us
+ * being able to grab the lock exclusively
+ * check to see if we're allowed to do a thread_block
+ */
+ if (lck->lck_rw_can_sleep) {
- i = lock_wait_time[lck->can_sleep ? 1 : 0];
- if (i != 0) {
- lck_interlock_unlock(lck, istate);
-#if MACH_LDEBUG
- if (!--decrementer)
- Debugger("timeout - want_write");
-#endif /* MACH_LDEBUG */
- while (--i != 0 && lck->want_write)
- continue;
istate = lck_interlock_lock(lck);
- }
- if (lck->can_sleep && lck->want_write) {
- lck->waiting = TRUE;
- res = assert_wait((event_t) lck, THREAD_UNINT);
- if (res == THREAD_WAITING) {
+ if (lck->lck_rw_want_write) {
+
+ KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_START, (int)lck, 0, 0, 0, 0);
+
+ lck->lck_w_waiting = TRUE;
+
+ res = assert_wait(RW_LOCK_WRITER_EVENT(lck), THREAD_UNINT);
lck_interlock_unlock(lck, istate);
- res = thread_block(THREAD_CONTINUE_NULL);
- istate = lck_interlock_lock(lck);
+
+ if (res == THREAD_WAITING) {
+ res = thread_block(THREAD_CONTINUE_NULL);
+ slept++;
+ }
+ KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_END, (int)lck, res, slept, 0, 0);
+ } else {
+ lck->lck_rw_want_write = TRUE;
+ lck_interlock_unlock(lck, istate);
+ break;
}
}
- KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EXCLUSIVE_CODE) | DBG_FUNC_END, (int)lck, res, 0, 0, 0);
}
- lck->want_write = TRUE;
-
- /* Wait for readers (and upgrades) to finish */
+ /*
+ * Wait for readers (and upgrades) to finish...
+ * the test for these conditions must be done simultaneously with
+ * a check of the interlock not being held since
+ * the rw_shared_count will drop to 0 first and then want_upgrade
+ * will be set to 1 in the shared_to_exclusive scenario... those
+ * adjustments are done behind the interlock and represent an
+ * atomic change in state and must be considered as such
+ * however, once we see the read count at 0, the want_upgrade not set
+ * and the interlock not held, we are safe to proceed
+ */
+ while (lck_rw_held_read_or_upgrade(lck)) {
-#if MACH_LDEBUG
- decrementer = DECREMENTER_TIMEOUT;
-#endif /* MACH_LDEBUG */
- while ((lck->read_count != 0) || lck->want_upgrade) {
- if (!lock_miss) {
- lock_miss = TRUE;
+#if CONFIG_DTRACE
+ /*
+ * Either sleeping or spinning is happening, start
+ * a timing of our delay interval now. If we set it
+ * to -1 we don't have accurate data so we cannot later
+ * decide to record a dtrace spin or sleep event.
+ */
+ if (dtrace_ls_initialized == FALSE) {
+ dtrace_ls_initialized = TRUE;
+ dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
+ dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
+ dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
+ if (dtrace_ls_enabled) {
+ /*
+ * Either sleeping or spinning is happening,
+ * start a timing of our delay interval now.
+ */
+ readers_at_sleep = lck->lck_rw_shared_count;
+ wait_interval = mach_absolute_time();
+ }
}
+#endif
+ if (istate == -1)
+ istate = ml_get_interrupts_enabled();
+
+ deadline = lck_rw_deadline_for_spin(lck);
+
+ KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_START, (int)lck, 0, 0, 0, 0);
- i = lock_wait_time[lck->can_sleep ? 1 : 0];
+ while ((lockheld = lck_rw_held_read_or_upgrade(lck)) && mach_absolute_time() < deadline)
+ lck_rw_lock_pause(istate);
- KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EXCLUSIVE1_CODE) | DBG_FUNC_START,
- (int)lck, lck->read_count, lck->want_upgrade, i, 0);
+ KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_END, (int)lck, 0, 0, lockheld, 0);
+
+ if ( !lockheld)
+ break;
+ /*
+ * if we get here, the deadline has expired w/o us
+ * being able to grab the lock exclusively
+ * check to see if we're allowed to do a thread_block
+ */
+ if (lck->lck_rw_can_sleep) {
- if (i != 0) {
- lck_interlock_unlock(lck, istate);
-#if MACH_LDEBUG
- if (!--decrementer)
- Debugger("timeout - wait for readers");
-#endif /* MACH_LDEBUG */
- while (--i != 0 && (lck->read_count != 0 ||
- lck->want_upgrade))
- continue;
istate = lck_interlock_lock(lck);
- }
- if (lck->can_sleep && (lck->read_count != 0 || lck->want_upgrade)) {
- lck->waiting = TRUE;
- res = assert_wait((event_t) lck, THREAD_UNINT);
- if (res == THREAD_WAITING) {
+ if (lck->lck_rw_shared_count != 0 || lck->lck_rw_want_upgrade) {
+ KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_START, (int)lck, 0, 0, 0, 0);
+
+ lck->lck_w_waiting = TRUE;
+
+ res = assert_wait(RW_LOCK_WRITER_EVENT(lck), THREAD_UNINT);
+ lck_interlock_unlock(lck, istate);
+
+ if (res == THREAD_WAITING) {
+ res = thread_block(THREAD_CONTINUE_NULL);
+ slept++;
+ }
+ KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_END, (int)lck, res, slept, 0, 0);
+ } else {
lck_interlock_unlock(lck, istate);
- res = thread_block(THREAD_CONTINUE_NULL);
- istate = lck_interlock_lock(lck);
+ /*
+ * must own the lock now, since we checked for
+ * readers or upgrade owner behind the interlock
+ * no need for a call to 'lck_rw_held_read_or_upgrade'
+ */
+ break;
}
}
- KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EXCLUSIVE1_CODE) | DBG_FUNC_END,
- (int)lck, lck->read_count, lck->want_upgrade, res, 0);
}
- lck_interlock_unlock(lck, istate);
+#if CONFIG_DTRACE
+ /*
+ * Decide what latencies we suffered that are Dtrace events.
+ * If we have set wait_interval, then we either spun or slept.
+ * At least we get out from under the interlock before we record
+ * which is the best we can do here to minimize the impact
+ * of the tracing.
+ * If we have set wait_interval to -1, then dtrace was not enabled when we
+ * started sleeping/spinning so we don't record this event.
+ */
+ if (dtrace_ls_enabled == TRUE) {
+ if (slept == 0) {
+ LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_EXCL_SPIN, lck,
+ mach_absolute_time() - wait_interval, 1);
+ } else {
+ /*
+ * For the blocking case, we also record if when we blocked
+ * it was held for read or write, and how many readers.
+ * Notice that above we recorded this before we dropped
+ * the interlock so the count is accurate.
+ */
+ LOCKSTAT_RECORD4(LS_LCK_RW_LOCK_EXCL_BLOCK, lck,
+ mach_absolute_time() - wait_interval, 1,
+ (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
+ }
+ }
+ LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lck, 1);
+#endif
}
/*
- * Routine: lck_rw_done
+ * Routine: lck_rw_done_gen
+ *
+ * called from the assembly language wrapper...
+ * prior_lock_state is the value in the 1st
+ * word of the lock at the time of a successful
+ * atomic compare and exchange with the new value...
+ * it represents the state of the lock before we
+ * decremented the rw_shared_count or cleared either
+ * rw_want_upgrade or rw_want_write and
+ * the lck_x_waiting bits... since the wrapper
+ * routine has already changed the state atomically,
+ * we just need to decide if we should
+ * wake up anyone and what value to return... we do
+ * this by examining the state of the lock before
+ * we changed it
*/
lck_rw_type_t
-lck_rw_done(
- lck_rw_t *lck)
+lck_rw_done_gen(
+ lck_rw_t *lck,
+ int prior_lock_state)
{
- boolean_t do_wakeup = FALSE;
- lck_rw_type_t lck_rw_type;
- boolean_t istate;
-
-
- istate = lck_interlock_lock(lck);
+ lck_rw_t *fake_lck;
+ lck_rw_type_t lock_type;
+ thread_t thread = current_thread();
+ uint32_t rwlock_count;
- if (lck->read_count != 0) {
- lck_rw_type = LCK_RW_TYPE_SHARED;
- lck->read_count--;
+ /* Check if dropping the lock means that we need to unpromote */
+ rwlock_count = thread->rwlock_count--;
+#if MACH_LDEBUG
+ if (rwlock_count == 0) {
+ panic("rw lock count underflow for thread %p", thread);
}
- else {
- lck_rw_type = LCK_RW_TYPE_EXCLUSIVE;
- if (lck->want_upgrade)
- lck->want_upgrade = FALSE;
- else
- lck->want_write = FALSE;
+#endif
+ if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
+ /* sched_flags checked without lock, but will be rechecked while clearing */
+ lck_rw_clear_promotion(thread);
}
/*
- * There is no reason to wakeup a waiting thread
- * if the read-count is non-zero. Consider:
- * we must be dropping a read lock
- * threads are waiting only if one wants a write lock
- * if there are still readers, they can't proceed
+ * prior_lock state is a snapshot of the 1st word of the
+ * lock in question... we'll fake up a pointer to it
+ * and carefully not access anything beyond whats defined
+ * in the first word of a lck_rw_t
*/
+ fake_lck = (lck_rw_t *)&prior_lock_state;
+
+ if (fake_lck->lck_rw_shared_count <= 1) {
+ if (fake_lck->lck_w_waiting)
+ thread_wakeup(RW_LOCK_WRITER_EVENT(lck));
- if (lck->waiting && (lck->read_count == 0)) {
- lck->waiting = FALSE;
- do_wakeup = TRUE;
+ if (!(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting) && fake_lck->lck_r_waiting)
+ thread_wakeup(RW_LOCK_READER_EVENT(lck));
}
+ if (fake_lck->lck_rw_shared_count)
+ lock_type = LCK_RW_TYPE_SHARED;
+ else
+ lock_type = LCK_RW_TYPE_EXCLUSIVE;
- lck_interlock_unlock(lck, istate);
+#if CONFIG_DTRACE
+ LOCKSTAT_RECORD(LS_LCK_RW_DONE_RELEASE, lck, lock_type == LCK_RW_TYPE_SHARED ? 0 : 1);
+#endif
- if (do_wakeup)
- thread_wakeup((event_t) lck);
- return(lck_rw_type);
+ return(lock_type);
}
-
-
/*
* Routine: lck_rw_unlock
*/
/*
- * Routine: lck_rw_lock_shared
+ * Routine: lck_rw_lock_shared_gen
+ * Function:
+ * assembly fast path code has determined that this lock
+ * is held exclusively... this is where we spin/block
+ * until we can acquire the lock in the shared mode
*/
void
-lck_rw_lock_shared(
+lck_rw_lock_shared_gen(
lck_rw_t *lck)
{
- int i;
- wait_result_t res;
-#if MACH_LDEBUG
- int decrementer;
-#endif /* MACH_LDEBUG */
- boolean_t istate;
+ uint64_t deadline = 0;
+ int gotlock = 0;
+ int slept = 0;
+ wait_result_t res = 0;
+ boolean_t istate = -1;
+
+#if CONFIG_DTRACE
+ uint64_t wait_interval = 0;
+ int readers_at_sleep = 0;
+ boolean_t dtrace_ls_initialized = FALSE;
+ boolean_t dtrace_rwl_shared_spin, dtrace_rwl_shared_block, dtrace_ls_enabled = FALSE;
+#endif
- istate = lck_interlock_lock(lck);
-
-#if MACH_LDEBUG
- decrementer = DECREMENTER_TIMEOUT;
-#endif /* MACH_LDEBUG */
- while (lck->want_write || lck->want_upgrade) {
- i = lock_wait_time[lck->can_sleep ? 1 : 0];
-
- KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_CODE) | DBG_FUNC_START,
- (int)lck, lck->want_write, lck->want_upgrade, i, 0);
-
- if (i != 0) {
- lck_interlock_unlock(lck, istate);
-#if MACH_LDEBUG
- if (!--decrementer)
- Debugger("timeout - wait no writers");
-#endif /* MACH_LDEBUG */
- while (--i != 0 && (lck->want_write || lck->want_upgrade))
- continue;
- istate = lck_interlock_lock(lck);
+ while ( !lck_rw_grab_shared(lck)) {
+
+#if CONFIG_DTRACE
+ if (dtrace_ls_initialized == FALSE) {
+ dtrace_ls_initialized = TRUE;
+ dtrace_rwl_shared_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_SPIN] != 0);
+ dtrace_rwl_shared_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_BLOCK] != 0);
+ dtrace_ls_enabled = dtrace_rwl_shared_spin || dtrace_rwl_shared_block;
+ if (dtrace_ls_enabled) {
+ /*
+ * Either sleeping or spinning is happening,
+ * start a timing of our delay interval now.
+ */
+ readers_at_sleep = lck->lck_rw_shared_count;
+ wait_interval = mach_absolute_time();
+ }
}
+#endif
+ if (istate == -1)
+ istate = ml_get_interrupts_enabled();
+
+ deadline = lck_rw_deadline_for_spin(lck);
+
+ KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_START,
+ (int)lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0);
+
+ while (((gotlock = lck_rw_grab_shared(lck)) == 0) && mach_absolute_time() < deadline)
+ lck_rw_lock_pause(istate);
+
+ KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_END,
+ (int)lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, gotlock, 0);
+
+ if (gotlock)
+ break;
+ /*
+ * if we get here, the deadline has expired w/o us
+ * being able to grab the lock for read
+ * check to see if we're allowed to do a thread_block
+ */
+ if (lck->lck_rw_can_sleep) {
+
+ istate = lck_interlock_lock(lck);
- if (lck->can_sleep && (lck->want_write || lck->want_upgrade)) {
- lck->waiting = TRUE;
- res = assert_wait((event_t) lck, THREAD_UNINT);
- if (res == THREAD_WAITING) {
+ if ((lck->lck_rw_want_write || lck->lck_rw_want_upgrade) &&
+ ((lck->lck_rw_shared_count == 0) || lck->lck_rw_priv_excl)) {
+
+ KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_START,
+ (int)lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0);
+
+ lck->lck_r_waiting = TRUE;
+
+ res = assert_wait(RW_LOCK_READER_EVENT(lck), THREAD_UNINT);
lck_interlock_unlock(lck, istate);
- res = thread_block(THREAD_CONTINUE_NULL);
- istate = lck_interlock_lock(lck);
+
+ if (res == THREAD_WAITING) {
+ res = thread_block(THREAD_CONTINUE_NULL);
+ slept++;
+ }
+ KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_END,
+ (int)lck, res, slept, 0, 0);
+ } else {
+ lck->lck_rw_shared_count++;
+ lck_interlock_unlock(lck, istate);
+ break;
}
}
- KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_CODE) | DBG_FUNC_END,
- (int)lck, lck->want_write, lck->want_upgrade, res, 0);
}
- lck->read_count++;
-
- lck_interlock_unlock(lck, istate);
+#if CONFIG_DTRACE
+ if (dtrace_ls_enabled == TRUE) {
+ if (slept == 0) {
+ LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_SHARED_SPIN, lck, mach_absolute_time() - wait_interval, 0);
+ } else {
+ LOCKSTAT_RECORD4(LS_LCK_RW_LOCK_SHARED_BLOCK, lck,
+ mach_absolute_time() - wait_interval, 0,
+ (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
+ }
+ }
+ LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lck, 0);
+#endif
}
/*
- * Routine: lck_rw_lock_shared_to_exclusive
+ * Routine: lck_rw_lock_shared_to_exclusive_failure
* Function:
- * Improves a read-only lock to one with
- * write permission. If another reader has
- * already requested an upgrade to a write lock,
- * no lock is held upon return.
- *
- * Returns TRUE if the upgrade *failed*.
+ * assembly fast path code has already dropped our read
+ * count and determined that someone else owns 'lck_rw_want_upgrade'
+ * if 'lck_rw_shared_count' == 0, its also already dropped 'lck_w_waiting'
+ * all we need to do here is determine if a wakeup is needed
*/
-
boolean_t
-lck_rw_lock_shared_to_exclusive(
- lck_rw_t *lck)
+lck_rw_lock_shared_to_exclusive_failure(
+ lck_rw_t *lck,
+ int prior_lock_state)
{
- int i;
- boolean_t do_wakeup = FALSE;
- wait_result_t res;
-#if MACH_LDEBUG
- int decrementer;
-#endif /* MACH_LDEBUG */
- boolean_t istate;
-
- istate = lck_interlock_lock(lck);
+ lck_rw_t *fake_lck;
+ thread_t thread = current_thread();
+ uint32_t rwlock_count;
- lck->read_count--;
+ /* Check if dropping the lock means that we need to unpromote */
+ rwlock_count = thread->rwlock_count--;
+#if MACH_LDEBUG
+ if (rwlock_count == 0) {
+ panic("rw lock count underflow for thread %p", thread);
+ }
+#endif
+ if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
+ /* sched_flags checked without lock, but will be rechecked while clearing */
+ lck_rw_clear_promotion(thread);
+ }
- if (lck->want_upgrade) {
- KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_CODE) | DBG_FUNC_START,
- (int)lck, lck->read_count, lck->want_upgrade, 0, 0);
+ /*
+ * prior_lock state is a snapshot of the 1st word of the
+ * lock in question... we'll fake up a pointer to it
+ * and carefully not access anything beyond whats defined
+ * in the first word of a lck_rw_t
+ */
+ fake_lck = (lck_rw_t *)&prior_lock_state;
+ if (fake_lck->lck_w_waiting && fake_lck->lck_rw_shared_count == 1) {
/*
* Someone else has requested upgrade.
- * Since we've released a read lock, wake
- * him up.
+ * Since we've released the read lock, wake
+ * him up if he's blocked waiting
*/
- if (lck->waiting && (lck->read_count == 0)) {
- lck->waiting = FALSE;
- do_wakeup = TRUE;
+ thread_wakeup(RW_LOCK_WRITER_EVENT(lck));
+ }
+ KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_CODE) | DBG_FUNC_NONE,
+ (int)lck, lck->lck_rw_shared_count, lck->lck_rw_want_upgrade, 0, 0);
+
+ return (FALSE);
+}
+
+
+/*
+ * Routine: lck_rw_lock_shared_to_exclusive_failure
+ * Function:
+ * assembly fast path code has already dropped our read
+ * count and successfully acquired 'lck_rw_want_upgrade'
+ * we just need to wait for the rest of the readers to drain
+ * and then we can return as the exclusive holder of this lock
+ */
+boolean_t
+lck_rw_lock_shared_to_exclusive_success(
+ lck_rw_t *lck)
+{
+ uint64_t deadline = 0;
+ int slept = 0;
+ int still_shared = 0;
+ wait_result_t res;
+ boolean_t istate = -1;
+
+#if CONFIG_DTRACE
+ uint64_t wait_interval = 0;
+ int readers_at_sleep = 0;
+ boolean_t dtrace_ls_initialized = FALSE;
+ boolean_t dtrace_rwl_shared_to_excl_spin, dtrace_rwl_shared_to_excl_block, dtrace_ls_enabled = FALSE;
+#endif
+
+ while (lck->lck_rw_shared_count != 0) {
+
+#if CONFIG_DTRACE
+ if (dtrace_ls_initialized == FALSE) {
+ dtrace_ls_initialized = TRUE;
+ dtrace_rwl_shared_to_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN] != 0);
+ dtrace_rwl_shared_to_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK] != 0);
+ dtrace_ls_enabled = dtrace_rwl_shared_to_excl_spin || dtrace_rwl_shared_to_excl_block;
+ if (dtrace_ls_enabled) {
+ /*
+ * Either sleeping or spinning is happening,
+ * start a timing of our delay interval now.
+ */
+ readers_at_sleep = lck->lck_rw_shared_count;
+ wait_interval = mach_absolute_time();
+ }
}
+#endif
+ if (istate == -1)
+ istate = ml_get_interrupts_enabled();
- lck_interlock_unlock(lck, istate);
+ deadline = lck_rw_deadline_for_spin(lck);
- if (do_wakeup)
- thread_wakeup((event_t) lck);
+ KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_START,
+ (int)lck, lck->lck_rw_shared_count, 0, 0, 0);
- KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_CODE) | DBG_FUNC_END,
- (int)lck, lck->read_count, lck->want_upgrade, 0, 0);
+ while ((still_shared = lck->lck_rw_shared_count) && mach_absolute_time() < deadline)
+ lck_rw_lock_pause(istate);
- return (TRUE);
- }
+ KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_END,
+ (int)lck, lck->lck_rw_shared_count, 0, 0, 0);
- lck->want_upgrade = TRUE;
-
-#if MACH_LDEBUG
- decrementer = DECREMENTER_TIMEOUT;
-#endif /* MACH_LDEBUG */
- while (lck->read_count != 0) {
- i = lock_wait_time[lck->can_sleep ? 1 : 0];
-
- KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX1_CODE) | DBG_FUNC_START,
- (int)lck, lck->read_count, i, 0, 0);
-
- if (i != 0) {
- lck_interlock_unlock(lck, istate);
-#if MACH_LDEBUG
- if (!--decrementer)
- Debugger("timeout - read_count");
-#endif /* MACH_LDEBUG */
- while (--i != 0 && lck->read_count != 0)
- continue;
+ if ( !still_shared)
+ break;
+ /*
+ * if we get here, the deadline has expired w/o
+ * the rw_shared_count having drained to 0
+ * check to see if we're allowed to do a thread_block
+ */
+ if (lck->lck_rw_can_sleep) {
+
istate = lck_interlock_lock(lck);
- }
+
+ if (lck->lck_rw_shared_count != 0) {
+ KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_START,
+ (int)lck, lck->lck_rw_shared_count, 0, 0, 0);
+
+ lck->lck_w_waiting = TRUE;
+
+ res = assert_wait(RW_LOCK_WRITER_EVENT(lck), THREAD_UNINT);
+ lck_interlock_unlock(lck, istate);
- if (lck->can_sleep && lck->read_count != 0) {
- lck->waiting = TRUE;
- res = assert_wait((event_t) lck, THREAD_UNINT);
- if (res == THREAD_WAITING) {
+ if (res == THREAD_WAITING) {
+ res = thread_block(THREAD_CONTINUE_NULL);
+ slept++;
+ }
+ KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_END,
+ (int)lck, res, slept, 0, 0);
+ } else {
lck_interlock_unlock(lck, istate);
- res = thread_block(THREAD_CONTINUE_NULL);
- istate = lck_interlock_lock(lck);
+ break;
}
}
- KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX1_CODE) | DBG_FUNC_END,
- (int)lck, lck->read_count, 0, 0, 0);
}
-
- lck_interlock_unlock(lck, istate);
-
- return (FALSE);
+#if CONFIG_DTRACE
+ /*
+ * We infer whether we took the sleep/spin path above by checking readers_at_sleep.
+ */
+ if (dtrace_ls_enabled == TRUE) {
+ if (slept == 0) {
+ LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN, lck, mach_absolute_time() - wait_interval, 0);
+ } else {
+ LOCKSTAT_RECORD4(LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK, lck,
+ mach_absolute_time() - wait_interval, 1,
+ (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
+ }
+ }
+ LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lck, 1);
+#endif
+ return (TRUE);
}
+
/*
* Routine: lck_rw_lock_exclusive_to_shared
+ * Function:
+ * assembly fast path has already dropped
+ * our exclusive state and bumped lck_rw_shared_count
+ * all we need to do here is determine if anyone
+ * needs to be awakened.
*/
void
-lck_rw_lock_exclusive_to_shared(
- lck_rw_t *lck)
+lck_rw_lock_exclusive_to_shared_gen(
+ lck_rw_t *lck,
+ int prior_lock_state)
{
- boolean_t do_wakeup = FALSE;
- boolean_t istate;
+ lck_rw_t *fake_lck;
- KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_START,
- (int)lck, lck->want_write, lck->want_upgrade, 0, 0);
-
- istate = lck_interlock_lock(lck);
-
- lck->read_count++;
- if (lck->want_upgrade)
- lck->want_upgrade = FALSE;
- else
- lck->want_write = FALSE;
-
- if (lck->waiting) {
- lck->waiting = FALSE;
- do_wakeup = TRUE;
- }
+ /*
+ * prior_lock state is a snapshot of the 1st word of the
+ * lock in question... we'll fake up a pointer to it
+ * and carefully not access anything beyond whats defined
+ * in the first word of a lck_rw_t
+ */
+ fake_lck = (lck_rw_t *)&prior_lock_state;
- lck_interlock_unlock(lck, istate);
+ KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_START,
+ (int)lck, fake_lck->lck_rw_want_write, fake_lck->lck_rw_want_upgrade, 0, 0);
- if (do_wakeup)
- thread_wakeup((event_t) lck);
+ /*
+ * don't wake up anyone waiting to take the lock exclusively
+ * since we hold a read count... when the read count drops to 0,
+ * the writers will be woken.
+ *
+ * wake up any waiting readers if we don't have any writers waiting,
+ * or the lock is NOT marked as rw_priv_excl (writers have privilege)
+ */
+ if (!(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting) && fake_lck->lck_r_waiting)
+ thread_wakeup(RW_LOCK_READER_EVENT(lck));
KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_END,
- (int)lck, lck->want_write, lck->want_upgrade, lck->read_count, 0);
+ (int)lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, lck->lck_rw_shared_count, 0);
+#if CONFIG_DTRACE
+ LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_TO_SHARED_DOWNGRADE, lck, 0);
+#endif
}
return(FALSE);
}
-/*
- * Routine: lck_rw_try_lock_exclusive
- * Function:
- * Tries to get a write lock.
- *
- * Returns FALSE if the lock is not held on return.
- */
-boolean_t
-lck_rw_try_lock_exclusive(
- lck_rw_t *lck)
+void
+lck_rw_assert(
+ lck_rw_t *lck,
+ unsigned int type)
{
- boolean_t istate;
-
- istate = lck_interlock_lock(lck);
-
- if (lck->want_write || lck->want_upgrade || lck->read_count) {
- /*
- * Can't get lock.
- */
- lck_interlock_unlock(lck, istate);
- return(FALSE);
+ switch (type) {
+ case LCK_RW_ASSERT_SHARED:
+ if (lck->lck_rw_shared_count != 0) {
+ return;
+ }
+ break;
+ case LCK_RW_ASSERT_EXCLUSIVE:
+ if ((lck->lck_rw_want_write ||
+ lck->lck_rw_want_upgrade) &&
+ lck->lck_rw_shared_count == 0) {
+ return;
+ }
+ break;
+ case LCK_RW_ASSERT_HELD:
+ if (lck->lck_rw_want_write ||
+ lck->lck_rw_want_upgrade ||
+ lck->lck_rw_shared_count != 0) {
+ return;
+ }
+ break;
+ case LCK_RW_ASSERT_NOTHELD:
+ if (!(lck->lck_rw_want_write ||
+ lck->lck_rw_want_upgrade ||
+ lck->lck_rw_shared_count != 0)) {
+ return;
+ }
+ break;
+ default:
+ break;
}
- /*
- * Have lock.
- */
-
- lck->want_write = TRUE;
-
- lck_interlock_unlock(lck, istate);
-
- return(TRUE);
+ panic("rw lock (%p)%s held (mode=%u), first word %08x\n", lck, (type == LCK_RW_ASSERT_NOTHELD ? "" : " not"), type, *(uint32_t *)lck);
}
-/*
- * Routine: lck_rw_try_lock_shared
- * Function:
- * Tries to get a read lock.
- *
- * Returns FALSE if the lock is not held on return.
- */
-
-boolean_t
-lck_rw_try_lock_shared(
- lck_rw_t *lck)
+/* On return to userspace, this routine is called if the rwlock_count is somehow imbalanced */
+void
+lck_rw_clear_promotions_x86(thread_t thread)
{
- boolean_t istate;
-
- istate = lck_interlock_lock(lck);
-
- if (lck->want_write || lck->want_upgrade) {
- lck_interlock_unlock(lck, istate);
- return(FALSE);
- }
-
- lck->read_count++;
-
- lck_interlock_unlock(lck, istate);
-
- return(TRUE);
+#if MACH_LDEBUG
+ /* It's fatal to leave a RW lock locked and return to userspace */
+ panic("%u rw lock(s) held on return to userspace for thread %p", thread->rwlock_count, thread);
+#else
+ /* Paper over the issue */
+ thread->rwlock_count = 0;
+ lck_rw_clear_promotion(thread);
+#endif
}
+
+#ifdef MUTEX_ZONE
+extern zone_t lck_mtx_zone;
+#endif
/*
* Routine: lck_mtx_alloc_init
*/
lck_attr_t *attr)
{
lck_mtx_t *lck;
-
+#ifdef MUTEX_ZONE
+ if ((lck = (lck_mtx_t *)zalloc(lck_mtx_zone)) != 0)
+ lck_mtx_init(lck, grp, attr);
+#else
if ((lck = (lck_mtx_t *)kalloc(sizeof(lck_mtx_t))) != 0)
lck_mtx_init(lck, grp, attr);
-
+#endif
return(lck);
}
lck_grp_t *grp)
{
lck_mtx_destroy(lck, grp);
+#ifdef MUTEX_ZONE
+ zfree(lck_mtx_zone, lck);
+#else
kfree(lck, sizeof(lck_mtx_t));
+#endif
}
/*
lck_grp_t *grp,
lck_attr_t *attr)
{
- lck->lck_mtx.lck_mtx_ilk = 0;
- lck->lck_mtx.lck_mtx_locked = 0;
- lck->lck_mtx.lck_mtx_waiters = 0;
- lck->lck_mtx.lck_mtx_pri = 0;
- lck->lck_mtx_attr = 0;
+ bzero((void *)lck, sizeof(lck_mtx_ext_t));
if ((attr->lck_attr_val) & LCK_ATTR_DEBUG) {
- lck->lck_mtx_deb.pc = 0;
- lck->lck_mtx_deb.thread = 0;
lck->lck_mtx_deb.type = MUTEX_TAG;
lck->lck_mtx_attr |= LCK_MTX_ATTR_DEBUG;
}
lck->lck_mtx_grp = grp;
+
+ if (grp->lck_grp_attr & LCK_GRP_ATTR_STAT)
+ lck->lck_mtx_attr |= LCK_MTX_ATTR_STAT;
+
+ lck->lck_mtx.lck_mtx_is_ext = 1;
+ lck->lck_mtx.lck_mtx_sw.lck_mtxd.lck_mtxd_pad32 = 0xFFFFFFFF;
}
/*
lck_attr_t *attr)
{
lck_mtx_ext_t *lck_ext;
+ lck_attr_t *lck_attr;
+
+ if (attr != LCK_ATTR_NULL)
+ lck_attr = attr;
+ else
+ lck_attr = &LockDefaultLckAttr;
- if ((attr != LCK_ATTR_NULL) && ((attr->lck_attr_val) & LCK_ATTR_DEBUG)) {
+ if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) {
if ((lck_ext = (lck_mtx_ext_t *)kalloc(sizeof(lck_mtx_ext_t))) != 0) {
- lck_mtx_ext_init(lck_ext, grp, attr);
+ lck_mtx_ext_init(lck_ext, grp, lck_attr);
lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT;
lck->lck_mtx_ptr = lck_ext;
}
} else {
- lck->lck_mtx_ilk = 0;
- lck->lck_mtx_locked = 0;
- lck->lck_mtx_waiters = 0;
- lck->lck_mtx_pri = 0;
+ lck->lck_mtx_owner = 0;
+ lck->lck_mtx_state = 0;
+ }
+ lck->lck_mtx_sw.lck_mtxd.lck_mtxd_pad32 = 0xFFFFFFFF;
+ lck_grp_reference(grp);
+ lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX);
+}
+
+/*
+ * Routine: lck_mtx_init_ext
+ */
+void
+lck_mtx_init_ext(
+ lck_mtx_t *lck,
+ lck_mtx_ext_t *lck_ext,
+ lck_grp_t *grp,
+ lck_attr_t *attr)
+{
+ lck_attr_t *lck_attr;
+
+ if (attr != LCK_ATTR_NULL)
+ lck_attr = attr;
+ else
+ lck_attr = &LockDefaultLckAttr;
+
+ if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) {
+ lck_mtx_ext_init(lck_ext, grp, lck_attr);
+ lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT;
+ lck->lck_mtx_ptr = lck_ext;
+ } else {
+ lck->lck_mtx_owner = 0;
+ lck->lck_mtx_state = 0;
}
+ lck->lck_mtx_sw.lck_mtxd.lck_mtxd_pad32 = 0xFFFFFFFF;
+
lck_grp_reference(grp);
lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX);
}
if (lck->lck_mtx_tag == LCK_MTX_TAG_DESTROYED)
return;
+#if MACH_LDEBUG
+ lck_mtx_assert(lck, LCK_MTX_ASSERT_NOTOWNED);
+#endif
lck_is_indirect = (lck->lck_mtx_tag == LCK_MTX_TAG_INDIRECT);
- lck->lck_mtx_tag = LCK_MTX_TAG_DESTROYED;
+
+ lck_mtx_lock_mark_destroyed(lck);
+
if (lck_is_indirect)
kfree(lck->lck_mtx_ptr, sizeof(lck_mtx_ext_t));
lck_grp_lckcnt_decr(grp, LCK_TYPE_MTX);
return;
}
+
+#define LCK_MTX_LCK_WAIT_CODE 0x20
+#define LCK_MTX_LCK_WAKEUP_CODE 0x21
+#define LCK_MTX_LCK_SPIN_CODE 0x22
+#define LCK_MTX_LCK_ACQUIRE_CODE 0x23
+#define LCK_MTX_LCK_DEMOTE_CODE 0x24
+
+
/*
- * Routine: lck_mtx_assert
+ * Routine: lck_mtx_unlock_wakeup_x86
+ *
+ * Invoked on unlock when there is
+ * contention (i.e. the assembly routine sees that
+ * that mutex->lck_mtx_waiters != 0 or
+ * that mutex->lck_mtx_promoted != 0...
+ *
+ * neither the mutex or interlock is held
*/
void
-lck_mtx_assert(
- __unused lck_mtx_t *lck,
- __unused unsigned int type)
+lck_mtx_unlock_wakeup_x86 (
+ lck_mtx_t *mutex,
+ int prior_lock_state)
{
-}
+ lck_mtx_t fake_lck;
-#if MACH_KDB
+ /*
+ * prior_lock state is a snapshot of the 2nd word of the
+ * lock in question... we'll fake up a lock with the bits
+ * copied into place and carefully not access anything
+ * beyond whats defined in the second word of a lck_mtx_t
+ */
+ fake_lck.lck_mtx_state = prior_lock_state;
-void db_show_one_lock(lock_t *);
+ KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_START,
+ mutex, fake_lck.lck_mtx_promoted, fake_lck.lck_mtx_waiters, fake_lck.lck_mtx_pri, 0);
-void
-db_show_one_lock(
- lock_t *lock)
-{
- db_printf("Read_count = 0x%x, %swant_upgrade, %swant_write, ",
- lock->read_count,
- lock->want_upgrade ? "" : "!",
- lock->want_write ? "" : "!");
- db_printf("%swaiting, %scan_sleep\n",
- lock->waiting ? "" : "!", lock->can_sleep ? "" : "!");
- db_printf("Interlock:\n");
- db_show_one_simple_lock((db_expr_t)simple_lock_addr(lock->interlock),
- TRUE, (db_expr_t)0, (char *)0);
-}
+ if (__probable(fake_lck.lck_mtx_waiters)) {
+ if (fake_lck.lck_mtx_waiters > 1)
+ thread_wakeup_one_with_pri((event_t)(((unsigned int*)mutex)+(sizeof(lck_mtx_t)-1)/sizeof(unsigned int)), fake_lck.lck_mtx_pri);
+ else
+ thread_wakeup_one((event_t)(((unsigned int*)mutex)+(sizeof(lck_mtx_t)-1)/sizeof(unsigned int)));
+ }
-#endif /* MACH_KDB */
+ if (__improbable(fake_lck.lck_mtx_promoted)) {
+ thread_t thread = current_thread();
-/*
- * The C portion of the mutex package. These routines are only invoked
- * if the optimized assembler routines can't do the work.
- */
-/*
- * Routine: lock_alloc
- * Function:
- * Allocate a mutex for external users who cannot
- * hard-code the structure definition into their
- * objects.
- * For now just use kalloc, but a zone is probably
- * warranted.
- */
-mutex_t *
-mutex_alloc(
- unsigned short tag)
-{
- mutex_t *m;
+ KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_DEMOTE_CODE) | DBG_FUNC_NONE,
+ thread_tid(thread), thread->promotions, thread->sched_flags & TH_SFLAG_PROMOTED, 0, 0);
- if ((m = (mutex_t *)kalloc(sizeof(mutex_t))) != 0)
- mutex_init(m, tag);
- return(m);
+ if (thread->promotions > 0) {
+ spl_t s = splsched();
+
+ thread_lock(thread);
+
+ if (--thread->promotions == 0 && (thread->sched_flags & TH_SFLAG_PROMOTED)) {
+
+ thread->sched_flags &= ~TH_SFLAG_PROMOTED;
+
+ if (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) {
+ KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_DEMOTE) | DBG_FUNC_NONE,
+ thread->sched_pri, DEPRESSPRI, 0, mutex, 0);
+
+ set_sched_pri(thread, DEPRESSPRI);
+ }
+ else {
+ if (thread->priority < thread->sched_pri) {
+ KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_DEMOTE) | DBG_FUNC_NONE,
+ thread->sched_pri, thread->priority, 0, mutex, 0);
+
+ SCHED(compute_priority)(thread, FALSE);
+ }
+ }
+ }
+ thread_unlock(thread);
+ splx(s);
+ }
+ }
+ KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_END,
+ mutex, 0, mutex->lck_mtx_waiters, 0, 0);
}
+
/*
- * Routine: mutex_free
- * Function:
- * Free a mutex allocated for external users.
- * For now just use kfree, but a zone is probably
- * warranted.
+ * Routine: lck_mtx_lock_acquire_x86
+ *
+ * Invoked on acquiring the mutex when there is
+ * contention (i.e. the assembly routine sees that
+ * that mutex->lck_mtx_waiters != 0 or
+ * thread->was_promoted_on_wakeup != 0)...
+ *
+ * mutex is owned... interlock is held... preemption is disabled
*/
void
-mutex_free(
- mutex_t *m)
+lck_mtx_lock_acquire_x86(
+ lck_mtx_t *mutex)
{
- kfree(m, sizeof(mutex_t));
+ thread_t thread;
+ integer_t priority;
+ spl_t s;
+
+ KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_START,
+ mutex, thread->was_promoted_on_wakeup, mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0);
+
+ if (mutex->lck_mtx_waiters)
+ priority = mutex->lck_mtx_pri;
+ else
+ priority = 0;
+
+ thread = (thread_t)mutex->lck_mtx_owner; /* faster then current_thread() */
+
+ if (thread->sched_pri < priority || thread->was_promoted_on_wakeup) {
+
+ KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PROMOTE) | DBG_FUNC_NONE,
+ thread->sched_pri, priority, thread->was_promoted_on_wakeup, mutex, 0);
+
+ s = splsched();
+ thread_lock(thread);
+
+ if (thread->sched_pri < priority) {
+ /* Do not promote into the realtime priority band */
+ assert(priority <= MAXPRI_KERNEL);
+ set_sched_pri(thread, priority);
+ }
+ if (mutex->lck_mtx_promoted == 0) {
+ mutex->lck_mtx_promoted = 1;
+
+ thread->promotions++;
+ thread->sched_flags |= TH_SFLAG_PROMOTED;
+ }
+ thread->was_promoted_on_wakeup = 0;
+
+ thread_unlock(thread);
+ splx(s);
+ }
+ KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_END,
+ mutex, 0, mutex->lck_mtx_waiters, 0, 0);
}
+
+
/*
- * Routine: _mutex_assert
+ * Routine: lck_mtx_lock_spinwait_x86
+ *
+ * Invoked trying to acquire a mutex when there is contention but
+ * the holder is running on another processor. We spin for up to a maximum
+ * time waiting for the lock to be released.
+ *
+ * Called with the interlock unlocked.
+ * returns 0 if mutex acquired
+ * returns 1 if we spun
+ * returns 2 if we didn't spin due to the holder not running
*/
-void
-_mutex_assert (
- mutex_t *mutex,
- unsigned int what)
+int
+lck_mtx_lock_spinwait_x86(
+ lck_mtx_t *mutex)
{
-
- thread_t thread = current_thread();
thread_t holder;
+ uint64_t deadline;
+ int retval = 1;
+ int loopcount = 0;
- if (panicstr != NULL)
- return;
- holder = (thread_t) mutex->lck_mtx.lck_mtx_locked;
+ KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_START,
+ mutex, mutex->lck_mtx_owner, mutex->lck_mtx_waiters, 0, 0);
- switch (what) {
- case MA_OWNED:
- if (thread != holder)
- panic("mutex %x not owned\n", mutex);
- break;
+ deadline = mach_absolute_time() + MutexSpin;
- case MA_NOTOWNED:
- if (thread == holder)
- panic("mutex %x owned\n", mutex);
- break;
+ /*
+ * Spin while:
+ * - mutex is locked, and
+ * - its locked as a spin lock, and
+ * - owner is running on another processor, and
+ * - owner (processor) is not idling, and
+ * - we haven't spun for long enough.
+ */
+ do {
+ if (__probable(lck_mtx_lock_grab_mutex(mutex))) {
+ retval = 0;
+ break;
+ }
+ if ((holder = (thread_t) mutex->lck_mtx_owner) != NULL) {
+
+ if ( !(holder->machine.specFlags & OnProc) ||
+ (holder->state & TH_IDLE)) {
+ if (loopcount == 0)
+ retval = 2;
+ break;
+ }
+ }
+ cpu_pause();
+
+ loopcount++;
+
+ } while (mach_absolute_time() < deadline);
+
+
+#if CONFIG_DTRACE
+ /*
+ * We've already kept a count via deadline of how long we spun.
+ * If dtrace is active, then we compute backwards to decide how
+ * long we spun.
+ *
+ * Note that we record a different probe id depending on whether
+ * this is a direct or indirect mutex. This allows us to
+ * penalize only lock groups that have debug/stats enabled
+ * with dtrace processing if desired.
+ */
+ if (__probable(mutex->lck_mtx_is_ext == 0)) {
+ LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN, mutex,
+ mach_absolute_time() - (deadline - MutexSpin));
+ } else {
+ LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_SPIN, mutex,
+ mach_absolute_time() - (deadline - MutexSpin));
}
+ /* The lockstat acquire event is recorded by the assembly code beneath us. */
+#endif
+
+ KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_END,
+ mutex, mutex->lck_mtx_owner, mutex->lck_mtx_waiters, retval, 0);
+ return retval;
}
-#if MACH_KDB
-/*
- * Routines to print out simple_locks and mutexes in a nicely-formatted
- * fashion.
- */
-char *simple_lock_labels = "ENTRY ILK THREAD DURATION CALLER";
-char *mutex_labels = "ENTRY LOCKED WAITERS THREAD CALLER";
+/*
+ * Routine: lck_mtx_lock_wait_x86
+ *
+ * Invoked in order to wait on contention.
+ *
+ * Called with the interlock locked and
+ * preemption disabled...
+ * returns it unlocked and with preemption enabled
+ */
void
-db_show_one_simple_lock (
- db_expr_t addr,
- boolean_t have_addr,
- db_expr_t count,
- char * modif)
+lck_mtx_lock_wait_x86 (
+ lck_mtx_t *mutex)
{
- simple_lock_t saddr = (simple_lock_t)addr;
+ thread_t self = current_thread();
+ thread_t holder;
+ integer_t priority;
+ spl_t s;
+#if CONFIG_DTRACE
+ uint64_t sleep_start = 0;
- if (saddr == (simple_lock_t)0 || !have_addr) {
- db_error ("No simple_lock\n");
+ if (lockstat_probemap[LS_LCK_MTX_LOCK_BLOCK] || lockstat_probemap[LS_LCK_MTX_EXT_LOCK_BLOCK]) {
+ sleep_start = mach_absolute_time();
}
-#if USLOCK_DEBUG
- else if (saddr->lock_type != USLOCK_TAG)
- db_error ("Not a simple_lock\n");
-#endif /* USLOCK_DEBUG */
+#endif
+ KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_START,
+ mutex, mutex->lck_mtx_owner, mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0);
- db_printf ("%s\n", simple_lock_labels);
- db_print_simple_lock (saddr);
-}
+ priority = self->sched_pri;
-void
-db_print_simple_lock (
- simple_lock_t addr)
-{
+ if (priority < self->priority)
+ priority = self->priority;
+ if (priority < BASEPRI_DEFAULT)
+ priority = BASEPRI_DEFAULT;
- db_printf ("%08x %3d", addr, *hw_lock_addr(addr->interlock));
-#if USLOCK_DEBUG
- db_printf (" %08x", addr->debug.lock_thread);
- db_printf (" %08x ", addr->debug.duration[1]);
- db_printsym ((int)addr->debug.lock_pc, DB_STGY_ANY);
-#endif /* USLOCK_DEBUG */
- db_printf ("\n");
-}
+ /* Do not promote into the realtime priority band */
+ priority = MIN(priority, MAXPRI_KERNEL);
-void
-db_show_one_mutex (
- db_expr_t addr,
- boolean_t have_addr,
- db_expr_t count,
- char * modif)
-{
- mutex_t * maddr = (mutex_t *)addr;
+ if (mutex->lck_mtx_waiters == 0 || priority > mutex->lck_mtx_pri)
+ mutex->lck_mtx_pri = priority;
+ mutex->lck_mtx_waiters++;
- if (maddr == (mutex_t *)0 || !have_addr)
- db_error ("No mutex\n");
-#if MACH_LDEBUG
- else if (maddr->type != MUTEX_TAG)
- db_error ("Not a mutex\n");
-#endif /* MACH_LDEBUG */
+ if ( (holder = (thread_t)mutex->lck_mtx_owner) &&
+ holder->sched_pri < mutex->lck_mtx_pri ) {
+ /* Assert that we're not altering the priority of a
+ * MAXPRI_KERNEL or RT prio band thread
+ */
+ assert(holder->sched_pri < MAXPRI_KERNEL);
+ s = splsched();
+ thread_lock(holder);
+
+ if (holder->sched_pri < mutex->lck_mtx_pri) {
+ KERNEL_DEBUG_CONSTANT(
+ MACHDBG_CODE(DBG_MACH_SCHED, MACH_PROMOTE) | DBG_FUNC_NONE,
+ holder->sched_pri, priority, thread_tid(holder), mutex, 0);
+
+ set_sched_pri(holder, priority);
+
+ if (mutex->lck_mtx_promoted == 0) {
+ holder->promotions++;
+ holder->sched_flags |= TH_SFLAG_PROMOTED;
+
+ mutex->lck_mtx_promoted = 1;
+ }
+ }
+ thread_unlock(holder);
+ splx(s);
+ }
+ assert_wait((event_t)(((unsigned int*)mutex)+((sizeof(lck_mtx_t)-1)/sizeof(unsigned int))), THREAD_UNINT);
- db_printf ("%s\n", mutex_labels);
- db_print_mutex (maddr);
-}
+ lck_mtx_ilk_unlock(mutex);
-void
-db_print_mutex (
- mutex_t * addr)
-{
- db_printf ("%08x %6d %7d",
- addr, *addr, addr->lck_mtx.lck_mtx_waiters);
-#if MACH_LDEBUG
- db_printf (" %08x ", addr->thread);
- db_printsym (addr->pc, DB_STGY_ANY);
-#endif /* MACH_LDEBUG */
- db_printf ("\n");
-}
+ thread_block(THREAD_CONTINUE_NULL);
-#endif /* MACH_KDB */
+ KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_END,
+ mutex, mutex->lck_mtx_owner, mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0);
+
+#if CONFIG_DTRACE
+ /*
+ * Record the Dtrace lockstat probe for blocking, block time
+ * measured from when we were entered.
+ */
+ if (sleep_start) {
+ if (mutex->lck_mtx_is_ext == 0) {
+ LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_BLOCK, mutex,
+ mach_absolute_time() - sleep_start);
+ } else {
+ LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_BLOCK, mutex,
+ mach_absolute_time() - sleep_start);
+ }
+ }
+#endif
+}