X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/8f6c56a50524aa785f7e596d52dddfb331e18961..2d21ac55c334faf3a56e5634905ed6987fc787d4:/osfmk/i386/locks_i386.c?ds=sidebyside diff --git a/osfmk/i386/locks_i386.c b/osfmk/i386/locks_i386.c index 846038183..38d332b00 100644 --- a/osfmk/i386/locks_i386.c +++ b/osfmk/i386/locks_i386.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2007 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -84,12 +84,20 @@ #include #endif /* MACH_KDB */ -#ifdef __ppc__ -#include -#endif +#include #include +/* + * We need only enough declarations from the BSD-side to be able to + * test if our probe is active, and to call __dtrace_probe(). Setting + * NEED_DTRACE_DEFS gets a local copy of those definitions pulled in. + */ +#if CONFIG_DTRACE +#define NEED_DTRACE_DEFS +#include <../bsd/sys/lockstat.h> +#endif + #define LCK_RW_LCK_EXCLUSIVE_CODE 0x100 #define LCK_RW_LCK_EXCLUSIVE1_CODE 0x101 #define LCK_RW_LCK_SHARED_CODE 0x102 @@ -97,11 +105,12 @@ #define LCK_RW_LCK_SH_TO_EX1_CODE 0x104 #define LCK_RW_LCK_EX_TO_SH_CODE 0x105 +#define LCK_MTX_LCK_SPIN 0x200 #define ANY_LOCK_DEBUG (USLOCK_DEBUG || LOCK_DEBUG || MUTEX_DEBUG) unsigned int LcksOpts=0; -unsigned int lock_wait_time[2] = { (unsigned int)-1, 100 } ; +unsigned int lock_wait_time[2] = { (unsigned int)-1, 0 } ; /* Forwards */ @@ -169,6 +178,16 @@ int usld_lock_common_checks(usimple_lock_t, char *); #define USLDBG(stmt) #endif /* USLOCK_DEBUG */ +/* + * Forward definitions + */ + +void lck_rw_lock_shared_gen( + lck_rw_t *lck); + +lck_rw_type_t lck_rw_done_gen( + lck_rw_t *lck); + /* * Routine: lck_spin_alloc_init */ @@ -255,7 +274,7 @@ boolean_t lck_spin_try_lock( lck_spin_t *lck) { - usimple_lock_try((usimple_lock_t) lck); + return((boolean_t)usimple_lock_try((usimple_lock_t) lck)); } /* @@ -289,13 +308,13 @@ usimple_lock( usimple_lock_t l) { #ifndef MACHINE_SIMPLE_LOCK - pc_t pc = NULL; + DECL_PC(pc); OBTAIN_PC(pc, l); USLDBG(usld_lock_pre(l, pc)); - if(!hw_lock_to(&l->interlock, LockTimeOut)) /* Try to get the lock with a timeout */ - panic("simple lock deadlock detection - l=%08X, cpu=%d, ret=%08X", l, cpu_number(), pc); + if(!hw_lock_to(&l->interlock, LockTimeOutTSC)) /* Try to get the lock with a timeout */ + panic("simple lock deadlock detection: lock=%p, cpu=%d, owning thread=0x%x", l, cpu_number(), l->interlock.lock_data); USLDBG(usld_lock_post(l, pc)); #else @@ -344,8 +363,8 @@ usimple_lock_try( usimple_lock_t l) { #ifndef MACHINE_SIMPLE_LOCK - DECL_PC(pc); unsigned int success; + DECL_PC(pc); OBTAIN_PC(pc, l); USLDBG(usld_lock_try_pre(l, pc)); @@ -445,11 +464,11 @@ usld_lock_pre( if ((l->debug.state & USLOCK_TAKEN) && l->debug.lock_thread && l->debug.lock_thread == (void *) current_thread()) { - printf("%s: lock 0x%x already locked (at 0x%x) by", - caller, (integer_t) l, l->debug.lock_pc); - printf(" current thread 0x%x (new attempt at pc 0x%x)\n", + printf("%s: lock %p already locked (at %p) by", + caller, l, l->debug.lock_pc); + printf(" current thread %p (new attempt at pc %p)\n", l->debug.lock_thread, pc); - panic(caller); + panic("%s", caller); } mp_disable_preemption(); usl_trace(l, cpu_number(), pc, caller); @@ -518,13 +537,13 @@ usld_unlock( panic("%s: lock 0x%x hasn't been taken", caller, (integer_t) l); if (l->debug.lock_thread != (void *) current_thread()) - panic("%s: unlocking lock 0x%x, owned by thread 0x%x", + panic("%s: unlocking lock 0x%x, owned by thread %p", caller, (integer_t) l, l->debug.lock_thread); if (l->debug.lock_cpu != mycpu) { printf("%s: unlocking lock 0x%x on cpu 0x%x", caller, (integer_t) l, mycpu); printf(" (acquired on cpu 0x%x)\n", l->debug.lock_cpu); - panic(caller); + panic("%s", caller); } usl_trace(l, mycpu, pc, caller); @@ -671,15 +690,15 @@ lock_init( lock_t *l, boolean_t can_sleep, __unused unsigned short tag, - unsigned short tag1) + __unused unsigned short tag1) { - (void) memset((void *) l, 0, sizeof(lock_t)); - - simple_lock_init(&l->interlock, tag1); - l->want_write = FALSE; - l->want_upgrade = FALSE; - l->read_count = 0; - l->can_sleep = can_sleep; + hw_lock_byte_init(&l->lck_rw_interlock); + l->lck_rw_want_write = FALSE; + l->lck_rw_want_upgrade = FALSE; + l->lck_rw_shared_count = 0; + l->lck_rw_can_sleep = can_sleep; + l->lck_rw_tag = tag; + l->lck_rw_priv_excl = 1; } @@ -695,162 +714,21 @@ void lock_write( register lock_t * l) { - register int i; - boolean_t lock_miss = FALSE; -#if MACH_LDEBUG - int decrementer; -#endif /* MACH_LDEBUG */ - - simple_lock(&l->interlock); - -#if MACH_LDEBUG - decrementer = DECREMENTER_TIMEOUT; -#endif /* MACH_LDEBUG */ - - /* - * Try to acquire the want_write bit. - */ - while (l->want_write) { - if (!lock_miss) { - lock_miss = TRUE; - } - - i = lock_wait_time[l->can_sleep ? 1 : 0]; - if (i != 0) { - simple_unlock(&l->interlock); -#if MACH_LDEBUG - if (!--decrementer) - Debugger("timeout - want_write"); -#endif /* MACH_LDEBUG */ - while (--i != 0 && l->want_write) - continue; - simple_lock(&l->interlock); - } - - if (l->can_sleep && l->want_write) { - l->waiting = TRUE; - thread_sleep_simple_lock((event_t) l, - simple_lock_addr(l->interlock), - THREAD_UNINT); - /* interlock relocked */ - } - } - l->want_write = TRUE; - - /* Wait for readers (and upgrades) to finish */ - -#if MACH_LDEBUG - decrementer = DECREMENTER_TIMEOUT; -#endif /* MACH_LDEBUG */ - while ((l->read_count != 0) || l->want_upgrade) { - if (!lock_miss) { - lock_miss = TRUE; - } - - i = lock_wait_time[l->can_sleep ? 1 : 0]; - if (i != 0) { - simple_unlock(&l->interlock); -#if MACH_LDEBUG - if (!--decrementer) - Debugger("timeout - wait for readers"); -#endif /* MACH_LDEBUG */ - while (--i != 0 && (l->read_count != 0 || - l->want_upgrade)) - continue; - simple_lock(&l->interlock); - } - - if (l->can_sleep && (l->read_count != 0 || l->want_upgrade)) { - l->waiting = TRUE; - thread_sleep_simple_lock((event_t) l, - simple_lock_addr(l->interlock), - THREAD_UNINT); - /* interlock relocked */ - } - } - - simple_unlock(&l->interlock); + lck_rw_lock_exclusive(l); } void lock_done( register lock_t * l) { - boolean_t do_wakeup = FALSE; - - - simple_lock(&l->interlock); - - if (l->read_count != 0) { - l->read_count--; - } - else - if (l->want_upgrade) { - l->want_upgrade = FALSE; - } - else { - l->want_write = FALSE; - } - - /* - * There is no reason to wakeup a waiting thread - * if the read-count is non-zero. Consider: - * we must be dropping a read lock - * threads are waiting only if one wants a write lock - * if there are still readers, they can't proceed - */ - - if (l->waiting && (l->read_count == 0)) { - l->waiting = FALSE; - do_wakeup = TRUE; - } - - simple_unlock(&l->interlock); - - if (do_wakeup) - thread_wakeup((event_t) l); + (void) lck_rw_done(l); } void lock_read( register lock_t * l) { - register int i; -#if MACH_LDEBUG - int decrementer; -#endif /* MACH_LDEBUG */ - - simple_lock(&l->interlock); - -#if MACH_LDEBUG - decrementer = DECREMENTER_TIMEOUT; -#endif /* MACH_LDEBUG */ - while (l->want_write || l->want_upgrade) { - i = lock_wait_time[l->can_sleep ? 1 : 0]; - - if (i != 0) { - simple_unlock(&l->interlock); -#if MACH_LDEBUG - if (!--decrementer) - Debugger("timeout - wait no writers"); -#endif /* MACH_LDEBUG */ - while (--i != 0 && (l->want_write || l->want_upgrade)) - continue; - simple_lock(&l->interlock); - } - - if (l->can_sleep && (l->want_write || l->want_upgrade)) { - l->waiting = TRUE; - thread_sleep_simple_lock((event_t) l, - simple_lock_addr(l->interlock), - THREAD_UNINT); - /* interlock relocked */ - } - } - - l->read_count++; - - simple_unlock(&l->interlock); + lck_rw_lock_shared(l); } @@ -862,164 +740,23 @@ lock_read( * already requested an upgrade to a write lock, * no lock is held upon return. * - * Returns TRUE if the upgrade *failed*. + * Returns FALSE if the upgrade *failed*. */ boolean_t lock_read_to_write( register lock_t * l) { - register int i; - boolean_t do_wakeup = FALSE; -#if MACH_LDEBUG - int decrementer; -#endif /* MACH_LDEBUG */ - - simple_lock(&l->interlock); - - l->read_count--; - - if (l->want_upgrade) { - /* - * Someone else has requested upgrade. - * Since we've released a read lock, wake - * him up. - */ - if (l->waiting && (l->read_count == 0)) { - l->waiting = FALSE; - do_wakeup = TRUE; - } - - simple_unlock(&l->interlock); - - if (do_wakeup) - thread_wakeup((event_t) l); - return (TRUE); - } - - l->want_upgrade = TRUE; - -#if MACH_LDEBUG - decrementer = DECREMENTER_TIMEOUT; -#endif /* MACH_LDEBUG */ - while (l->read_count != 0) { - i = lock_wait_time[l->can_sleep ? 1 : 0]; - - if (i != 0) { - simple_unlock(&l->interlock); -#if MACH_LDEBUG - if (!--decrementer) - Debugger("timeout - read_count"); -#endif /* MACH_LDEBUG */ - while (--i != 0 && l->read_count != 0) - continue; - simple_lock(&l->interlock); - } - - if (l->can_sleep && l->read_count != 0) { - l->waiting = TRUE; - thread_sleep_simple_lock((event_t) l, - simple_lock_addr(l->interlock), - THREAD_UNINT); - /* interlock relocked */ - } - } - - simple_unlock(&l->interlock); - - return (FALSE); + return lck_rw_lock_shared_to_exclusive(l); } void lock_write_to_read( register lock_t * l) { - boolean_t do_wakeup = FALSE; - - simple_lock(&l->interlock); - - l->read_count++; - if (l->want_upgrade) - l->want_upgrade = FALSE; - else - l->want_write = FALSE; - - if (l->waiting) { - l->waiting = FALSE; - do_wakeup = TRUE; - } - - simple_unlock(&l->interlock); - - if (do_wakeup) - thread_wakeup((event_t) l); -} - - -#if 0 /* Unused */ -/* - * Routine: lock_try_write - * Function: - * Tries to get a write lock. - * - * Returns FALSE if the lock is not held on return. - */ - -boolean_t -lock_try_write( - register lock_t * l) -{ - pc_t pc; - - simple_lock(&l->interlock); - - if (l->want_write || l->want_upgrade || l->read_count) { - /* - * Can't get lock. - */ - simple_unlock(&l->interlock); - return(FALSE); - } - - /* - * Have lock. - */ - - l->want_write = TRUE; - - simple_unlock(&l->interlock); - - return(TRUE); + lck_rw_lock_exclusive_to_shared(l); } -/* - * Routine: lock_try_read - * Function: - * Tries to get a read lock. - * - * Returns FALSE if the lock is not held on return. - */ - -boolean_t -lock_try_read( - register lock_t * l) -{ - pc_t pc; - - simple_lock(&l->interlock); - - if (l->want_write || l->want_upgrade) { - simple_unlock(&l->interlock); - return(FALSE); - } - - l->read_count++; - - simple_unlock(&l->interlock); - - return(TRUE); -} -#endif /* Unused */ /* @@ -1055,14 +792,19 @@ void lck_rw_init( lck_rw_t *lck, lck_grp_t *grp, - __unused lck_attr_t *attr) { - - hw_lock_init(&lck->interlock); - lck->want_write = FALSE; - lck->want_upgrade = FALSE; - lck->read_count = 0; - lck->can_sleep = TRUE; + lck_attr_t *attr) +{ + lck_attr_t *lck_attr = (attr != LCK_ATTR_NULL) ? + attr : &LockDefaultLckAttr; + + hw_lock_byte_init(&lck->lck_rw_interlock); + lck->lck_rw_want_write = FALSE; + lck->lck_rw_want_upgrade = FALSE; + lck->lck_rw_shared_count = 0; + lck->lck_rw_can_sleep = TRUE; lck->lck_rw_tag = 0; + lck->lck_rw_priv_excl = ((lck_attr->lck_attr_val & + LCK_ATTR_RW_SHARED_PRIORITY) == 0); lck_grp_reference(grp); lck_grp_lckcnt_incr(grp, LCK_TYPE_RW); @@ -1091,6 +833,11 @@ lck_rw_destroy( #define DECREMENTER_TIMEOUT 1000000 +#define RW_LOCK_READER_EVENT(x) \ + ((event_t) (((unsigned char*) (x)) + (offsetof(lck_rw_t, lck_rw_tag)))) + +#define RW_LOCK_WRITER_EVENT(x) \ + ((event_t) (((unsigned char*) (x)) + (offsetof(lck_rw_t, lck_rw_pad8)))) /* * We need to disable interrupts while holding the mutex interlock @@ -1103,7 +850,7 @@ lck_interlock_lock(lck_rw_t *lck) boolean_t istate; istate = ml_set_interrupts_enabled(FALSE); - hw_lock_lock(&lck->interlock); + hw_lock_byte_lock(&lck->lck_rw_interlock); return istate; } @@ -1111,10 +858,24 @@ lck_interlock_lock(lck_rw_t *lck) static void lck_interlock_unlock(lck_rw_t *lck, boolean_t istate) { - hw_lock_unlock(&lck->interlock); + hw_lock_byte_unlock(&lck->lck_rw_interlock); ml_set_interrupts_enabled(istate); } +/* + * This inline is used when busy-waiting for an rw lock. + * If interrupts were disabled when the lock primitive was called, + * we poll the IPI handler for pending tlb flushes. + * XXX This is a hack to avoid deadlocking on the pmap_system_lock. + */ +static inline void +lck_rw_lock_pause(boolean_t interrupts_enabled) +{ + if (!interrupts_enabled) + handle_pending_TLB_flushes(); + cpu_pause(); +} + /* * Routine: lck_rw_lock_exclusive */ @@ -1123,68 +884,98 @@ lck_rw_lock_exclusive( lck_rw_t *lck) { int i; - boolean_t lock_miss = FALSE; wait_result_t res; #if MACH_LDEBUG int decrementer; #endif /* MACH_LDEBUG */ boolean_t istate; +#if CONFIG_DTRACE + uint64_t wait_interval = 0; + int slept = 0; + int readers_at_sleep; +#endif istate = lck_interlock_lock(lck); +#if CONFIG_DTRACE + readers_at_sleep = lck->lck_rw_shared_count; +#endif #if MACH_LDEBUG decrementer = DECREMENTER_TIMEOUT; #endif /* MACH_LDEBUG */ /* - * Try to acquire the want_write bit. + * Try to acquire the lck_rw_want_write bit. */ - while (lck->want_write) { - KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EXCLUSIVE_CODE) | DBG_FUNC_START, (int)lck, 0, 0, 0, 0); + while (lck->lck_rw_want_write) { - if (!lock_miss) { - lock_miss = TRUE; + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EXCLUSIVE_CODE) | DBG_FUNC_START, (int)lck, 0, 0, 0, 0); + /* + * Either sleeping or spinning is happening, start + * a timing of our delay interval now. + */ +#if CONFIG_DTRACE + if ((lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] || lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK]) && wait_interval == 0) { + wait_interval = mach_absolute_time(); + } else { + wait_interval = -1; } +#endif - i = lock_wait_time[lck->can_sleep ? 1 : 0]; + + i = lock_wait_time[lck->lck_rw_can_sleep ? 1 : 0]; if (i != 0) { lck_interlock_unlock(lck, istate); #if MACH_LDEBUG if (!--decrementer) - Debugger("timeout - want_write"); + Debugger("timeout - lck_rw_want_write"); #endif /* MACH_LDEBUG */ - while (--i != 0 && lck->want_write) - continue; + while (--i != 0 && lck->lck_rw_want_write) + lck_rw_lock_pause(istate); istate = lck_interlock_lock(lck); } - if (lck->can_sleep && lck->want_write) { - lck->waiting = TRUE; - res = assert_wait((event_t) lck, THREAD_UNINT); + if (lck->lck_rw_can_sleep && lck->lck_rw_want_write) { + lck->lck_w_waiting = TRUE; + res = assert_wait(RW_LOCK_WRITER_EVENT(lck), THREAD_UNINT); if (res == THREAD_WAITING) { lck_interlock_unlock(lck, istate); res = thread_block(THREAD_CONTINUE_NULL); +#if CONFIG_DTRACE + slept = 1; +#endif istate = lck_interlock_lock(lck); } } KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EXCLUSIVE_CODE) | DBG_FUNC_END, (int)lck, res, 0, 0, 0); } - lck->want_write = TRUE; + lck->lck_rw_want_write = TRUE; /* Wait for readers (and upgrades) to finish */ #if MACH_LDEBUG decrementer = DECREMENTER_TIMEOUT; #endif /* MACH_LDEBUG */ - while ((lck->read_count != 0) || lck->want_upgrade) { - if (!lock_miss) { - lock_miss = TRUE; - } + while ((lck->lck_rw_shared_count != 0) || lck->lck_rw_want_upgrade) { - i = lock_wait_time[lck->can_sleep ? 1 : 0]; + i = lock_wait_time[lck->lck_rw_can_sleep ? 1 : 0]; KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EXCLUSIVE1_CODE) | DBG_FUNC_START, - (int)lck, lck->read_count, lck->want_upgrade, i, 0); + (int)lck, lck->lck_rw_shared_count, lck->lck_rw_want_upgrade, i, 0); + +#if CONFIG_DTRACE + /* + * Either sleeping or spinning is happening, start + * a timing of our delay interval now. If we set it + * to -1 we don't have accurate data so we cannot later + * decide to record a dtrace spin or sleep event. + */ + if ((lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] || lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK]) && wait_interval == 0) { + wait_interval = mach_absolute_time(); + } else { + wait_interval = (unsigned) -1; + } +#endif if (i != 0) { lck_interlock_unlock(lck, istate); @@ -1192,53 +983,84 @@ lck_rw_lock_exclusive( if (!--decrementer) Debugger("timeout - wait for readers"); #endif /* MACH_LDEBUG */ - while (--i != 0 && (lck->read_count != 0 || - lck->want_upgrade)) - continue; + while (--i != 0 && (lck->lck_rw_shared_count != 0 || + lck->lck_rw_want_upgrade)) + lck_rw_lock_pause(istate); istate = lck_interlock_lock(lck); } - if (lck->can_sleep && (lck->read_count != 0 || lck->want_upgrade)) { - lck->waiting = TRUE; - res = assert_wait((event_t) lck, THREAD_UNINT); + if (lck->lck_rw_can_sleep && (lck->lck_rw_shared_count != 0 || lck->lck_rw_want_upgrade)) { + lck->lck_w_waiting = TRUE; + res = assert_wait(RW_LOCK_WRITER_EVENT(lck), THREAD_UNINT); if (res == THREAD_WAITING) { lck_interlock_unlock(lck, istate); res = thread_block(THREAD_CONTINUE_NULL); +#if CONFIG_DTRACE + slept = 1; +#endif istate = lck_interlock_lock(lck); } } KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EXCLUSIVE1_CODE) | DBG_FUNC_END, - (int)lck, lck->read_count, lck->want_upgrade, res, 0); + (int)lck, lck->lck_rw_shared_count, lck->lck_rw_want_upgrade, res, 0); } lck_interlock_unlock(lck, istate); +#if CONFIG_DTRACE + /* + * Decide what latencies we suffered that are Dtrace events. + * If we have set wait_interval, then we either spun or slept. + * At least we get out from under the interlock before we record + * which is the best we can do here to minimize the impact + * of the tracing. + * If we have set wait_interval to -1, then dtrace was not enabled when we + * started sleeping/spinning so we don't record this event. + */ + if (wait_interval != 0 && wait_interval != (unsigned) -1) { + if (slept == 0) { + LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_EXCL_SPIN, lck, + mach_absolute_time() - wait_interval, 1); + } else { + /* + * For the blocking case, we also record if when we blocked + * it was held for read or write, and how many readers. + * Notice that above we recorded this before we dropped + * the interlock so the count is accurate. + */ + LOCKSTAT_RECORD4(LS_LCK_RW_LOCK_EXCL_BLOCK, lck, + mach_absolute_time() - wait_interval, 1, + (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep); + } + } + LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lck, 1); +#endif } /* - * Routine: lck_rw_done + * Routine: lck_rw_done_gen */ lck_rw_type_t -lck_rw_done( +lck_rw_done_gen( lck_rw_t *lck) { - boolean_t do_wakeup = FALSE; + boolean_t wakeup_readers = FALSE; + boolean_t wakeup_writers = FALSE; lck_rw_type_t lck_rw_type; boolean_t istate; - istate = lck_interlock_lock(lck); - if (lck->read_count != 0) { + if (lck->lck_rw_shared_count != 0) { lck_rw_type = LCK_RW_TYPE_SHARED; - lck->read_count--; + lck->lck_rw_shared_count--; } else { lck_rw_type = LCK_RW_TYPE_EXCLUSIVE; - if (lck->want_upgrade) - lck->want_upgrade = FALSE; + if (lck->lck_rw_want_upgrade) + lck->lck_rw_want_upgrade = FALSE; else - lck->want_write = FALSE; + lck->lck_rw_want_write = FALSE; } /* @@ -1249,15 +1071,29 @@ lck_rw_done( * if there are still readers, they can't proceed */ - if (lck->waiting && (lck->read_count == 0)) { - lck->waiting = FALSE; - do_wakeup = TRUE; + if (lck->lck_rw_shared_count == 0) { + if (lck->lck_w_waiting) { + lck->lck_w_waiting = FALSE; + wakeup_writers = TRUE; + } + if (!(lck->lck_rw_priv_excl && wakeup_writers == TRUE) && + lck->lck_r_waiting) { + lck->lck_r_waiting = FALSE; + wakeup_readers = TRUE; + } } lck_interlock_unlock(lck, istate); - if (do_wakeup) - thread_wakeup((event_t) lck); + if (wakeup_readers) + thread_wakeup(RW_LOCK_READER_EVENT(lck)); + if (wakeup_writers) + thread_wakeup(RW_LOCK_WRITER_EVENT(lck)); + +#if CONFIG_DTRACE + LOCKSTAT_RECORD(LS_LCK_RW_DONE_RELEASE, lck, (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE ? 1 : 0)); +#endif + return(lck_rw_type); } @@ -1331,10 +1167,10 @@ lck_rw_lock( /* - * Routine: lck_rw_lock_shared + * Routine: lck_rw_lock_shared_gen */ void -lck_rw_lock_shared( +lck_rw_lock_shared_gen( lck_rw_t *lck) { int i; @@ -1343,17 +1179,34 @@ lck_rw_lock_shared( int decrementer; #endif /* MACH_LDEBUG */ boolean_t istate; +#if CONFIG_DTRACE + uint64_t wait_interval = 0; + int slept = 0; + int readers_at_sleep; +#endif istate = lck_interlock_lock(lck); +#if CONFIG_DTRACE + readers_at_sleep = lck->lck_rw_shared_count; +#endif #if MACH_LDEBUG decrementer = DECREMENTER_TIMEOUT; #endif /* MACH_LDEBUG */ - while (lck->want_write || lck->want_upgrade) { - i = lock_wait_time[lck->can_sleep ? 1 : 0]; + while ((lck->lck_rw_want_write || lck->lck_rw_want_upgrade) && + ((lck->lck_rw_shared_count == 0) || lck->lck_rw_priv_excl)) { + + i = lock_wait_time[lck->lck_rw_can_sleep ? 1 : 0]; KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_CODE) | DBG_FUNC_START, - (int)lck, lck->want_write, lck->want_upgrade, i, 0); + (int)lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, i, 0); +#if CONFIG_DTRACE + if ((lockstat_probemap[LS_LCK_RW_LOCK_SHARED_SPIN] || lockstat_probemap[LS_LCK_RW_LOCK_SHARED_BLOCK]) && wait_interval == 0) { + wait_interval = mach_absolute_time(); + } else { + wait_interval = -1; + } +#endif if (i != 0) { lck_interlock_unlock(lck, istate); @@ -1361,27 +1214,46 @@ lck_rw_lock_shared( if (!--decrementer) Debugger("timeout - wait no writers"); #endif /* MACH_LDEBUG */ - while (--i != 0 && (lck->want_write || lck->want_upgrade)) - continue; + while (--i != 0 && + (lck->lck_rw_want_write || lck->lck_rw_want_upgrade) && + ((lck->lck_rw_shared_count == 0) || lck->lck_rw_priv_excl)) + lck_rw_lock_pause(istate); istate = lck_interlock_lock(lck); } - if (lck->can_sleep && (lck->want_write || lck->want_upgrade)) { - lck->waiting = TRUE; - res = assert_wait((event_t) lck, THREAD_UNINT); + if (lck->lck_rw_can_sleep && + (lck->lck_rw_want_write || lck->lck_rw_want_upgrade) && + ((lck->lck_rw_shared_count == 0) || lck->lck_rw_priv_excl)) { + lck->lck_r_waiting = TRUE; + res = assert_wait(RW_LOCK_READER_EVENT(lck), THREAD_UNINT); if (res == THREAD_WAITING) { lck_interlock_unlock(lck, istate); res = thread_block(THREAD_CONTINUE_NULL); +#if CONFIG_DTRACE + slept = 1; +#endif istate = lck_interlock_lock(lck); } } KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_CODE) | DBG_FUNC_END, - (int)lck, lck->want_write, lck->want_upgrade, res, 0); + (int)lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, res, 0); } - lck->read_count++; + lck->lck_rw_shared_count++; lck_interlock_unlock(lck, istate); +#if CONFIG_DTRACE + if (wait_interval != 0 && wait_interval != (unsigned) -1) { + if (slept == 0) { + LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_SHARED_SPIN, lck, mach_absolute_time() - wait_interval, 0); + } else { + LOCKSTAT_RECORD4(LS_LCK_RW_LOCK_SHARED_BLOCK, lck, + mach_absolute_time() - wait_interval, 0, + (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep); + } + } + LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lck, 0); +#endif } @@ -1393,7 +1265,7 @@ lck_rw_lock_shared( * already requested an upgrade to a write lock, * no lock is held upon return. * - * Returns TRUE if the upgrade *failed*. + * Returns FALSE if the upgrade *failed*. */ boolean_t @@ -1407,74 +1279,105 @@ lck_rw_lock_shared_to_exclusive( int decrementer; #endif /* MACH_LDEBUG */ boolean_t istate; +#if CONFIG_DTRACE + uint64_t wait_interval = 0; + int slept = 0; + int readers_at_sleep = 0; +#endif istate = lck_interlock_lock(lck); - lck->read_count--; + lck->lck_rw_shared_count--; - if (lck->want_upgrade) { + if (lck->lck_rw_want_upgrade) { KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_CODE) | DBG_FUNC_START, - (int)lck, lck->read_count, lck->want_upgrade, 0, 0); + (int)lck, lck->lck_rw_shared_count, lck->lck_rw_want_upgrade, 0, 0); /* * Someone else has requested upgrade. * Since we've released a read lock, wake * him up. */ - if (lck->waiting && (lck->read_count == 0)) { - lck->waiting = FALSE; + if (lck->lck_w_waiting && (lck->lck_rw_shared_count == 0)) { + lck->lck_w_waiting = FALSE; do_wakeup = TRUE; } lck_interlock_unlock(lck, istate); - if (do_wakeup) - thread_wakeup((event_t) lck); + if (do_wakeup) + thread_wakeup(RW_LOCK_WRITER_EVENT(lck)); KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_CODE) | DBG_FUNC_END, - (int)lck, lck->read_count, lck->want_upgrade, 0, 0); + (int)lck, lck->lck_rw_shared_count, lck->lck_rw_want_upgrade, 0, 0); - return (TRUE); + return (FALSE); } - lck->want_upgrade = TRUE; + lck->lck_rw_want_upgrade = TRUE; #if MACH_LDEBUG decrementer = DECREMENTER_TIMEOUT; #endif /* MACH_LDEBUG */ - while (lck->read_count != 0) { - i = lock_wait_time[lck->can_sleep ? 1 : 0]; + while (lck->lck_rw_shared_count != 0) { +#if CONFIG_DTRACE + if (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN] && wait_interval == 0) { + wait_interval = mach_absolute_time(); + readers_at_sleep = lck->lck_rw_shared_count; + } else { + wait_interval = -1; + } +#endif + i = lock_wait_time[lck->lck_rw_can_sleep ? 1 : 0]; KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX1_CODE) | DBG_FUNC_START, - (int)lck, lck->read_count, i, 0, 0); + (int)lck, lck->lck_rw_shared_count, i, 0, 0); if (i != 0) { lck_interlock_unlock(lck, istate); #if MACH_LDEBUG if (!--decrementer) - Debugger("timeout - read_count"); + Debugger("timeout - lck_rw_shared_count"); #endif /* MACH_LDEBUG */ - while (--i != 0 && lck->read_count != 0) - continue; + while (--i != 0 && lck->lck_rw_shared_count != 0) + lck_rw_lock_pause(istate); istate = lck_interlock_lock(lck); } - if (lck->can_sleep && lck->read_count != 0) { - lck->waiting = TRUE; - res = assert_wait((event_t) lck, THREAD_UNINT); + if (lck->lck_rw_can_sleep && lck->lck_rw_shared_count != 0) { + lck->lck_w_waiting = TRUE; + res = assert_wait(RW_LOCK_WRITER_EVENT(lck), THREAD_UNINT); if (res == THREAD_WAITING) { lck_interlock_unlock(lck, istate); res = thread_block(THREAD_CONTINUE_NULL); +#if CONFIG_DTRACE + slept = 1; +#endif istate = lck_interlock_lock(lck); } } KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX1_CODE) | DBG_FUNC_END, - (int)lck, lck->read_count, 0, 0, 0); + (int)lck, lck->lck_rw_shared_count, 0, 0, 0); } lck_interlock_unlock(lck, istate); +#if CONFIG_DTRACE + /* + * We infer whether we took the sleep/spin path above by checking readers_at_sleep. + */ + if (wait_interval != 0 && wait_interval != (unsigned) -1 && readers_at_sleep) { + if (slept == 0) { + LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN, lck, mach_absolute_time() - wait_interval, 0); + } else { + LOCKSTAT_RECORD4(LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK, lck, + mach_absolute_time() - wait_interval, 1, + (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep); + } + } - return (FALSE); + LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lck, 1); +#endif + return (TRUE); } /* @@ -1484,33 +1387,44 @@ void lck_rw_lock_exclusive_to_shared( lck_rw_t *lck) { - boolean_t do_wakeup = FALSE; + boolean_t wakeup_readers = FALSE; + boolean_t wakeup_writers = FALSE; boolean_t istate; KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_START, - (int)lck, lck->want_write, lck->want_upgrade, 0, 0); + (int)lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0); istate = lck_interlock_lock(lck); - lck->read_count++; - if (lck->want_upgrade) - lck->want_upgrade = FALSE; + lck->lck_rw_shared_count++; + if (lck->lck_rw_want_upgrade) + lck->lck_rw_want_upgrade = FALSE; else - lck->want_write = FALSE; - - if (lck->waiting) { - lck->waiting = FALSE; - do_wakeup = TRUE; + lck->lck_rw_want_write = FALSE; + + if (lck->lck_w_waiting) { + lck->lck_w_waiting = FALSE; + wakeup_writers = TRUE; + } + if (!(lck->lck_rw_priv_excl && wakeup_writers == TRUE) && + lck->lck_r_waiting) { + lck->lck_r_waiting = FALSE; + wakeup_readers = TRUE; } lck_interlock_unlock(lck, istate); - if (do_wakeup) - thread_wakeup((event_t) lck); + if (wakeup_readers) + thread_wakeup(RW_LOCK_READER_EVENT(lck)); + if (wakeup_writers) + thread_wakeup(RW_LOCK_WRITER_EVENT(lck)); KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_END, - (int)lck, lck->want_write, lck->want_upgrade, lck->read_count, 0); + (int)lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, lck->lck_rw_shared_count, 0); +#if CONFIG_DTRACE + LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_TO_SHARED_DOWNGRADE, lck, 0); +#endif } @@ -1547,7 +1461,7 @@ lck_rw_try_lock_exclusive( istate = lck_interlock_lock(lck); - if (lck->want_write || lck->want_upgrade || lck->read_count) { + if (lck->lck_rw_want_write || lck->lck_rw_want_upgrade || lck->lck_rw_shared_count) { /* * Can't get lock. */ @@ -1559,10 +1473,13 @@ lck_rw_try_lock_exclusive( * Have lock. */ - lck->want_write = TRUE; + lck->lck_rw_want_write = TRUE; lck_interlock_unlock(lck, istate); +#if CONFIG_DTRACE + LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_EXCL_ACQUIRE, lck, 1); +#endif return(TRUE); } @@ -1581,19 +1498,54 @@ lck_rw_try_lock_shared( boolean_t istate; istate = lck_interlock_lock(lck); - - if (lck->want_write || lck->want_upgrade) { +/* No reader priority check here... */ + if (lck->lck_rw_want_write || lck->lck_rw_want_upgrade) { lck_interlock_unlock(lck, istate); return(FALSE); } - lck->read_count++; + lck->lck_rw_shared_count++; lck_interlock_unlock(lck, istate); +#if CONFIG_DTRACE + LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_SHARED_ACQUIRE, lck, 0); +#endif return(TRUE); } +void +lck_rw_assert( + lck_rw_t *lck, + unsigned int type) +{ + switch (type) { + case LCK_RW_ASSERT_SHARED: + if (lck->lck_rw_shared_count != 0) { + return; + } + break; + case LCK_RW_ASSERT_EXCLUSIVE: + if ((lck->lck_rw_want_write || + lck->lck_rw_want_upgrade) && + lck->lck_rw_shared_count == 0) { + return; + } + break; + case LCK_RW_ASSERT_HELD: + if (lck->lck_rw_want_write || + lck->lck_rw_want_upgrade || + lck->lck_rw_shared_count != 0) { + return; + } + break; + default: + break; + } + + panic("rw lock (%p) not held (mode=%u)\n", lck, type); +} + /* * Routine: lck_mtx_alloc_init */ @@ -1631,20 +1583,17 @@ lck_mtx_ext_init( lck_grp_t *grp, lck_attr_t *attr) { - lck->lck_mtx.lck_mtx_ilk = 0; - lck->lck_mtx.lck_mtx_locked = 0; - lck->lck_mtx.lck_mtx_waiters = 0; - lck->lck_mtx.lck_mtx_pri = 0; - lck->lck_mtx_attr = 0; + bzero((void *)lck, sizeof(lck_mtx_ext_t)); if ((attr->lck_attr_val) & LCK_ATTR_DEBUG) { - lck->lck_mtx_deb.pc = 0; - lck->lck_mtx_deb.thread = 0; lck->lck_mtx_deb.type = MUTEX_TAG; lck->lck_mtx_attr |= LCK_MTX_ATTR_DEBUG; } lck->lck_mtx_grp = grp; + + if (grp->lck_grp_attr & LCK_GRP_ATTR_STAT) + lck->lck_mtx_attr |= LCK_MTX_ATTR_STAT; } /* @@ -1657,10 +1606,16 @@ lck_mtx_init( lck_attr_t *attr) { lck_mtx_ext_t *lck_ext; + lck_attr_t *lck_attr; + + if (attr != LCK_ATTR_NULL) + lck_attr = attr; + else + lck_attr = &LockDefaultLckAttr; - if ((attr != LCK_ATTR_NULL) && ((attr->lck_attr_val) & LCK_ATTR_DEBUG)) { + if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) { if ((lck_ext = (lck_mtx_ext_t *)kalloc(sizeof(lck_mtx_ext_t))) != 0) { - lck_mtx_ext_init(lck_ext, grp, attr); + lck_mtx_ext_init(lck_ext, grp, lck_attr); lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT; lck->lck_mtx_ptr = lck_ext; } @@ -1674,6 +1629,37 @@ lck_mtx_init( lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX); } +/* + * Routine: lck_mtx_init_ext + */ +void +lck_mtx_init_ext( + lck_mtx_t *lck, + lck_mtx_ext_t *lck_ext, + lck_grp_t *grp, + lck_attr_t *attr) +{ + lck_attr_t *lck_attr; + + if (attr != LCK_ATTR_NULL) + lck_attr = attr; + else + lck_attr = &LockDefaultLckAttr; + + if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) { + lck_mtx_ext_init(lck_ext, grp, lck_attr); + lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT; + lck->lck_mtx_ptr = lck_ext; + } else { + lck->lck_mtx_ilk = 0; + lck->lck_mtx_locked = 0; + lck->lck_mtx_waiters = 0; + lck->lck_mtx_pri = 0; + } + lck_grp_reference(grp); + lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX); +} + /* * Routine: lck_mtx_destroy */ @@ -1696,31 +1682,100 @@ lck_mtx_destroy( } /* - * Routine: lck_mtx_assert + * Routine: lck_mtx_lock_spinwait + * + * Invoked trying to acquire a mutex when there is contention but + * the holder is running on another processor. We spin for up to a maximum + * time waiting for the lock to be released. + * + * Called with the interlock unlocked. */ void -lck_mtx_assert( - __unused lck_mtx_t *lck, - __unused unsigned int type) +lck_mtx_lock_spinwait( + lck_mtx_t *lck) { + thread_t holder; + volatile lck_mtx_t *mutex; + uint64_t deadline; + + if (lck->lck_mtx_tag != LCK_MTX_TAG_INDIRECT) + mutex = lck; + else + mutex = &lck->lck_mtx_ptr->lck_mtx; + + KERNEL_DEBUG( + MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN) | DBG_FUNC_NONE, + (int)lck, (int)mutex->lck_mtx_locked, 0, 0, 0); + + deadline = mach_absolute_time() + MutexSpin; + /* + * Spin while: + * - mutex is locked, and + * - its locked as a spin lock, or + * - owner is running on another processor, and + * - owner (processor) is not idling, and + * - we haven't spun for long enough. + */ + while ((holder = (thread_t) mutex->lck_mtx_locked) != NULL) { + if ((holder == (thread_t)MUTEX_LOCKED_AS_SPIN) || + ((holder->machine.specFlags & OnProc) != 0 && + (holder->state & TH_IDLE) == 0 && + mach_absolute_time() < deadline)) { + cpu_pause(); + continue; + } + break; + } +#if CONFIG_DTRACE + /* + * We've already kept a count via deadline of how long we spun. + * If dtrace is active, then we compute backwards to decide how + * long we spun. + * + * Note that we record a different probe id depending on whether + * this is a direct or indirect mutex. This allows us to + * penalize only lock groups that have debug/stats enabled + * with dtrace processing if desired. + */ + if (lck->lck_mtx_tag != LCK_MTX_TAG_INDIRECT) { + LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN, lck, + mach_absolute_time() - (deadline - MutexSpin)); + } else { + LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_SPIN, lck, + mach_absolute_time() - (deadline - MutexSpin)); + } + /* The lockstat acquire event is recorded by the assembly code beneath us. */ +#endif } -#if MACH_KDB +/* + * Called from assembly code when a destroyed mutex is detected + * during a lock/unlock/try/convert + */ -void db_show_one_lock(lock_t *); +void +lck_mtx_interlock_panic( + lck_mtx_t *lck) +{ + panic("trying to interlock destroyed mutex %p", lck); +} + + +#if MACH_KDB void db_show_one_lock( lock_t *lock) { db_printf("Read_count = 0x%x, %swant_upgrade, %swant_write, ", - lock->read_count, - lock->want_upgrade ? "" : "!", - lock->want_write ? "" : "!"); + lock->lck_rw_shared_count, + lock->lck_rw_want_upgrade ? "" : "!", + lock->lck_rw_want_write ? "" : "!"); db_printf("%swaiting, %scan_sleep\n", - lock->waiting ? "" : "!", lock->can_sleep ? "" : "!"); + (lock->lck_r_waiting || lock->lck_w_waiting) ? "" : "!", + lock->lck_rw_can_sleep ? "" : "!"); db_printf("Interlock:\n"); - db_show_one_simple_lock((db_expr_t)simple_lock_addr(lock->interlock), + db_show_one_simple_lock((db_expr_t) ((vm_offset_t)simple_lock_addr(lock->lck_rw_interlock)), TRUE, (db_expr_t)0, (char *)0); } @@ -1765,36 +1820,6 @@ mutex_free( kfree(m, sizeof(mutex_t)); } -/* - * Routine: _mutex_assert - */ -void -_mutex_assert ( - mutex_t *mutex, - unsigned int what) -{ - - thread_t thread = current_thread(); - thread_t holder; - - if (panicstr != NULL) - return; - - holder = (thread_t) mutex->lck_mtx.lck_mtx_locked; - - switch (what) { - case MA_OWNED: - if (thread != holder) - panic("mutex %x not owned\n", mutex); - break; - - case MA_NOTOWNED: - if (thread == holder) - panic("mutex %x owned\n", mutex); - break; - } - -} #if MACH_KDB /* @@ -1802,17 +1827,17 @@ _mutex_assert ( * fashion. */ -char *simple_lock_labels = "ENTRY ILK THREAD DURATION CALLER"; -char *mutex_labels = "ENTRY LOCKED WAITERS THREAD CALLER"; +const char *simple_lock_labels = "ENTRY ILK THREAD DURATION CALLER"; +const char *mutex_labels = "ENTRY LOCKED WAITERS THREAD CALLER"; void db_show_one_simple_lock ( db_expr_t addr, boolean_t have_addr, - db_expr_t count, - char * modif) + __unused db_expr_t count, + __unused char * modif) { - simple_lock_t saddr = (simple_lock_t)addr; + simple_lock_t saddr = (simple_lock_t) ((vm_offset_t) addr); if (saddr == (simple_lock_t)0 || !have_addr) { db_error ("No simple_lock\n"); @@ -1844,10 +1869,10 @@ void db_show_one_mutex ( db_expr_t addr, boolean_t have_addr, - db_expr_t count, - char * modif) + __unused db_expr_t count, + __unused char * modif) { - mutex_t * maddr = (mutex_t *)addr; + mutex_t * maddr = (mutex_t *)((vm_offset_t) addr); if (maddr == (mutex_t *)0 || !have_addr) db_error ("No mutex\n");