X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/2d21ac55c334faf3a56e5634905ed6987fc787d4..fe8ab488e9161c46dd9885d58fc52996dc0249ff:/osfmk/i386/locks_i386.c?ds=sidebyside diff --git a/osfmk/i386/locks_i386.c b/osfmk/i386/locks_i386.c index 38d332b00..4dd253e01 100644 --- a/osfmk/i386/locks_i386.c +++ b/osfmk/i386/locks_i386.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -61,10 +61,8 @@ * Locking primitives implementation */ -#include #include -#include #include #include #include @@ -77,16 +75,12 @@ #include #include -#if MACH_KDB -#include -#include -#include -#include -#endif /* MACH_KDB */ - -#include +#include /* machine_timeout_suspended() */ +#include +#include #include +#include /* * We need only enough declarations from the BSD-side to be able to @@ -105,24 +99,22 @@ #define LCK_RW_LCK_SH_TO_EX1_CODE 0x104 #define LCK_RW_LCK_EX_TO_SH_CODE 0x105 -#define LCK_MTX_LCK_SPIN 0x200 +#define LCK_RW_LCK_EX_WRITER_SPIN_CODE 0x106 +#define LCK_RW_LCK_EX_WRITER_WAIT_CODE 0x107 +#define LCK_RW_LCK_EX_READER_SPIN_CODE 0x108 +#define LCK_RW_LCK_EX_READER_WAIT_CODE 0x109 +#define LCK_RW_LCK_SHARED_SPIN_CODE 0x110 +#define LCK_RW_LCK_SHARED_WAIT_CODE 0x111 +#define LCK_RW_LCK_SH_TO_EX_SPIN_CODE 0x112 +#define LCK_RW_LCK_SH_TO_EX_WAIT_CODE 0x113 + #define ANY_LOCK_DEBUG (USLOCK_DEBUG || LOCK_DEBUG || MUTEX_DEBUG) unsigned int LcksOpts=0; -unsigned int lock_wait_time[2] = { (unsigned int)-1, 0 } ; /* Forwards */ -#if MACH_KDB -void db_print_simple_lock( - simple_lock_t addr); - -void db_print_mutex( - mutex_t * addr); -#endif /* MACH_KDB */ - - #if USLOCK_DEBUG /* * Perform simple lock checks. @@ -131,11 +123,9 @@ int uslock_check = 1; int max_lock_loops = 100000000; decl_simple_lock_data(extern , printf_lock) decl_simple_lock_data(extern , panic_lock) -#if MACH_KDB -decl_simple_lock_data(extern , kdb_lock) -#endif /* MACH_KDB */ #endif /* USLOCK_DEBUG */ +extern unsigned int not_in_kdp; /* * We often want to know the addresses of the callers @@ -146,7 +136,7 @@ typedef void *pc_t; #define INVALID_PC ((void *) VM_MAX_KERNEL_ADDRESS) #define INVALID_THREAD ((void *) VM_MAX_KERNEL_ADDRESS) #if ANY_LOCK_DEBUG -#define OBTAIN_PC(pc,l) ((pc) = (void *) GET_RETURN_PC(&(l))) +#define OBTAIN_PC(pc) ((pc) = GET_RETURN_PC()) #define DECL_PC(pc) pc_t pc; #else /* ANY_LOCK_DEBUG */ #define DECL_PC(pc) @@ -154,9 +144,9 @@ typedef void *pc_t; /* * Eliminate lint complaints about unused local pc variables. */ -#define OBTAIN_PC(pc,l) ++pc +#define OBTAIN_PC(pc) ++pc #else /* lint */ -#define OBTAIN_PC(pc,l) +#define OBTAIN_PC(pc) #endif /* lint */ #endif /* USLOCK_DEBUG */ @@ -178,6 +168,12 @@ int usld_lock_common_checks(usimple_lock_t, char *); #define USLDBG(stmt) #endif /* USLOCK_DEBUG */ + +extern int lck_rw_grab_want(lck_rw_t *lck); +extern int lck_rw_grab_shared(lck_rw_t *lck); +extern int lck_rw_held_read_or_upgrade(lck_rw_t *lck); + + /* * Forward definitions */ @@ -185,9 +181,26 @@ int usld_lock_common_checks(usimple_lock_t, char *); void lck_rw_lock_shared_gen( lck_rw_t *lck); -lck_rw_type_t lck_rw_done_gen( +void lck_rw_lock_exclusive_gen( + lck_rw_t *lck); + +boolean_t lck_rw_lock_shared_to_exclusive_success( lck_rw_t *lck); +boolean_t lck_rw_lock_shared_to_exclusive_failure( + lck_rw_t *lck, + int prior_lock_state); + +void lck_rw_lock_exclusive_to_shared_gen( + lck_rw_t *lck, + int prior_lock_state); + +lck_rw_type_t lck_rw_done_gen( + lck_rw_t *lck, + int prior_lock_state); + +void lck_rw_clear_promotions_x86(thread_t thread); + /* * Routine: lck_spin_alloc_init */ @@ -238,9 +251,9 @@ lck_spin_destroy( lck_spin_t *lck, lck_grp_t *grp) { - if (lck->lck_spin_data[0] == LCK_SPIN_TAG_DESTROYED) + if (lck->interlock == LCK_SPIN_TAG_DESTROYED) return; - lck->lck_spin_data[0] = LCK_SPIN_TAG_DESTROYED; + lck->interlock = LCK_SPIN_TAG_DESTROYED; lck_grp_lckcnt_decr(grp, LCK_TYPE_SPIN); lck_grp_deallocate(grp); return; @@ -277,6 +290,19 @@ lck_spin_try_lock( return((boolean_t)usimple_lock_try((usimple_lock_t) lck)); } +/* + * Routine: lck_spin_is_acquired + * NOT SAFE: To be used only by kernel debugger to avoid deadlock. + * Returns: TRUE if lock is acquired. + */ +boolean_t +lck_spin_is_acquired(lck_spin_t *lck) { + if (not_in_kdp) { + panic("panic: spinlock acquired check done outside of kernel debugger"); + } + return (lck->interlock != 0)? TRUE : FALSE; +} + /* * Initialize a usimple_lock. * @@ -295,6 +321,29 @@ usimple_lock_init( #endif } +volatile uint32_t spinlock_owner_cpu = ~0; +volatile usimple_lock_t spinlock_timed_out; + +uint32_t spinlock_timeout_NMI(uintptr_t thread_addr) { + uint64_t deadline; + uint32_t i; + + for (i = 0; i < real_ncpus; i++) { + if ((uintptr_t)cpu_data_ptr[i]->cpu_active_thread == thread_addr) { + spinlock_owner_cpu = i; + if ((uint32_t) cpu_number() == i) + break; + cpu_datap(i)->cpu_NMI_acknowledged = FALSE; + cpu_NMI_interrupt(i); + deadline = mach_absolute_time() + (LockTimeOut * 2); + while (mach_absolute_time() < deadline && cpu_datap(i)->cpu_NMI_acknowledged == FALSE) + cpu_pause(); + break; + } + } + + return spinlock_owner_cpu; +} /* * Acquire a usimple_lock. @@ -310,12 +359,25 @@ usimple_lock( #ifndef MACHINE_SIMPLE_LOCK DECL_PC(pc); - OBTAIN_PC(pc, l); + OBTAIN_PC(pc); USLDBG(usld_lock_pre(l, pc)); - if(!hw_lock_to(&l->interlock, LockTimeOutTSC)) /* Try to get the lock with a timeout */ - panic("simple lock deadlock detection: lock=%p, cpu=%d, owning thread=0x%x", l, cpu_number(), l->interlock.lock_data); + if(__improbable(hw_lock_to(&l->interlock, LockTimeOutTSC) == 0)) { + boolean_t uslock_acquired = FALSE; + while (machine_timeout_suspended()) { + enable_preemption(); + if ((uslock_acquired = hw_lock_to(&l->interlock, LockTimeOutTSC))) + break; + } + if (uslock_acquired == FALSE) { + uint32_t lock_cpu; + uintptr_t lowner = (uintptr_t)l->interlock.lock_data; + spinlock_timed_out = l; + lock_cpu = spinlock_timeout_NMI(lowner); + panic("Spinlock acquisition timed out: lock=%p, lock owner thread=0x%lx, current_thread: %p, lock owner active on CPU 0x%x, current owner: 0x%lx", l, lowner, current_thread(), lock_cpu, (uintptr_t)l->interlock.lock_data); + } + } USLDBG(usld_lock_post(l, pc)); #else simple_lock((simple_lock_t)l); @@ -337,7 +399,7 @@ usimple_unlock( #ifndef MACHINE_SIMPLE_LOCK DECL_PC(pc); - OBTAIN_PC(pc, l); + OBTAIN_PC(pc); USLDBG(usld_unlock(l, pc)); hw_lock_unlock(&l->interlock); #else @@ -366,7 +428,7 @@ usimple_lock_try( unsigned int success; DECL_PC(pc); - OBTAIN_PC(pc, l); + OBTAIN_PC(pc); USLDBG(usld_lock_try_pre(l, pc)); if ((success = hw_lock_try(&l->interlock))) { USLDBG(usld_lock_try_post(l, pc)); @@ -430,10 +492,9 @@ usld_lock_common_checks( if (l == USIMPLE_LOCK_NULL) panic("%s: null lock pointer", caller); if (l->lock_type != USLOCK_TAG) - panic("%s: 0x%x is not a usimple lock", caller, (integer_t) l); + panic("%s: %p is not a usimple lock, 0x%x", caller, l, l->lock_type); if (!(l->debug.state & USLOCK_INIT)) - panic("%s: 0x%x is not an initialized lock", - caller, (integer_t) l); + panic("%s: %p is not an initialized lock, 0x%x", caller, l, l->debug.state); return USLOCK_CHECKING(l); } @@ -495,11 +556,11 @@ usld_lock_post( return; if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED)) - panic("%s: lock 0x%x became uninitialized", - caller, (integer_t) l); + panic("%s: lock %p became uninitialized", + caller, l); if ((l->debug.state & USLOCK_TAKEN)) - panic("%s: lock 0x%x became TAKEN by someone else", - caller, (integer_t) l); + panic("%s: lock 0x%p became TAKEN by someone else", + caller, l); mycpu = cpu_number(); l->debug.lock_thread = (void *)current_thread(); @@ -534,14 +595,14 @@ usld_unlock( mycpu = cpu_number(); if (!(l->debug.state & USLOCK_TAKEN)) - panic("%s: lock 0x%x hasn't been taken", - caller, (integer_t) l); + panic("%s: lock 0x%p hasn't been taken", + caller, l); if (l->debug.lock_thread != (void *) current_thread()) - panic("%s: unlocking lock 0x%x, owned by thread %p", - caller, (integer_t) l, l->debug.lock_thread); + panic("%s: unlocking lock 0x%p, owned by thread %p", + caller, l, l->debug.lock_thread); if (l->debug.lock_cpu != mycpu) { - printf("%s: unlocking lock 0x%x on cpu 0x%x", - caller, (integer_t) l, mycpu); + printf("%s: unlocking lock 0x%p on cpu 0x%x", + caller, l, mycpu); printf(" (acquired on cpu 0x%x)\n", l->debug.lock_cpu); panic("%s", caller); } @@ -596,11 +657,11 @@ usld_lock_try_post( return; if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED)) - panic("%s: lock 0x%x became uninitialized", - caller, (integer_t) l); + panic("%s: lock 0x%p became uninitialized", + caller, l); if ((l->debug.state & USLOCK_TAKEN)) - panic("%s: lock 0x%x became TAKEN by someone else", - caller, (integer_t) l); + panic("%s: lock 0x%p became TAKEN by someone else", + caller, l); mycpu = cpu_number(); l->debug.lock_thread = (void *) current_thread(); @@ -631,8 +692,8 @@ usl_trace( if (traced_lock == l) { XPR(XPR_SLOCK, "seq %d, cpu %d, %s @ %x\n", - (integer_t) lock_seq, (integer_t) mycpu, - (integer_t) op_name, (integer_t) pc, 0); + (uintptr_t) lock_seq, (uintptr_t) mycpu, + (uintptr_t) op_name, (uintptr_t) pc, 0); lock_seq++; } } @@ -640,125 +701,6 @@ usl_trace( #endif /* USLOCK_DEBUG */ -/* - * Routine: lock_alloc - * Function: - * Allocate a lock for external users who cannot - * hard-code the structure definition into their - * objects. - * For now just use kalloc, but a zone is probably - * warranted. - */ -lock_t * -lock_alloc( - boolean_t can_sleep, - unsigned short tag, - unsigned short tag1) -{ - lock_t *l; - - if ((l = (lock_t *)kalloc(sizeof(lock_t))) != 0) - lock_init(l, can_sleep, tag, tag1); - return(l); -} - -/* - * Routine: lock_free - * Function: - * Free a lock allocated for external users. - * For now just use kfree, but a zone is probably - * warranted. - */ -void -lock_free( - lock_t *l) -{ - kfree(l, sizeof(lock_t)); -} - - -/* - * Routine: lock_init - * Function: - * Initialize a lock; required before use. - * Note that clients declare the "struct lock" - * variables and then initialize them, rather - * than getting a new one from this module. - */ -void -lock_init( - lock_t *l, - boolean_t can_sleep, - __unused unsigned short tag, - __unused unsigned short tag1) -{ - hw_lock_byte_init(&l->lck_rw_interlock); - l->lck_rw_want_write = FALSE; - l->lck_rw_want_upgrade = FALSE; - l->lck_rw_shared_count = 0; - l->lck_rw_can_sleep = can_sleep; - l->lck_rw_tag = tag; - l->lck_rw_priv_excl = 1; -} - - -/* - * Sleep locks. These use the same data structure and algorithm - * as the spin locks, but the process sleeps while it is waiting - * for the lock. These work on uniprocessor systems. - */ - -#define DECREMENTER_TIMEOUT 1000000 - -void -lock_write( - register lock_t * l) -{ - lck_rw_lock_exclusive(l); -} - -void -lock_done( - register lock_t * l) -{ - (void) lck_rw_done(l); -} - -void -lock_read( - register lock_t * l) -{ - lck_rw_lock_shared(l); -} - - -/* - * Routine: lock_read_to_write - * Function: - * Improves a read-only lock to one with - * write permission. If another reader has - * already requested an upgrade to a write lock, - * no lock is held upon return. - * - * Returns FALSE if the upgrade *failed*. - */ - -boolean_t -lock_read_to_write( - register lock_t * l) -{ - return lck_rw_lock_shared_to_exclusive(l); -} - -void -lock_write_to_read( - register lock_t * l) -{ - lck_rw_lock_exclusive_to_shared(l); -} - - - /* * Routine: lck_rw_alloc_init */ @@ -768,9 +710,11 @@ lck_rw_alloc_init( lck_attr_t *attr) { lck_rw_t *lck; - if ((lck = (lck_rw_t *)kalloc(sizeof(lck_rw_t))) != 0) + if ((lck = (lck_rw_t *)kalloc(sizeof(lck_rw_t))) != 0) { + bzero(lck, sizeof(lck_rw_t)); lck_rw_init(lck, grp, attr); - + } + return(lck); } @@ -802,6 +746,7 @@ lck_rw_init( lck->lck_rw_want_upgrade = FALSE; lck->lck_rw_shared_count = 0; lck->lck_rw_can_sleep = TRUE; + lck->lck_r_waiting = lck->lck_w_waiting = 0; lck->lck_rw_tag = 0; lck->lck_rw_priv_excl = ((lck_attr->lck_attr_val & LCK_ATTR_RW_SHARED_PRIORITY) == 0); @@ -816,9 +761,13 @@ lck_rw_init( void lck_rw_destroy( lck_rw_t *lck, - lck_grp_t *grp) { + lck_grp_t *grp) +{ if (lck->lck_rw_tag == LCK_RW_TAG_DESTROYED) return; +#if MACH_LDEBUG + lck_rw_assert(lck, LCK_RW_ASSERT_NOTHELD); +#endif lck->lck_rw_tag = LCK_RW_TAG_DESTROYED; lck_grp_lckcnt_decr(grp, LCK_TYPE_RW); lck_grp_deallocate(grp); @@ -840,8 +789,8 @@ lck_rw_destroy( ((event_t) (((unsigned char*) (x)) + (offsetof(lck_rw_t, lck_rw_pad8)))) /* - * We need to disable interrupts while holding the mutex interlock - * to prevent an IPI intervening. + * We disable interrupts while holding the RW interlock to prevent an + * interrupt from exacerbating hold time. * Hence, local helper functions lck_interlock_lock()/lck_interlock_unlock(). */ static boolean_t @@ -876,92 +825,132 @@ lck_rw_lock_pause(boolean_t interrupts_enabled) cpu_pause(); } + +/* + * compute the deadline to spin against when + * waiting for a change of state on a lck_rw_t + */ +static inline uint64_t +lck_rw_deadline_for_spin(lck_rw_t *lck) +{ + if (lck->lck_rw_can_sleep) { + if (lck->lck_r_waiting || lck->lck_w_waiting || lck->lck_rw_shared_count > machine_info.max_cpus) { + /* + * there are already threads waiting on this lock... this + * implies that they have spun beyond their deadlines waiting for + * the desired state to show up so we will not bother spinning at this time... + * or + * the current number of threads sharing this lock exceeds our capacity to run them + * concurrently and since all states we're going to spin for require the rw_shared_count + * to be at 0, we'll not bother spinning since the latency for this to happen is + * unpredictable... + */ + return (mach_absolute_time()); + } + return (mach_absolute_time() + MutexSpin); + } else + return (mach_absolute_time() + (100000LL * 1000000000LL)); +} + + /* * Routine: lck_rw_lock_exclusive */ void -lck_rw_lock_exclusive( +lck_rw_lock_exclusive_gen( lck_rw_t *lck) { - int i; - wait_result_t res; -#if MACH_LDEBUG - int decrementer; -#endif /* MACH_LDEBUG */ - boolean_t istate; -#if CONFIG_DTRACE - uint64_t wait_interval = 0; - int slept = 0; - int readers_at_sleep; -#endif + uint64_t deadline = 0; + int slept = 0; + int gotlock = 0; + int lockheld = 0; + wait_result_t res = 0; + boolean_t istate = -1; - istate = lck_interlock_lock(lck); #if CONFIG_DTRACE - readers_at_sleep = lck->lck_rw_shared_count; + boolean_t dtrace_ls_initialized = FALSE; + boolean_t dtrace_rwl_excl_spin, dtrace_rwl_excl_block, dtrace_ls_enabled= FALSE; + uint64_t wait_interval = 0; + int readers_at_sleep = 0; #endif -#if MACH_LDEBUG - decrementer = DECREMENTER_TIMEOUT; -#endif /* MACH_LDEBUG */ - /* * Try to acquire the lck_rw_want_write bit. */ - while (lck->lck_rw_want_write) { + while ( !lck_rw_grab_want(lck)) { - KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EXCLUSIVE_CODE) | DBG_FUNC_START, (int)lck, 0, 0, 0, 0); - /* - * Either sleeping or spinning is happening, start - * a timing of our delay interval now. - */ #if CONFIG_DTRACE - if ((lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] || lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK]) && wait_interval == 0) { - wait_interval = mach_absolute_time(); - } else { - wait_interval = -1; + if (dtrace_ls_initialized == FALSE) { + dtrace_ls_initialized = TRUE; + dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0); + dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0); + dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block; + if (dtrace_ls_enabled) { + /* + * Either sleeping or spinning is happening, + * start a timing of our delay interval now. + */ + readers_at_sleep = lck->lck_rw_shared_count; + wait_interval = mach_absolute_time(); + } } #endif + if (istate == -1) + istate = ml_get_interrupts_enabled(); + + deadline = lck_rw_deadline_for_spin(lck); + + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_START, (int)lck, 0, 0, 0, 0); + + while (((gotlock = lck_rw_grab_want(lck)) == 0) && mach_absolute_time() < deadline) + lck_rw_lock_pause(istate); + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_END, (int)lck, 0, 0, gotlock, 0); + + if (gotlock) + break; + /* + * if we get here, the deadline has expired w/o us + * being able to grab the lock exclusively + * check to see if we're allowed to do a thread_block + */ + if (lck->lck_rw_can_sleep) { - i = lock_wait_time[lck->lck_rw_can_sleep ? 1 : 0]; - if (i != 0) { - lck_interlock_unlock(lck, istate); -#if MACH_LDEBUG - if (!--decrementer) - Debugger("timeout - lck_rw_want_write"); -#endif /* MACH_LDEBUG */ - while (--i != 0 && lck->lck_rw_want_write) - lck_rw_lock_pause(istate); istate = lck_interlock_lock(lck); - } - if (lck->lck_rw_can_sleep && lck->lck_rw_want_write) { - lck->lck_w_waiting = TRUE; - res = assert_wait(RW_LOCK_WRITER_EVENT(lck), THREAD_UNINT); - if (res == THREAD_WAITING) { - lck_interlock_unlock(lck, istate); - res = thread_block(THREAD_CONTINUE_NULL); -#if CONFIG_DTRACE - slept = 1; -#endif - istate = lck_interlock_lock(lck); - } - } - KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EXCLUSIVE_CODE) | DBG_FUNC_END, (int)lck, res, 0, 0, 0); - } - lck->lck_rw_want_write = TRUE; + if (lck->lck_rw_want_write) { - /* Wait for readers (and upgrades) to finish */ + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_START, (int)lck, 0, 0, 0, 0); -#if MACH_LDEBUG - decrementer = DECREMENTER_TIMEOUT; -#endif /* MACH_LDEBUG */ - while ((lck->lck_rw_shared_count != 0) || lck->lck_rw_want_upgrade) { + lck->lck_w_waiting = TRUE; - i = lock_wait_time[lck->lck_rw_can_sleep ? 1 : 0]; + res = assert_wait(RW_LOCK_WRITER_EVENT(lck), THREAD_UNINT); + lck_interlock_unlock(lck, istate); - KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EXCLUSIVE1_CODE) | DBG_FUNC_START, - (int)lck, lck->lck_rw_shared_count, lck->lck_rw_want_upgrade, i, 0); + if (res == THREAD_WAITING) { + res = thread_block(THREAD_CONTINUE_NULL); + slept++; + } + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_END, (int)lck, res, slept, 0, 0); + } else { + lck->lck_rw_want_write = TRUE; + lck_interlock_unlock(lck, istate); + break; + } + } + } + /* + * Wait for readers (and upgrades) to finish... + * the test for these conditions must be done simultaneously with + * a check of the interlock not being held since + * the rw_shared_count will drop to 0 first and then want_upgrade + * will be set to 1 in the shared_to_exclusive scenario... those + * adjustments are done behind the interlock and represent an + * atomic change in state and must be considered as such + * however, once we see the read count at 0, the want_upgrade not set + * and the interlock not held, we are safe to proceed + */ + while (lck_rw_held_read_or_upgrade(lck)) { #if CONFIG_DTRACE /* @@ -970,42 +959,69 @@ lck_rw_lock_exclusive( * to -1 we don't have accurate data so we cannot later * decide to record a dtrace spin or sleep event. */ - if ((lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] || lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK]) && wait_interval == 0) { - wait_interval = mach_absolute_time(); - } else { - wait_interval = (unsigned) -1; + if (dtrace_ls_initialized == FALSE) { + dtrace_ls_initialized = TRUE; + dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0); + dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0); + dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block; + if (dtrace_ls_enabled) { + /* + * Either sleeping or spinning is happening, + * start a timing of our delay interval now. + */ + readers_at_sleep = lck->lck_rw_shared_count; + wait_interval = mach_absolute_time(); + } } #endif + if (istate == -1) + istate = ml_get_interrupts_enabled(); + + deadline = lck_rw_deadline_for_spin(lck); + + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_START, (int)lck, 0, 0, 0, 0); + + while ((lockheld = lck_rw_held_read_or_upgrade(lck)) && mach_absolute_time() < deadline) + lck_rw_lock_pause(istate); + + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_END, (int)lck, 0, 0, lockheld, 0); + + if ( !lockheld) + break; + /* + * if we get here, the deadline has expired w/o us + * being able to grab the lock exclusively + * check to see if we're allowed to do a thread_block + */ + if (lck->lck_rw_can_sleep) { - if (i != 0) { - lck_interlock_unlock(lck, istate); -#if MACH_LDEBUG - if (!--decrementer) - Debugger("timeout - wait for readers"); -#endif /* MACH_LDEBUG */ - while (--i != 0 && (lck->lck_rw_shared_count != 0 || - lck->lck_rw_want_upgrade)) - lck_rw_lock_pause(istate); istate = lck_interlock_lock(lck); - } - if (lck->lck_rw_can_sleep && (lck->lck_rw_shared_count != 0 || lck->lck_rw_want_upgrade)) { - lck->lck_w_waiting = TRUE; - res = assert_wait(RW_LOCK_WRITER_EVENT(lck), THREAD_UNINT); - if (res == THREAD_WAITING) { + if (lck->lck_rw_shared_count != 0 || lck->lck_rw_want_upgrade) { + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_START, (int)lck, 0, 0, 0, 0); + + lck->lck_w_waiting = TRUE; + + res = assert_wait(RW_LOCK_WRITER_EVENT(lck), THREAD_UNINT); lck_interlock_unlock(lck, istate); - res = thread_block(THREAD_CONTINUE_NULL); -#if CONFIG_DTRACE - slept = 1; -#endif - istate = lck_interlock_lock(lck); + + if (res == THREAD_WAITING) { + res = thread_block(THREAD_CONTINUE_NULL); + slept++; + } + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_END, (int)lck, res, slept, 0, 0); + } else { + lck_interlock_unlock(lck, istate); + /* + * must own the lock now, since we checked for + * readers or upgrade owner behind the interlock + * no need for a call to 'lck_rw_held_read_or_upgrade' + */ + break; } } - KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EXCLUSIVE1_CODE) | DBG_FUNC_END, - (int)lck, lck->lck_rw_shared_count, lck->lck_rw_want_upgrade, res, 0); } - lck_interlock_unlock(lck, istate); #if CONFIG_DTRACE /* * Decide what latencies we suffered that are Dtrace events. @@ -1016,7 +1032,7 @@ lck_rw_lock_exclusive( * If we have set wait_interval to -1, then dtrace was not enabled when we * started sleeping/spinning so we don't record this event. */ - if (wait_interval != 0 && wait_interval != (unsigned) -1) { + if (dtrace_ls_enabled == TRUE) { if (slept == 0) { LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_EXCL_SPIN, lck, mach_absolute_time() - wait_interval, 1); @@ -1039,67 +1055,72 @@ lck_rw_lock_exclusive( /* * Routine: lck_rw_done_gen + * + * called from the assembly language wrapper... + * prior_lock_state is the value in the 1st + * word of the lock at the time of a successful + * atomic compare and exchange with the new value... + * it represents the state of the lock before we + * decremented the rw_shared_count or cleared either + * rw_want_upgrade or rw_want_write and + * the lck_x_waiting bits... since the wrapper + * routine has already changed the state atomically, + * we just need to decide if we should + * wake up anyone and what value to return... we do + * this by examining the state of the lock before + * we changed it */ lck_rw_type_t lck_rw_done_gen( - lck_rw_t *lck) + lck_rw_t *lck, + int prior_lock_state) { - boolean_t wakeup_readers = FALSE; - boolean_t wakeup_writers = FALSE; - lck_rw_type_t lck_rw_type; - boolean_t istate; - - istate = lck_interlock_lock(lck); - - if (lck->lck_rw_shared_count != 0) { - lck_rw_type = LCK_RW_TYPE_SHARED; - lck->lck_rw_shared_count--; - } - else { - lck_rw_type = LCK_RW_TYPE_EXCLUSIVE; - if (lck->lck_rw_want_upgrade) - lck->lck_rw_want_upgrade = FALSE; - else - lck->lck_rw_want_write = FALSE; - } + lck_rw_t *fake_lck; + lck_rw_type_t lock_type; + thread_t thread; + uint32_t rwlock_count; /* - * There is no reason to wakeup a waiting thread - * if the read-count is non-zero. Consider: - * we must be dropping a read lock - * threads are waiting only if one wants a write lock - * if there are still readers, they can't proceed + * prior_lock state is a snapshot of the 1st word of the + * lock in question... we'll fake up a pointer to it + * and carefully not access anything beyond whats defined + * in the first word of a lck_rw_t */ + fake_lck = (lck_rw_t *)&prior_lock_state; - if (lck->lck_rw_shared_count == 0) { - if (lck->lck_w_waiting) { - lck->lck_w_waiting = FALSE; - wakeup_writers = TRUE; - } - if (!(lck->lck_rw_priv_excl && wakeup_writers == TRUE) && - lck->lck_r_waiting) { - lck->lck_r_waiting = FALSE; - wakeup_readers = TRUE; - } - } - - lck_interlock_unlock(lck, istate); + if (fake_lck->lck_rw_shared_count <= 1) { + if (fake_lck->lck_w_waiting) + thread_wakeup(RW_LOCK_WRITER_EVENT(lck)); - if (wakeup_readers) - thread_wakeup(RW_LOCK_READER_EVENT(lck)); - if (wakeup_writers) - thread_wakeup(RW_LOCK_WRITER_EVENT(lck)); + if (!(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting) && fake_lck->lck_r_waiting) + thread_wakeup(RW_LOCK_READER_EVENT(lck)); + } + if (fake_lck->lck_rw_shared_count) + lock_type = LCK_RW_TYPE_SHARED; + else + lock_type = LCK_RW_TYPE_EXCLUSIVE; + + /* Check if dropping the lock means that we need to unpromote */ + thread = current_thread(); + rwlock_count = thread->rwlock_count--; +#if MACH_LDEBUG + if (rwlock_count == 0) { + panic("rw lock count underflow for thread %p", thread); + } +#endif + if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) { + /* sched_flags checked without lock, but will be rechecked while clearing */ + lck_rw_clear_promotion(thread); + } #if CONFIG_DTRACE - LOCKSTAT_RECORD(LS_LCK_RW_DONE_RELEASE, lck, (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE ? 1 : 0)); + LOCKSTAT_RECORD(LS_LCK_RW_DONE_RELEASE, lck, lock_type == LCK_RW_TYPE_SHARED ? 0 : 1); #endif - return(lck_rw_type); + return(lock_type); } - - /* * Routine: lck_rw_unlock */ @@ -1168,82 +1189,98 @@ lck_rw_lock( /* * Routine: lck_rw_lock_shared_gen + * Function: + * assembly fast path code has determined that this lock + * is held exclusively... this is where we spin/block + * until we can acquire the lock in the shared mode */ void lck_rw_lock_shared_gen( lck_rw_t *lck) { - int i; - wait_result_t res; -#if MACH_LDEBUG - int decrementer; -#endif /* MACH_LDEBUG */ - boolean_t istate; + uint64_t deadline = 0; + int gotlock = 0; + int slept = 0; + wait_result_t res = 0; + boolean_t istate = -1; + #if CONFIG_DTRACE uint64_t wait_interval = 0; - int slept = 0; - int readers_at_sleep; + int readers_at_sleep = 0; + boolean_t dtrace_ls_initialized = FALSE; + boolean_t dtrace_rwl_shared_spin, dtrace_rwl_shared_block, dtrace_ls_enabled = FALSE; #endif - istate = lck_interlock_lock(lck); + while ( !lck_rw_grab_shared(lck)) { + #if CONFIG_DTRACE - readers_at_sleep = lck->lck_rw_shared_count; + if (dtrace_ls_initialized == FALSE) { + dtrace_ls_initialized = TRUE; + dtrace_rwl_shared_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_SPIN] != 0); + dtrace_rwl_shared_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_BLOCK] != 0); + dtrace_ls_enabled = dtrace_rwl_shared_spin || dtrace_rwl_shared_block; + if (dtrace_ls_enabled) { + /* + * Either sleeping or spinning is happening, + * start a timing of our delay interval now. + */ + readers_at_sleep = lck->lck_rw_shared_count; + wait_interval = mach_absolute_time(); + } + } #endif + if (istate == -1) + istate = ml_get_interrupts_enabled(); -#if MACH_LDEBUG - decrementer = DECREMENTER_TIMEOUT; -#endif /* MACH_LDEBUG */ - while ((lck->lck_rw_want_write || lck->lck_rw_want_upgrade) && - ((lck->lck_rw_shared_count == 0) || lck->lck_rw_priv_excl)) { + deadline = lck_rw_deadline_for_spin(lck); - i = lock_wait_time[lck->lck_rw_can_sleep ? 1 : 0]; + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_START, + (int)lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0); - KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_CODE) | DBG_FUNC_START, - (int)lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, i, 0); -#if CONFIG_DTRACE - if ((lockstat_probemap[LS_LCK_RW_LOCK_SHARED_SPIN] || lockstat_probemap[LS_LCK_RW_LOCK_SHARED_BLOCK]) && wait_interval == 0) { - wait_interval = mach_absolute_time(); - } else { - wait_interval = -1; - } -#endif + while (((gotlock = lck_rw_grab_shared(lck)) == 0) && mach_absolute_time() < deadline) + lck_rw_lock_pause(istate); + + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_END, + (int)lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, gotlock, 0); + + if (gotlock) + break; + /* + * if we get here, the deadline has expired w/o us + * being able to grab the lock for read + * check to see if we're allowed to do a thread_block + */ + if (lck->lck_rw_can_sleep) { - if (i != 0) { - lck_interlock_unlock(lck, istate); -#if MACH_LDEBUG - if (!--decrementer) - Debugger("timeout - wait no writers"); -#endif /* MACH_LDEBUG */ - while (--i != 0 && - (lck->lck_rw_want_write || lck->lck_rw_want_upgrade) && - ((lck->lck_rw_shared_count == 0) || lck->lck_rw_priv_excl)) - lck_rw_lock_pause(istate); istate = lck_interlock_lock(lck); - } - if (lck->lck_rw_can_sleep && - (lck->lck_rw_want_write || lck->lck_rw_want_upgrade) && - ((lck->lck_rw_shared_count == 0) || lck->lck_rw_priv_excl)) { - lck->lck_r_waiting = TRUE; - res = assert_wait(RW_LOCK_READER_EVENT(lck), THREAD_UNINT); - if (res == THREAD_WAITING) { + if ((lck->lck_rw_want_write || lck->lck_rw_want_upgrade) && + ((lck->lck_rw_shared_count == 0) || lck->lck_rw_priv_excl)) { + + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_START, + (int)lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0); + + lck->lck_r_waiting = TRUE; + + res = assert_wait(RW_LOCK_READER_EVENT(lck), THREAD_UNINT); lck_interlock_unlock(lck, istate); - res = thread_block(THREAD_CONTINUE_NULL); -#if CONFIG_DTRACE - slept = 1; -#endif - istate = lck_interlock_lock(lck); + + if (res == THREAD_WAITING) { + res = thread_block(THREAD_CONTINUE_NULL); + slept++; + } + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_END, + (int)lck, res, slept, 0, 0); + } else { + lck->lck_rw_shared_count++; + lck_interlock_unlock(lck, istate); + break; } } - KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_CODE) | DBG_FUNC_END, - (int)lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, res, 0); } - lck->lck_rw_shared_count++; - - lck_interlock_unlock(lck, istate); #if CONFIG_DTRACE - if (wait_interval != 0 && wait_interval != (unsigned) -1) { + if (dtrace_ls_enabled == TRUE) { if (slept == 0) { LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_SHARED_SPIN, lck, mach_absolute_time() - wait_interval, 0); } else { @@ -1258,114 +1295,151 @@ lck_rw_lock_shared_gen( /* - * Routine: lck_rw_lock_shared_to_exclusive + * Routine: lck_rw_lock_shared_to_exclusive_failure * Function: - * Improves a read-only lock to one with - * write permission. If another reader has - * already requested an upgrade to a write lock, - * no lock is held upon return. - * - * Returns FALSE if the upgrade *failed*. + * assembly fast path code has already dropped our read + * count and determined that someone else owns 'lck_rw_want_upgrade' + * if 'lck_rw_shared_count' == 0, its also already dropped 'lck_w_waiting' + * all we need to do here is determine if a wakeup is needed */ - boolean_t -lck_rw_lock_shared_to_exclusive( - lck_rw_t *lck) +lck_rw_lock_shared_to_exclusive_failure( + lck_rw_t *lck, + int prior_lock_state) { - int i; - boolean_t do_wakeup = FALSE; - wait_result_t res; -#if MACH_LDEBUG - int decrementer; -#endif /* MACH_LDEBUG */ - boolean_t istate; -#if CONFIG_DTRACE - uint64_t wait_interval = 0; - int slept = 0; - int readers_at_sleep = 0; + lck_rw_t *fake_lck; + thread_t thread = current_thread(); + uint32_t rwlock_count; + + /* Check if dropping the lock means that we need to unpromote */ + rwlock_count = thread->rwlock_count--; +#if MACH_LDEBUG + if (rwlock_count == 0) { + panic("rw lock count underflow for thread %p", thread); + } #endif + if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) { + /* sched_flags checked without lock, but will be rechecked while clearing */ + lck_rw_clear_promotion(thread); + } - istate = lck_interlock_lock(lck); - - lck->lck_rw_shared_count--; - - if (lck->lck_rw_want_upgrade) { - KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_CODE) | DBG_FUNC_START, - (int)lck, lck->lck_rw_shared_count, lck->lck_rw_want_upgrade, 0, 0); + /* + * prior_lock state is a snapshot of the 1st word of the + * lock in question... we'll fake up a pointer to it + * and carefully not access anything beyond whats defined + * in the first word of a lck_rw_t + */ + fake_lck = (lck_rw_t *)&prior_lock_state; + if (fake_lck->lck_w_waiting && fake_lck->lck_rw_shared_count == 1) { /* * Someone else has requested upgrade. - * Since we've released a read lock, wake - * him up. + * Since we've released the read lock, wake + * him up if he's blocked waiting */ - if (lck->lck_w_waiting && (lck->lck_rw_shared_count == 0)) { - lck->lck_w_waiting = FALSE; - do_wakeup = TRUE; - } - - lck_interlock_unlock(lck, istate); + thread_wakeup(RW_LOCK_WRITER_EVENT(lck)); + } + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_CODE) | DBG_FUNC_NONE, + (int)lck, lck->lck_rw_shared_count, lck->lck_rw_want_upgrade, 0, 0); - if (do_wakeup) - thread_wakeup(RW_LOCK_WRITER_EVENT(lck)); + return (FALSE); +} - KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_CODE) | DBG_FUNC_END, - (int)lck, lck->lck_rw_shared_count, lck->lck_rw_want_upgrade, 0, 0); - return (FALSE); - } +/* + * Routine: lck_rw_lock_shared_to_exclusive_failure + * Function: + * assembly fast path code has already dropped our read + * count and successfully acquired 'lck_rw_want_upgrade' + * we just need to wait for the rest of the readers to drain + * and then we can return as the exclusive holder of this lock + */ +boolean_t +lck_rw_lock_shared_to_exclusive_success( + lck_rw_t *lck) +{ + uint64_t deadline = 0; + int slept = 0; + int still_shared = 0; + wait_result_t res; + boolean_t istate = -1; - lck->lck_rw_want_upgrade = TRUE; +#if CONFIG_DTRACE + uint64_t wait_interval = 0; + int readers_at_sleep = 0; + boolean_t dtrace_ls_initialized = FALSE; + boolean_t dtrace_rwl_shared_to_excl_spin, dtrace_rwl_shared_to_excl_block, dtrace_ls_enabled = FALSE; +#endif -#if MACH_LDEBUG - decrementer = DECREMENTER_TIMEOUT; -#endif /* MACH_LDEBUG */ while (lck->lck_rw_shared_count != 0) { + #if CONFIG_DTRACE - if (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN] && wait_interval == 0) { - wait_interval = mach_absolute_time(); - readers_at_sleep = lck->lck_rw_shared_count; - } else { - wait_interval = -1; + if (dtrace_ls_initialized == FALSE) { + dtrace_ls_initialized = TRUE; + dtrace_rwl_shared_to_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN] != 0); + dtrace_rwl_shared_to_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK] != 0); + dtrace_ls_enabled = dtrace_rwl_shared_to_excl_spin || dtrace_rwl_shared_to_excl_block; + if (dtrace_ls_enabled) { + /* + * Either sleeping or spinning is happening, + * start a timing of our delay interval now. + */ + readers_at_sleep = lck->lck_rw_shared_count; + wait_interval = mach_absolute_time(); + } } #endif - i = lock_wait_time[lck->lck_rw_can_sleep ? 1 : 0]; - - KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX1_CODE) | DBG_FUNC_START, - (int)lck, lck->lck_rw_shared_count, i, 0, 0); - - if (i != 0) { - lck_interlock_unlock(lck, istate); -#if MACH_LDEBUG - if (!--decrementer) - Debugger("timeout - lck_rw_shared_count"); -#endif /* MACH_LDEBUG */ - while (--i != 0 && lck->lck_rw_shared_count != 0) - lck_rw_lock_pause(istate); + if (istate == -1) + istate = ml_get_interrupts_enabled(); + + deadline = lck_rw_deadline_for_spin(lck); + + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_START, + (int)lck, lck->lck_rw_shared_count, 0, 0, 0); + + while ((still_shared = lck->lck_rw_shared_count) && mach_absolute_time() < deadline) + lck_rw_lock_pause(istate); + + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_END, + (int)lck, lck->lck_rw_shared_count, 0, 0, 0); + + if ( !still_shared) + break; + /* + * if we get here, the deadline has expired w/o + * the rw_shared_count having drained to 0 + * check to see if we're allowed to do a thread_block + */ + if (lck->lck_rw_can_sleep) { + istate = lck_interlock_lock(lck); - } + + if (lck->lck_rw_shared_count != 0) { + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_START, + (int)lck, lck->lck_rw_shared_count, 0, 0, 0); - if (lck->lck_rw_can_sleep && lck->lck_rw_shared_count != 0) { - lck->lck_w_waiting = TRUE; - res = assert_wait(RW_LOCK_WRITER_EVENT(lck), THREAD_UNINT); - if (res == THREAD_WAITING) { + lck->lck_w_waiting = TRUE; + + res = assert_wait(RW_LOCK_WRITER_EVENT(lck), THREAD_UNINT); lck_interlock_unlock(lck, istate); - res = thread_block(THREAD_CONTINUE_NULL); -#if CONFIG_DTRACE - slept = 1; -#endif - istate = lck_interlock_lock(lck); + + if (res == THREAD_WAITING) { + res = thread_block(THREAD_CONTINUE_NULL); + slept++; + } + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_END, + (int)lck, res, slept, 0, 0); + } else { + lck_interlock_unlock(lck, istate); + break; } } - KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX1_CODE) | DBG_FUNC_END, - (int)lck, lck->lck_rw_shared_count, 0, 0, 0); } - - lck_interlock_unlock(lck, istate); #if CONFIG_DTRACE /* * We infer whether we took the sleep/spin path above by checking readers_at_sleep. */ - if (wait_interval != 0 && wait_interval != (unsigned) -1 && readers_at_sleep) { + if (dtrace_ls_enabled == TRUE) { if (slept == 0) { LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN, lck, mach_absolute_time() - wait_interval, 0); } else { @@ -1374,50 +1448,48 @@ lck_rw_lock_shared_to_exclusive( (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep); } } - LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lck, 1); #endif return (TRUE); } + /* * Routine: lck_rw_lock_exclusive_to_shared + * Function: + * assembly fast path has already dropped + * our exclusive state and bumped lck_rw_shared_count + * all we need to do here is determine if anyone + * needs to be awakened. */ void -lck_rw_lock_exclusive_to_shared( - lck_rw_t *lck) +lck_rw_lock_exclusive_to_shared_gen( + lck_rw_t *lck, + int prior_lock_state) { - boolean_t wakeup_readers = FALSE; - boolean_t wakeup_writers = FALSE; - boolean_t istate; + lck_rw_t *fake_lck; - KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_START, - (int)lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0); - - istate = lck_interlock_lock(lck); - - lck->lck_rw_shared_count++; - if (lck->lck_rw_want_upgrade) - lck->lck_rw_want_upgrade = FALSE; - else - lck->lck_rw_want_write = FALSE; - - if (lck->lck_w_waiting) { - lck->lck_w_waiting = FALSE; - wakeup_writers = TRUE; - } - if (!(lck->lck_rw_priv_excl && wakeup_writers == TRUE) && - lck->lck_r_waiting) { - lck->lck_r_waiting = FALSE; - wakeup_readers = TRUE; - } + /* + * prior_lock state is a snapshot of the 1st word of the + * lock in question... we'll fake up a pointer to it + * and carefully not access anything beyond whats defined + * in the first word of a lck_rw_t + */ + fake_lck = (lck_rw_t *)&prior_lock_state; - lck_interlock_unlock(lck, istate); + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_START, + (int)lck, fake_lck->lck_rw_want_write, fake_lck->lck_rw_want_upgrade, 0, 0); - if (wakeup_readers) + /* + * don't wake up anyone waiting to take the lock exclusively + * since we hold a read count... when the read count drops to 0, + * the writers will be woken. + * + * wake up any waiting readers if we don't have any writers waiting, + * or the lock is NOT marked as rw_priv_excl (writers have privilege) + */ + if (!(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting) && fake_lck->lck_r_waiting) thread_wakeup(RW_LOCK_READER_EVENT(lck)); - if (wakeup_writers) - thread_wakeup(RW_LOCK_WRITER_EVENT(lck)); KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_END, (int)lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, lck->lck_rw_shared_count, 0); @@ -1445,74 +1517,6 @@ lck_rw_try_lock( return(FALSE); } -/* - * Routine: lck_rw_try_lock_exclusive - * Function: - * Tries to get a write lock. - * - * Returns FALSE if the lock is not held on return. - */ - -boolean_t -lck_rw_try_lock_exclusive( - lck_rw_t *lck) -{ - boolean_t istate; - - istate = lck_interlock_lock(lck); - - if (lck->lck_rw_want_write || lck->lck_rw_want_upgrade || lck->lck_rw_shared_count) { - /* - * Can't get lock. - */ - lck_interlock_unlock(lck, istate); - return(FALSE); - } - - /* - * Have lock. - */ - - lck->lck_rw_want_write = TRUE; - - lck_interlock_unlock(lck, istate); - -#if CONFIG_DTRACE - LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_EXCL_ACQUIRE, lck, 1); -#endif - return(TRUE); -} - -/* - * Routine: lck_rw_try_lock_shared - * Function: - * Tries to get a read lock. - * - * Returns FALSE if the lock is not held on return. - */ - -boolean_t -lck_rw_try_lock_shared( - lck_rw_t *lck) -{ - boolean_t istate; - - istate = lck_interlock_lock(lck); -/* No reader priority check here... */ - if (lck->lck_rw_want_write || lck->lck_rw_want_upgrade) { - lck_interlock_unlock(lck, istate); - return(FALSE); - } - - lck->lck_rw_shared_count++; - - lck_interlock_unlock(lck, istate); - -#if CONFIG_DTRACE - LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_SHARED_ACQUIRE, lck, 0); -#endif - return(TRUE); -} void lck_rw_assert( @@ -1539,13 +1543,38 @@ lck_rw_assert( return; } break; + case LCK_RW_ASSERT_NOTHELD: + if (!(lck->lck_rw_want_write || + lck->lck_rw_want_upgrade || + lck->lck_rw_shared_count != 0)) { + return; + } + break; default: break; } - panic("rw lock (%p) not held (mode=%u)\n", lck, type); + panic("rw lock (%p)%s held (mode=%u), first word %08x\n", lck, (type == LCK_RW_ASSERT_NOTHELD ? "" : " not"), type, *(uint32_t *)lck); } +/* On return to userspace, this routine is called if the rwlock_count is somehow imbalanced */ +void +lck_rw_clear_promotions_x86(thread_t thread) +{ +#if MACH_LDEBUG + /* It's fatal to leave a RW lock locked and return to userspace */ + panic("%u rw lock(s) held on return to userspace for thread %p", thread->rwlock_count, thread); +#else + /* Paper over the issue */ + thread->rwlock_count = 0; + lck_rw_clear_promotion(thread); +#endif +} + + +#ifdef MUTEX_ZONE +extern zone_t lck_mtx_zone; +#endif /* * Routine: lck_mtx_alloc_init */ @@ -1555,10 +1584,13 @@ lck_mtx_alloc_init( lck_attr_t *attr) { lck_mtx_t *lck; - +#ifdef MUTEX_ZONE + if ((lck = (lck_mtx_t *)zalloc(lck_mtx_zone)) != 0) + lck_mtx_init(lck, grp, attr); +#else if ((lck = (lck_mtx_t *)kalloc(sizeof(lck_mtx_t))) != 0) lck_mtx_init(lck, grp, attr); - +#endif return(lck); } @@ -1571,7 +1603,11 @@ lck_mtx_free( lck_grp_t *grp) { lck_mtx_destroy(lck, grp); +#ifdef MUTEX_ZONE + zfree(lck_mtx_zone, lck); +#else kfree(lck, sizeof(lck_mtx_t)); +#endif } /* @@ -1593,7 +1629,10 @@ lck_mtx_ext_init( lck->lck_mtx_grp = grp; if (grp->lck_grp_attr & LCK_GRP_ATTR_STAT) - lck->lck_mtx_attr |= LCK_MTX_ATTR_STAT; + lck->lck_mtx_attr |= LCK_MTX_ATTR_STAT; + + lck->lck_mtx.lck_mtx_is_ext = 1; + lck->lck_mtx.lck_mtx_sw.lck_mtxd.lck_mtxd_pad32 = 0xFFFFFFFF; } /* @@ -1620,11 +1659,10 @@ lck_mtx_init( lck->lck_mtx_ptr = lck_ext; } } else { - lck->lck_mtx_ilk = 0; - lck->lck_mtx_locked = 0; - lck->lck_mtx_waiters = 0; - lck->lck_mtx_pri = 0; + lck->lck_mtx_owner = 0; + lck->lck_mtx_state = 0; } + lck->lck_mtx_sw.lck_mtxd.lck_mtxd_pad32 = 0xFFFFFFFF; lck_grp_reference(grp); lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX); } @@ -1651,11 +1689,11 @@ lck_mtx_init_ext( lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT; lck->lck_mtx_ptr = lck_ext; } else { - lck->lck_mtx_ilk = 0; - lck->lck_mtx_locked = 0; - lck->lck_mtx_waiters = 0; - lck->lck_mtx_pri = 0; + lck->lck_mtx_owner = 0; + lck->lck_mtx_state = 0; } + lck->lck_mtx_sw.lck_mtxd.lck_mtxd_pad32 = 0xFFFFFFFF; + lck_grp_reference(grp); lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX); } @@ -1672,8 +1710,13 @@ lck_mtx_destroy( if (lck->lck_mtx_tag == LCK_MTX_TAG_DESTROYED) return; +#if MACH_LDEBUG + lck_mtx_assert(lck, LCK_MTX_ASSERT_NOTOWNED); +#endif lck_is_indirect = (lck->lck_mtx_tag == LCK_MTX_TAG_INDIRECT); - lck->lck_mtx_tag = LCK_MTX_TAG_DESTROYED; + + lck_mtx_lock_mark_destroyed(lck); + if (lck_is_indirect) kfree(lck->lck_mtx_ptr, sizeof(lck_mtx_ext_t)); lck_grp_lckcnt_decr(grp, LCK_TYPE_MTX); @@ -1681,51 +1724,205 @@ lck_mtx_destroy( return; } + +#define LCK_MTX_LCK_WAIT_CODE 0x20 +#define LCK_MTX_LCK_WAKEUP_CODE 0x21 +#define LCK_MTX_LCK_SPIN_CODE 0x22 +#define LCK_MTX_LCK_ACQUIRE_CODE 0x23 +#define LCK_MTX_LCK_DEMOTE_CODE 0x24 + + +/* + * Routine: lck_mtx_unlock_wakeup_x86 + * + * Invoked on unlock when there is + * contention (i.e. the assembly routine sees that + * that mutex->lck_mtx_waiters != 0 or + * that mutex->lck_mtx_promoted != 0... + * + * neither the mutex or interlock is held + */ +void +lck_mtx_unlock_wakeup_x86 ( + lck_mtx_t *mutex, + int prior_lock_state) +{ + lck_mtx_t fake_lck; + + /* + * prior_lock state is a snapshot of the 2nd word of the + * lock in question... we'll fake up a lock with the bits + * copied into place and carefully not access anything + * beyond whats defined in the second word of a lck_mtx_t + */ + fake_lck.lck_mtx_state = prior_lock_state; + + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_START, + mutex, fake_lck.lck_mtx_promoted, fake_lck.lck_mtx_waiters, fake_lck.lck_mtx_pri, 0); + + if (__probable(fake_lck.lck_mtx_waiters)) { + if (fake_lck.lck_mtx_waiters > 1) + thread_wakeup_one_with_pri((event_t)(((unsigned int*)mutex)+(sizeof(lck_mtx_t)-1)/sizeof(unsigned int)), fake_lck.lck_mtx_pri); + else + thread_wakeup_one((event_t)(((unsigned int*)mutex)+(sizeof(lck_mtx_t)-1)/sizeof(unsigned int))); + } + + if (__improbable(fake_lck.lck_mtx_promoted)) { + thread_t thread = current_thread(); + + + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_DEMOTE_CODE) | DBG_FUNC_NONE, + thread_tid(thread), thread->promotions, thread->sched_flags & TH_SFLAG_PROMOTED, 0, 0); + + if (thread->promotions > 0) { + spl_t s = splsched(); + + thread_lock(thread); + + if (--thread->promotions == 0 && (thread->sched_flags & TH_SFLAG_PROMOTED)) { + + thread->sched_flags &= ~TH_SFLAG_PROMOTED; + + if (thread->sched_flags & TH_SFLAG_RW_PROMOTED) { + /* Thread still has a RW lock promotion */ + } else if (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) { + KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_DEMOTE) | DBG_FUNC_NONE, + thread->sched_pri, DEPRESSPRI, 0, mutex, 0); + + set_sched_pri(thread, DEPRESSPRI); + } + else { + if (thread->priority < thread->sched_pri) { + KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_DEMOTE) | DBG_FUNC_NONE, + thread->sched_pri, thread->priority, 0, mutex, 0); + + SCHED(compute_priority)(thread, FALSE); + } + } + } + thread_unlock(thread); + splx(s); + } + } + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_END, + mutex, 0, mutex->lck_mtx_waiters, 0, 0); +} + + +/* + * Routine: lck_mtx_lock_acquire_x86 + * + * Invoked on acquiring the mutex when there is + * contention (i.e. the assembly routine sees that + * that mutex->lck_mtx_waiters != 0 or + * thread->was_promoted_on_wakeup != 0)... + * + * mutex is owned... interlock is held... preemption is disabled + */ +void +lck_mtx_lock_acquire_x86( + lck_mtx_t *mutex) +{ + thread_t thread; + integer_t priority; + spl_t s; + + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_START, + mutex, thread->was_promoted_on_wakeup, mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0); + + if (mutex->lck_mtx_waiters) + priority = mutex->lck_mtx_pri; + else + priority = 0; + + thread = (thread_t)mutex->lck_mtx_owner; /* faster then current_thread() */ + + if (thread->sched_pri < priority || thread->was_promoted_on_wakeup) { + + KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PROMOTE) | DBG_FUNC_NONE, + thread->sched_pri, priority, thread->was_promoted_on_wakeup, mutex, 0); + + s = splsched(); + thread_lock(thread); + + if (thread->sched_pri < priority) { + /* Do not promote past promotion ceiling */ + assert(priority <= MAXPRI_PROMOTE); + set_sched_pri(thread, priority); + } + if (mutex->lck_mtx_promoted == 0) { + mutex->lck_mtx_promoted = 1; + + thread->promotions++; + thread->sched_flags |= TH_SFLAG_PROMOTED; + } + thread->was_promoted_on_wakeup = 0; + + thread_unlock(thread); + splx(s); + } + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_END, + mutex, 0, mutex->lck_mtx_waiters, 0, 0); +} + + + /* - * Routine: lck_mtx_lock_spinwait + * Routine: lck_mtx_lock_spinwait_x86 * * Invoked trying to acquire a mutex when there is contention but * the holder is running on another processor. We spin for up to a maximum * time waiting for the lock to be released. * * Called with the interlock unlocked. + * returns 0 if mutex acquired + * returns 1 if we spun + * returns 2 if we didn't spin due to the holder not running */ -void -lck_mtx_lock_spinwait( - lck_mtx_t *lck) +int +lck_mtx_lock_spinwait_x86( + lck_mtx_t *mutex) { - thread_t holder; - volatile lck_mtx_t *mutex; - uint64_t deadline; + thread_t holder; + uint64_t deadline; + int retval = 1; + int loopcount = 0; - if (lck->lck_mtx_tag != LCK_MTX_TAG_INDIRECT) - mutex = lck; - else - mutex = &lck->lck_mtx_ptr->lck_mtx; - KERNEL_DEBUG( - MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN) | DBG_FUNC_NONE, - (int)lck, (int)mutex->lck_mtx_locked, 0, 0, 0); + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_START, + mutex, mutex->lck_mtx_owner, mutex->lck_mtx_waiters, 0, 0); deadline = mach_absolute_time() + MutexSpin; + /* * Spin while: * - mutex is locked, and - * - its locked as a spin lock, or + * - its locked as a spin lock, and * - owner is running on another processor, and * - owner (processor) is not idling, and * - we haven't spun for long enough. */ - while ((holder = (thread_t) mutex->lck_mtx_locked) != NULL) { - if ((holder == (thread_t)MUTEX_LOCKED_AS_SPIN) || - ((holder->machine.specFlags & OnProc) != 0 && - (holder->state & TH_IDLE) == 0 && - mach_absolute_time() < deadline)) { - cpu_pause(); - continue; + do { + if (__probable(lck_mtx_lock_grab_mutex(mutex))) { + retval = 0; + break; } - break; - } + if ((holder = (thread_t) mutex->lck_mtx_owner) != NULL) { + + if ( !(holder->machine.specFlags & OnProc) || + (holder->state & TH_IDLE)) { + if (loopcount == 0) + retval = 2; + break; + } + } + cpu_pause(); + + loopcount++; + + } while (mach_absolute_time() < deadline); + + #if CONFIG_DTRACE /* * We've already kept a count via deadline of how long we spun. @@ -1737,165 +1934,115 @@ lck_mtx_lock_spinwait( * penalize only lock groups that have debug/stats enabled * with dtrace processing if desired. */ - if (lck->lck_mtx_tag != LCK_MTX_TAG_INDIRECT) { - LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN, lck, + if (__probable(mutex->lck_mtx_is_ext == 0)) { + LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN, mutex, mach_absolute_time() - (deadline - MutexSpin)); } else { - LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_SPIN, lck, + LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_SPIN, mutex, mach_absolute_time() - (deadline - MutexSpin)); } /* The lockstat acquire event is recorded by the assembly code beneath us. */ #endif -} -/* - * Called from assembly code when a destroyed mutex is detected - * during a lock/unlock/try/convert - */ + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_END, + mutex, mutex->lck_mtx_owner, mutex->lck_mtx_waiters, retval, 0); -void -lck_mtx_interlock_panic( - lck_mtx_t *lck) -{ - panic("trying to interlock destroyed mutex %p", lck); + return retval; } -#if MACH_KDB - -void -db_show_one_lock( - lock_t *lock) -{ - db_printf("Read_count = 0x%x, %swant_upgrade, %swant_write, ", - lock->lck_rw_shared_count, - lock->lck_rw_want_upgrade ? "" : "!", - lock->lck_rw_want_write ? "" : "!"); - db_printf("%swaiting, %scan_sleep\n", - (lock->lck_r_waiting || lock->lck_w_waiting) ? "" : "!", - lock->lck_rw_can_sleep ? "" : "!"); - db_printf("Interlock:\n"); - db_show_one_simple_lock((db_expr_t) ((vm_offset_t)simple_lock_addr(lock->lck_rw_interlock)), - TRUE, (db_expr_t)0, (char *)0); -} - -#endif /* MACH_KDB */ /* - * The C portion of the mutex package. These routines are only invoked - * if the optimized assembler routines can't do the work. - */ - -/* - * Routine: lock_alloc - * Function: - * Allocate a mutex for external users who cannot - * hard-code the structure definition into their - * objects. - * For now just use kalloc, but a zone is probably - * warranted. - */ -mutex_t * -mutex_alloc( - unsigned short tag) -{ - mutex_t *m; - - if ((m = (mutex_t *)kalloc(sizeof(mutex_t))) != 0) - mutex_init(m, tag); - return(m); -} - -/* - * Routine: mutex_free - * Function: - * Free a mutex allocated for external users. - * For now just use kfree, but a zone is probably - * warranted. + * Routine: lck_mtx_lock_wait_x86 + * + * Invoked in order to wait on contention. + * + * Called with the interlock locked and + * preemption disabled... + * returns it unlocked and with preemption enabled */ void -mutex_free( - mutex_t *m) +lck_mtx_lock_wait_x86 ( + lck_mtx_t *mutex) { - kfree(m, sizeof(mutex_t)); -} + thread_t self = current_thread(); + thread_t holder; + integer_t priority; + spl_t s; +#if CONFIG_DTRACE + uint64_t sleep_start = 0; + if (lockstat_probemap[LS_LCK_MTX_LOCK_BLOCK] || lockstat_probemap[LS_LCK_MTX_EXT_LOCK_BLOCK]) { + sleep_start = mach_absolute_time(); + } +#endif + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_START, + mutex, mutex->lck_mtx_owner, mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0); -#if MACH_KDB -/* - * Routines to print out simple_locks and mutexes in a nicely-formatted - * fashion. - */ + priority = self->sched_pri; -const char *simple_lock_labels = "ENTRY ILK THREAD DURATION CALLER"; -const char *mutex_labels = "ENTRY LOCKED WAITERS THREAD CALLER"; + if (priority < self->priority) + priority = self->priority; + if (priority < BASEPRI_DEFAULT) + priority = BASEPRI_DEFAULT; -void -db_show_one_simple_lock ( - db_expr_t addr, - boolean_t have_addr, - __unused db_expr_t count, - __unused char * modif) -{ - simple_lock_t saddr = (simple_lock_t) ((vm_offset_t) addr); + /* Do not promote past promotion ceiling */ + priority = MIN(priority, MAXPRI_PROMOTE); - if (saddr == (simple_lock_t)0 || !have_addr) { - db_error ("No simple_lock\n"); - } -#if USLOCK_DEBUG - else if (saddr->lock_type != USLOCK_TAG) - db_error ("Not a simple_lock\n"); -#endif /* USLOCK_DEBUG */ + if (mutex->lck_mtx_waiters == 0 || priority > mutex->lck_mtx_pri) + mutex->lck_mtx_pri = priority; + mutex->lck_mtx_waiters++; - db_printf ("%s\n", simple_lock_labels); - db_print_simple_lock (saddr); -} - -void -db_print_simple_lock ( - simple_lock_t addr) -{ + if ( (holder = (thread_t)mutex->lck_mtx_owner) && + holder->sched_pri < mutex->lck_mtx_pri ) { + s = splsched(); + thread_lock(holder); - db_printf ("%08x %3d", addr, *hw_lock_addr(addr->interlock)); -#if USLOCK_DEBUG - db_printf (" %08x", addr->debug.lock_thread); - db_printf (" %08x ", addr->debug.duration[1]); - db_printsym ((int)addr->debug.lock_pc, DB_STGY_ANY); -#endif /* USLOCK_DEBUG */ - db_printf ("\n"); -} + /* holder priority may have been bumped by another thread + * before thread_lock was taken + */ + if (holder->sched_pri < mutex->lck_mtx_pri) { + KERNEL_DEBUG_CONSTANT( + MACHDBG_CODE(DBG_MACH_SCHED, MACH_PROMOTE) | DBG_FUNC_NONE, + holder->sched_pri, priority, thread_tid(holder), mutex, 0); + /* Assert that we're not altering the priority of a + * thread above the MAXPRI_PROMOTE band + */ + assert(holder->sched_pri < MAXPRI_PROMOTE); + set_sched_pri(holder, priority); + + if (mutex->lck_mtx_promoted == 0) { + holder->promotions++; + holder->sched_flags |= TH_SFLAG_PROMOTED; + + mutex->lck_mtx_promoted = 1; + } + } + thread_unlock(holder); + splx(s); + } + assert_wait((event_t)(((unsigned int*)mutex)+((sizeof(lck_mtx_t)-1)/sizeof(unsigned int))), THREAD_UNINT); -void -db_show_one_mutex ( - db_expr_t addr, - boolean_t have_addr, - __unused db_expr_t count, - __unused char * modif) -{ - mutex_t * maddr = (mutex_t *)((vm_offset_t) addr); + lck_mtx_ilk_unlock(mutex); - if (maddr == (mutex_t *)0 || !have_addr) - db_error ("No mutex\n"); -#if MACH_LDEBUG - else if (maddr->type != MUTEX_TAG) - db_error ("Not a mutex\n"); -#endif /* MACH_LDEBUG */ + thread_block(THREAD_CONTINUE_NULL); - db_printf ("%s\n", mutex_labels); - db_print_mutex (maddr); -} + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_END, + mutex, mutex->lck_mtx_owner, mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0); -void -db_print_mutex ( - mutex_t * addr) -{ - db_printf ("%08x %6d %7d", - addr, *addr, addr->lck_mtx.lck_mtx_waiters); -#if MACH_LDEBUG - db_printf (" %08x ", addr->thread); - db_printsym (addr->pc, DB_STGY_ANY); -#endif /* MACH_LDEBUG */ - db_printf ("\n"); +#if CONFIG_DTRACE + /* + * Record the Dtrace lockstat probe for blocking, block time + * measured from when we were entered. + */ + if (sleep_start) { + if (mutex->lck_mtx_is_ext == 0) { + LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_BLOCK, mutex, + mach_absolute_time() - sleep_start); + } else { + LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_BLOCK, mutex, + mach_absolute_time() - sleep_start); + } + } +#endif } - -#endif /* MACH_KDB */