X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/7ddcb079202367355dddccdfa4318e57d50318be..a39ff7e25e19b3a8c3020042a3872ca9ec9659f1:/osfmk/i386/locks_i386.c diff --git a/osfmk/i386/locks_i386.c b/osfmk/i386/locks_i386.c index 048dc704d..039584749 100644 --- a/osfmk/i386/locks_i386.c +++ b/osfmk/i386/locks_i386.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -61,10 +61,8 @@ * Locking primitives implementation */ -#include #include -#include #include #include #include @@ -77,13 +75,8 @@ #include #include -#if MACH_KDB -#include -#include -#include -#include -#endif /* MACH_KDB */ #include /* machine_timeout_suspended() */ +#include #include #include @@ -98,6 +91,11 @@ #if CONFIG_DTRACE #define NEED_DTRACE_DEFS #include <../bsd/sys/lockstat.h> + +#define DTRACE_RW_SHARED 0x0 //reader +#define DTRACE_RW_EXCL 0x1 //writer +#define DTRACE_NO_FLAG 0x0 //not applicable + #endif #define LCK_RW_LCK_EXCLUSIVE_CODE 0x100 @@ -121,13 +119,11 @@ unsigned int LcksOpts=0; -/* Forwards */ - -#if MACH_KDB -void db_print_simple_lock( - simple_lock_t addr); -#endif /* MACH_KDB */ +#if DEVELOPMENT || DEBUG +unsigned int LckDisablePreemptCheck = 0; +#endif +/* Forwards */ #if USLOCK_DEBUG /* @@ -139,6 +135,7 @@ decl_simple_lock_data(extern , printf_lock) decl_simple_lock_data(extern , panic_lock) #endif /* USLOCK_DEBUG */ +extern unsigned int not_in_kdp; /* * We often want to know the addresses of the callers @@ -163,6 +160,63 @@ typedef void *pc_t; #endif /* lint */ #endif /* USLOCK_DEBUG */ +// Enforce program order of loads and stores. +#define ordered_load(target) _Generic( (target),\ + uint32_t* : __c11_atomic_load((_Atomic uint32_t* )(target), memory_order_relaxed), \ + uintptr_t*: __c11_atomic_load((_Atomic uintptr_t*)(target), memory_order_relaxed) ) +#define ordered_store(target, value) _Generic( (target),\ + uint32_t* : __c11_atomic_store((_Atomic uint32_t* )(target), (value), memory_order_relaxed), \ + uintptr_t*: __c11_atomic_store((_Atomic uintptr_t*)(target), (value), memory_order_relaxed) ) + +/* + * atomic exchange API is a low level abstraction of the operations + * to atomically read, modify, and write a pointer. This abstraction works + * for both Intel and ARMv8.1 compare and exchange atomic instructions as + * well as the ARM exclusive instructions. + * + * atomic_exchange_begin() - begin exchange and retrieve current value + * atomic_exchange_complete() - conclude an exchange + * atomic_exchange_abort() - cancel an exchange started with atomic_exchange_begin() + */ +static uint32_t +atomic_exchange_begin32(uint32_t *target, uint32_t *previous, enum memory_order ord) +{ + uint32_t val; + + (void)ord; // Memory order not used + val = __c11_atomic_load((_Atomic uint32_t *)target, memory_order_relaxed); + *previous = val; + return val; +} + +static boolean_t +atomic_exchange_complete32(uint32_t *target, uint32_t previous, uint32_t newval, enum memory_order ord) +{ + return __c11_atomic_compare_exchange_strong((_Atomic uint32_t *)target, &previous, newval, ord, memory_order_relaxed); +} + +static void +atomic_exchange_abort(void) { } + +static boolean_t +atomic_test_and_set32(uint32_t *target, uint32_t test_mask, uint32_t set_mask, enum memory_order ord, boolean_t wait) +{ + uint32_t value, prev; + + for ( ; ; ) { + value = atomic_exchange_begin32(target, &prev, ord); + if (value & test_mask) { + if (wait) + cpu_pause(); + else + atomic_exchange_abort(); + return FALSE; + } + value |= set_mask; + if (atomic_exchange_complete32(target, prev, value, ord)) + return TRUE; + } +} /* * Portable lock package implementation of usimple_locks. @@ -182,35 +236,20 @@ int usld_lock_common_checks(usimple_lock_t, char *); #endif /* USLOCK_DEBUG */ -extern int lck_rw_grab_want(lck_rw_t *lck); -extern int lck_rw_grab_shared(lck_rw_t *lck); -extern int lck_rw_held_read_or_upgrade(lck_rw_t *lck); - - /* * Forward definitions */ -void lck_rw_lock_shared_gen( - lck_rw_t *lck); - -void lck_rw_lock_exclusive_gen( - lck_rw_t *lck); - -boolean_t lck_rw_lock_shared_to_exclusive_success( - lck_rw_t *lck); - -boolean_t lck_rw_lock_shared_to_exclusive_failure( - lck_rw_t *lck, - int prior_lock_state); - -void lck_rw_lock_exclusive_to_shared_gen( - lck_rw_t *lck, - int prior_lock_state); - -lck_rw_type_t lck_rw_done_gen( - lck_rw_t *lck, - int prior_lock_state); +static void lck_rw_lock_shared_gen(lck_rw_t *lck); +static void lck_rw_lock_exclusive_gen(lck_rw_t *lck); +static boolean_t lck_rw_lock_shared_to_exclusive_success(lck_rw_t *lck); +static boolean_t lck_rw_lock_shared_to_exclusive_failure(lck_rw_t *lck, uint32_t prior_lock_state); +static void lck_rw_lock_exclusive_to_shared_gen(lck_rw_t *lck, uint32_t prior_lock_state); +static lck_rw_type_t lck_rw_done_gen(lck_rw_t *lck, uint32_t prior_lock_state); +void lck_rw_clear_promotions_x86(thread_t thread); +static boolean_t lck_rw_held_read_or_upgrade(lck_rw_t *lock); +static boolean_t lck_rw_grab_want(lck_rw_t *lock); +static boolean_t lck_rw_grab_shared(lck_rw_t *lock); /* * Routine: lck_spin_alloc_init @@ -298,7 +337,60 @@ boolean_t lck_spin_try_lock( lck_spin_t *lck) { - return((boolean_t)usimple_lock_try((usimple_lock_t) lck)); + boolean_t lrval = (boolean_t)usimple_lock_try((usimple_lock_t) lck); +#if DEVELOPMENT || DEBUG + if (lrval) { + pltrace(FALSE); + } +#endif + return(lrval); +} + +/* + * Routine: lck_spin_assert + */ +void +lck_spin_assert(lck_spin_t *lock, unsigned int type) +{ + thread_t thread, holder; + uintptr_t state; + + if (__improbable(type != LCK_ASSERT_OWNED && type != LCK_ASSERT_NOTOWNED)) { + panic("lck_spin_assert(): invalid arg (%u)", type); + } + + state = lock->interlock; + holder = (thread_t)state; + thread = current_thread(); + if (type == LCK_ASSERT_OWNED) { + if (__improbable(holder == THREAD_NULL)) { + panic("Lock not owned %p = %lx", lock, state); + } + if (__improbable(holder != thread)) { + panic("Lock not owned by current thread %p = %lx", lock, state); + } + } else if (type == LCK_ASSERT_NOTOWNED) { + if (__improbable(holder != THREAD_NULL)) { + if (holder == thread) { + panic("Lock owned by current thread %p = %lx", lock, state); + } else { + panic("Lock %p owned by thread %p", lock, holder); + } + } + } +} + +/* + * Routine: kdp_lck_spin_is_acquired + * NOT SAFE: To be used only by kernel debugger to avoid deadlock. + * Returns: TRUE if lock is acquired. + */ +boolean_t +kdp_lck_spin_is_acquired(lck_spin_t *lck) { + if (not_in_kdp) { + panic("panic: spinlock acquired check done outside of kernel debugger"); + } + return (lck->interlock != 0)? TRUE : FALSE; } /* @@ -322,20 +414,16 @@ usimple_lock_init( volatile uint32_t spinlock_owner_cpu = ~0; volatile usimple_lock_t spinlock_timed_out; -static uint32_t spinlock_timeout_NMI(uintptr_t thread_addr) { - uint64_t deadline; +uint32_t spinlock_timeout_NMI(uintptr_t thread_addr) { uint32_t i; for (i = 0; i < real_ncpus; i++) { - if ((uintptr_t)cpu_data_ptr[i]->cpu_active_thread == thread_addr) { + if ((cpu_data_ptr[i] != NULL) && ((uintptr_t)cpu_data_ptr[i]->cpu_active_thread == thread_addr)) { spinlock_owner_cpu = i; - if ((uint32_t) cpu_number() == i) - break; - cpu_datap(i)->cpu_NMI_acknowledged = FALSE; - cpu_NMI_interrupt(i); - deadline = mach_absolute_time() + (LockTimeOut * 2); - while (mach_absolute_time() < deadline && cpu_datap(i)->cpu_NMI_acknowledged == FALSE) - cpu_pause(); + if ((uint32_t) cpu_number() != i) { + /* Cause NMI and panic on the owner's cpu */ + NMIPI_panic(cpu_to_cpumask(i), SPINLOCK_TIMEOUT); + } break; } } @@ -373,13 +461,21 @@ usimple_lock( uintptr_t lowner = (uintptr_t)l->interlock.lock_data; spinlock_timed_out = l; lock_cpu = spinlock_timeout_NMI(lowner); - panic("Spinlock acquisition timed out: lock=%p, lock owner thread=0x%lx, current_thread: %p, lock owner active on CPU 0x%x, current owner: 0x%lx", l, lowner, current_thread(), lock_cpu, (uintptr_t)l->interlock.lock_data); + panic("Spinlock acquisition timed out: lock=%p, lock owner thread=0x%lx, current_thread: %p, lock owner active on CPU 0x%x, current owner: 0x%lx, time: %llu", + l, lowner, current_thread(), lock_cpu, (uintptr_t)l->interlock.lock_data, mach_absolute_time()); } } +#if DEVELOPMENT || DEBUG + pltrace(FALSE); +#endif + USLDBG(usld_lock_post(l, pc)); #else simple_lock((simple_lock_t)l); #endif +#if CONFIG_DTRACE + LOCKSTAT_RECORD(LS_LCK_SPIN_LOCK_ACQUIRE, l, 0); +#endif } @@ -399,6 +495,9 @@ usimple_unlock( OBTAIN_PC(pc); USLDBG(usld_unlock(l, pc)); +#if DEVELOPMENT || DEBUG + pltrace(TRUE); +#endif hw_lock_unlock(&l->interlock); #else simple_unlock_rwmb((simple_lock_t)l); @@ -429,7 +528,10 @@ usimple_lock_try( OBTAIN_PC(pc); USLDBG(usld_lock_try_pre(l, pc)); if ((success = hw_lock_try(&l->interlock))) { - USLDBG(usld_lock_try_post(l, pc)); +#if DEVELOPMENT || DEBUG + pltrace(FALSE); +#endif + USLDBG(usld_lock_try_post(l, pc)); } return success; #else @@ -437,6 +539,22 @@ usimple_lock_try( #endif } +/* + * Acquire a usimple_lock while polling for pending TLB flushes + * and spinning on a lock. + * + */ +void +usimple_lock_try_lock_loop(usimple_lock_t l) +{ + boolean_t istate = ml_get_interrupts_enabled(); + while (!simple_lock_try((l))) { + if (!istate) + handle_pending_TLB_flushes(); + cpu_pause(); + } +} + #if USLOCK_DEBUG /* * States of a usimple_lock. The default when initializing @@ -546,7 +664,7 @@ usld_lock_post( usimple_lock_t l, pc_t pc) { - register int mycpu; + int mycpu; char caller[] = "successful usimple_lock"; @@ -583,7 +701,7 @@ usld_unlock( usimple_lock_t l, pc_t pc) { - register int mycpu; + int mycpu; char caller[] = "usimple_unlock"; @@ -648,7 +766,7 @@ usld_lock_try_post( usimple_lock_t l, pc_t pc) { - register int mycpu; + int mycpu; char caller[] = "successful usimple_lock_try"; if (!usld_lock_common_checks(l, caller)) @@ -699,126 +817,6 @@ usl_trace( #endif /* USLOCK_DEBUG */ -/* - * Routine: lock_alloc - * Function: - * Allocate a lock for external users who cannot - * hard-code the structure definition into their - * objects. - * For now just use kalloc, but a zone is probably - * warranted. - */ -lock_t * -lock_alloc( - boolean_t can_sleep, - unsigned short tag, - unsigned short tag1) -{ - lock_t *l; - - if ((l = (lock_t *)kalloc(sizeof(lock_t))) != 0) - lock_init(l, can_sleep, tag, tag1); - return(l); -} - -/* - * Routine: lock_free - * Function: - * Free a lock allocated for external users. - * For now just use kfree, but a zone is probably - * warranted. - */ -void -lock_free( - lock_t *l) -{ - kfree(l, sizeof(lock_t)); -} - - -/* - * Routine: lock_init - * Function: - * Initialize a lock; required before use. - * Note that clients declare the "struct lock" - * variables and then initialize them, rather - * than getting a new one from this module. - */ -void -lock_init( - lock_t *l, - boolean_t can_sleep, - __unused unsigned short tag, - __unused unsigned short tag1) -{ - hw_lock_byte_init(&l->lck_rw_interlock); - l->lck_rw_want_write = FALSE; - l->lck_rw_want_upgrade = FALSE; - l->lck_rw_shared_count = 0; - l->lck_rw_can_sleep = can_sleep; - l->lck_rw_tag = tag; - l->lck_rw_priv_excl = 1; - l->lck_r_waiting = l->lck_w_waiting = 0; -} - - -/* - * Sleep locks. These use the same data structure and algorithm - * as the spin locks, but the process sleeps while it is waiting - * for the lock. These work on uniprocessor systems. - */ - -#define DECREMENTER_TIMEOUT 1000000 - -void -lock_write( - register lock_t * l) -{ - lck_rw_lock_exclusive(l); -} - -void -lock_done( - register lock_t * l) -{ - (void) lck_rw_done(l); -} - -void -lock_read( - register lock_t * l) -{ - lck_rw_lock_shared(l); -} - - -/* - * Routine: lock_read_to_write - * Function: - * Improves a read-only lock to one with - * write permission. If another reader has - * already requested an upgrade to a write lock, - * no lock is held upon return. - * - * Returns FALSE if the upgrade *failed*. - */ - -boolean_t -lock_read_to_write( - register lock_t * l) -{ - return lck_rw_lock_shared_to_exclusive(l); -} - -void -lock_write_to_read( - register lock_t * l) -{ - lck_rw_lock_exclusive_to_shared(l); -} - - - /* * Routine: lck_rw_alloc_init */ @@ -883,6 +881,9 @@ lck_rw_destroy( { if (lck->lck_rw_tag == LCK_RW_TAG_DESTROYED) return; +#if MACH_LDEBUG + lck_rw_assert(lck, LCK_RW_ASSERT_NOTHELD); +#endif lck->lck_rw_tag = LCK_RW_TAG_DESTROYED; lck_grp_lckcnt_decr(grp, LCK_TYPE_RW); lck_grp_deallocate(grp); @@ -897,29 +898,22 @@ lck_rw_destroy( #define DECREMENTER_TIMEOUT 1000000 -#define RW_LOCK_READER_EVENT(x) \ - ((event_t) (((unsigned char*) (x)) + (offsetof(lck_rw_t, lck_rw_tag)))) - -#define RW_LOCK_WRITER_EVENT(x) \ - ((event_t) (((unsigned char*) (x)) + (offsetof(lck_rw_t, lck_rw_pad8)))) - /* * We disable interrupts while holding the RW interlock to prevent an * interrupt from exacerbating hold time. * Hence, local helper functions lck_interlock_lock()/lck_interlock_unlock(). */ -static boolean_t +static inline boolean_t lck_interlock_lock(lck_rw_t *lck) { boolean_t istate; istate = ml_set_interrupts_enabled(FALSE); hw_lock_byte_lock(&lck->lck_rw_interlock); - return istate; } -static void +static inline void lck_interlock_unlock(lck_rw_t *lck, boolean_t istate) { hw_lock_byte_unlock(&lck->lck_rw_interlock); @@ -940,6 +934,13 @@ lck_rw_lock_pause(boolean_t interrupts_enabled) cpu_pause(); } +static inline boolean_t +lck_rw_held_read_or_upgrade(lck_rw_t *lock) +{ + if (ordered_load(&lock->data) & (LCK_RW_SHARED_MASK | LCK_RW_INTERLOCK | LCK_RW_WANT_UPGRADE)) + return TRUE; + return FALSE; +} /* * compute the deadline to spin against when @@ -968,13 +969,68 @@ lck_rw_deadline_for_spin(lck_rw_t *lck) } +/* + * Spin while interlock is held. + */ + +static inline void +lck_rw_interlock_spin(lck_rw_t *lock) +{ + while (ordered_load(&lock->data) & LCK_RW_INTERLOCK) { + cpu_pause(); + } +} + +static boolean_t +lck_rw_grab_want(lck_rw_t *lock) +{ + uint32_t data, prev; + + for ( ; ; ) { + data = atomic_exchange_begin32(&lock->data, &prev, memory_order_relaxed); + if ((data & LCK_RW_INTERLOCK) == 0) + break; + atomic_exchange_abort(); + lck_rw_interlock_spin(lock); + } + if (data & LCK_RW_WANT_WRITE) { + atomic_exchange_abort(); + return FALSE; + } + data |= LCK_RW_WANT_WRITE; + return atomic_exchange_complete32(&lock->data, prev, data, memory_order_relaxed); +} + +static boolean_t +lck_rw_grab_shared(lck_rw_t *lock) +{ + uint32_t data, prev; + + for ( ; ; ) { + data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp); + if ((data & LCK_RW_INTERLOCK) == 0) + break; + atomic_exchange_abort(); + lck_rw_interlock_spin(lock); + } + if (data & (LCK_RW_WANT_WRITE | LCK_RW_WANT_UPGRADE)) { + if (((data & LCK_RW_SHARED_MASK) == 0) || (data & LCK_RW_PRIV_EXCL)) { + atomic_exchange_abort(); + return FALSE; + } + } + data += LCK_RW_SHARED_READER; + return atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp); +} + /* * Routine: lck_rw_lock_exclusive */ -void +static void lck_rw_lock_exclusive_gen( lck_rw_t *lck) { + __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck); uint64_t deadline = 0; int slept = 0; int gotlock = 0; @@ -1015,12 +1071,12 @@ lck_rw_lock_exclusive_gen( deadline = lck_rw_deadline_for_spin(lck); - KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_START, (int)lck, 0, 0, 0, 0); + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0); while (((gotlock = lck_rw_grab_want(lck)) == 0) && mach_absolute_time() < deadline) lck_rw_lock_pause(istate); - KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_END, (int)lck, 0, 0, gotlock, 0); + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_END, trace_lck, 0, 0, gotlock, 0); if (gotlock) break; @@ -1035,10 +1091,11 @@ lck_rw_lock_exclusive_gen( if (lck->lck_rw_want_write) { - KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_START, (int)lck, 0, 0, 0, 0); + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0); lck->lck_w_waiting = TRUE; + thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite); res = assert_wait(RW_LOCK_WRITER_EVENT(lck), THREAD_UNINT); lck_interlock_unlock(lck, istate); @@ -1046,7 +1103,7 @@ lck_rw_lock_exclusive_gen( res = thread_block(THREAD_CONTINUE_NULL); slept++; } - KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_END, (int)lck, res, slept, 0, 0); + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_END, trace_lck, res, slept, 0, 0); } else { lck->lck_rw_want_write = TRUE; lck_interlock_unlock(lck, istate); @@ -1094,12 +1151,12 @@ lck_rw_lock_exclusive_gen( deadline = lck_rw_deadline_for_spin(lck); - KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_START, (int)lck, 0, 0, 0, 0); + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0); while ((lockheld = lck_rw_held_read_or_upgrade(lck)) && mach_absolute_time() < deadline) lck_rw_lock_pause(istate); - KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_END, (int)lck, 0, 0, lockheld, 0); + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_END, trace_lck, 0, 0, lockheld, 0); if ( !lockheld) break; @@ -1113,10 +1170,11 @@ lck_rw_lock_exclusive_gen( istate = lck_interlock_lock(lck); if (lck->lck_rw_shared_count != 0 || lck->lck_rw_want_upgrade) { - KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_START, (int)lck, 0, 0, 0, 0); + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0); lck->lck_w_waiting = TRUE; + thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite); res = assert_wait(RW_LOCK_WRITER_EVENT(lck), THREAD_UNINT); lck_interlock_unlock(lck, istate); @@ -1124,7 +1182,7 @@ lck_rw_lock_exclusive_gen( res = thread_block(THREAD_CONTINUE_NULL); slept++; } - KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_END, (int)lck, res, slept, 0, 0); + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_END, trace_lck, res, slept, 0, 0); } else { lck_interlock_unlock(lck, istate); /* @@ -1167,11 +1225,53 @@ lck_rw_lock_exclusive_gen( #endif } +/* + * Routine: lck_rw_done + */ + +lck_rw_type_t lck_rw_done(lck_rw_t *lock) +{ + uint32_t data, prev; + + for ( ; ; ) { + data = atomic_exchange_begin32(&lock->data, &prev, memory_order_release_smp); + if (data & LCK_RW_INTERLOCK) { /* wait for interlock to clear */ + atomic_exchange_abort(); + lck_rw_interlock_spin(lock); + continue; + } + if (data & LCK_RW_SHARED_MASK) { + data -= LCK_RW_SHARED_READER; + if ((data & LCK_RW_SHARED_MASK) == 0) /* if reader count has now gone to 0, check for waiters */ + goto check_waiters; + } else { /* if reader count == 0, must be exclusive lock */ + if (data & LCK_RW_WANT_UPGRADE) { + data &= ~(LCK_RW_WANT_UPGRADE); + } else { + if (data & LCK_RW_WANT_WRITE) + data &= ~(LCK_RW_WANT_EXCL); + else /* lock is not 'owned', panic */ + panic("Releasing non-exclusive RW lock without a reader refcount!"); + } +check_waiters: + if (prev & LCK_RW_W_WAITING) { + data &= ~(LCK_RW_W_WAITING); + if ((prev & LCK_RW_PRIV_EXCL) == 0) + data &= ~(LCK_RW_R_WAITING); + } else + data &= ~(LCK_RW_R_WAITING); + } + if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_release_smp)) + break; + cpu_pause(); + } + return lck_rw_done_gen(lock, prev); +} /* * Routine: lck_rw_done_gen * - * called from the assembly language wrapper... + * called from lck_rw_done() * prior_lock_state is the value in the 1st * word of the lock at the time of a successful * atomic compare and exchange with the new value... @@ -1185,13 +1285,15 @@ lck_rw_lock_exclusive_gen( * this by examining the state of the lock before * we changed it */ -lck_rw_type_t +static lck_rw_type_t lck_rw_done_gen( lck_rw_t *lck, - int prior_lock_state) + uint32_t prior_lock_state) { lck_rw_t *fake_lck; lck_rw_type_t lock_type; + thread_t thread; + uint32_t rwlock_count; /* * prior_lock state is a snapshot of the 1st word of the @@ -1213,6 +1315,19 @@ lck_rw_done_gen( else lock_type = LCK_RW_TYPE_EXCLUSIVE; + /* Check if dropping the lock means that we need to unpromote */ + thread = current_thread(); + rwlock_count = thread->rwlock_count--; +#if MACH_LDEBUG + if (rwlock_count == 0) { + panic("rw lock count underflow for thread %p", thread); + } +#endif + if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) { + /* sched_flags checked without lock, but will be rechecked while clearing */ + lck_rw_clear_promotion(thread); + } + #if CONFIG_DTRACE LOCKSTAT_RECORD(LS_LCK_RW_DONE_RELEASE, lck, lock_type == LCK_RW_TYPE_SHARED ? 0 : 1); #endif @@ -1247,10 +1362,11 @@ lck_rw_unlock_shared( { lck_rw_type_t ret; + assertf(lck->lck_rw_shared_count > 0, "lck %p has shared_count=0x%x", lck, lck->lck_rw_shared_count); ret = lck_rw_done(lck); if (ret != LCK_RW_TYPE_SHARED) - panic("lck_rw_unlock(): lock held in mode: %d\n", ret); + panic("lck_rw_unlock_shared(): lock %p held in mode: %d\n", lck, ret); } @@ -1286,6 +1402,32 @@ lck_rw_lock( panic("lck_rw_lock(): Invalid RW lock type: %x\n", lck_rw_type); } +/* + * Routine: lck_rw_lock_shared + */ +void +lck_rw_lock_shared(lck_rw_t *lock) +{ + uint32_t data, prev; + + current_thread()->rwlock_count++; + for ( ; ; ) { + data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp); + if (data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE | LCK_RW_INTERLOCK)) { + atomic_exchange_abort(); + lck_rw_lock_shared_gen(lock); + break; + } + data += LCK_RW_SHARED_READER; + if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp)) + break; + cpu_pause(); + } +#if CONFIG_DTRACE + LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lock, DTRACE_RW_SHARED); +#endif /* CONFIG_DTRACE */ + return; +} /* * Routine: lck_rw_lock_shared_gen @@ -1294,16 +1436,17 @@ lck_rw_lock( * is held exclusively... this is where we spin/block * until we can acquire the lock in the shared mode */ -void +static void lck_rw_lock_shared_gen( lck_rw_t *lck) { + __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck); uint64_t deadline = 0; int gotlock = 0; int slept = 0; wait_result_t res = 0; boolean_t istate = -1; - + #if CONFIG_DTRACE uint64_t wait_interval = 0; int readers_at_sleep = 0; @@ -1335,13 +1478,13 @@ lck_rw_lock_shared_gen( deadline = lck_rw_deadline_for_spin(lck); KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_START, - (int)lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0); + trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0); while (((gotlock = lck_rw_grab_shared(lck)) == 0) && mach_absolute_time() < deadline) lck_rw_lock_pause(istate); KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_END, - (int)lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, gotlock, 0); + trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, gotlock, 0); if (gotlock) break; @@ -1358,10 +1501,11 @@ lck_rw_lock_shared_gen( ((lck->lck_rw_shared_count == 0) || lck->lck_rw_priv_excl)) { KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_START, - (int)lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0); + trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0); lck->lck_r_waiting = TRUE; + thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockRead); res = assert_wait(RW_LOCK_READER_EVENT(lck), THREAD_UNINT); lck_interlock_unlock(lck, istate); @@ -1370,7 +1514,7 @@ lck_rw_lock_shared_gen( slept++; } KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_END, - (int)lck, res, slept, 0, 0); + trace_lck, res, slept, 0, 0); } else { lck->lck_rw_shared_count++; lck_interlock_unlock(lck, istate); @@ -1394,6 +1538,65 @@ lck_rw_lock_shared_gen( } +/* + * Routine: lck_rw_lock_exclusive + */ + +void +lck_rw_lock_exclusive(lck_rw_t *lock) +{ + current_thread()->rwlock_count++; + if (atomic_test_and_set32(&lock->data, + (LCK_RW_SHARED_MASK | LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE | LCK_RW_INTERLOCK), + LCK_RW_WANT_EXCL, memory_order_acquire_smp, FALSE)) { +#if CONFIG_DTRACE + LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL); +#endif /* CONFIG_DTRACE */ + } else + lck_rw_lock_exclusive_gen(lock); +} + + +/* + * Routine: lck_rw_lock_shared_to_exclusive + */ + +boolean_t +lck_rw_lock_shared_to_exclusive(lck_rw_t *lock) +{ + uint32_t data, prev; + + for ( ; ; ) { + data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp); + if (data & LCK_RW_INTERLOCK) { + atomic_exchange_abort(); + lck_rw_interlock_spin(lock); + continue; + } + if (data & LCK_RW_WANT_UPGRADE) { + data -= LCK_RW_SHARED_READER; + if ((data & LCK_RW_SHARED_MASK) == 0) /* we were the last reader */ + data &= ~(LCK_RW_W_WAITING); /* so clear the wait indicator */ + if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp)) + return lck_rw_lock_shared_to_exclusive_failure(lock, prev); + } else { + data |= LCK_RW_WANT_UPGRADE; /* ask for WANT_UPGRADE */ + data -= LCK_RW_SHARED_READER; /* and shed our read count */ + if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp)) + break; + } + cpu_pause(); + } + /* we now own the WANT_UPGRADE */ + if (data & LCK_RW_SHARED_MASK) /* check to see if all of the readers are drained */ + lck_rw_lock_shared_to_exclusive_success(lock); /* if not, we need to go wait */ +#if CONFIG_DTRACE + LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lock, 0); +#endif + return TRUE; +} + + /* * Routine: lck_rw_lock_shared_to_exclusive_failure * Function: @@ -1402,19 +1605,22 @@ lck_rw_lock_shared_gen( * if 'lck_rw_shared_count' == 0, its also already dropped 'lck_w_waiting' * all we need to do here is determine if a wakeup is needed */ -boolean_t +static boolean_t lck_rw_lock_shared_to_exclusive_failure( lck_rw_t *lck, - int prior_lock_state) + uint32_t prior_lock_state) { lck_rw_t *fake_lck; - - /* - * prior_lock state is a snapshot of the 1st word of the - * lock in question... we'll fake up a pointer to it - * and carefully not access anything beyond whats defined - * in the first word of a lck_rw_t - */ + thread_t thread = current_thread(); + uint32_t rwlock_count; + + /* Check if dropping the lock means that we need to unpromote */ + rwlock_count = thread->rwlock_count--; +#if MACH_LDEBUG + if (rwlock_count == 0) { + panic("rw lock count underflow for thread %p", thread); + } +#endif fake_lck = (lck_rw_t *)&prior_lock_state; if (fake_lck->lck_w_waiting && fake_lck->lck_rw_shared_count == 1) { @@ -1425,8 +1631,14 @@ lck_rw_lock_shared_to_exclusive_failure( */ thread_wakeup(RW_LOCK_WRITER_EVENT(lck)); } + + if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) { + /* sched_flags checked without lock, but will be rechecked while clearing */ + lck_rw_clear_promotion(thread); + } + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_CODE) | DBG_FUNC_NONE, - (int)lck, lck->lck_rw_shared_count, lck->lck_rw_want_upgrade, 0, 0); + VM_KERNEL_UNSLIDE_OR_PERM(lck), lck->lck_rw_shared_count, lck->lck_rw_want_upgrade, 0, 0); return (FALSE); } @@ -1440,10 +1652,11 @@ lck_rw_lock_shared_to_exclusive_failure( * we just need to wait for the rest of the readers to drain * and then we can return as the exclusive holder of this lock */ -boolean_t +static boolean_t lck_rw_lock_shared_to_exclusive_success( lck_rw_t *lck) { + __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck); uint64_t deadline = 0; int slept = 0; int still_shared = 0; @@ -1481,13 +1694,13 @@ lck_rw_lock_shared_to_exclusive_success( deadline = lck_rw_deadline_for_spin(lck); KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_START, - (int)lck, lck->lck_rw_shared_count, 0, 0, 0); + trace_lck, lck->lck_rw_shared_count, 0, 0, 0); while ((still_shared = lck->lck_rw_shared_count) && mach_absolute_time() < deadline) lck_rw_lock_pause(istate); KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_END, - (int)lck, lck->lck_rw_shared_count, 0, 0, 0); + trace_lck, lck->lck_rw_shared_count, 0, 0, 0); if ( !still_shared) break; @@ -1502,10 +1715,11 @@ lck_rw_lock_shared_to_exclusive_success( if (lck->lck_rw_shared_count != 0) { KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_START, - (int)lck, lck->lck_rw_shared_count, 0, 0, 0); + trace_lck, lck->lck_rw_shared_count, 0, 0, 0); lck->lck_w_waiting = TRUE; + thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockUpgrade); res = assert_wait(RW_LOCK_WRITER_EVENT(lck), THREAD_UNINT); lck_interlock_unlock(lck, istate); @@ -1514,7 +1728,7 @@ lck_rw_lock_shared_to_exclusive_success( slept++; } KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_END, - (int)lck, res, slept, 0, 0); + trace_lck, res, slept, 0, 0); } else { lck_interlock_unlock(lck, istate); break; @@ -1539,32 +1753,56 @@ lck_rw_lock_shared_to_exclusive_success( return (TRUE); } +/* + * Routine: lck_rw_lock_exclusive_to_shared + */ + +void lck_rw_lock_exclusive_to_shared(lck_rw_t *lock) +{ + uint32_t data, prev; + + for ( ; ; ) { + data = atomic_exchange_begin32(&lock->data, &prev, memory_order_release_smp); + if (data & LCK_RW_INTERLOCK) { + atomic_exchange_abort(); + lck_rw_interlock_spin(lock); /* wait for interlock to clear */ + continue; + } + data += LCK_RW_SHARED_READER; + if (data & LCK_RW_WANT_UPGRADE) + data &= ~(LCK_RW_WANT_UPGRADE); + else + data &= ~(LCK_RW_WANT_EXCL); + if (!((prev & LCK_RW_W_WAITING) && (prev & LCK_RW_PRIV_EXCL))) + data &= ~(LCK_RW_W_WAITING); + if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_release_smp)) + break; + cpu_pause(); + } + return lck_rw_lock_exclusive_to_shared_gen(lock, prev); +} + /* - * Routine: lck_rw_lock_exclusive_to_shared + * Routine: lck_rw_lock_exclusive_to_shared_gen * Function: * assembly fast path has already dropped * our exclusive state and bumped lck_rw_shared_count * all we need to do here is determine if anyone * needs to be awakened. */ -void +static void lck_rw_lock_exclusive_to_shared_gen( lck_rw_t *lck, - int prior_lock_state) + uint32_t prior_lock_state) { - lck_rw_t *fake_lck; + __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck); + lck_rw_t *fake_lck; - /* - * prior_lock state is a snapshot of the 1st word of the - * lock in question... we'll fake up a pointer to it - * and carefully not access anything beyond whats defined - * in the first word of a lck_rw_t - */ fake_lck = (lck_rw_t *)&prior_lock_state; KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_START, - (int)lck, fake_lck->lck_rw_want_write, fake_lck->lck_rw_want_upgrade, 0, 0); + trace_lck, fake_lck->lck_rw_want_write, fake_lck->lck_rw_want_upgrade, 0, 0); /* * don't wake up anyone waiting to take the lock exclusively @@ -1578,7 +1816,7 @@ lck_rw_lock_exclusive_to_shared_gen( thread_wakeup(RW_LOCK_READER_EVENT(lck)); KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_END, - (int)lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, lck->lck_rw_shared_count, 0); + trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, lck->lck_rw_shared_count, 0); #if CONFIG_DTRACE LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_TO_SHARED_DOWNGRADE, lck, 0); @@ -1603,6 +1841,71 @@ lck_rw_try_lock( return(FALSE); } +/* + * Routine: lck_rw_try_lock_shared + */ + +boolean_t lck_rw_try_lock_shared(lck_rw_t *lock) +{ + uint32_t data, prev; + + for ( ; ; ) { + data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp); + if (data & LCK_RW_INTERLOCK) { + atomic_exchange_abort(); + lck_rw_interlock_spin(lock); + continue; + } + if (data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) { + atomic_exchange_abort(); + return FALSE; /* lock is busy */ + } + data += LCK_RW_SHARED_READER; /* Increment reader refcount */ + if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp)) + break; + cpu_pause(); + } + current_thread()->rwlock_count++; + /* There is a 3 instr window where preemption may not notice rwlock_count after cmpxchg */ +#if CONFIG_DTRACE + LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_SHARED_ACQUIRE, lock, DTRACE_RW_SHARED); +#endif /* CONFIG_DTRACE */ + return TRUE; +} + + +/* + * Routine: lck_rw_try_lock_exclusive + */ + +boolean_t lck_rw_try_lock_exclusive(lck_rw_t *lock) +{ + uint32_t data, prev; + + for ( ; ; ) { + data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp); + if (data & LCK_RW_INTERLOCK) { + atomic_exchange_abort(); + lck_rw_interlock_spin(lock); + continue; + } + if (data & (LCK_RW_SHARED_MASK | LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) { + atomic_exchange_abort(); + return FALSE; /* can't get it */ + } + data |= LCK_RW_WANT_EXCL; + if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp)) + break; + cpu_pause(); + } + + current_thread()->rwlock_count++; +#if CONFIG_DTRACE + LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL); +#endif /* CONFIG_DTRACE */ + return TRUE; +} + void lck_rw_assert( @@ -1629,13 +1932,62 @@ lck_rw_assert( return; } break; + case LCK_RW_ASSERT_NOTHELD: + if (!(lck->lck_rw_want_write || + lck->lck_rw_want_upgrade || + lck->lck_rw_shared_count != 0)) { + return; + } + break; default: break; } - panic("rw lock (%p) not held (mode=%u), first word %08x\n", lck, type, *(uint32_t *)lck); + panic("rw lock (%p)%s held (mode=%u), first word %08x\n", lck, (type == LCK_RW_ASSERT_NOTHELD ? "" : " not"), type, *(uint32_t *)lck); +} + +/* On return to userspace, this routine is called if the rwlock_count is somehow imbalanced */ +void +lck_rw_clear_promotions_x86(thread_t thread) +{ +#if MACH_LDEBUG + /* It's fatal to leave a RW lock locked and return to userspace */ + panic("%u rw lock(s) held on return to userspace for thread %p", thread->rwlock_count, thread); +#else + /* Paper over the issue */ + thread->rwlock_count = 0; + lck_rw_clear_promotion(thread); +#endif +} + +boolean_t +lck_rw_lock_yield_shared(lck_rw_t *lck, boolean_t force_yield) +{ + lck_rw_assert(lck, LCK_RW_ASSERT_SHARED); + + if (lck->lck_rw_want_write || lck->lck_rw_want_upgrade || force_yield) { + lck_rw_unlock_shared(lck); + mutex_pause(2); + lck_rw_lock_shared(lck); + return TRUE; + } + + return FALSE; +} + +/* + * Routine: kdp_lck_rw_lock_is_acquired_exclusive + * NOT SAFE: To be used only by kernel debugger to avoid deadlock. + */ +boolean_t +kdp_lck_rw_lock_is_acquired_exclusive(lck_rw_t *lck) { + if (not_in_kdp) { + panic("panic: rw lock exclusive check done outside of kernel debugger"); + } + return ((lck->lck_rw_want_upgrade || lck->lck_rw_want_write) && (lck->lck_rw_shared_count == 0)) ? TRUE : FALSE; } + #ifdef MUTEX_ZONE extern zone_t lck_mtx_zone; #endif @@ -1696,9 +2048,7 @@ lck_mtx_ext_init( lck->lck_mtx_attr |= LCK_MTX_ATTR_STAT; lck->lck_mtx.lck_mtx_is_ext = 1; -#if defined(__x86_64__) - lck->lck_mtx.lck_mtx_sw.lck_mtxd.lck_mtxd_pad32 = 0xFFFFFFFF; -#endif + lck->lck_mtx.lck_mtx_pad32 = 0xFFFFFFFF; } /* @@ -1728,9 +2078,7 @@ lck_mtx_init( lck->lck_mtx_owner = 0; lck->lck_mtx_state = 0; } -#if defined(__x86_64__) - lck->lck_mtx_sw.lck_mtxd.lck_mtxd_pad32 = 0xFFFFFFFF; -#endif + lck->lck_mtx_pad32 = 0xFFFFFFFF; lck_grp_reference(grp); lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX); } @@ -1760,9 +2108,7 @@ lck_mtx_init_ext( lck->lck_mtx_owner = 0; lck->lck_mtx_state = 0; } -#if defined(__x86_64__) - lck->lck_mtx_sw.lck_mtxd.lck_mtxd_pad32 = 0xFFFFFFFF; -#endif + lck->lck_mtx_pad32 = 0xFFFFFFFF; lck_grp_reference(grp); lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX); @@ -1780,6 +2126,9 @@ lck_mtx_destroy( if (lck->lck_mtx_tag == LCK_MTX_TAG_DESTROYED) return; +#if MACH_LDEBUG + lck_mtx_assert(lck, LCK_MTX_ASSERT_NOTOWNED); +#endif lck_is_indirect = (lck->lck_mtx_tag == LCK_MTX_TAG_INDIRECT); lck_mtx_lock_mark_destroyed(lck); @@ -1814,7 +2163,8 @@ lck_mtx_unlock_wakeup_x86 ( lck_mtx_t *mutex, int prior_lock_state) { - lck_mtx_t fake_lck; + __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(mutex); + lck_mtx_t fake_lck; /* * prior_lock state is a snapshot of the 2nd word of the @@ -1825,14 +2175,13 @@ lck_mtx_unlock_wakeup_x86 ( fake_lck.lck_mtx_state = prior_lock_state; KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_START, - mutex, fake_lck.lck_mtx_promoted, fake_lck.lck_mtx_waiters, fake_lck.lck_mtx_pri, 0); + trace_lck, fake_lck.lck_mtx_promoted, fake_lck.lck_mtx_waiters, fake_lck.lck_mtx_pri, 0); if (__probable(fake_lck.lck_mtx_waiters)) { - if (fake_lck.lck_mtx_waiters > 1) - thread_wakeup_one_with_pri((event_t)(((unsigned int*)mutex)+(sizeof(lck_mtx_t)-1)/sizeof(unsigned int)), fake_lck.lck_mtx_pri); + thread_wakeup_one_with_pri(LCK_MTX_EVENT(mutex), fake_lck.lck_mtx_pri); else - thread_wakeup_one((event_t)(((unsigned int*)mutex)+(sizeof(lck_mtx_t)-1)/sizeof(unsigned int))); + thread_wakeup_one(LCK_MTX_EVENT(mutex)); } if (__improbable(fake_lck.lck_mtx_promoted)) { @@ -1851,18 +2200,20 @@ lck_mtx_unlock_wakeup_x86 ( thread->sched_flags &= ~TH_SFLAG_PROMOTED; - if (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) { + if (thread->sched_flags & TH_SFLAG_RW_PROMOTED) { + /* Thread still has a RW lock promotion */ + } else if (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) { KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_DEMOTE) | DBG_FUNC_NONE, - thread->sched_pri, DEPRESSPRI, 0, mutex, 0); + thread->sched_pri, DEPRESSPRI, 0, trace_lck, 0); set_sched_pri(thread, DEPRESSPRI); } else { - if (thread->priority < thread->sched_pri) { + if (thread->base_pri < thread->sched_pri) { KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_DEMOTE) | DBG_FUNC_NONE, - thread->sched_pri, thread->priority, 0, mutex, 0); + thread->sched_pri, thread->base_pri, 0, trace_lck, 0); - SCHED(compute_priority)(thread, FALSE); + thread_recompute_sched_pri(thread, FALSE); } } } @@ -1871,7 +2222,7 @@ lck_mtx_unlock_wakeup_x86 ( } } KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_END, - mutex, 0, mutex->lck_mtx_waiters, 0, 0); + trace_lck, 0, mutex->lck_mtx_waiters, 0, 0); } @@ -1889,12 +2240,13 @@ void lck_mtx_lock_acquire_x86( lck_mtx_t *mutex) { - thread_t thread; - integer_t priority; - spl_t s; + __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(mutex); + thread_t thread; + integer_t priority; + spl_t s; KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_START, - mutex, thread->was_promoted_on_wakeup, mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0); + trace_lck, thread->was_promoted_on_wakeup, mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0); if (mutex->lck_mtx_waiters) priority = mutex->lck_mtx_pri; @@ -1906,14 +2258,16 @@ lck_mtx_lock_acquire_x86( if (thread->sched_pri < priority || thread->was_promoted_on_wakeup) { KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PROMOTE) | DBG_FUNC_NONE, - thread->sched_pri, priority, thread->was_promoted_on_wakeup, mutex, 0); + thread->sched_pri, priority, thread->was_promoted_on_wakeup, trace_lck, 0); s = splsched(); thread_lock(thread); - if (thread->sched_pri < priority) + if (thread->sched_pri < priority) { + /* Do not promote past promotion ceiling */ + assert(priority <= MAXPRI_PROMOTE); set_sched_pri(thread, priority); - + } if (mutex->lck_mtx_promoted == 0) { mutex->lck_mtx_promoted = 1; @@ -1926,9 +2280,30 @@ lck_mtx_lock_acquire_x86( splx(s); } KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_END, - mutex, 0, mutex->lck_mtx_waiters, 0, 0); + trace_lck, 0, mutex->lck_mtx_waiters, 0, 0); +} + + +static int +lck_mtx_interlock_try_lock(lck_mtx_t *mutex, boolean_t *istate) +{ + int retval; + + *istate = ml_set_interrupts_enabled(FALSE); + retval = lck_mtx_ilk_try_lock(mutex); + + if (retval == 0) + ml_set_interrupts_enabled(*istate); + + return retval; } +static void +lck_mtx_interlock_unlock(lck_mtx_t *mutex, boolean_t istate) +{ + lck_mtx_ilk_unlock(mutex); + ml_set_interrupts_enabled(istate); +} /* @@ -1947,16 +2322,20 @@ int lck_mtx_lock_spinwait_x86( lck_mtx_t *mutex) { + __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(mutex); thread_t holder; - uint64_t deadline; + uint64_t overall_deadline; + uint64_t check_owner_deadline; + uint64_t cur_time; int retval = 1; int loopcount = 0; - KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_START, - mutex, mutex->lck_mtx_owner, mutex->lck_mtx_waiters, 0, 0); + trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, 0, 0); - deadline = mach_absolute_time() + MutexSpin; + cur_time = mach_absolute_time(); + overall_deadline = cur_time + MutexSpin; + check_owner_deadline = cur_time; /* * Spin while: @@ -1971,25 +2350,42 @@ lck_mtx_lock_spinwait_x86( retval = 0; break; } - if ((holder = (thread_t) mutex->lck_mtx_owner) != NULL) { + cur_time = mach_absolute_time(); - if ( !(holder->machine.specFlags & OnProc) || - (holder->state & TH_IDLE)) { - if (loopcount == 0) - retval = 2; - break; + if (cur_time >= overall_deadline) + break; + + if (cur_time >= check_owner_deadline && mutex->lck_mtx_owner) { + boolean_t istate; + + if (lck_mtx_interlock_try_lock(mutex, &istate)) { + + if ((holder = (thread_t) mutex->lck_mtx_owner) != NULL) { + + if ( !(holder->machine.specFlags & OnProc) || + (holder->state & TH_IDLE)) { + + lck_mtx_interlock_unlock(mutex, istate); + + if (loopcount == 0) + retval = 2; + break; + } + } + lck_mtx_interlock_unlock(mutex, istate); + + check_owner_deadline = cur_time + (MutexSpin / 4); } } cpu_pause(); loopcount++; - } while (mach_absolute_time() < deadline); - + } while (TRUE); #if CONFIG_DTRACE /* - * We've already kept a count via deadline of how long we spun. + * We've already kept a count via overall_deadline of how long we spun. * If dtrace is active, then we compute backwards to decide how * long we spun. * @@ -2000,16 +2396,16 @@ lck_mtx_lock_spinwait_x86( */ if (__probable(mutex->lck_mtx_is_ext == 0)) { LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN, mutex, - mach_absolute_time() - (deadline - MutexSpin)); + mach_absolute_time() - (overall_deadline - MutexSpin)); } else { LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_SPIN, mutex, - mach_absolute_time() - (deadline - MutexSpin)); + mach_absolute_time() - (overall_deadline - MutexSpin)); } /* The lockstat acquire event is recorded by the assembly code beneath us. */ #endif KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_END, - mutex, mutex->lck_mtx_owner, mutex->lck_mtx_waiters, retval, 0); + trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, retval, 0); return retval; } @@ -2029,6 +2425,7 @@ void lck_mtx_lock_wait_x86 ( lck_mtx_t *mutex) { + __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(mutex); thread_t self = current_thread(); thread_t holder; integer_t priority; @@ -2041,30 +2438,38 @@ lck_mtx_lock_wait_x86 ( } #endif KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_START, - mutex, mutex->lck_mtx_owner, mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0); + trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0); priority = self->sched_pri; - if (priority < self->priority) - priority = self->priority; + if (priority < self->base_pri) + priority = self->base_pri; if (priority < BASEPRI_DEFAULT) priority = BASEPRI_DEFAULT; + /* Do not promote past promotion ceiling */ + priority = MIN(priority, MAXPRI_PROMOTE); + if (mutex->lck_mtx_waiters == 0 || priority > mutex->lck_mtx_pri) mutex->lck_mtx_pri = priority; mutex->lck_mtx_waiters++; if ( (holder = (thread_t)mutex->lck_mtx_owner) && holder->sched_pri < mutex->lck_mtx_pri ) { - s = splsched(); thread_lock(holder); + /* holder priority may have been bumped by another thread + * before thread_lock was taken + */ if (holder->sched_pri < mutex->lck_mtx_pri) { KERNEL_DEBUG_CONSTANT( MACHDBG_CODE(DBG_MACH_SCHED, MACH_PROMOTE) | DBG_FUNC_NONE, - holder->sched_pri, priority, thread_tid(holder), mutex, 0); - + holder->sched_pri, priority, thread_tid(holder), trace_lck, 0); + /* Assert that we're not altering the priority of a + * thread above the MAXPRI_PROMOTE band + */ + assert(holder->sched_pri < MAXPRI_PROMOTE); set_sched_pri(holder, priority); if (mutex->lck_mtx_promoted == 0) { @@ -2077,14 +2482,15 @@ lck_mtx_lock_wait_x86 ( thread_unlock(holder); splx(s); } - assert_wait((event_t)(((unsigned int*)mutex)+((sizeof(lck_mtx_t)-1)/sizeof(unsigned int))), THREAD_UNINT); + thread_set_pending_block_hint(self, kThreadWaitKernelMutex); + assert_wait(LCK_MTX_EVENT(mutex), THREAD_UNINT); lck_mtx_ilk_unlock(mutex); thread_block(THREAD_CONTINUE_NULL); KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_END, - mutex, mutex->lck_mtx_owner, mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0); + trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0); #if CONFIG_DTRACE /* @@ -2103,65 +2509,50 @@ lck_mtx_lock_wait_x86 ( #endif } - -#if MACH_KDB - -void -db_show_one_lock( - lock_t *lock) -{ - db_printf("Read_count = 0x%x, %swant_upgrade, %swant_write, ", - lock->lck_rw_shared_count, - lock->lck_rw_want_upgrade ? "" : "!", - lock->lck_rw_want_write ? "" : "!"); - db_printf("%swaiting, %scan_sleep\n", - (lock->lck_r_waiting || lock->lck_w_waiting) ? "" : "!", - lock->lck_rw_can_sleep ? "" : "!"); - db_printf("Interlock:\n"); - db_show_one_simple_lock((db_expr_t) ((vm_offset_t)simple_lock_addr(lock->lck_rw_interlock)), - TRUE, (db_expr_t)0, (char *)0); -} - /* - * Routines to print out simple_locks and mutexes in a nicely-formatted - * fashion. + * Routine: kdp_lck_mtx_lock_spin_is_acquired + * NOT SAFE: To be used only by kernel debugger to avoid deadlock. + * Returns: TRUE if lock is acquired. */ - -const char *simple_lock_labels = "ENTRY ILK THREAD DURATION CALLER"; - -void -db_show_one_simple_lock ( - db_expr_t addr, - boolean_t have_addr, - __unused db_expr_t count, - __unused char * modif) +boolean_t +kdp_lck_mtx_lock_spin_is_acquired(lck_mtx_t *lck) { - simple_lock_t saddr = (simple_lock_t) ((vm_offset_t) addr); + if (not_in_kdp) { + panic("panic: kdp_lck_mtx_lock_spin_is_acquired called outside of kernel debugger"); + } - if (saddr == (simple_lock_t)0 || !have_addr) { - db_error ("No simple_lock\n"); + if (lck->lck_mtx_ilocked || lck->lck_mtx_mlocked) { + return TRUE; } -#if USLOCK_DEBUG - else if (saddr->lock_type != USLOCK_TAG) - db_error ("Not a simple_lock\n"); -#endif /* USLOCK_DEBUG */ - db_printf ("%s\n", simple_lock_labels); - db_print_simple_lock (saddr); + return FALSE; } void -db_print_simple_lock ( - simple_lock_t addr) +kdp_lck_mtx_find_owner(__unused struct waitq * waitq, event64_t event, thread_waitinfo_t * waitinfo) { - - db_printf ("%08x %3d", addr, *hw_lock_addr(addr->interlock)); -#if USLOCK_DEBUG - db_printf (" %08x", addr->debug.lock_thread); - db_printf (" %08x ", addr->debug.duration[1]); - db_printsym ((int)addr->debug.lock_pc, DB_STGY_ANY); -#endif /* USLOCK_DEBUG */ - db_printf ("\n"); + lck_mtx_t * mutex = LCK_EVENT_TO_MUTEX(event); + waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(mutex); + thread_t holder = (thread_t)mutex->lck_mtx_owner; + waitinfo->owner = thread_tid(holder); } -#endif /* MACH_KDB */ +void +kdp_rwlck_find_owner(__unused struct waitq * waitq, event64_t event, thread_waitinfo_t * waitinfo) +{ + lck_rw_t *rwlck = NULL; + switch(waitinfo->wait_type) { + case kThreadWaitKernelRWLockRead: + rwlck = READ_EVENT_TO_RWLOCK(event); + break; + case kThreadWaitKernelRWLockWrite: + case kThreadWaitKernelRWLockUpgrade: + rwlck = WRITE_EVENT_TO_RWLOCK(event); + break; + default: + panic("%s was called with an invalid blocking type", __FUNCTION__); + break; + } + waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(rwlck); + waitinfo->owner = 0; +}