X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/0a7de7458d150b5d4dffc935ba399be265ef0a1a..HEAD:/osfmk/arm/locks_arm.c diff --git a/osfmk/arm/locks_arm.c b/osfmk/arm/locks_arm.c index 5b6917ac3..8246489dc 100644 --- a/osfmk/arm/locks_arm.c +++ b/osfmk/arm/locks_arm.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2017 Apple Inc. All rights reserved. + * Copyright (c) 2007-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -59,22 +59,23 @@ * Locking primitives implementation */ -#define ATOMIC_PRIVATE 1 #define LOCK_PRIVATE 1 #include -#include +#include #include #include #include #include #include #include -#include #include #include #include +#include +#include +#include #include #include @@ -104,42 +105,28 @@ // These are undesirable when in a panic or a debugger is runnning. #define LOCK_CORRECTNESS_PANIC() (kernel_debugger_entry_count == 0) -unsigned int LcksOpts = 0; - #define ADAPTIVE_SPIN_ENABLE 0x1 -#if __SMP__ int lck_mtx_adaptive_spin_mode = ADAPTIVE_SPIN_ENABLE; -#else /* __SMP__ */ -int lck_mtx_adaptive_spin_mode = 0; -#endif /* __SMP__ */ #define SPINWAIT_OWNER_CHECK_COUNT 4 typedef enum { SPINWAIT_ACQUIRED, /* Got the lock. */ SPINWAIT_INTERLOCK, /* Got the interlock, no owner, but caller must finish acquiring the lock. */ - SPINWAIT_DID_SPIN, /* Got the interlock, spun, but failed to get the lock. */ + SPINWAIT_DID_SPIN_HIGH_THR, /* Got the interlock, spun, but failed to get the lock. */ + SPINWAIT_DID_SPIN_OWNER_NOT_CORE, /* Got the interlock, spun, but failed to get the lock. */ + SPINWAIT_DID_SPIN_NO_WINDOW_CONTENTION, /* Got the interlock, spun, but failed to get the lock. */ + SPINWAIT_DID_SPIN_SLIDING_THR,/* Got the interlock, spun, but failed to get the lock. */ SPINWAIT_DID_NOT_SPIN, /* Got the interlock, did not spin. */ } spinwait_result_t; -#if CONFIG_DTRACE && __SMP__ +#if CONFIG_DTRACE extern uint64_t dtrace_spin_threshold; #endif /* Forwards */ - -#if USLOCK_DEBUG -/* - * Perform simple lock checks. - */ -int uslock_check = 1; -int max_lock_loops = 100000000; -decl_simple_lock_data(extern, printf_lock) -decl_simple_lock_data(extern, panic_lock) -#endif /* USLOCK_DEBUG */ - extern unsigned int not_in_kdp; /* @@ -165,19 +152,6 @@ typedef void *pc_t; * Portable lock package implementation of usimple_locks. */ -#if USLOCK_DEBUG -#define USLDBG(stmt) stmt -void usld_lock_init(usimple_lock_t, unsigned short); -void usld_lock_pre(usimple_lock_t, pc_t); -void usld_lock_post(usimple_lock_t, pc_t); -void usld_unlock(usimple_lock_t, pc_t); -void usld_lock_try_pre(usimple_lock_t, pc_t); -void usld_lock_try_post(usimple_lock_t, pc_t); -int usld_lock_common_checks(usimple_lock_t, const char *); -#else /* USLOCK_DEBUG */ -#define USLDBG(stmt) -#endif /* USLOCK_DEBUG */ - /* * Owner thread pointer when lock held in spin mode */ @@ -190,26 +164,24 @@ int usld_lock_common_checks(usimple_lock_t, const char *); #define lck_rw_ilk_lock(lock) hw_lock_bit ((hw_lock_bit_t*)(&(lock)->lck_rw_tag), LCK_RW_INTERLOCK_BIT, LCK_GRP_NULL) #define lck_rw_ilk_unlock(lock) hw_unlock_bit((hw_lock_bit_t*)(&(lock)->lck_rw_tag), LCK_RW_INTERLOCK_BIT) -#define memory_barrier() __c11_atomic_thread_fence(memory_order_acq_rel_smp) -#define load_memory_barrier() __c11_atomic_thread_fence(memory_order_acquire_smp) -#define store_memory_barrier() __c11_atomic_thread_fence(memory_order_release_smp) +#define load_memory_barrier() os_atomic_thread_fence(acquire) // Enforce program order of loads and stores. -#define ordered_load(target, type) \ - __c11_atomic_load((_Atomic type *)(target), memory_order_relaxed) -#define ordered_store(target, type, value) \ - __c11_atomic_store((_Atomic type *)(target), value, memory_order_relaxed) - -#define ordered_load_mtx(lock) ordered_load(&(lock)->lck_mtx_data, uintptr_t) -#define ordered_store_mtx(lock, value) ordered_store(&(lock)->lck_mtx_data, uintptr_t, (value)) -#define ordered_load_rw(lock) ordered_load(&(lock)->lck_rw_data, uint32_t) -#define ordered_store_rw(lock, value) ordered_store(&(lock)->lck_rw_data, uint32_t, (value)) -#define ordered_load_rw_owner(lock) ordered_load(&(lock)->lck_rw_owner, thread_t) -#define ordered_store_rw_owner(lock, value) ordered_store(&(lock)->lck_rw_owner, thread_t, (value)) -#define ordered_load_hw(lock) ordered_load(&(lock)->lock_data, uintptr_t) -#define ordered_store_hw(lock, value) ordered_store(&(lock)->lock_data, uintptr_t, (value)) -#define ordered_load_bit(lock) ordered_load((lock), uint32_t) -#define ordered_store_bit(lock, value) ordered_store((lock), uint32_t, (value)) +#define ordered_load(target) \ + os_atomic_load(target, compiler_acq_rel) +#define ordered_store(target, value) \ + os_atomic_store(target, value, compiler_acq_rel) + +#define ordered_load_mtx(lock) ordered_load(&(lock)->lck_mtx_data) +#define ordered_store_mtx(lock, value) ordered_store(&(lock)->lck_mtx_data, (value)) +#define ordered_load_rw(lock) ordered_load(&(lock)->lck_rw_data) +#define ordered_store_rw(lock, value) ordered_store(&(lock)->lck_rw_data, (value)) +#define ordered_load_rw_owner(lock) ordered_load(&(lock)->lck_rw_owner) +#define ordered_store_rw_owner(lock, value) ordered_store(&(lock)->lck_rw_owner, (value)) +#define ordered_load_hw(lock) ordered_load(&(lock)->lock_data) +#define ordered_store_hw(lock, value) ordered_store(&(lock)->lock_data, (value)) +#define ordered_load_bit(lock) ordered_load((lock)) +#define ordered_store_bit(lock, value) ordered_store((lock), (value)) // Prevent the compiler from reordering memory operations around this @@ -231,6 +203,18 @@ int usld_lock_common_checks(usimple_lock_t, const char *); #define enable_interrupts() __asm__ volatile ("cpsie if" ::: "memory"); #endif +ZONE_VIEW_DEFINE(ZV_LCK_SPIN, "lck_spin", + KHEAP_ID_DEFAULT, sizeof(lck_spin_t)); + +ZONE_VIEW_DEFINE(ZV_LCK_MTX, "lck_mtx", + KHEAP_ID_DEFAULT, sizeof(lck_mtx_t)); + +ZONE_VIEW_DEFINE(ZV_LCK_MTX_EXT, "lck_mtx_ext", + KHEAP_ID_DEFAULT, sizeof(lck_mtx_ext_t)); + +ZONE_VIEW_DEFINE(ZV_LCK_RW, "lck_rw", + KHEAP_ID_DEFAULT, sizeof(lck_rw_t)); + /* * Forward declarations */ @@ -253,11 +237,56 @@ static boolean_t lck_rw_grab(lck_rw_t *lock, int mode, boolean_t wait); * atomic_exchange_complete() - conclude an exchange * atomic_exchange_abort() - cancel an exchange started with atomic_exchange_begin() */ +__unused static uint32_t +load_exclusive32(uint32_t *target, enum memory_order ord) +{ + uint32_t value; + +#if __arm__ + if (_os_atomic_mo_has_release(ord)) { + // Pre-load release barrier + atomic_thread_fence(memory_order_release); + } + value = __builtin_arm_ldrex(target); +#else + if (_os_atomic_mo_has_acquire(ord)) { + value = __builtin_arm_ldaex(target); // ldaxr + } else { + value = __builtin_arm_ldrex(target); // ldxr + } +#endif // __arm__ + return value; +} + +__unused static boolean_t +store_exclusive32(uint32_t *target, uint32_t value, enum memory_order ord) +{ + boolean_t err; + +#if __arm__ + err = __builtin_arm_strex(value, target); + if (_os_atomic_mo_has_acquire(ord)) { + // Post-store acquire barrier + atomic_thread_fence(memory_order_acquire); + } +#else + if (_os_atomic_mo_has_release(ord)) { + err = __builtin_arm_stlex(value, target); // stlxr + } else { + err = __builtin_arm_strex(value, target); // stxr + } +#endif // __arm__ + return !err; +} + static uint32_t atomic_exchange_begin32(uint32_t *target, uint32_t *previous, enum memory_order ord) { uint32_t val; +#if __ARM_ATOMICS_8_1 + ord = memory_order_relaxed; +#endif val = load_exclusive32(target, ord); *previous = val; return val; @@ -266,14 +295,18 @@ atomic_exchange_begin32(uint32_t *target, uint32_t *previous, enum memory_order static boolean_t atomic_exchange_complete32(uint32_t *target, uint32_t previous, uint32_t newval, enum memory_order ord) { +#if __ARM_ATOMICS_8_1 + return __c11_atomic_compare_exchange_strong((_Atomic uint32_t *)target, &previous, newval, ord, memory_order_relaxed); +#else (void)previous; // Previous not needed, monitor is held return store_exclusive32(target, newval, ord); +#endif } static void atomic_exchange_abort(void) { - clear_exclusive(); + os_atomic_clear_exclusive(); } static boolean_t @@ -298,287 +331,139 @@ atomic_test_and_set32(uint32_t *target, uint32_t test_mask, uint32_t set_mask, e } } +inline boolean_t +hw_atomic_test_and_set32(uint32_t *target, uint32_t test_mask, uint32_t set_mask, enum memory_order ord, boolean_t wait) +{ + return atomic_test_and_set32(target, test_mask, set_mask, ord, wait); +} + +/* + * To help _disable_preemption() inline everywhere with LTO, + * we keep these nice non inlineable functions as the panic() + * codegen setup is quite large and for weird reasons causes a frame. + */ +__abortlike +static void +_disable_preemption_overflow(void) +{ + panic("Preemption count overflow"); +} + void _disable_preemption(void) { - thread_t thread = current_thread(); - unsigned int count; + thread_t thread = current_thread(); + unsigned int count = thread->machine.preemption_count; - count = thread->machine.preemption_count + 1; - ordered_store(&thread->machine.preemption_count, unsigned int, count); + if (__improbable(++count == 0)) { + _disable_preemption_overflow(); + } + + os_atomic_store(&thread->machine.preemption_count, count, compiler_acq_rel); } -void -_enable_preemption(void) +/* + * This function checks whether an AST_URGENT has been pended. + * + * It is called once the preemption has been reenabled, which means the thread + * may have been preempted right before this was called, and when this function + * actually performs the check, we've changed CPU. + * + * This race is however benign: the point of AST_URGENT is to trigger a context + * switch, so if one happened, there's nothing left to check for, and AST_URGENT + * was cleared in the process. + * + * It follows that this check cannot have false negatives, which allows us + * to avoid fiddling with interrupt state for the vast majority of cases + * when the check will actually be negative. + */ +static NOINLINE void +kernel_preempt_check(thread_t thread) { - thread_t thread = current_thread(); - long state; - unsigned int count; + cpu_data_t *cpu_data_ptr; + long state; + #if __arm__ #define INTERRUPT_MASK PSR_IRQF #else // __arm__ #define INTERRUPT_MASK DAIF_IRQF #endif // __arm__ - count = thread->machine.preemption_count; - if (count == 0) { - panic("Preemption count negative"); // Count will go negative when released - } - count--; - if (count > 0) { - goto update_count; // Preemption is still disabled, just update - } - state = get_interrupts(); // Get interrupt state - if (state & INTERRUPT_MASK) { - goto update_count; // Interrupts are already masked, can't take AST here + /* + * This check is racy and could load from another CPU's pending_ast mask, + * but as described above, this can't have false negatives. + */ + cpu_data_ptr = os_atomic_load(&thread->machine.CpuDatap, compiler_acq_rel); + if (__probable((cpu_data_ptr->cpu_pending_ast & AST_URGENT) == 0)) { + return; } - disable_interrupts_noread(); // Disable interrupts - ordered_store(&thread->machine.preemption_count, unsigned int, count); - if (thread->machine.CpuDatap->cpu_pending_ast & AST_URGENT) { + + /* If interrupts are masked, we can't take an AST here */ + state = get_interrupts(); + if ((state & INTERRUPT_MASK) == 0) { + disable_interrupts_noread(); // Disable interrupts + + /* + * Reload cpu_data_ptr: a context switch would cause it to change. + * Now that interrupts are disabled, this will debounce false positives. + */ + cpu_data_ptr = os_atomic_load(&thread->machine.CpuDatap, compiler_acq_rel); + if (thread->machine.CpuDatap->cpu_pending_ast & AST_URGENT) { #if __arm__ #if __ARM_USER_PROTECT__ - uintptr_t up = arm_user_protect_begin(thread); + uintptr_t up = arm_user_protect_begin(thread); #endif // __ARM_USER_PROTECT__ - enable_fiq(); + enable_fiq(); #endif // __arm__ - ast_taken_kernel(); // Handle urgent AST + ast_taken_kernel(); // Handle urgent AST #if __arm__ #if __ARM_USER_PROTECT__ - arm_user_protect_end(thread, up, TRUE); + arm_user_protect_end(thread, up, TRUE); #endif // __ARM_USER_PROTECT__ - enable_interrupts(); - return; // Return early on arm only due to FIQ enabling + enable_interrupts(); + return; // Return early on arm only due to FIQ enabling #endif // __arm__ - } - restore_interrupts(state); // Enable interrupts - return; - -update_count: - ordered_store(&thread->machine.preemption_count, unsigned int, count); - return; -} - -int -get_preemption_level(void) -{ - return current_thread()->machine.preemption_count; -} - -#if __SMP__ -static unsigned int -hw_lock_bit_to_contended(hw_lock_bit_t *lock, uint32_t mask, uint32_t timeout LCK_GRP_ARG(lck_grp_t *grp)); -#endif - -static inline unsigned int -hw_lock_bit_to_internal(hw_lock_bit_t *lock, unsigned int bit, uint32_t timeout LCK_GRP_ARG(lck_grp_t *grp)) -{ - unsigned int success = 0; - uint32_t mask = (1 << bit); -#if !__SMP__ - uint32_t state; -#endif - -#if __SMP__ - if (__improbable(!atomic_test_and_set32(lock, mask, mask, memory_order_acquire, FALSE))) { - success = hw_lock_bit_to_contended(lock, mask, timeout LCK_GRP_ARG(grp)); - } else { - success = 1; - } -#else // __SMP__ - (void)timeout; - state = ordered_load_bit(lock); - if (!(mask & state)) { - ordered_store_bit(lock, state | mask); - success = 1; - } -#endif // __SMP__ - - if (success) { - lck_grp_spin_update_held(lock LCK_GRP_ARG(grp)); - } - - return success; -} - -unsigned -int -(hw_lock_bit_to)(hw_lock_bit_t * lock, unsigned int bit, uint32_t timeout LCK_GRP_ARG(lck_grp_t *grp)) -{ - _disable_preemption(); - return hw_lock_bit_to_internal(lock, bit, timeout LCK_GRP_ARG(grp)); -} - -#if __SMP__ -static unsigned int NOINLINE -hw_lock_bit_to_contended(hw_lock_bit_t *lock, uint32_t mask, uint32_t timeout LCK_GRP_ARG(lck_grp_t *grp)) -{ - uint64_t end = 0; - int i; -#if CONFIG_DTRACE || LOCK_STATS - uint64_t begin = 0; - boolean_t stat_enabled = lck_grp_spin_spin_enabled(lock LCK_GRP_ARG(grp)); -#endif /* CONFIG_DTRACE || LOCK_STATS */ - -#if LOCK_STATS || CONFIG_DTRACE - if (__improbable(stat_enabled)) { - begin = mach_absolute_time(); - } -#endif /* LOCK_STATS || CONFIG_DTRACE */ - for (;;) { - for (i = 0; i < LOCK_SNOOP_SPINS; i++) { - // Always load-exclusive before wfe - // This grabs the monitor and wakes up on a release event - if (atomic_test_and_set32(lock, mask, mask, memory_order_acquire, TRUE)) { - goto end; - } - } - if (end == 0) { - end = ml_get_timebase() + timeout; - } else if (ml_get_timebase() >= end) { - break; } + restore_interrupts(state); // Enable interrupts } - return 0; -end: -#if CONFIG_DTRACE || LOCK_STATS - if (__improbable(stat_enabled)) { - lck_grp_spin_update_spin(lock LCK_GRP_ARG(grp), mach_absolute_time() - begin); - } - lck_grp_spin_update_miss(lock LCK_GRP_ARG(grp)); -#endif /* CONFIG_DTRACE || LCK_GRP_STAT */ - - return 1; -} -#endif // __SMP__ - -void -(hw_lock_bit)(hw_lock_bit_t * lock, unsigned int bit LCK_GRP_ARG(lck_grp_t *grp)) -{ - if (hw_lock_bit_to(lock, bit, LOCK_PANIC_TIMEOUT, LCK_GRP_PROBEARG(grp))) { - return; - } -#if __SMP__ - panic("hw_lock_bit(): timed out (%p)", lock); -#else - panic("hw_lock_bit(): interlock held (%p)", lock); -#endif -} - -void -(hw_lock_bit_nopreempt)(hw_lock_bit_t * lock, unsigned int bit LCK_GRP_ARG(lck_grp_t *grp)) -{ - if (__improbable(get_preemption_level() == 0)) { - panic("Attempt to take no-preempt bitlock %p in preemptible context", lock); - } - if (hw_lock_bit_to_internal(lock, bit, LOCK_PANIC_TIMEOUT LCK_GRP_ARG(grp))) { - return; - } -#if __SMP__ - panic("hw_lock_bit_nopreempt(): timed out (%p)", lock); -#else - panic("hw_lock_bit_nopreempt(): interlock held (%p)", lock); -#endif -} - -unsigned -int -(hw_lock_bit_try)(hw_lock_bit_t * lock, unsigned int bit LCK_GRP_ARG(lck_grp_t *grp)) -{ - uint32_t mask = (1 << bit); -#if !__SMP__ - uint32_t state; -#endif - boolean_t success = FALSE; - - _disable_preemption(); -#if __SMP__ - // TODO: consider weak (non-looping) atomic test-and-set - success = atomic_test_and_set32(lock, mask, mask, memory_order_acquire, FALSE); -#else - state = ordered_load_bit(lock); - if (!(mask & state)) { - ordered_store_bit(lock, state | mask); - success = TRUE; - } -#endif // __SMP__ - if (!success) { - _enable_preemption(); - } - - if (success) { - lck_grp_spin_update_held(lock LCK_GRP_ARG(grp)); - } - - return success; -} - -static inline void -hw_unlock_bit_internal(hw_lock_bit_t *lock, unsigned int bit) -{ - uint32_t mask = (1 << bit); -#if !__SMP__ - uint32_t state; -#endif - -#if __SMP__ - __c11_atomic_fetch_and((_Atomic uint32_t *)lock, ~mask, memory_order_release); - set_event(); -#else // __SMP__ - state = ordered_load_bit(lock); - ordered_store_bit(lock, state & ~mask); -#endif // __SMP__ -#if CONFIG_DTRACE - LOCKSTAT_RECORD(LS_LCK_SPIN_UNLOCK_RELEASE, lock, bit); -#endif } /* - * Routine: hw_unlock_bit - * - * Release spin-lock. The second parameter is the bit number to test and set. - * Decrement the preemption level. + * To help _enable_preemption() inline everywhere with LTO, + * we keep these nice non inlineable functions as the panic() + * codegen setup is quite large and for weird reasons causes a frame. */ -void -hw_unlock_bit(hw_lock_bit_t * lock, unsigned int bit) +__abortlike +static void +_enable_preemption_underflow(void) { - hw_unlock_bit_internal(lock, bit); - _enable_preemption(); + panic("Preemption count underflow"); } void -hw_unlock_bit_nopreempt(hw_lock_bit_t * lock, unsigned int bit) +_enable_preemption(void) { - if (__improbable(get_preemption_level() == 0)) { - panic("Attempt to release no-preempt bitlock %p in preemptible context", lock); - } - hw_unlock_bit_internal(lock, bit); -} + thread_t thread = current_thread(); + unsigned int count = thread->machine.preemption_count; -#if __SMP__ -static inline boolean_t -interlock_try_disable_interrupts( - lck_mtx_t *mutex, - boolean_t *istate) -{ - *istate = ml_set_interrupts_enabled(FALSE); + if (__improbable(count == 0)) { + _enable_preemption_underflow(); + } + count -= 1; - if (interlock_try(mutex)) { - return 1; - } else { - ml_set_interrupts_enabled(*istate); - return 0; + os_atomic_store(&thread->machine.preemption_count, count, compiler_acq_rel); + if (count == 0) { + kernel_preempt_check(thread); } + + os_compiler_barrier(); } -static inline void -interlock_unlock_enable_interrupts( - lck_mtx_t *mutex, - boolean_t istate) +int +get_preemption_level(void) { - interlock_unlock(mutex); - ml_set_interrupts_enabled(istate); + return current_thread()->machine.preemption_count; } -#endif /* __SMP__ */ /* * Routine: lck_spin_alloc_init @@ -588,12 +473,10 @@ lck_spin_alloc_init( lck_grp_t * grp, lck_attr_t * attr) { - lck_spin_t *lck; - - if ((lck = (lck_spin_t *) kalloc(sizeof(lck_spin_t))) != 0) { - lck_spin_init(lck, grp, attr); - } + lck_spin_t *lck; + lck = zalloc(ZV_LCK_SPIN); + lck_spin_init(lck, grp, attr); return lck; } @@ -606,7 +489,7 @@ lck_spin_free( lck_grp_t * grp) { lck_spin_destroy(lck, grp); - kfree(lck, sizeof(lck_spin_t)); + zfree(ZV_LCK_SPIN, lck); } /* @@ -618,22 +501,22 @@ lck_spin_init( lck_grp_t * grp, __unused lck_attr_t * attr) { - hw_lock_init(&lck->hwlock); lck->type = LCK_SPIN_TYPE; - lck_grp_reference(grp); - lck_grp_lckcnt_incr(grp, LCK_TYPE_SPIN); - store_memory_barrier(); + hw_lock_init(&lck->hwlock); + if (grp) { + lck_grp_reference(grp); + lck_grp_lckcnt_incr(grp, LCK_TYPE_SPIN); + } } /* * arm_usimple_lock is a lck_spin_t without a group or attributes */ -void inline +MARK_AS_HIBERNATE_TEXT void inline arm_usimple_lock_init(simple_lock_t lck, __unused unsigned short initial_value) { lck->type = LCK_SPIN_TYPE; hw_lock_init(&lck->hwlock); - store_memory_barrier(); } @@ -767,8 +650,10 @@ lck_spin_destroy( return; } lck->lck_spin_data = LCK_SPIN_TAG_DESTROYED; - lck_grp_lckcnt_decr(grp, LCK_TYPE_SPIN); - lck_grp_deallocate(grp); + if (grp) { + lck_grp_lckcnt_decr(grp, LCK_TYPE_SPIN); + lck_grp_deallocate(grp); + } } /* @@ -794,12 +679,7 @@ usimple_lock_init( usimple_lock_t l, unsigned short tag) { -#ifndef MACHINE_SIMPLE_LOCK - USLDBG(usld_lock_init(l, tag)); - hw_lock_init(&l->lck_spin_data); -#else simple_lock_init((simple_lock_t) l, tag); -#endif } @@ -815,21 +695,7 @@ void usimple_lock_t l LCK_GRP_ARG(lck_grp_t *grp)) { -#ifndef MACHINE_SIMPLE_LOCK - pc_t pc; - - OBTAIN_PC(pc, l); - USLDBG(usld_lock_pre(l, pc)); - - if (!hw_lock_to(&l->lck_spin_data, LockTimeOut, LCK_GRP_ARG(grp))) { /* Try to get the lock - * with a timeout */ - panic("simple lock deadlock detection - l=%p, cpu=%d, ret=%p", &l, cpu_number(), pc); - } - - USLDBG(usld_lock_post(l, pc)); -#else simple_lock((simple_lock_t) l, LCK_GRP_PROBEARG(grp)); -#endif } @@ -846,16 +712,7 @@ void (usimple_unlock)( usimple_lock_t l) { -#ifndef MACHINE_SIMPLE_LOCK - pc_t pc; - - OBTAIN_PC(pc, l); - USLDBG(usld_unlock(l, pc)); - sync(); - hw_lock_unlock(&l->lck_spin_data); -#else simple_unlock((simple_lock_t)l); -#endif } @@ -877,299 +734,9 @@ int usimple_lock_t l LCK_GRP_ARG(lck_grp_t *grp)) { -#ifndef MACHINE_SIMPLE_LOCK - pc_t pc; - unsigned int success; - - OBTAIN_PC(pc, l); - USLDBG(usld_lock_try_pre(l, pc)); - if ((success = hw_lock_try(&l->lck_spin_data LCK_GRP_ARG(grp)))) { - USLDBG(usld_lock_try_post(l, pc)); - } - return success; -#else return simple_lock_try((simple_lock_t) l, grp); -#endif -} - -#if USLOCK_DEBUG -/* - * States of a usimple_lock. The default when initializing - * a usimple_lock is setting it up for debug checking. - */ -#define USLOCK_CHECKED 0x0001 /* lock is being checked */ -#define USLOCK_TAKEN 0x0002 /* lock has been taken */ -#define USLOCK_INIT 0xBAA0 /* lock has been initialized */ -#define USLOCK_INITIALIZED (USLOCK_INIT|USLOCK_CHECKED) -#define USLOCK_CHECKING(l) (uslock_check && \ - ((l)->debug.state & USLOCK_CHECKED)) - -/* - * Trace activities of a particularly interesting lock. - */ -void usl_trace(usimple_lock_t, int, pc_t, const char *); - - -/* - * Initialize the debugging information contained - * in a usimple_lock. - */ -void -usld_lock_init( - usimple_lock_t l, - __unused unsigned short tag) -{ - if (l == USIMPLE_LOCK_NULL) { - panic("lock initialization: null lock pointer"); - } - l->lock_type = USLOCK_TAG; - l->debug.state = uslock_check ? USLOCK_INITIALIZED : 0; - l->debug.lock_cpu = l->debug.unlock_cpu = 0; - l->debug.lock_pc = l->debug.unlock_pc = INVALID_PC; - l->debug.lock_thread = l->debug.unlock_thread = INVALID_THREAD; - l->debug.duration[0] = l->debug.duration[1] = 0; - l->debug.unlock_cpu = l->debug.unlock_cpu = 0; - l->debug.unlock_pc = l->debug.unlock_pc = INVALID_PC; - l->debug.unlock_thread = l->debug.unlock_thread = INVALID_THREAD; -} - - -/* - * These checks apply to all usimple_locks, not just - * those with USLOCK_CHECKED turned on. - */ -int -usld_lock_common_checks( - usimple_lock_t l, - const char *caller) -{ - if (l == USIMPLE_LOCK_NULL) { - panic("%s: null lock pointer", caller); - } - if (l->lock_type != USLOCK_TAG) { - panic("%s: 0x%x is not a usimple lock", caller, (integer_t) l); - } - if (!(l->debug.state & USLOCK_INIT)) { - panic("%s: 0x%x is not an initialized lock", - caller, (integer_t) l); - } - return USLOCK_CHECKING(l); -} - - -/* - * Debug checks on a usimple_lock just before attempting - * to acquire it. - */ -/* ARGSUSED */ -void -usld_lock_pre( - usimple_lock_t l, - pc_t pc) -{ - const char *caller = "usimple_lock"; - - - if (!usld_lock_common_checks(l, caller)) { - return; - } - - /* - * Note that we have a weird case where we are getting a lock when we are] - * in the process of putting the system to sleep. We are running with no - * current threads, therefore we can't tell if we are trying to retake a lock - * we have or someone on the other processor has it. Therefore we just - * ignore this test if the locking thread is 0. - */ - - if ((l->debug.state & USLOCK_TAKEN) && l->debug.lock_thread && - l->debug.lock_thread == (void *) current_thread()) { - printf("%s: lock 0x%x already locked (at %p) by", - caller, (integer_t) l, l->debug.lock_pc); - printf(" current thread %p (new attempt at pc %p)\n", - l->debug.lock_thread, pc); - panic("%s", caller); - } - mp_disable_preemption(); - usl_trace(l, cpu_number(), pc, caller); - mp_enable_preemption(); -} - - -/* - * Debug checks on a usimple_lock just after acquiring it. - * - * Pre-emption has been disabled at this point, - * so we are safe in using cpu_number. - */ -void -usld_lock_post( - usimple_lock_t l, - pc_t pc) -{ - int mycpu; - const char *caller = "successful usimple_lock"; - - - if (!usld_lock_common_checks(l, caller)) { - return; - } - - if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED)) { - panic("%s: lock 0x%x became uninitialized", - caller, (integer_t) l); - } - if ((l->debug.state & USLOCK_TAKEN)) { - panic("%s: lock 0x%x became TAKEN by someone else", - caller, (integer_t) l); - } - - mycpu = cpu_number(); - l->debug.lock_thread = (void *) current_thread(); - l->debug.state |= USLOCK_TAKEN; - l->debug.lock_pc = pc; - l->debug.lock_cpu = mycpu; - - usl_trace(l, mycpu, pc, caller); -} - - -/* - * Debug checks on a usimple_lock just before - * releasing it. Note that the caller has not - * yet released the hardware lock. - * - * Preemption is still disabled, so there's - * no problem using cpu_number. - */ -void -usld_unlock( - usimple_lock_t l, - pc_t pc) -{ - int mycpu; - const char *caller = "usimple_unlock"; - - - if (!usld_lock_common_checks(l, caller)) { - return; - } - - mycpu = cpu_number(); - - if (!(l->debug.state & USLOCK_TAKEN)) { - panic("%s: lock 0x%x hasn't been taken", - caller, (integer_t) l); - } - if (l->debug.lock_thread != (void *) current_thread()) { - panic("%s: unlocking lock 0x%x, owned by thread %p", - caller, (integer_t) l, l->debug.lock_thread); - } - if (l->debug.lock_cpu != mycpu) { - printf("%s: unlocking lock 0x%x on cpu 0x%x", - caller, (integer_t) l, mycpu); - printf(" (acquired on cpu 0x%x)\n", l->debug.lock_cpu); - panic("%s", caller); - } - usl_trace(l, mycpu, pc, caller); - - l->debug.unlock_thread = l->debug.lock_thread; - l->debug.lock_thread = INVALID_PC; - l->debug.state &= ~USLOCK_TAKEN; - l->debug.unlock_pc = pc; - l->debug.unlock_cpu = mycpu; -} - - -/* - * Debug checks on a usimple_lock just before - * attempting to acquire it. - * - * Preemption isn't guaranteed to be disabled. - */ -void -usld_lock_try_pre( - usimple_lock_t l, - pc_t pc) -{ - const char *caller = "usimple_lock_try"; - - if (!usld_lock_common_checks(l, caller)) { - return; - } - mp_disable_preemption(); - usl_trace(l, cpu_number(), pc, caller); - mp_enable_preemption(); } - -/* - * Debug checks on a usimple_lock just after - * successfully attempting to acquire it. - * - * Preemption has been disabled by the - * lock acquisition attempt, so it's safe - * to use cpu_number. - */ -void -usld_lock_try_post( - usimple_lock_t l, - pc_t pc) -{ - int mycpu; - const char *caller = "successful usimple_lock_try"; - - if (!usld_lock_common_checks(l, caller)) { - return; - } - - if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED)) { - panic("%s: lock 0x%x became uninitialized", - caller, (integer_t) l); - } - if ((l->debug.state & USLOCK_TAKEN)) { - panic("%s: lock 0x%x became TAKEN by someone else", - caller, (integer_t) l); - } - - mycpu = cpu_number(); - l->debug.lock_thread = (void *) current_thread(); - l->debug.state |= USLOCK_TAKEN; - l->debug.lock_pc = pc; - l->debug.lock_cpu = mycpu; - - usl_trace(l, mycpu, pc, caller); -} - - -/* - * For very special cases, set traced_lock to point to a - * specific lock of interest. The result is a series of - * XPRs showing lock operations on that lock. The lock_seq - * value is used to show the order of those operations. - */ -usimple_lock_t traced_lock; -unsigned int lock_seq; - -void -usl_trace( - usimple_lock_t l, - int mycpu, - pc_t pc, - const char *op_name) -{ - if (traced_lock == l) { - XPR(XPR_SLOCK, - "seq %d, cpu %d, %s @ %x\n", - (integer_t) lock_seq, (integer_t) mycpu, - (integer_t) op_name, (integer_t) pc, 0); - lock_seq++; - } -} - - -#endif /* USLOCK_DEBUG */ - /* * The C portion of the shared/exclusive locks package. */ @@ -1178,7 +745,6 @@ usl_trace( * compute the deadline to spin against when * waiting for a change of state on a lck_rw_t */ -#if __SMP__ static inline uint64_t lck_rw_deadline_for_spin(lck_rw_t *lck) { @@ -1204,12 +770,10 @@ lck_rw_deadline_for_spin(lck_rw_t *lck) return mach_absolute_time() + (100000LL * 1000000000LL); } } -#endif // __SMP__ static boolean_t lck_rw_drain_status(lck_rw_t *lock, uint32_t status_mask, boolean_t wait __unused) { -#if __SMP__ uint64_t deadline = 0; uint32_t data; @@ -1225,24 +789,14 @@ lck_rw_drain_status(lck_rw_t *lock, uint32_t status_mask, boolean_t wait __unuse if (wait) { wait_for_event(); } else { - clear_exclusive(); + os_atomic_clear_exclusive(); } if (!wait || (mach_absolute_time() >= deadline)) { return FALSE; } } - clear_exclusive(); + os_atomic_clear_exclusive(); return TRUE; -#else - uint32_t data; - - data = ordered_load_rw(lock); - if ((data & status_mask) == 0) { - return TRUE; - } else { - return FALSE; - } -#endif // __SMP__ } /* @@ -1251,7 +805,6 @@ lck_rw_drain_status(lck_rw_t *lock, uint32_t status_mask, boolean_t wait __unuse static inline void lck_rw_interlock_spin(lck_rw_t *lock) { -#if __SMP__ uint32_t data; for (;;) { @@ -1259,13 +812,10 @@ lck_rw_interlock_spin(lck_rw_t *lock) if (data & LCK_RW_INTERLOCK) { wait_for_event(); } else { - clear_exclusive(); + os_atomic_clear_exclusive(); return; } } -#else - panic("lck_rw_interlock_spin(): Interlock locked %p %x", lock, lock->lck_rw_data); -#endif } /* @@ -1301,13 +851,9 @@ lck_rw_grab(lck_rw_t *lock, int mode, boolean_t wait) uint32_t data, prev; boolean_t do_exch; -#if __SMP__ if (wait) { deadline = lck_rw_deadline_for_spin(lock); } -#else - wait = FALSE; // Don't spin on UP systems -#endif for (;;) { data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp); @@ -1355,12 +901,10 @@ lck_rw_alloc_init( lck_grp_t *grp, lck_attr_t *attr) { - lck_rw_t *lck; - - if ((lck = (lck_rw_t *)kalloc(sizeof(lck_rw_t))) != 0) { - lck_rw_init(lck, grp, attr); - } + lck_rw_t *lck; + lck = zalloc_flags(ZV_LCK_RW, Z_WAITOK | Z_ZERO); + lck_rw_init(lck, grp, attr); return lck; } @@ -1373,7 +917,7 @@ lck_rw_free( lck_grp_t *grp) { lck_rw_destroy(lck, grp); - kfree(lck, sizeof(lck_rw_t)); + zfree(ZV_LCK_RW, lck); } /* @@ -1436,6 +980,40 @@ lck_rw_lock( } } +#define LCK_RW_LOCK_EXCLUSIVE_TAS(lck) (atomic_test_and_set32(&(lck)->lck_rw_data, \ + (LCK_RW_SHARED_MASK | LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE | LCK_RW_INTERLOCK), \ + LCK_RW_WANT_EXCL, memory_order_acquire_smp, FALSE)) + +/* + * Routine: lck_rw_lock_exclusive_check_contended + */ +bool +lck_rw_lock_exclusive_check_contended(lck_rw_t *lock) +{ + thread_t thread = current_thread(); + bool contended = false; + + if (lock->lck_rw_can_sleep) { + thread->rwlock_count++; + } else if (get_preemption_level() == 0) { + panic("Taking non-sleepable RW lock with preemption enabled"); + } + if (LCK_RW_LOCK_EXCLUSIVE_TAS(lock)) { +#if CONFIG_DTRACE + LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL); +#endif /* CONFIG_DTRACE */ + } else { + contended = true; + lck_rw_lock_exclusive_gen(lock); + } +#if MACH_ASSERT + thread_t owner = ordered_load_rw_owner(lock); + assertf(owner == THREAD_NULL, "state=0x%x, owner=%p", ordered_load_rw(lock), owner); +#endif + ordered_store_rw_owner(lock, thread); + return contended; +} + /* * Routine: lck_rw_lock_exclusive */ @@ -1444,10 +1022,12 @@ lck_rw_lock_exclusive(lck_rw_t *lock) { thread_t thread = current_thread(); - thread->rwlock_count++; - if (atomic_test_and_set32(&lock->lck_rw_data, - (LCK_RW_SHARED_MASK | LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE | LCK_RW_INTERLOCK), - LCK_RW_WANT_EXCL, memory_order_acquire_smp, FALSE)) { + if (lock->lck_rw_can_sleep) { + thread->rwlock_count++; + } else if (get_preemption_level() == 0) { + panic("Taking non-sleepable RW lock with preemption enabled"); + } + if (LCK_RW_LOCK_EXCLUSIVE_TAS(lock)) { #if CONFIG_DTRACE LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL); #endif /* CONFIG_DTRACE */ @@ -1469,7 +1049,11 @@ lck_rw_lock_shared(lck_rw_t *lock) { uint32_t data, prev; - current_thread()->rwlock_count++; + if (lock->lck_rw_can_sleep) { + current_thread()->rwlock_count++; + } else if (get_preemption_level() == 0) { + panic("Taking non-sleepable RW lock with preemption enabled"); + } for (;;) { data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp); if (data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE | LCK_RW_INTERLOCK)) { @@ -1495,6 +1079,8 @@ lck_rw_lock_shared(lck_rw_t *lock) /* * Routine: lck_rw_lock_shared_to_exclusive + * + * False returned upon failure, in this case the shared lock is dropped. */ boolean_t lck_rw_lock_shared_to_exclusive(lck_rw_t *lock) @@ -1558,7 +1144,11 @@ lck_rw_lock_shared_to_exclusive_failure( uint32_t rwlock_count; /* Check if dropping the lock means that we need to unpromote */ - rwlock_count = thread->rwlock_count--; + if (lck->lck_rw_can_sleep) { + rwlock_count = thread->rwlock_count--; + } else { + rwlock_count = UINT32_MAX; + } #if MACH_LDEBUG if (rwlock_count == 0) { panic("rw lock count underflow for thread %p", thread); @@ -1708,13 +1298,9 @@ lck_rw_lock_exclusive_to_shared(lck_rw_t *lock) for (;;) { data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_release_smp); if (data & LCK_RW_INTERLOCK) { -#if __SMP__ atomic_exchange_abort(); lck_rw_interlock_spin(lock); /* wait for interlock to clear */ continue; -#else - panic("lck_rw_lock_exclusive_to_shared(): Interlock locked (%p): %x", lock, data); -#endif // __SMP__ } data += LCK_RW_SHARED_READER; if (data & LCK_RW_WANT_UPGRADE) { @@ -1811,13 +1397,9 @@ lck_rw_try_lock_shared(lck_rw_t *lock) for (;;) { data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp); if (data & LCK_RW_INTERLOCK) { -#if __SMP__ atomic_exchange_abort(); lck_rw_interlock_spin(lock); continue; -#else - panic("lck_rw_try_lock_shared(): Interlock locked (%p): %x", lock, data); -#endif } if (data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) { atomic_exchange_abort(); @@ -1833,7 +1415,13 @@ lck_rw_try_lock_shared(lck_rw_t *lock) thread_t owner = ordered_load_rw_owner(lock); assertf(owner == THREAD_NULL, "state=0x%x, owner=%p", ordered_load_rw(lock), owner); #endif - current_thread()->rwlock_count++; + + if (lock->lck_rw_can_sleep) { + current_thread()->rwlock_count++; + } else if (get_preemption_level() == 0) { + panic("Taking non-sleepable RW lock with preemption enabled"); + } + #if CONFIG_DTRACE LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_SHARED_ACQUIRE, lock, DTRACE_RW_SHARED); #endif /* CONFIG_DTRACE */ @@ -1854,13 +1442,9 @@ lck_rw_try_lock_exclusive(lck_rw_t *lock) for (;;) { data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp); if (data & LCK_RW_INTERLOCK) { -#if __SMP__ atomic_exchange_abort(); lck_rw_interlock_spin(lock); continue; -#else - panic("lck_rw_try_lock_exclusive(): Interlock locked (%p): %x", lock, data); -#endif } if (data & (LCK_RW_SHARED_MASK | LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) { atomic_exchange_abort(); @@ -1873,7 +1457,11 @@ lck_rw_try_lock_exclusive(lck_rw_t *lock) cpu_pause(); } thread = current_thread(); - thread->rwlock_count++; + if (lock->lck_rw_can_sleep) { + thread->rwlock_count++; + } else if (get_preemption_level() == 0) { + panic("Taking non-sleepable RW lock with preemption enabled"); + } #if MACH_ASSERT thread_t owner = ordered_load_rw_owner(lock); assertf(owner == THREAD_NULL, "state=0x%x, owner=%p", ordered_load_rw(lock), owner); @@ -2144,13 +1732,9 @@ lck_rw_done(lck_rw_t *lock) for (;;) { data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_release_smp); if (data & LCK_RW_INTERLOCK) { /* wait for interlock to clear */ -#if __SMP__ atomic_exchange_abort(); lck_rw_interlock_spin(lock); continue; -#else - panic("lck_rw_done(): Interlock locked (%p): %x", lock, data); -#endif // __SMP__ } if (data & LCK_RW_SHARED_MASK) { /* lock is held shared */ assertf(lock->lck_rw_owner == THREAD_NULL, "state=0x%x, owner=%p", lock->lck_rw_data, lock->lck_rw_owner); @@ -2251,7 +1835,11 @@ lck_rw_done_gen( /* Check if dropping the lock means that we need to unpromote */ thread = current_thread(); - rwlock_count = thread->rwlock_count--; + if (fake_lck.can_sleep) { + rwlock_count = thread->rwlock_count--; + } else { + rwlock_count = UINT32_MAX; + } #if MACH_LDEBUG if (rwlock_count == 0) { panic("rw lock count underflow for thread %p", thread); @@ -2372,7 +1960,10 @@ lck_rw_lock_shared_gen( #endif /* CONFIG_DTRACE */ } - +/* + * Required to verify thread ownership for exclusive locks by virtue of PPL + * usage + */ void lck_rw_assert( lck_rw_t *lck, @@ -2453,10 +2044,8 @@ lck_mtx_alloc_init( { lck_mtx_t *lck; - if ((lck = (lck_mtx_t *) kalloc(sizeof(lck_mtx_t))) != 0) { - lck_mtx_init(lck, grp, attr); - } - + lck = zalloc(ZV_LCK_MTX); + lck_mtx_init(lck, grp, attr); return lck; } @@ -2469,7 +2058,7 @@ lck_mtx_free( lck_grp_t * grp) { lck_mtx_destroy(lck, grp); - kfree(lck, sizeof(lck_mtx_t)); + zfree(ZV_LCK_MTX, lck); } /* @@ -2494,18 +2083,16 @@ lck_mtx_init( #ifdef BER_XXX if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) { - if ((lck_ext = (lck_mtx_ext_t *) kalloc(sizeof(lck_mtx_ext_t))) != 0) { - lck_mtx_ext_init(lck_ext, grp, lck_attr); - lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT; - lck->lck_mtx_ptr = lck_ext; - lck->lck_mtx_type = LCK_MTX_TYPE; - } + lck_ext = zalloc(ZV_LCK_MTX_EXT); + lck_mtx_ext_init(lck_ext, grp, lck_attr); + lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT; + lck->lck_mtx_ptr = lck_ext; + lck->lck_mtx_type = LCK_MTX_TYPE; } else #endif { lck->lck_mtx_ptr = NULL; // Clear any padding in the union fields below lck->lck_mtx_waiters = 0; - lck->lck_mtx_pri = 0; lck->lck_mtx_type = LCK_MTX_TYPE; ordered_store_mtx(lck, 0); } @@ -2538,7 +2125,6 @@ lck_mtx_init_ext( lck->lck_mtx_type = LCK_MTX_TYPE; } else { lck->lck_mtx_waiters = 0; - lck->lck_mtx_pri = 0; lck->lck_mtx_type = LCK_MTX_TYPE; ordered_store_mtx(lck, 0); } @@ -2606,6 +2192,10 @@ static inline void lck_mtx_check_preemption(lck_mtx_t *lock) { #if DEVELOPMENT || DEBUG + if (current_cpu_datap()->cpu_hibernate) { + return; + } + int pl = get_preemption_level(); if (pl != 0) { @@ -2627,8 +2217,8 @@ lck_mtx_lock(lck_mtx_t *lock) lck_mtx_verify(lock); lck_mtx_check_preemption(lock); thread = current_thread(); - if (atomic_compare_exchange(&lock->lck_mtx_data, 0, LCK_MTX_THREAD_TO_STATE(thread), - memory_order_acquire_smp, FALSE)) { + if (os_atomic_cmpxchg(&lock->lck_mtx_data, + 0, LCK_MTX_THREAD_TO_STATE(thread), acquire)) { #if CONFIG_DTRACE LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_ACQUIRE, lock, 0); #endif /* CONFIG_DTRACE */ @@ -2647,6 +2237,7 @@ lck_mtx_lock_contended(lck_mtx_t *lock, thread_t thread, boolean_t interlocked) uintptr_t state; int waiters = 0; spinwait_result_t sw_res; + struct turnstile *ts = NULL; /* Loop waiting until I see that the mutex is unowned */ for (;;) { @@ -2655,6 +2246,11 @@ lck_mtx_lock_contended(lck_mtx_t *lock, thread_t thread, boolean_t interlocked) switch (sw_res) { case SPINWAIT_ACQUIRED: + if (ts != NULL) { + interlock_lock(lock); + turnstile_complete((uintptr_t)lock, NULL, NULL, TURNSTILE_KERNEL_MUTEX); + interlock_unlock(lock); + } goto done; case SPINWAIT_INTERLOCK: goto set_owner; @@ -2668,7 +2264,7 @@ lck_mtx_lock_contended(lck_mtx_t *lock, thread_t thread, boolean_t interlocked) break; } ordered_store_mtx(lock, (state | LCK_ILOCK | ARM_LCK_WAITERS)); // Set waiters bit and wait - lck_mtx_lock_wait(lock, holding_thread); + lck_mtx_lock_wait(lock, holding_thread, &ts); /* returns interlock unlocked */ } @@ -2678,25 +2274,34 @@ set_owner: if (state & ARM_LCK_WAITERS) { /* Skip lck_mtx_lock_acquire if there are no waiters. */ - waiters = lck_mtx_lock_acquire(lock); + waiters = lck_mtx_lock_acquire(lock, ts); + /* + * lck_mtx_lock_acquire will call + * turnstile_complete + */ + } else { + if (ts != NULL) { + turnstile_complete((uintptr_t)lock, NULL, NULL, TURNSTILE_KERNEL_MUTEX); + } } state = LCK_MTX_THREAD_TO_STATE(thread); if (waiters != 0) { state |= ARM_LCK_WAITERS; } -#if __SMP__ state |= LCK_ILOCK; // Preserve interlock ordered_store_mtx(lock, state); // Set ownership interlock_unlock(lock); // Release interlock, enable preemption -#else - ordered_store_mtx(lock, state); // Set ownership - enable_preemption(); -#endif done: load_memory_barrier(); + assert(thread->turnstile != NULL); + + if (ts != NULL) { + turnstile_cleanup(); + } + #if CONFIG_DTRACE LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_ACQUIRE, lock, 0); #endif /* CONFIG_DTRACE */ @@ -2713,16 +2318,16 @@ static spinwait_result_t lck_mtx_lock_contended_spinwait_arm(lck_mtx_t *lock, thread_t thread, boolean_t interlocked) { int has_interlock = (int)interlocked; -#if __SMP__ __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lock); - thread_t holder; - uint64_t overall_deadline; - uint64_t check_owner_deadline; - uint64_t cur_time; - spinwait_result_t retval = SPINWAIT_DID_SPIN; - int loopcount = 0; - uintptr_t state; - boolean_t istate; + thread_t owner, prev_owner; + uint64_t window_deadline, sliding_deadline, high_deadline; + uint64_t start_time, cur_time, avg_hold_time, bias, delta; + int loopcount = 0; + uint i, prev_owner_cpu; + int total_hold_time_samples, window_hold_time_samples, unfairness; + bool owner_on_core, adjust; + uintptr_t state, new_state, waiters; + spinwait_result_t retval = SPINWAIT_DID_SPIN_HIGH_THR; if (__improbable(!(lck_mtx_adaptive_spin_mode & ADAPTIVE_SPIN_ENABLE))) { if (!has_interlock) { @@ -2732,101 +2337,289 @@ lck_mtx_lock_contended_spinwait_arm(lck_mtx_t *lock, thread_t thread, boolean_t return SPINWAIT_DID_NOT_SPIN; } - state = ordered_load_mtx(lock); - KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_START, trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(LCK_MTX_STATE_TO_THREAD(state)), lock->lck_mtx_waiters, 0, 0); - cur_time = mach_absolute_time(); - overall_deadline = cur_time + MutexSpin; - check_owner_deadline = cur_time; - - if (has_interlock) { - istate = ml_get_interrupts_enabled(); + start_time = mach_absolute_time(); + /* + * window_deadline represents the "learning" phase. + * The thread collects statistics about the lock during + * window_deadline and then it makes a decision on whether to spin more + * or block according to the concurrency behavior + * observed. + * + * Every thread can spin at least low_MutexSpin. + */ + window_deadline = start_time + low_MutexSpin; + /* + * Sliding_deadline is the adjusted spin deadline + * computed after the "learning" phase. + */ + sliding_deadline = window_deadline; + /* + * High_deadline is a hard deadline. No thread + * can spin more than this deadline. + */ + if (high_MutexSpin >= 0) { + high_deadline = start_time + high_MutexSpin; + } else { + high_deadline = start_time + low_MutexSpin * real_ncpus; } + /* + * Do not know yet which is the owner cpu. + * Initialize prev_owner_cpu with next cpu. + */ + prev_owner_cpu = (cpu_number() + 1) % real_ncpus; + total_hold_time_samples = 0; + window_hold_time_samples = 0; + avg_hold_time = 0; + adjust = TRUE; + bias = (os_hash_kernel_pointer(lock) + cpu_number()) % real_ncpus; + /* Snoop the lock state */ state = ordered_load_mtx(lock); + owner = LCK_MTX_STATE_TO_THREAD(state); + prev_owner = owner; + + if (has_interlock) { + if (owner == NULL) { + retval = SPINWAIT_INTERLOCK; + goto done_spinning; + } else { + /* + * We are holding the interlock, so + * we can safely dereference owner. + */ + if (!machine_thread_on_core(owner) || (owner->state & TH_IDLE)) { + retval = SPINWAIT_DID_NOT_SPIN; + goto done_spinning; + } + } + interlock_unlock(lock); + has_interlock = 0; + } /* * Spin while: * - mutex is locked, and * - it's locked as a spin lock, and * - owner is running on another processor, and - * - owner (processor) is not idling, and * - we haven't spun for long enough. */ do { - if (!(state & LCK_ILOCK) || has_interlock) { - if (!has_interlock) { - has_interlock = interlock_try_disable_interrupts(lock, &istate); + /* + * Try to acquire the lock. + */ + owner = LCK_MTX_STATE_TO_THREAD(state); + if (owner == NULL) { + waiters = state & ARM_LCK_WAITERS; + if (waiters) { + /* + * preserve the waiter bit + * and try acquire the interlock. + * Note: we will successfully acquire + * the interlock only if we can also + * acquire the lock. + */ + new_state = ARM_LCK_WAITERS | LCK_ILOCK; + has_interlock = 1; + retval = SPINWAIT_INTERLOCK; + disable_preemption(); + } else { + new_state = LCK_MTX_THREAD_TO_STATE(thread); + retval = SPINWAIT_ACQUIRED; } - if (has_interlock) { - state = ordered_load_mtx(lock); - holder = LCK_MTX_STATE_TO_THREAD(state); + /* + * The cmpxchg will succed only if the lock + * is not owned (doesn't have an owner set) + * and it is not interlocked. + * It will not fail if there are waiters. + */ + if (os_atomic_cmpxchgv(&lock->lck_mtx_data, + waiters, new_state, &state, acquire)) { + goto done_spinning; + } else { + if (waiters) { + has_interlock = 0; + enable_preemption(); + } + } + } - if (holder == NULL) { - retval = SPINWAIT_INTERLOCK; + cur_time = mach_absolute_time(); - if (istate) { - ml_set_interrupts_enabled(istate); - } + /* + * Never spin past high_deadline. + */ + if (cur_time >= high_deadline) { + retval = SPINWAIT_DID_SPIN_HIGH_THR; + break; + } - break; - } + /* + * Check if owner is on core. If not block. + */ + owner = LCK_MTX_STATE_TO_THREAD(state); + if (owner) { + i = prev_owner_cpu; + owner_on_core = FALSE; - if (!(holder->machine.machine_thread_flags & MACHINE_THREAD_FLAGS_ON_CPU) || - (holder->state & TH_IDLE)) { - if (loopcount == 0) { - retval = SPINWAIT_DID_NOT_SPIN; - } + disable_preemption(); + state = ordered_load_mtx(lock); + owner = LCK_MTX_STATE_TO_THREAD(state); - if (istate) { - ml_set_interrupts_enabled(istate); + /* + * For scalability we want to check if the owner is on core + * without locking the mutex interlock. + * If we do not lock the mutex interlock, the owner that we see might be + * invalid, so we cannot dereference it. Therefore we cannot check + * any field of the thread to tell us if it is on core. + * Check if the thread that is running on the other cpus matches the owner. + */ + if (owner) { + do { + cpu_data_t *cpu_data_ptr = CpuDataEntries[i].cpu_data_vaddr; + if ((cpu_data_ptr != NULL) && (cpu_data_ptr->cpu_active_thread == owner)) { + owner_on_core = TRUE; + break; } - - break; + if (++i >= real_ncpus) { + i = 0; + } + } while (i != prev_owner_cpu); + enable_preemption(); + + if (owner_on_core) { + prev_owner_cpu = i; + } else { + prev_owner = owner; + state = ordered_load_mtx(lock); + owner = LCK_MTX_STATE_TO_THREAD(state); + if (owner == prev_owner) { + /* + * Owner is not on core. + * Stop spinning. + */ + if (loopcount == 0) { + retval = SPINWAIT_DID_NOT_SPIN; + } else { + retval = SPINWAIT_DID_SPIN_OWNER_NOT_CORE; + } + break; + } + /* + * Fall through if the owner changed while we were scanning. + * The new owner could potentially be on core, so loop + * again. + */ } - - interlock_unlock_enable_interrupts(lock, istate); - has_interlock = 0; + } else { + enable_preemption(); } } - cur_time = mach_absolute_time(); - - if (cur_time >= overall_deadline) { - break; + /* + * Save how many times we see the owner changing. + * We can roughly estimate the the mutex hold + * time and the fairness with that. + */ + if (owner != prev_owner) { + prev_owner = owner; + total_hold_time_samples++; + window_hold_time_samples++; } - check_owner_deadline = cur_time + (MutexSpin / SPINWAIT_OWNER_CHECK_COUNT); + /* + * Learning window expired. + * Try to adjust the sliding_deadline. + */ + if (cur_time >= window_deadline) { + /* + * If there was not contention during the window + * stop spinning. + */ + if (window_hold_time_samples < 1) { + retval = SPINWAIT_DID_SPIN_NO_WINDOW_CONTENTION; + break; + } + + if (adjust) { + /* + * For a fair lock, we'd wait for at most (NCPU-1) periods, + * but the lock is unfair, so let's try to estimate by how much. + */ + unfairness = total_hold_time_samples / real_ncpus; + + if (unfairness == 0) { + /* + * We observed the owner changing `total_hold_time_samples` times which + * let us estimate the average hold time of this mutex for the duration + * of the spin time. + * avg_hold_time = (cur_time - start_time) / total_hold_time_samples; + * + * In this case spin at max avg_hold_time * (real_ncpus - 1) + */ + delta = cur_time - start_time; + sliding_deadline = start_time + (delta * (real_ncpus - 1)) / total_hold_time_samples; + } else { + /* + * In this case at least one of the other cpus was able to get the lock twice + * while I was spinning. + * We could spin longer but it won't necessarily help if the system is unfair. + * Try to randomize the wait to reduce contention. + * + * We compute how much time we could potentially spin + * and distribute it over the cpus. + * + * bias is an integer between 0 and real_ncpus. + * distributed_increment = ((high_deadline - cur_time) / real_ncpus) * bias + */ + delta = high_deadline - cur_time; + sliding_deadline = cur_time + ((delta * bias) / real_ncpus); + adjust = FALSE; + } + } - if (cur_time < check_owner_deadline) { - machine_delay_until(check_owner_deadline - cur_time, check_owner_deadline); + window_deadline += low_MutexSpin; + window_hold_time_samples = 0; } - /* Snoop the lock state */ - state = ordered_load_mtx(lock); + /* + * Stop spinning if we past + * the adjusted deadline. + */ + if (cur_time >= sliding_deadline) { + retval = SPINWAIT_DID_SPIN_SLIDING_THR; + break; + } - if (state == 0) { - /* Try to grab the lock. */ - if (os_atomic_cmpxchg(&lock->lck_mtx_data, - 0, LCK_MTX_THREAD_TO_STATE(thread), acquire)) { - retval = SPINWAIT_ACQUIRED; - break; - } + /* + * We want to arm the monitor for wfe, + * so load exclusively the lock. + * + * NOTE: + * we rely on the fact that wfe will + * eventually return even if the cache line + * is not modified. This way we will keep + * looping and checking if the deadlines expired. + */ + state = os_atomic_load_exclusive(&lock->lck_mtx_data, relaxed); + owner = LCK_MTX_STATE_TO_THREAD(state); + if (owner != NULL) { + wait_for_event(); + state = ordered_load_mtx(lock); + } else { + atomic_exchange_abort(); } loopcount++; } while (TRUE); +done_spinning: #if CONFIG_DTRACE /* - * We've already kept a count via overall_deadline of how long we spun. - * If dtrace is active, then we compute backwards to decide how - * long we spun. - * * Note that we record a different probe id depending on whether * this is a direct or indirect mutex. This allows us to * penalize only lock groups that have debug/stats enabled @@ -2834,10 +2627,10 @@ lck_mtx_lock_contended_spinwait_arm(lck_mtx_t *lock, thread_t thread, boolean_t */ if (__probable(lock->lck_mtx_tag != LCK_MTX_TAG_INDIRECT)) { LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN, lock, - mach_absolute_time() - (overall_deadline - MutexSpin)); + mach_absolute_time() - start_time); } else { LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_SPIN, lock, - mach_absolute_time() - (overall_deadline - MutexSpin)); + mach_absolute_time() - start_time); } /* The lockstat acquire event is recorded by the caller. */ #endif @@ -2846,11 +2639,6 @@ lck_mtx_lock_contended_spinwait_arm(lck_mtx_t *lock, thread_t thread, boolean_t KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_END, trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(LCK_MTX_STATE_TO_THREAD(state)), lock->lck_mtx_waiters, retval, 0); -#else /* __SMP__ */ - /* Spinwaiting is not useful on UP systems. */ -#pragma unused(lock, thread) - int retval = SPINWAIT_DID_NOT_SPIN; -#endif /* __SMP__ */ if ((!has_interlock) && (retval != SPINWAIT_ACQUIRED)) { /* We must own either the lock or the interlock on return. */ interlock_lock(lock); @@ -2859,6 +2647,7 @@ lck_mtx_lock_contended_spinwait_arm(lck_mtx_t *lock, thread_t thread, boolean_t return retval; } + /* * Common code for mutex locking as spinlock */ @@ -2918,8 +2707,8 @@ lck_mtx_try_lock(lck_mtx_t *lock) thread_t thread = current_thread(); lck_mtx_verify(lock); - if (atomic_compare_exchange(&lock->lck_mtx_data, 0, LCK_MTX_THREAD_TO_STATE(thread), - memory_order_acquire_smp, FALSE)) { + if (os_atomic_cmpxchg(&lock->lck_mtx_data, + 0, LCK_MTX_THREAD_TO_STATE(thread), acquire)) { #if CONFIG_DTRACE LOCKSTAT_RECORD(LS_LCK_MTX_TRY_LOCK_ACQUIRE, lock, 0); #endif /* CONFIG_DTRACE */ @@ -2935,7 +2724,6 @@ lck_mtx_try_lock_contended(lck_mtx_t *lock, thread_t thread) uintptr_t state; int waiters; -#if __SMP__ interlock_lock(lock); state = ordered_load_mtx(lock); holding_thread = LCK_MTX_STATE_TO_THREAD(state); @@ -2943,34 +2731,18 @@ lck_mtx_try_lock_contended(lck_mtx_t *lock, thread_t thread) interlock_unlock(lock); return FALSE; } -#else - disable_preemption_for_thread(thread); - state = ordered_load_mtx(lock); - if (state & LCK_ILOCK) { - panic("Unexpected interlock set (%p)", lock); - } - holding_thread = LCK_MTX_STATE_TO_THREAD(state); - if (holding_thread) { - enable_preemption(); - return FALSE; - } - state |= LCK_ILOCK; - ordered_store_mtx(lock, state); -#endif // __SMP__ - waiters = lck_mtx_lock_acquire(lock); + waiters = lck_mtx_lock_acquire(lock, NULL); state = LCK_MTX_THREAD_TO_STATE(thread); if (waiters != 0) { state |= ARM_LCK_WAITERS; } -#if __SMP__ state |= LCK_ILOCK; // Preserve interlock ordered_store_mtx(lock, state); // Set ownership interlock_unlock(lock); // Release interlock, enable preemption -#else - ordered_store_mtx(lock, state); // Set ownership - enable_preemption(); -#endif load_memory_barrier(); + + turnstile_cleanup(); + return TRUE; } @@ -3046,8 +2818,8 @@ lck_mtx_unlock(lck_mtx_t *lock) goto slow_case; } // Locked as a mutex - if (atomic_compare_exchange(&lock->lck_mtx_data, LCK_MTX_THREAD_TO_STATE(thread), 0, - memory_order_release_smp, FALSE)) { + if (os_atomic_cmpxchg(&lock->lck_mtx_data, + LCK_MTX_THREAD_TO_STATE(thread), 0, release)) { #if CONFIG_DTRACE LOCKSTAT_RECORD(LS_LCK_MTX_UNLOCK_RELEASE, lock, 0); #endif /* CONFIG_DTRACE */ @@ -3061,44 +2833,41 @@ static void NOINLINE lck_mtx_unlock_contended(lck_mtx_t *lock, thread_t thread, boolean_t ilk_held) { uintptr_t state; + boolean_t cleanup = FALSE; if (ilk_held) { state = ordered_load_mtx(lock); } else { -#if __SMP__ interlock_lock(lock); state = ordered_load_mtx(lock); if (thread != LCK_MTX_STATE_TO_THREAD(state)) { panic("lck_mtx_unlock(): Attempt to release lock not owned by thread (%p)", lock); } -#else - disable_preemption_for_thread(thread); - state = ordered_load_mtx(lock); - if (state & LCK_ILOCK) { - panic("lck_mtx_unlock(): Unexpected interlock set (%p)", lock); - } - if (thread != LCK_MTX_STATE_TO_THREAD(state)) { - panic("lck_mtx_unlock(): Attempt to release lock not owned by thread (%p)", lock); - } - state |= LCK_ILOCK; - ordered_store_mtx(lock, state); -#endif if (state & ARM_LCK_WAITERS) { - lck_mtx_unlock_wakeup(lock, thread); - state = ordered_load_mtx(lock); - } else { - assertf(lock->lck_mtx_pri == 0, "pri=0x%x", lock->lck_mtx_pri); + if (lck_mtx_unlock_wakeup(lock, thread)) { + state = ARM_LCK_WAITERS; + } else { + state = 0; + } + cleanup = TRUE; + goto unlock; } } state &= ARM_LCK_WAITERS; /* Clear state, retain waiters bit */ -#if __SMP__ +unlock: state |= LCK_ILOCK; ordered_store_mtx(lock, state); interlock_unlock(lock); -#else - ordered_store_mtx(lock, state); - enable_preemption(); -#endif + if (cleanup) { + /* + * Do not do any turnstile operations outside of this block. + * lock/unlock is called at early stage of boot with single thread, + * when turnstile is not yet initialized. + * Even without contention we can come throught the slow path + * if the mutex is acquired as a spin lock. + */ + turnstile_cleanup(); + } #if CONFIG_DTRACE LOCKSTAT_RECORD(LS_LCK_MTX_UNLOCK_RELEASE, lock, 0); @@ -3165,19 +2934,15 @@ lck_mtx_convert_spin(lck_mtx_t *lock) } state &= ~(LCK_MTX_THREAD_MASK); // Clear the spin tag ordered_store_mtx(lock, state); - waiters = lck_mtx_lock_acquire(lock); // Acquire to manage priority boosts + waiters = lck_mtx_lock_acquire(lock, NULL); // Acquire to manage priority boosts state = LCK_MTX_THREAD_TO_STATE(thread); if (waiters != 0) { state |= ARM_LCK_WAITERS; } -#if __SMP__ state |= LCK_ILOCK; ordered_store_mtx(lock, state); // Set ownership interlock_unlock(lock); // Release interlock, enable preemption -#else - ordered_store_mtx(lock, state); // Set ownership - enable_preemption(); -#endif + turnstile_cleanup(); } @@ -3232,13 +2997,8 @@ lck_spin_assert(lck_spin_t *lock, unsigned int type) if (holder != 0) { if (holder == thread) { panic("Lock owned by current thread %p = %lx", lock, state); - } else { - panic("Lock %p owned by thread %p", lock, holder); } } - if (state & LCK_ILOCK) { - panic("Lock bit set %p = %lx", lock, state); - } } else { panic("lck_spin_assert(): invalid arg (%u)", type); }