X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/8f6c56a50524aa785f7e596d52dddfb331e18961..cb3231590a3c94ab4375e2228bd5e86b0cf1ad7e:/osfmk/i386/locks_i386.c diff --git a/osfmk/i386/locks_i386.c b/osfmk/i386/locks_i386.c index 846038183..c5b0d3037 100644 --- a/osfmk/i386/locks_i386.c +++ b/osfmk/i386/locks_i386.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,34 +22,34 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* * @OSF_COPYRIGHT@ */ -/* +/* * Mach Operating System * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University * All Rights Reserved. - * + * * Permission to use, copy, modify and distribute this software and its * documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. - * + * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * + * * Carnegie Mellon requests users of this software to return to - * + * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 - * + * * any improvements or extensions that they make and grant Carnegie Mellon * the rights to redistribute these changes. */ @@ -61,10 +61,11 @@ * Locking primitives implementation */ -#include +#define LOCK_PRIVATE 1 + #include -#include +#include #include #include #include @@ -73,22 +74,22 @@ #include #include #include -#include #include #include -#if MACH_KDB -#include -#include -#include -#include -#endif /* MACH_KDB */ - -#ifdef __ppc__ -#include -#endif - +#include /* machine_timeout_suspended() */ +#include +#include +#include +#include #include +#include + +#if CONFIG_DTRACE +#define DTRACE_RW_SHARED 0x0 //reader +#define DTRACE_RW_EXCL 0x1 //writer +#define DTRACE_NO_FLAG 0x0 //not applicable +#endif /* CONFIG_DTRACE */ #define LCK_RW_LCK_EXCLUSIVE_CODE 0x100 #define LCK_RW_LCK_EXCLUSIVE1_CODE 0x101 @@ -97,22 +98,25 @@ #define LCK_RW_LCK_SH_TO_EX1_CODE 0x104 #define LCK_RW_LCK_EX_TO_SH_CODE 0x105 +#define LCK_RW_LCK_EX_WRITER_SPIN_CODE 0x106 +#define LCK_RW_LCK_EX_WRITER_WAIT_CODE 0x107 +#define LCK_RW_LCK_EX_READER_SPIN_CODE 0x108 +#define LCK_RW_LCK_EX_READER_WAIT_CODE 0x109 +#define LCK_RW_LCK_SHARED_SPIN_CODE 0x110 +#define LCK_RW_LCK_SHARED_WAIT_CODE 0x111 +#define LCK_RW_LCK_SH_TO_EX_SPIN_CODE 0x112 +#define LCK_RW_LCK_SH_TO_EX_WAIT_CODE 0x113 + #define ANY_LOCK_DEBUG (USLOCK_DEBUG || LOCK_DEBUG || MUTEX_DEBUG) unsigned int LcksOpts=0; -unsigned int lock_wait_time[2] = { (unsigned int)-1, 100 } ; -/* Forwards */ - -#if MACH_KDB -void db_print_simple_lock( - simple_lock_t addr); - -void db_print_mutex( - mutex_t * addr); -#endif /* MACH_KDB */ +#if DEVELOPMENT || DEBUG +unsigned int LckDisablePreemptCheck = 0; +#endif +/* Forwards */ #if USLOCK_DEBUG /* @@ -120,13 +124,11 @@ void db_print_mutex( */ int uslock_check = 1; int max_lock_loops = 100000000; -decl_simple_lock_data(extern , printf_lock) -decl_simple_lock_data(extern , panic_lock) -#if MACH_KDB -decl_simple_lock_data(extern , kdb_lock) -#endif /* MACH_KDB */ +decl_simple_lock_data(extern , printf_lock); +decl_simple_lock_data(extern , panic_lock); #endif /* USLOCK_DEBUG */ +extern unsigned int not_in_kdp; /* * We often want to know the addresses of the callers @@ -137,7 +139,7 @@ typedef void *pc_t; #define INVALID_PC ((void *) VM_MAX_KERNEL_ADDRESS) #define INVALID_THREAD ((void *) VM_MAX_KERNEL_ADDRESS) #if ANY_LOCK_DEBUG -#define OBTAIN_PC(pc,l) ((pc) = (void *) GET_RETURN_PC(&(l))) +#define OBTAIN_PC(pc) ((pc) = GET_RETURN_PC()) #define DECL_PC(pc) pc_t pc; #else /* ANY_LOCK_DEBUG */ #define DECL_PC(pc) @@ -145,12 +147,67 @@ typedef void *pc_t; /* * Eliminate lint complaints about unused local pc variables. */ -#define OBTAIN_PC(pc,l) ++pc +#define OBTAIN_PC(pc) ++pc #else /* lint */ -#define OBTAIN_PC(pc,l) +#define OBTAIN_PC(pc) #endif /* lint */ #endif /* USLOCK_DEBUG */ +/* + * atomic exchange API is a low level abstraction of the operations + * to atomically read, modify, and write a pointer. This abstraction works + * for both Intel and ARMv8.1 compare and exchange atomic instructions as + * well as the ARM exclusive instructions. + * + * atomic_exchange_begin() - begin exchange and retrieve current value + * atomic_exchange_complete() - conclude an exchange + * atomic_exchange_abort() - cancel an exchange started with atomic_exchange_begin() + */ +static uint32_t +atomic_exchange_begin32(uint32_t *target, uint32_t *previous, enum memory_order ord) +{ + uint32_t val; + + (void)ord; // Memory order not used + val = os_atomic_load(target, relaxed); + *previous = val; + return val; +} + +static boolean_t +atomic_exchange_complete32(uint32_t *target, uint32_t previous, uint32_t newval, enum memory_order ord) +{ + return __c11_atomic_compare_exchange_strong((_Atomic uint32_t *)target, &previous, newval, ord, memory_order_relaxed); +} + +static void +atomic_exchange_abort(void) { } + +static boolean_t +atomic_test_and_set32(uint32_t *target, uint32_t test_mask, uint32_t set_mask, enum memory_order ord, boolean_t wait) +{ + uint32_t value, prev; + + for ( ; ; ) { + value = atomic_exchange_begin32(target, &prev, ord); + if (value & test_mask) { + if (wait) + cpu_pause(); + else + atomic_exchange_abort(); + return FALSE; + } + value |= set_mask; + if (atomic_exchange_complete32(target, prev, value, ord)) + return TRUE; + } +} + +inline boolean_t +hw_atomic_test_and_set32(uint32_t *target, uint32_t test_mask, uint32_t set_mask, enum memory_order ord, boolean_t wait) +{ + return atomic_test_and_set32(target, test_mask, set_mask, ord, wait); +} /* * Portable lock package implementation of usimple_locks. @@ -169,6 +226,29 @@ int usld_lock_common_checks(usimple_lock_t, char *); #define USLDBG(stmt) #endif /* USLOCK_DEBUG */ +/* + * Forward definitions + */ + +static void lck_rw_lock_shared_gen(lck_rw_t *lck); +static void lck_rw_lock_exclusive_gen(lck_rw_t *lck); +static boolean_t lck_rw_lock_shared_to_exclusive_success(lck_rw_t *lck); +static boolean_t lck_rw_lock_shared_to_exclusive_failure(lck_rw_t *lck, uint32_t prior_lock_state); +static void lck_rw_lock_exclusive_to_shared_gen(lck_rw_t *lck, uint32_t prior_lock_state); +static lck_rw_type_t lck_rw_done_gen(lck_rw_t *lck, uint32_t prior_lock_state); +void lck_rw_clear_promotions_x86(thread_t thread); +static boolean_t lck_rw_held_read_or_upgrade(lck_rw_t *lock); +static boolean_t lck_rw_grab_want(lck_rw_t *lock); +static boolean_t lck_rw_grab_shared(lck_rw_t *lock); +static void lck_mtx_unlock_wakeup_tail(lck_mtx_t *mutex, uint32_t state, boolean_t indirect); +static void lck_mtx_interlock_lock(lck_mtx_t *mutex, uint32_t *new_state); +static void lck_mtx_interlock_lock_clear_flags(lck_mtx_t *mutex, uint32_t and_flags, uint32_t *new_state); +static int lck_mtx_interlock_try_lock(lck_mtx_t *mutex, uint32_t *new_state); +static int lck_mtx_interlock_try_lock_set_flags(lck_mtx_t *mutex, uint32_t or_flags, uint32_t *new_state); +static boolean_t lck_mtx_lock_wait_interlock_to_clear(lck_mtx_t *lock, uint32_t *new_state); +static boolean_t lck_mtx_try_lock_wait_interlock_to_clear(lck_mtx_t *lock, uint32_t *new_state); + + /* * Routine: lck_spin_alloc_init */ @@ -207,8 +287,10 @@ lck_spin_init( __unused lck_attr_t *attr) { usimple_lock_init((usimple_lock_t) lck, 0); - lck_grp_reference(grp); - lck_grp_lckcnt_incr(grp, LCK_TYPE_SPIN); + if (grp) { + lck_grp_reference(grp); + lck_grp_lckcnt_incr(grp, LCK_TYPE_SPIN); + } } /* @@ -219,22 +301,33 @@ lck_spin_destroy( lck_spin_t *lck, lck_grp_t *grp) { - if (lck->lck_spin_data[0] == LCK_SPIN_TAG_DESTROYED) + if (lck->interlock == LCK_SPIN_TAG_DESTROYED) return; - lck->lck_spin_data[0] = LCK_SPIN_TAG_DESTROYED; - lck_grp_lckcnt_decr(grp, LCK_TYPE_SPIN); - lck_grp_deallocate(grp); + lck->interlock = LCK_SPIN_TAG_DESTROYED; + if (grp) { + lck_grp_lckcnt_decr(grp, LCK_TYPE_SPIN); + lck_grp_deallocate(grp); + } return; } /* * Routine: lck_spin_lock */ +void +lck_spin_lock_grp( + lck_spin_t *lck, + lck_grp_t *grp) +{ +#pragma unused(grp) + usimple_lock((usimple_lock_t) lck, grp); +} + void lck_spin_lock( lck_spin_t *lck) { - usimple_lock((usimple_lock_t) lck); + usimple_lock((usimple_lock_t) lck, NULL); } /* @@ -247,6 +340,21 @@ lck_spin_unlock( usimple_unlock((usimple_lock_t) lck); } +boolean_t +lck_spin_try_lock_grp( + lck_spin_t *lck, + lck_grp_t *grp) +{ +#pragma unused(grp) + boolean_t lrval = (boolean_t)usimple_lock_try((usimple_lock_t) lck, grp); +#if DEVELOPMENT || DEBUG + if (lrval) { + pltrace(FALSE); + } +#endif + return(lrval); +} + /* * Routine: lck_spin_try_lock @@ -255,7 +363,58 @@ boolean_t lck_spin_try_lock( lck_spin_t *lck) { - usimple_lock_try((usimple_lock_t) lck); + boolean_t lrval = (boolean_t)usimple_lock_try((usimple_lock_t) lck, LCK_GRP_NULL); +#if DEVELOPMENT || DEBUG + if (lrval) { + pltrace(FALSE); + } +#endif + return(lrval); +} + +/* + * Routine: lck_spin_assert + */ +void +lck_spin_assert(lck_spin_t *lock, unsigned int type) +{ + thread_t thread, holder; + uintptr_t state; + + if (__improbable(type != LCK_ASSERT_OWNED && type != LCK_ASSERT_NOTOWNED)) { + panic("lck_spin_assert(): invalid arg (%u)", type); + } + + state = lock->interlock; + holder = (thread_t)state; + thread = current_thread(); + if (type == LCK_ASSERT_OWNED) { + if (__improbable(holder == THREAD_NULL)) { + panic("Lock not owned %p = %lx", lock, state); + } + if (__improbable(holder != thread)) { + panic("Lock not owned by current thread %p = %lx", lock, state); + } + } else if (type == LCK_ASSERT_NOTOWNED) { + if (__improbable(holder != THREAD_NULL)) { + if (holder == thread) { + panic("Lock owned by current thread %p = %lx", lock, state); + } + } + } +} + +/* + * Routine: kdp_lck_spin_is_acquired + * NOT SAFE: To be used only by kernel debugger to avoid deadlock. + * Returns: TRUE if lock is acquired. + */ +boolean_t +kdp_lck_spin_is_acquired(lck_spin_t *lck) { + if (not_in_kdp) { + panic("panic: spinlock acquired check done outside of kernel debugger"); + } + return (lck->interlock != 0)? TRUE : FALSE; } /* @@ -276,6 +435,25 @@ usimple_lock_init( #endif } +volatile uint32_t spinlock_owner_cpu = ~0; +volatile usimple_lock_t spinlock_timed_out; + +uint32_t spinlock_timeout_NMI(uintptr_t thread_addr) { + uint32_t i; + + for (i = 0; i < real_ncpus; i++) { + if ((cpu_data_ptr[i] != NULL) && ((uintptr_t)cpu_data_ptr[i]->cpu_active_thread == thread_addr)) { + spinlock_owner_cpu = i; + if ((uint32_t) cpu_number() != i) { + /* Cause NMI and panic on the owner's cpu */ + NMIPI_panic(cpu_to_cpumask(i), SPINLOCK_TIMEOUT); + } + break; + } + } + + return spinlock_owner_cpu; +} /* * Acquire a usimple_lock. @@ -285,21 +463,43 @@ usimple_lock_init( * maintaining preemption state. */ void -usimple_lock( - usimple_lock_t l) +(usimple_lock)( + usimple_lock_t l + LCK_GRP_ARG(lck_grp_t *grp)) { #ifndef MACHINE_SIMPLE_LOCK - pc_t pc = NULL; + DECL_PC(pc); - OBTAIN_PC(pc, l); + OBTAIN_PC(pc); USLDBG(usld_lock_pre(l, pc)); - if(!hw_lock_to(&l->interlock, LockTimeOut)) /* Try to get the lock with a timeout */ - panic("simple lock deadlock detection - l=%08X, cpu=%d, ret=%08X", l, cpu_number(), pc); + if(__improbable(hw_lock_to(&l->interlock, LockTimeOutTSC, grp) == 0)) { + boolean_t uslock_acquired = FALSE; + while (machine_timeout_suspended()) { + enable_preemption(); + if ((uslock_acquired = hw_lock_to(&l->interlock, LockTimeOutTSC, grp))) + break; + } + + if (uslock_acquired == FALSE) { + uint32_t lock_cpu; + uintptr_t lowner = (uintptr_t)l->interlock.lock_data; + spinlock_timed_out = l; + lock_cpu = spinlock_timeout_NMI(lowner); + panic("Spinlock acquisition timed out: lock=%p, lock owner thread=0x%lx, current_thread: %p, lock owner active on CPU 0x%x, current owner: 0x%lx, time: %llu", + l, lowner, current_thread(), lock_cpu, (uintptr_t)l->interlock.lock_data, mach_absolute_time()); + } + } +#if DEVELOPMENT || DEBUG + pltrace(FALSE); +#endif USLDBG(usld_lock_post(l, pc)); #else - simple_lock((simple_lock_t)l); + simple_lock((simple_lock_t)l, grp); +#endif +#if CONFIG_DTRACE + LOCKSTAT_RECORD(LS_LCK_SPIN_LOCK_ACQUIRE, l, 0, (uintptr_t)LCK_GRP_PROBEARG(grp)); #endif } @@ -318,8 +518,11 @@ usimple_unlock( #ifndef MACHINE_SIMPLE_LOCK DECL_PC(pc); - OBTAIN_PC(pc, l); + OBTAIN_PC(pc); USLDBG(usld_unlock(l, pc)); +#if DEVELOPMENT || DEBUG + pltrace(TRUE); +#endif hw_lock_unlock(&l->interlock); #else simple_unlock_rwmb((simple_lock_t)l); @@ -341,23 +544,83 @@ usimple_unlock( */ unsigned int usimple_lock_try( - usimple_lock_t l) + usimple_lock_t l, + lck_grp_t *grp) { #ifndef MACHINE_SIMPLE_LOCK - DECL_PC(pc); unsigned int success; + DECL_PC(pc); - OBTAIN_PC(pc, l); + OBTAIN_PC(pc); USLDBG(usld_lock_try_pre(l, pc)); - if ((success = hw_lock_try(&l->interlock))) { - USLDBG(usld_lock_try_post(l, pc)); + if ((success = hw_lock_try(&l->interlock, grp))) { +#if DEVELOPMENT || DEBUG + pltrace(FALSE); +#endif + USLDBG(usld_lock_try_post(l, pc)); } return success; #else - return(simple_lock_try((simple_lock_t)l)); + return(simple_lock_try((simple_lock_t)l, grp)); #endif } +/* + * Acquire a usimple_lock while polling for pending cpu signals + * and spinning on a lock. + * + */ +unsigned int +(usimple_lock_try_lock_mp_signal_safe_loop_deadline)(usimple_lock_t l, + uint64_t deadline + LCK_GRP_ARG(lck_grp_t *grp)) +{ + boolean_t istate = ml_get_interrupts_enabled(); + + if (deadline < mach_absolute_time()) { + return 0; + } + + while (!simple_lock_try(l, grp)) { + if (!istate) + cpu_signal_handler(NULL); + + if (deadline < mach_absolute_time()) { + return 0; + } + + cpu_pause(); + } + + return 1; +} + +void +(usimple_lock_try_lock_loop)(usimple_lock_t l + LCK_GRP_ARG(lck_grp_t *grp)) +{ + usimple_lock_try_lock_mp_signal_safe_loop_deadline(l, ULLONG_MAX, grp); +} + +unsigned int +(usimple_lock_try_lock_mp_signal_safe_loop_duration)(usimple_lock_t l, + uint64_t duration + LCK_GRP_ARG(lck_grp_t *grp)) +{ + uint64_t deadline; + uint64_t base_at = mach_absolute_time(); + uint64_t duration_at; + + nanoseconds_to_absolutetime(duration, &duration_at); + deadline = base_at + duration_at; + if (deadline < base_at) { + /* deadline has overflowed, make it saturate */ + deadline = ULLONG_MAX; + } + + return usimple_lock_try_lock_mp_signal_safe_loop_deadline(l, deadline, grp); +} + #if USLOCK_DEBUG /* * States of a usimple_lock. The default when initializing @@ -370,12 +633,6 @@ usimple_lock_try( #define USLOCK_CHECKING(l) (uslock_check && \ ((l)->debug.state & USLOCK_CHECKED)) -/* - * Trace activities of a particularly interesting lock. - */ -void usl_trace(usimple_lock_t, int, pc_t, const char *); - - /* * Initialize the debugging information contained * in a usimple_lock. @@ -411,10 +668,9 @@ usld_lock_common_checks( if (l == USIMPLE_LOCK_NULL) panic("%s: null lock pointer", caller); if (l->lock_type != USLOCK_TAG) - panic("%s: 0x%x is not a usimple lock", caller, (integer_t) l); + panic("%s: %p is not a usimple lock, 0x%x", caller, l, l->lock_type); if (!(l->debug.state & USLOCK_INIT)) - panic("%s: 0x%x is not an initialized lock", - caller, (integer_t) l); + panic("%s: %p is not an initialized lock, 0x%x", caller, l, l->debug.state); return USLOCK_CHECKING(l); } @@ -445,14 +701,13 @@ usld_lock_pre( if ((l->debug.state & USLOCK_TAKEN) && l->debug.lock_thread && l->debug.lock_thread == (void *) current_thread()) { - printf("%s: lock 0x%x already locked (at 0x%x) by", - caller, (integer_t) l, l->debug.lock_pc); - printf(" current thread 0x%x (new attempt at pc 0x%x)\n", + printf("%s: lock %p already locked (at %p) by", + caller, l, l->debug.lock_pc); + printf(" current thread %p (new attempt at pc %p)\n", l->debug.lock_thread, pc); - panic(caller); + panic("%s", caller); } mp_disable_preemption(); - usl_trace(l, cpu_number(), pc, caller); mp_enable_preemption(); } @@ -468,7 +723,7 @@ usld_lock_post( usimple_lock_t l, pc_t pc) { - register int mycpu; + int mycpu; char caller[] = "successful usimple_lock"; @@ -476,19 +731,17 @@ usld_lock_post( return; if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED)) - panic("%s: lock 0x%x became uninitialized", - caller, (integer_t) l); + panic("%s: lock %p became uninitialized", + caller, l); if ((l->debug.state & USLOCK_TAKEN)) - panic("%s: lock 0x%x became TAKEN by someone else", - caller, (integer_t) l); + panic("%s: lock 0x%p became TAKEN by someone else", + caller, l); mycpu = cpu_number(); l->debug.lock_thread = (void *)current_thread(); l->debug.state |= USLOCK_TAKEN; l->debug.lock_pc = pc; l->debug.lock_cpu = mycpu; - - usl_trace(l, mycpu, pc, caller); } @@ -505,7 +758,7 @@ usld_unlock( usimple_lock_t l, pc_t pc) { - register int mycpu; + int mycpu; char caller[] = "usimple_unlock"; @@ -515,18 +768,17 @@ usld_unlock( mycpu = cpu_number(); if (!(l->debug.state & USLOCK_TAKEN)) - panic("%s: lock 0x%x hasn't been taken", - caller, (integer_t) l); + panic("%s: lock 0x%p hasn't been taken", + caller, l); if (l->debug.lock_thread != (void *) current_thread()) - panic("%s: unlocking lock 0x%x, owned by thread 0x%x", - caller, (integer_t) l, l->debug.lock_thread); + panic("%s: unlocking lock 0x%p, owned by thread %p", + caller, l, l->debug.lock_thread); if (l->debug.lock_cpu != mycpu) { - printf("%s: unlocking lock 0x%x on cpu 0x%x", - caller, (integer_t) l, mycpu); + printf("%s: unlocking lock 0x%p on cpu 0x%x", + caller, l, mycpu); printf(" (acquired on cpu 0x%x)\n", l->debug.lock_cpu); - panic(caller); + panic("%s", caller); } - usl_trace(l, mycpu, pc, caller); l->debug.unlock_thread = l->debug.lock_thread; l->debug.lock_thread = INVALID_PC; @@ -545,15 +797,12 @@ usld_unlock( void usld_lock_try_pre( usimple_lock_t l, - pc_t pc) + __unused pc_t pc) { char caller[] = "usimple_lock_try"; if (!usld_lock_common_checks(l, caller)) return; - mp_disable_preemption(); - usl_trace(l, cpu_number(), pc, caller); - mp_enable_preemption(); } @@ -570,119 +819,100 @@ usld_lock_try_post( usimple_lock_t l, pc_t pc) { - register int mycpu; + int mycpu; char caller[] = "successful usimple_lock_try"; if (!usld_lock_common_checks(l, caller)) return; if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED)) - panic("%s: lock 0x%x became uninitialized", - caller, (integer_t) l); + panic("%s: lock 0x%p became uninitialized", + caller, l); if ((l->debug.state & USLOCK_TAKEN)) - panic("%s: lock 0x%x became TAKEN by someone else", - caller, (integer_t) l); + panic("%s: lock 0x%p became TAKEN by someone else", + caller, l); mycpu = cpu_number(); l->debug.lock_thread = (void *) current_thread(); l->debug.state |= USLOCK_TAKEN; l->debug.lock_pc = pc; l->debug.lock_cpu = mycpu; - - usl_trace(l, mycpu, pc, caller); } - +#endif /* USLOCK_DEBUG */ /* - * For very special cases, set traced_lock to point to a - * specific lock of interest. The result is a series of - * XPRs showing lock operations on that lock. The lock_seq - * value is used to show the order of those operations. + * Routine: lck_rw_alloc_init */ -usimple_lock_t traced_lock; -unsigned int lock_seq; +lck_rw_t * +lck_rw_alloc_init( + lck_grp_t *grp, + lck_attr_t *attr) { + lck_rw_t *lck; -void -usl_trace( - usimple_lock_t l, - int mycpu, - pc_t pc, - const char * op_name) -{ - if (traced_lock == l) { - XPR(XPR_SLOCK, - "seq %d, cpu %d, %s @ %x\n", - (integer_t) lock_seq, (integer_t) mycpu, - (integer_t) op_name, (integer_t) pc, 0); - lock_seq++; + if ((lck = (lck_rw_t *)kalloc(sizeof(lck_rw_t))) != 0) { + bzero(lck, sizeof(lck_rw_t)); + lck_rw_init(lck, grp, attr); } -} - -#endif /* USLOCK_DEBUG */ + return(lck); +} /* - * Routine: lock_alloc - * Function: - * Allocate a lock for external users who cannot - * hard-code the structure definition into their - * objects. - * For now just use kalloc, but a zone is probably - * warranted. + * Routine: lck_rw_free */ -lock_t * -lock_alloc( - boolean_t can_sleep, - unsigned short tag, - unsigned short tag1) -{ - lock_t *l; - - if ((l = (lock_t *)kalloc(sizeof(lock_t))) != 0) - lock_init(l, can_sleep, tag, tag1); - return(l); +void +lck_rw_free( + lck_rw_t *lck, + lck_grp_t *grp) { + lck_rw_destroy(lck, grp); + kfree(lck, sizeof(lck_rw_t)); } /* - * Routine: lock_free - * Function: - * Free a lock allocated for external users. - * For now just use kfree, but a zone is probably - * warranted. + * Routine: lck_rw_init */ void -lock_free( - lock_t *l) +lck_rw_init( + lck_rw_t *lck, + lck_grp_t *grp, + lck_attr_t *attr) { - kfree(l, sizeof(lock_t)); + lck_attr_t *lck_attr = (attr != LCK_ATTR_NULL) ? + attr : &LockDefaultLckAttr; + + hw_lock_byte_init(&lck->lck_rw_interlock); + lck->lck_rw_want_write = FALSE; + lck->lck_rw_want_upgrade = FALSE; + lck->lck_rw_shared_count = 0; + lck->lck_rw_can_sleep = TRUE; + lck->lck_r_waiting = lck->lck_w_waiting = 0; + lck->lck_rw_tag = 0; + lck->lck_rw_priv_excl = ((lck_attr->lck_attr_val & + LCK_ATTR_RW_SHARED_PRIORITY) == 0); + + lck_grp_reference(grp); + lck_grp_lckcnt_incr(grp, LCK_TYPE_RW); } - /* - * Routine: lock_init - * Function: - * Initialize a lock; required before use. - * Note that clients declare the "struct lock" - * variables and then initialize them, rather - * than getting a new one from this module. + * Routine: lck_rw_destroy */ void -lock_init( - lock_t *l, - boolean_t can_sleep, - __unused unsigned short tag, - unsigned short tag1) +lck_rw_destroy( + lck_rw_t *lck, + lck_grp_t *grp) { - (void) memset((void *) l, 0, sizeof(lock_t)); - - simple_lock_init(&l->interlock, tag1); - l->want_write = FALSE; - l->want_upgrade = FALSE; - l->read_count = 0; - l->can_sleep = can_sleep; + if (lck->lck_rw_tag == LCK_RW_TAG_DESTROYED) + return; +#if MACH_LDEBUG + lck_rw_assert(lck, LCK_RW_ASSERT_NOTHELD); +#endif + lck->lck_rw_tag = LCK_RW_TAG_DESTROYED; + lck_grp_lckcnt_decr(grp, LCK_TYPE_RW); + lck_grp_deallocate(grp); + return; } - /* * Sleep locks. These use the same data structure and algorithm * as the spin locks, but the process sleeps while it is waiting @@ -691,579 +921,452 @@ lock_init( #define DECREMENTER_TIMEOUT 1000000 -void -lock_write( - register lock_t * l) +/* + * We disable interrupts while holding the RW interlock to prevent an + * interrupt from exacerbating hold time. + * Hence, local helper functions lck_interlock_lock()/lck_interlock_unlock(). + */ +static inline boolean_t +lck_interlock_lock(lck_rw_t *lck) { - register int i; - boolean_t lock_miss = FALSE; -#if MACH_LDEBUG - int decrementer; -#endif /* MACH_LDEBUG */ + boolean_t istate; - simple_lock(&l->interlock); + istate = ml_set_interrupts_enabled(FALSE); + hw_lock_byte_lock(&lck->lck_rw_interlock); + return istate; +} -#if MACH_LDEBUG - decrementer = DECREMENTER_TIMEOUT; -#endif /* MACH_LDEBUG */ +static inline void +lck_interlock_unlock(lck_rw_t *lck, boolean_t istate) +{ + hw_lock_byte_unlock(&lck->lck_rw_interlock); + ml_set_interrupts_enabled(istate); +} - /* - * Try to acquire the want_write bit. - */ - while (l->want_write) { - if (!lock_miss) { - lock_miss = TRUE; - } +/* + * This inline is used when busy-waiting for an rw lock. + * If interrupts were disabled when the lock primitive was called, + * we poll the IPI handler for pending tlb flushes. + * XXX This is a hack to avoid deadlocking on the pmap_system_lock. + */ +static inline void +lck_rw_lock_pause(boolean_t interrupts_enabled) +{ + if (!interrupts_enabled) + handle_pending_TLB_flushes(); + cpu_pause(); +} - i = lock_wait_time[l->can_sleep ? 1 : 0]; - if (i != 0) { - simple_unlock(&l->interlock); -#if MACH_LDEBUG - if (!--decrementer) - Debugger("timeout - want_write"); -#endif /* MACH_LDEBUG */ - while (--i != 0 && l->want_write) - continue; - simple_lock(&l->interlock); - } +static inline boolean_t +lck_rw_held_read_or_upgrade(lck_rw_t *lock) +{ + if (ordered_load(&lock->data) & (LCK_RW_SHARED_MASK | LCK_RW_INTERLOCK | LCK_RW_WANT_UPGRADE)) + return TRUE; + return FALSE; +} - if (l->can_sleep && l->want_write) { - l->waiting = TRUE; - thread_sleep_simple_lock((event_t) l, - simple_lock_addr(l->interlock), - THREAD_UNINT); - /* interlock relocked */ +/* + * compute the deadline to spin against when + * waiting for a change of state on a lck_rw_t + */ +static inline uint64_t +lck_rw_deadline_for_spin(lck_rw_t *lck) +{ + if (lck->lck_rw_can_sleep) { + if (lck->lck_r_waiting || lck->lck_w_waiting || lck->lck_rw_shared_count > machine_info.max_cpus) { + /* + * there are already threads waiting on this lock... this + * implies that they have spun beyond their deadlines waiting for + * the desired state to show up so we will not bother spinning at this time... + * or + * the current number of threads sharing this lock exceeds our capacity to run them + * concurrently and since all states we're going to spin for require the rw_shared_count + * to be at 0, we'll not bother spinning since the latency for this to happen is + * unpredictable... + */ + return (mach_absolute_time()); } - } - l->want_write = TRUE; - - /* Wait for readers (and upgrades) to finish */ + return (mach_absolute_time() + MutexSpin); + } else + return (mach_absolute_time() + (100000LL * 1000000000LL)); +} -#if MACH_LDEBUG - decrementer = DECREMENTER_TIMEOUT; -#endif /* MACH_LDEBUG */ - while ((l->read_count != 0) || l->want_upgrade) { - if (!lock_miss) { - lock_miss = TRUE; - } - i = lock_wait_time[l->can_sleep ? 1 : 0]; - if (i != 0) { - simple_unlock(&l->interlock); -#if MACH_LDEBUG - if (!--decrementer) - Debugger("timeout - wait for readers"); -#endif /* MACH_LDEBUG */ - while (--i != 0 && (l->read_count != 0 || - l->want_upgrade)) - continue; - simple_lock(&l->interlock); - } +/* + * Spin while interlock is held. + */ - if (l->can_sleep && (l->read_count != 0 || l->want_upgrade)) { - l->waiting = TRUE; - thread_sleep_simple_lock((event_t) l, - simple_lock_addr(l->interlock), - THREAD_UNINT); - /* interlock relocked */ - } +static inline void +lck_rw_interlock_spin(lck_rw_t *lock) +{ + while (ordered_load(&lock->data) & LCK_RW_INTERLOCK) { + cpu_pause(); } - - simple_unlock(&l->interlock); } -void -lock_done( - register lock_t * l) +static boolean_t +lck_rw_grab_want(lck_rw_t *lock) { - boolean_t do_wakeup = FALSE; - - - simple_lock(&l->interlock); - - if (l->read_count != 0) { - l->read_count--; + uint32_t data, prev; + + for ( ; ; ) { + data = atomic_exchange_begin32(&lock->data, &prev, memory_order_relaxed); + if ((data & LCK_RW_INTERLOCK) == 0) + break; + atomic_exchange_abort(); + lck_rw_interlock_spin(lock); } - else - if (l->want_upgrade) { - l->want_upgrade = FALSE; - } - else { - l->want_write = FALSE; + if (data & LCK_RW_WANT_WRITE) { + atomic_exchange_abort(); + return FALSE; } + data |= LCK_RW_WANT_WRITE; + return atomic_exchange_complete32(&lock->data, prev, data, memory_order_relaxed); +} - /* - * There is no reason to wakeup a waiting thread - * if the read-count is non-zero. Consider: - * we must be dropping a read lock - * threads are waiting only if one wants a write lock - * if there are still readers, they can't proceed - */ - - if (l->waiting && (l->read_count == 0)) { - l->waiting = FALSE; - do_wakeup = TRUE; +static boolean_t +lck_rw_grab_shared(lck_rw_t *lock) +{ + uint32_t data, prev; + + for ( ; ; ) { + data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp); + if ((data & LCK_RW_INTERLOCK) == 0) + break; + atomic_exchange_abort(); + lck_rw_interlock_spin(lock); } - - simple_unlock(&l->interlock); - - if (do_wakeup) - thread_wakeup((event_t) l); + if (data & (LCK_RW_WANT_WRITE | LCK_RW_WANT_UPGRADE)) { + if (((data & LCK_RW_SHARED_MASK) == 0) || (data & LCK_RW_PRIV_EXCL)) { + atomic_exchange_abort(); + return FALSE; + } + } + data += LCK_RW_SHARED_READER; + return atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp); } -void -lock_read( - register lock_t * l) +/* + * Routine: lck_rw_lock_exclusive + */ +static void +lck_rw_lock_exclusive_gen( + lck_rw_t *lck) { - register int i; -#if MACH_LDEBUG - int decrementer; -#endif /* MACH_LDEBUG */ - - simple_lock(&l->interlock); - -#if MACH_LDEBUG - decrementer = DECREMENTER_TIMEOUT; -#endif /* MACH_LDEBUG */ - while (l->want_write || l->want_upgrade) { - i = lock_wait_time[l->can_sleep ? 1 : 0]; - - if (i != 0) { - simple_unlock(&l->interlock); -#if MACH_LDEBUG - if (!--decrementer) - Debugger("timeout - wait no writers"); -#endif /* MACH_LDEBUG */ - while (--i != 0 && (l->want_write || l->want_upgrade)) - continue; - simple_lock(&l->interlock); - } + __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck); + uint64_t deadline = 0; + int slept = 0; + int gotlock = 0; + int lockheld = 0; + wait_result_t res = 0; + boolean_t istate = -1; + +#if CONFIG_DTRACE + boolean_t dtrace_ls_initialized = FALSE; + boolean_t dtrace_rwl_excl_spin, dtrace_rwl_excl_block, dtrace_ls_enabled= FALSE; + uint64_t wait_interval = 0; + int readers_at_sleep = 0; +#endif - if (l->can_sleep && (l->want_write || l->want_upgrade)) { - l->waiting = TRUE; - thread_sleep_simple_lock((event_t) l, - simple_lock_addr(l->interlock), - THREAD_UNINT); - /* interlock relocked */ + /* + * Try to acquire the lck_rw_want_write bit. + */ + while ( !lck_rw_grab_want(lck)) { + +#if CONFIG_DTRACE + if (dtrace_ls_initialized == FALSE) { + dtrace_ls_initialized = TRUE; + dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0); + dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0); + dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block; + if (dtrace_ls_enabled) { + /* + * Either sleeping or spinning is happening, + * start a timing of our delay interval now. + */ + readers_at_sleep = lck->lck_rw_shared_count; + wait_interval = mach_absolute_time(); + } } - } - - l->read_count++; - - simple_unlock(&l->interlock); -} - +#endif + if (istate == -1) + istate = ml_get_interrupts_enabled(); -/* - * Routine: lock_read_to_write - * Function: - * Improves a read-only lock to one with - * write permission. If another reader has - * already requested an upgrade to a write lock, - * no lock is held upon return. - * - * Returns TRUE if the upgrade *failed*. - */ + deadline = lck_rw_deadline_for_spin(lck); -boolean_t -lock_read_to_write( - register lock_t * l) -{ - register int i; - boolean_t do_wakeup = FALSE; -#if MACH_LDEBUG - int decrementer; -#endif /* MACH_LDEBUG */ + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0); - simple_lock(&l->interlock); + while (((gotlock = lck_rw_grab_want(lck)) == 0) && mach_absolute_time() < deadline) + lck_rw_lock_pause(istate); - l->read_count--; + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_END, trace_lck, 0, 0, gotlock, 0); - if (l->want_upgrade) { + if (gotlock) + break; /* - * Someone else has requested upgrade. - * Since we've released a read lock, wake - * him up. + * if we get here, the deadline has expired w/o us + * being able to grab the lock exclusively + * check to see if we're allowed to do a thread_block */ - if (l->waiting && (l->read_count == 0)) { - l->waiting = FALSE; - do_wakeup = TRUE; - } + if (lck->lck_rw_can_sleep) { - simple_unlock(&l->interlock); + istate = lck_interlock_lock(lck); - if (do_wakeup) - thread_wakeup((event_t) l); - return (TRUE); - } + if (lck->lck_rw_want_write) { - l->want_upgrade = TRUE; + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0); -#if MACH_LDEBUG - decrementer = DECREMENTER_TIMEOUT; -#endif /* MACH_LDEBUG */ - while (l->read_count != 0) { - i = lock_wait_time[l->can_sleep ? 1 : 0]; + lck->lck_w_waiting = TRUE; - if (i != 0) { - simple_unlock(&l->interlock); -#if MACH_LDEBUG - if (!--decrementer) - Debugger("timeout - read_count"); -#endif /* MACH_LDEBUG */ - while (--i != 0 && l->read_count != 0) - continue; - simple_lock(&l->interlock); - } + thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite); + res = assert_wait(RW_LOCK_WRITER_EVENT(lck), + THREAD_UNINT | THREAD_WAIT_NOREPORT_USER); + lck_interlock_unlock(lck, istate); - if (l->can_sleep && l->read_count != 0) { - l->waiting = TRUE; - thread_sleep_simple_lock((event_t) l, - simple_lock_addr(l->interlock), - THREAD_UNINT); - /* interlock relocked */ + if (res == THREAD_WAITING) { + res = thread_block(THREAD_CONTINUE_NULL); + slept++; + } + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_END, trace_lck, res, slept, 0, 0); + } else { + lck->lck_rw_want_write = TRUE; + lck_interlock_unlock(lck, istate); + break; + } } } + /* + * Wait for readers (and upgrades) to finish... + * the test for these conditions must be done simultaneously with + * a check of the interlock not being held since + * the rw_shared_count will drop to 0 first and then want_upgrade + * will be set to 1 in the shared_to_exclusive scenario... those + * adjustments are done behind the interlock and represent an + * atomic change in state and must be considered as such + * however, once we see the read count at 0, the want_upgrade not set + * and the interlock not held, we are safe to proceed + */ + while (lck_rw_held_read_or_upgrade(lck)) { - simple_unlock(&l->interlock); - - return (FALSE); -} - -void -lock_write_to_read( - register lock_t * l) -{ - boolean_t do_wakeup = FALSE; +#if CONFIG_DTRACE + /* + * Either sleeping or spinning is happening, start + * a timing of our delay interval now. If we set it + * to -1 we don't have accurate data so we cannot later + * decide to record a dtrace spin or sleep event. + */ + if (dtrace_ls_initialized == FALSE) { + dtrace_ls_initialized = TRUE; + dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0); + dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0); + dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block; + if (dtrace_ls_enabled) { + /* + * Either sleeping or spinning is happening, + * start a timing of our delay interval now. + */ + readers_at_sleep = lck->lck_rw_shared_count; + wait_interval = mach_absolute_time(); + } + } +#endif + if (istate == -1) + istate = ml_get_interrupts_enabled(); - simple_lock(&l->interlock); + deadline = lck_rw_deadline_for_spin(lck); - l->read_count++; - if (l->want_upgrade) - l->want_upgrade = FALSE; - else - l->want_write = FALSE; + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0); - if (l->waiting) { - l->waiting = FALSE; - do_wakeup = TRUE; - } + while ((lockheld = lck_rw_held_read_or_upgrade(lck)) && mach_absolute_time() < deadline) + lck_rw_lock_pause(istate); - simple_unlock(&l->interlock); + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_END, trace_lck, 0, 0, lockheld, 0); - if (do_wakeup) - thread_wakeup((event_t) l); -} + if ( !lockheld) + break; + /* + * if we get here, the deadline has expired w/o us + * being able to grab the lock exclusively + * check to see if we're allowed to do a thread_block + */ + if (lck->lck_rw_can_sleep) { + istate = lck_interlock_lock(lck); -#if 0 /* Unused */ -/* - * Routine: lock_try_write - * Function: - * Tries to get a write lock. - * - * Returns FALSE if the lock is not held on return. - */ + if (lck->lck_rw_shared_count != 0 || lck->lck_rw_want_upgrade) { + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0); -boolean_t -lock_try_write( - register lock_t * l) -{ - pc_t pc; + lck->lck_w_waiting = TRUE; - simple_lock(&l->interlock); + thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite); + res = assert_wait(RW_LOCK_WRITER_EVENT(lck), + THREAD_UNINT | THREAD_WAIT_NOREPORT_USER); + lck_interlock_unlock(lck, istate); - if (l->want_write || l->want_upgrade || l->read_count) { - /* - * Can't get lock. - */ - simple_unlock(&l->interlock); - return(FALSE); + if (res == THREAD_WAITING) { + res = thread_block(THREAD_CONTINUE_NULL); + slept++; + } + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_END, trace_lck, res, slept, 0, 0); + } else { + lck_interlock_unlock(lck, istate); + /* + * must own the lock now, since we checked for + * readers or upgrade owner behind the interlock + * no need for a call to 'lck_rw_held_read_or_upgrade' + */ + break; + } + } } +#if CONFIG_DTRACE /* - * Have lock. + * Decide what latencies we suffered that are Dtrace events. + * If we have set wait_interval, then we either spun or slept. + * At least we get out from under the interlock before we record + * which is the best we can do here to minimize the impact + * of the tracing. + * If we have set wait_interval to -1, then dtrace was not enabled when we + * started sleeping/spinning so we don't record this event. */ - - l->want_write = TRUE; - - simple_unlock(&l->interlock); - - return(TRUE); -} - -/* - * Routine: lock_try_read - * Function: - * Tries to get a read lock. - * - * Returns FALSE if the lock is not held on return. - */ - -boolean_t -lock_try_read( - register lock_t * l) -{ - pc_t pc; - - simple_lock(&l->interlock); - - if (l->want_write || l->want_upgrade) { - simple_unlock(&l->interlock); - return(FALSE); + if (dtrace_ls_enabled == TRUE) { + if (slept == 0) { + LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_SPIN, lck, + mach_absolute_time() - wait_interval, 1); + } else { + /* + * For the blocking case, we also record if when we blocked + * it was held for read or write, and how many readers. + * Notice that above we recorded this before we dropped + * the interlock so the count is accurate. + */ + LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_BLOCK, lck, + mach_absolute_time() - wait_interval, 1, + (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep); + } } - - l->read_count++; - - simple_unlock(&l->interlock); - - return(TRUE); -} -#endif /* Unused */ - - -/* - * Routine: lck_rw_alloc_init - */ -lck_rw_t * -lck_rw_alloc_init( - lck_grp_t *grp, - lck_attr_t *attr) { - lck_rw_t *lck; - - if ((lck = (lck_rw_t *)kalloc(sizeof(lck_rw_t))) != 0) - lck_rw_init(lck, grp, attr); - - return(lck); -} - -/* - * Routine: lck_rw_free - */ -void -lck_rw_free( - lck_rw_t *lck, - lck_grp_t *grp) { - lck_rw_destroy(lck, grp); - kfree(lck, sizeof(lck_rw_t)); -} - -/* - * Routine: lck_rw_init - */ -void -lck_rw_init( - lck_rw_t *lck, - lck_grp_t *grp, - __unused lck_attr_t *attr) { - - hw_lock_init(&lck->interlock); - lck->want_write = FALSE; - lck->want_upgrade = FALSE; - lck->read_count = 0; - lck->can_sleep = TRUE; - lck->lck_rw_tag = 0; - - lck_grp_reference(grp); - lck_grp_lckcnt_incr(grp, LCK_TYPE_RW); -} - -/* - * Routine: lck_rw_destroy - */ -void -lck_rw_destroy( - lck_rw_t *lck, - lck_grp_t *grp) { - if (lck->lck_rw_tag == LCK_RW_TAG_DESTROYED) - return; - lck->lck_rw_tag = LCK_RW_TAG_DESTROYED; - lck_grp_lckcnt_decr(grp, LCK_TYPE_RW); - lck_grp_deallocate(grp); - return; + LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lck, 1); +#endif } /* - * Sleep locks. These use the same data structure and algorithm - * as the spin locks, but the process sleeps while it is waiting - * for the lock. These work on uniprocessor systems. + * Routine: lck_rw_done */ -#define DECREMENTER_TIMEOUT 1000000 - - -/* - * We need to disable interrupts while holding the mutex interlock - * to prevent an IPI intervening. - * Hence, local helper functions lck_interlock_lock()/lck_interlock_unlock(). - */ -static boolean_t -lck_interlock_lock(lck_rw_t *lck) +lck_rw_type_t lck_rw_done(lck_rw_t *lock) { - boolean_t istate; - - istate = ml_set_interrupts_enabled(FALSE); - hw_lock_lock(&lck->interlock); - - return istate; -} - -static void -lck_interlock_unlock(lck_rw_t *lck, boolean_t istate) -{ - hw_lock_unlock(&lck->interlock); - ml_set_interrupts_enabled(istate); + uint32_t data, prev; + + for ( ; ; ) { + data = atomic_exchange_begin32(&lock->data, &prev, memory_order_release_smp); + if (data & LCK_RW_INTERLOCK) { /* wait for interlock to clear */ + atomic_exchange_abort(); + lck_rw_interlock_spin(lock); + continue; + } + if (data & LCK_RW_SHARED_MASK) { + data -= LCK_RW_SHARED_READER; + if ((data & LCK_RW_SHARED_MASK) == 0) /* if reader count has now gone to 0, check for waiters */ + goto check_waiters; + } else { /* if reader count == 0, must be exclusive lock */ + if (data & LCK_RW_WANT_UPGRADE) { + data &= ~(LCK_RW_WANT_UPGRADE); + } else { + if (data & LCK_RW_WANT_WRITE) + data &= ~(LCK_RW_WANT_EXCL); + else /* lock is not 'owned', panic */ + panic("Releasing non-exclusive RW lock without a reader refcount!"); + } +check_waiters: + if (prev & LCK_RW_W_WAITING) { + data &= ~(LCK_RW_W_WAITING); + if ((prev & LCK_RW_PRIV_EXCL) == 0) + data &= ~(LCK_RW_R_WAITING); + } else + data &= ~(LCK_RW_R_WAITING); + } + if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_release_smp)) + break; + cpu_pause(); + } + return lck_rw_done_gen(lock, prev); } /* - * Routine: lck_rw_lock_exclusive - */ -void -lck_rw_lock_exclusive( - lck_rw_t *lck) + * Routine: lck_rw_done_gen + * + * called from lck_rw_done() + * prior_lock_state is the value in the 1st + * word of the lock at the time of a successful + * atomic compare and exchange with the new value... + * it represents the state of the lock before we + * decremented the rw_shared_count or cleared either + * rw_want_upgrade or rw_want_write and + * the lck_x_waiting bits... since the wrapper + * routine has already changed the state atomically, + * we just need to decide if we should + * wake up anyone and what value to return... we do + * this by examining the state of the lock before + * we changed it + */ +static lck_rw_type_t +lck_rw_done_gen( + lck_rw_t *lck, + uint32_t prior_lock_state) { - int i; - boolean_t lock_miss = FALSE; - wait_result_t res; -#if MACH_LDEBUG - int decrementer; -#endif /* MACH_LDEBUG */ - boolean_t istate; - - istate = lck_interlock_lock(lck); - -#if MACH_LDEBUG - decrementer = DECREMENTER_TIMEOUT; -#endif /* MACH_LDEBUG */ - - /* - * Try to acquire the want_write bit. - */ - while (lck->want_write) { - KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EXCLUSIVE_CODE) | DBG_FUNC_START, (int)lck, 0, 0, 0, 0); + lck_rw_t *fake_lck; + lck_rw_type_t lock_type; + thread_t thread; + uint32_t rwlock_count; - if (!lock_miss) { - lock_miss = TRUE; - } + thread = current_thread(); + rwlock_count = thread->rwlock_count--; + fake_lck = (lck_rw_t *)&prior_lock_state; - i = lock_wait_time[lck->can_sleep ? 1 : 0]; - if (i != 0) { - lck_interlock_unlock(lck, istate); -#if MACH_LDEBUG - if (!--decrementer) - Debugger("timeout - want_write"); -#endif /* MACH_LDEBUG */ - while (--i != 0 && lck->want_write) - continue; - istate = lck_interlock_lock(lck); - } + if (lck->lck_rw_can_sleep) { + /* + * prior_lock state is a snapshot of the 1st word of the + * lock in question... we'll fake up a pointer to it + * and carefully not access anything beyond whats defined + * in the first word of a lck_rw_t + */ - if (lck->can_sleep && lck->want_write) { - lck->waiting = TRUE; - res = assert_wait((event_t) lck, THREAD_UNINT); - if (res == THREAD_WAITING) { - lck_interlock_unlock(lck, istate); - res = thread_block(THREAD_CONTINUE_NULL); - istate = lck_interlock_lock(lck); + if (fake_lck->lck_rw_shared_count <= 1) { + if (fake_lck->lck_w_waiting) { + thread_wakeup(RW_LOCK_WRITER_EVENT(lck)); } - } - KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EXCLUSIVE_CODE) | DBG_FUNC_END, (int)lck, res, 0, 0, 0); - } - lck->want_write = TRUE; - - /* Wait for readers (and upgrades) to finish */ -#if MACH_LDEBUG - decrementer = DECREMENTER_TIMEOUT; -#endif /* MACH_LDEBUG */ - while ((lck->read_count != 0) || lck->want_upgrade) { - if (!lock_miss) { - lock_miss = TRUE; + if (!(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting) && fake_lck->lck_r_waiting) { + thread_wakeup(RW_LOCK_READER_EVENT(lck)); + } } - - i = lock_wait_time[lck->can_sleep ? 1 : 0]; - - KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EXCLUSIVE1_CODE) | DBG_FUNC_START, - (int)lck, lck->read_count, lck->want_upgrade, i, 0); - - if (i != 0) { - lck_interlock_unlock(lck, istate); -#if MACH_LDEBUG - if (!--decrementer) - Debugger("timeout - wait for readers"); -#endif /* MACH_LDEBUG */ - while (--i != 0 && (lck->read_count != 0 || - lck->want_upgrade)) - continue; - istate = lck_interlock_lock(lck); +#if MACH_LDEBUG + if (rwlock_count == 0) { + panic("rw lock count underflow for thread %p", thread); } +#endif + /* Check if dropping the lock means that we need to unpromote */ - if (lck->can_sleep && (lck->read_count != 0 || lck->want_upgrade)) { - lck->waiting = TRUE; - res = assert_wait((event_t) lck, THREAD_UNINT); - if (res == THREAD_WAITING) { - lck_interlock_unlock(lck, istate); - res = thread_block(THREAD_CONTINUE_NULL); - istate = lck_interlock_lock(lck); - } + if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) { + /* sched_flags checked without lock, but will be rechecked while clearing */ + lck_rw_clear_promotion(thread, unslide_for_kdebug(lck)); } - KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EXCLUSIVE1_CODE) | DBG_FUNC_END, - (int)lck, lck->read_count, lck->want_upgrade, res, 0); - } - - lck_interlock_unlock(lck, istate); -} - - -/* - * Routine: lck_rw_done - */ -lck_rw_type_t -lck_rw_done( - lck_rw_t *lck) -{ - boolean_t do_wakeup = FALSE; - lck_rw_type_t lck_rw_type; - boolean_t istate; - - - istate = lck_interlock_lock(lck); - - if (lck->read_count != 0) { - lck_rw_type = LCK_RW_TYPE_SHARED; - lck->read_count--; - } - else { - lck_rw_type = LCK_RW_TYPE_EXCLUSIVE; - if (lck->want_upgrade) - lck->want_upgrade = FALSE; - else - lck->want_write = FALSE; } - - /* - * There is no reason to wakeup a waiting thread - * if the read-count is non-zero. Consider: - * we must be dropping a read lock - * threads are waiting only if one wants a write lock - * if there are still readers, they can't proceed - */ - - if (lck->waiting && (lck->read_count == 0)) { - lck->waiting = FALSE; - do_wakeup = TRUE; + if (fake_lck->lck_rw_shared_count) { + lock_type = LCK_RW_TYPE_SHARED; + } else { + lock_type = LCK_RW_TYPE_EXCLUSIVE; } - lck_interlock_unlock(lck, istate); +#if CONFIG_DTRACE + LOCKSTAT_RECORD(LS_LCK_RW_DONE_RELEASE, lck, lock_type == LCK_RW_TYPE_SHARED ? 0 : 1); +#endif - if (do_wakeup) - thread_wakeup((event_t) lck); - return(lck_rw_type); + return lock_type; } - - /* * Routine: lck_rw_unlock */ @@ -1290,10 +1393,11 @@ lck_rw_unlock_shared( { lck_rw_type_t ret; + assertf(lck->lck_rw_shared_count > 0, "lck %p has shared_count=0x%x", lck, lck->lck_rw_shared_count); ret = lck_rw_done(lck); if (ret != LCK_RW_TYPE_SHARED) - panic("lck_rw_unlock(): lock held in mode: %d\n", ret); + panic("lck_rw_unlock_shared(): lock %p held in mode: %d\n", lck, ret); } @@ -1329,188 +1433,434 @@ lck_rw_lock( panic("lck_rw_lock(): Invalid RW lock type: %x\n", lck_rw_type); } - /* * Routine: lck_rw_lock_shared */ void -lck_rw_lock_shared( - lck_rw_t *lck) +lck_rw_lock_shared(lck_rw_t *lock) { - int i; - wait_result_t res; -#if MACH_LDEBUG - int decrementer; -#endif /* MACH_LDEBUG */ - boolean_t istate; - - istate = lck_interlock_lock(lck); - -#if MACH_LDEBUG - decrementer = DECREMENTER_TIMEOUT; -#endif /* MACH_LDEBUG */ - while (lck->want_write || lck->want_upgrade) { - i = lock_wait_time[lck->can_sleep ? 1 : 0]; - - KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_CODE) | DBG_FUNC_START, - (int)lck, lck->want_write, lck->want_upgrade, i, 0); - - if (i != 0) { - lck_interlock_unlock(lck, istate); -#if MACH_LDEBUG - if (!--decrementer) - Debugger("timeout - wait no writers"); -#endif /* MACH_LDEBUG */ - while (--i != 0 && (lck->want_write || lck->want_upgrade)) + uint32_t data, prev; + + current_thread()->rwlock_count++; + for ( ; ; ) { + data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp); + if (data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE | LCK_RW_INTERLOCK)) { + atomic_exchange_abort(); + if (lock->lck_rw_can_sleep) { + lck_rw_lock_shared_gen(lock); + } else { + cpu_pause(); continue; - istate = lck_interlock_lock(lck); + } + break; + } + data += LCK_RW_SHARED_READER; + if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp)) + break; + cpu_pause(); + } +#if CONFIG_DTRACE + LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lock, DTRACE_RW_SHARED); +#endif /* CONFIG_DTRACE */ + return; +} + +/* + * Routine: lck_rw_lock_shared_gen + * Function: + * assembly fast path code has determined that this lock + * is held exclusively... this is where we spin/block + * until we can acquire the lock in the shared mode + */ +static void +lck_rw_lock_shared_gen( + lck_rw_t *lck) +{ + __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck); + uint64_t deadline = 0; + int gotlock = 0; + int slept = 0; + wait_result_t res = 0; + boolean_t istate = -1; + +#if CONFIG_DTRACE + uint64_t wait_interval = 0; + int readers_at_sleep = 0; + boolean_t dtrace_ls_initialized = FALSE; + boolean_t dtrace_rwl_shared_spin, dtrace_rwl_shared_block, dtrace_ls_enabled = FALSE; +#endif + + while ( !lck_rw_grab_shared(lck)) { + +#if CONFIG_DTRACE + if (dtrace_ls_initialized == FALSE) { + dtrace_ls_initialized = TRUE; + dtrace_rwl_shared_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_SPIN] != 0); + dtrace_rwl_shared_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_BLOCK] != 0); + dtrace_ls_enabled = dtrace_rwl_shared_spin || dtrace_rwl_shared_block; + if (dtrace_ls_enabled) { + /* + * Either sleeping or spinning is happening, + * start a timing of our delay interval now. + */ + readers_at_sleep = lck->lck_rw_shared_count; + wait_interval = mach_absolute_time(); + } } +#endif + if (istate == -1) + istate = ml_get_interrupts_enabled(); - if (lck->can_sleep && (lck->want_write || lck->want_upgrade)) { - lck->waiting = TRUE; - res = assert_wait((event_t) lck, THREAD_UNINT); - if (res == THREAD_WAITING) { + deadline = lck_rw_deadline_for_spin(lck); + + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_START, + trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0); + + while (((gotlock = lck_rw_grab_shared(lck)) == 0) && mach_absolute_time() < deadline) + lck_rw_lock_pause(istate); + + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_END, + trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, gotlock, 0); + + if (gotlock) + break; + /* + * if we get here, the deadline has expired w/o us + * being able to grab the lock for read + * check to see if we're allowed to do a thread_block + */ + if (lck->lck_rw_can_sleep) { + + istate = lck_interlock_lock(lck); + + if ((lck->lck_rw_want_write || lck->lck_rw_want_upgrade) && + ((lck->lck_rw_shared_count == 0) || lck->lck_rw_priv_excl)) { + + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_START, + trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0); + + lck->lck_r_waiting = TRUE; + + thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockRead); + res = assert_wait(RW_LOCK_READER_EVENT(lck), + THREAD_UNINT | THREAD_WAIT_NOREPORT_USER); lck_interlock_unlock(lck, istate); - res = thread_block(THREAD_CONTINUE_NULL); - istate = lck_interlock_lock(lck); + + if (res == THREAD_WAITING) { + res = thread_block(THREAD_CONTINUE_NULL); + slept++; + } + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_END, + trace_lck, res, slept, 0, 0); + } else { + lck->lck_rw_shared_count++; + lck_interlock_unlock(lck, istate); + break; } } - KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_CODE) | DBG_FUNC_END, - (int)lck, lck->want_write, lck->want_upgrade, res, 0); } - lck->read_count++; +#if CONFIG_DTRACE + if (dtrace_ls_enabled == TRUE) { + if (slept == 0) { + LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_SPIN, lck, mach_absolute_time() - wait_interval, 0); + } else { + LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_BLOCK, lck, + mach_absolute_time() - wait_interval, 0, + (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep); + } + } + LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lck, 0); +#endif +} + + +/* + * Routine: lck_rw_lock_exclusive + */ - lck_interlock_unlock(lck, istate); +void +lck_rw_lock_exclusive(lck_rw_t *lock) +{ + current_thread()->rwlock_count++; + if (atomic_test_and_set32(&lock->data, + (LCK_RW_SHARED_MASK | LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE | LCK_RW_INTERLOCK), + LCK_RW_WANT_EXCL, memory_order_acquire_smp, FALSE)) { +#if CONFIG_DTRACE + LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL); +#endif /* CONFIG_DTRACE */ + } else + lck_rw_lock_exclusive_gen(lock); } /* * Routine: lck_rw_lock_shared_to_exclusive - * Function: - * Improves a read-only lock to one with - * write permission. If another reader has - * already requested an upgrade to a write lock, - * no lock is held upon return. * - * Returns TRUE if the upgrade *failed*. + * False returned upon failure, in this case the shared lock is dropped. */ boolean_t -lck_rw_lock_shared_to_exclusive( - lck_rw_t *lck) +lck_rw_lock_shared_to_exclusive(lck_rw_t *lock) { - int i; - boolean_t do_wakeup = FALSE; - wait_result_t res; -#if MACH_LDEBUG - int decrementer; -#endif /* MACH_LDEBUG */ - boolean_t istate; + uint32_t data, prev; + + for ( ; ; ) { + data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp); + if (data & LCK_RW_INTERLOCK) { + atomic_exchange_abort(); + lck_rw_interlock_spin(lock); + continue; + } + if (data & LCK_RW_WANT_UPGRADE) { + data -= LCK_RW_SHARED_READER; + if ((data & LCK_RW_SHARED_MASK) == 0) /* we were the last reader */ + data &= ~(LCK_RW_W_WAITING); /* so clear the wait indicator */ + if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp)) + return lck_rw_lock_shared_to_exclusive_failure(lock, prev); + } else { + data |= LCK_RW_WANT_UPGRADE; /* ask for WANT_UPGRADE */ + data -= LCK_RW_SHARED_READER; /* and shed our read count */ + if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp)) + break; + } + cpu_pause(); + } + /* we now own the WANT_UPGRADE */ + if (data & LCK_RW_SHARED_MASK) /* check to see if all of the readers are drained */ + lck_rw_lock_shared_to_exclusive_success(lock); /* if not, we need to go wait */ +#if CONFIG_DTRACE + LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lock, 0); +#endif + return TRUE; +} - istate = lck_interlock_lock(lck); - lck->read_count--; +/* + * Routine: lck_rw_lock_shared_to_exclusive_failure + * Function: + * assembly fast path code has already dropped our read + * count and determined that someone else owns 'lck_rw_want_upgrade' + * if 'lck_rw_shared_count' == 0, its also already dropped 'lck_w_waiting' + * all we need to do here is determine if a wakeup is needed + */ +static boolean_t +lck_rw_lock_shared_to_exclusive_failure( + lck_rw_t *lck, + uint32_t prior_lock_state) +{ + lck_rw_t *fake_lck; + thread_t thread = current_thread(); + uint32_t rwlock_count; - if (lck->want_upgrade) { - KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_CODE) | DBG_FUNC_START, - (int)lck, lck->read_count, lck->want_upgrade, 0, 0); + /* Check if dropping the lock means that we need to unpromote */ + rwlock_count = thread->rwlock_count--; +#if MACH_LDEBUG + if (rwlock_count == 0) { + panic("rw lock count underflow for thread %p", thread); + } +#endif + fake_lck = (lck_rw_t *)&prior_lock_state; + if (fake_lck->lck_w_waiting && fake_lck->lck_rw_shared_count == 1) { /* * Someone else has requested upgrade. - * Since we've released a read lock, wake - * him up. + * Since we've released the read lock, wake + * him up if he's blocked waiting */ - if (lck->waiting && (lck->read_count == 0)) { - lck->waiting = FALSE; - do_wakeup = TRUE; - } + thread_wakeup(RW_LOCK_WRITER_EVENT(lck)); + } + + if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) { + /* sched_flags checked without lock, but will be rechecked while clearing */ + lck_rw_clear_promotion(thread, unslide_for_kdebug(lck)); + } + + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_CODE) | DBG_FUNC_NONE, + VM_KERNEL_UNSLIDE_OR_PERM(lck), lck->lck_rw_shared_count, lck->lck_rw_want_upgrade, 0, 0); - lck_interlock_unlock(lck, istate); + return (FALSE); +} - if (do_wakeup) - thread_wakeup((event_t) lck); - KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_CODE) | DBG_FUNC_END, - (int)lck, lck->read_count, lck->want_upgrade, 0, 0); +/* + * Routine: lck_rw_lock_shared_to_exclusive_failure + * Function: + * assembly fast path code has already dropped our read + * count and successfully acquired 'lck_rw_want_upgrade' + * we just need to wait for the rest of the readers to drain + * and then we can return as the exclusive holder of this lock + */ +static boolean_t +lck_rw_lock_shared_to_exclusive_success( + lck_rw_t *lck) +{ + __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck); + uint64_t deadline = 0; + int slept = 0; + int still_shared = 0; + wait_result_t res; + boolean_t istate = -1; - return (TRUE); - } +#if CONFIG_DTRACE + uint64_t wait_interval = 0; + int readers_at_sleep = 0; + boolean_t dtrace_ls_initialized = FALSE; + boolean_t dtrace_rwl_shared_to_excl_spin, dtrace_rwl_shared_to_excl_block, dtrace_ls_enabled = FALSE; +#endif - lck->want_upgrade = TRUE; + while (lck->lck_rw_shared_count != 0) { + +#if CONFIG_DTRACE + if (dtrace_ls_initialized == FALSE) { + dtrace_ls_initialized = TRUE; + dtrace_rwl_shared_to_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN] != 0); + dtrace_rwl_shared_to_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK] != 0); + dtrace_ls_enabled = dtrace_rwl_shared_to_excl_spin || dtrace_rwl_shared_to_excl_block; + if (dtrace_ls_enabled) { + /* + * Either sleeping or spinning is happening, + * start a timing of our delay interval now. + */ + readers_at_sleep = lck->lck_rw_shared_count; + wait_interval = mach_absolute_time(); + } + } +#endif + if (istate == -1) + istate = ml_get_interrupts_enabled(); -#if MACH_LDEBUG - decrementer = DECREMENTER_TIMEOUT; -#endif /* MACH_LDEBUG */ - while (lck->read_count != 0) { - i = lock_wait_time[lck->can_sleep ? 1 : 0]; + deadline = lck_rw_deadline_for_spin(lck); - KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX1_CODE) | DBG_FUNC_START, - (int)lck, lck->read_count, i, 0, 0); + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_START, + trace_lck, lck->lck_rw_shared_count, 0, 0, 0); + + while ((still_shared = lck->lck_rw_shared_count) && mach_absolute_time() < deadline) + lck_rw_lock_pause(istate); + + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_END, + trace_lck, lck->lck_rw_shared_count, 0, 0, 0); + + if ( !still_shared) + break; + /* + * if we get here, the deadline has expired w/o + * the rw_shared_count having drained to 0 + * check to see if we're allowed to do a thread_block + */ + if (lck->lck_rw_can_sleep) { - if (i != 0) { - lck_interlock_unlock(lck, istate); -#if MACH_LDEBUG - if (!--decrementer) - Debugger("timeout - read_count"); -#endif /* MACH_LDEBUG */ - while (--i != 0 && lck->read_count != 0) - continue; istate = lck_interlock_lock(lck); - } - if (lck->can_sleep && lck->read_count != 0) { - lck->waiting = TRUE; - res = assert_wait((event_t) lck, THREAD_UNINT); - if (res == THREAD_WAITING) { + if (lck->lck_rw_shared_count != 0) { + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_START, + trace_lck, lck->lck_rw_shared_count, 0, 0, 0); + + lck->lck_w_waiting = TRUE; + + thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockUpgrade); + res = assert_wait(RW_LOCK_WRITER_EVENT(lck), + THREAD_UNINT | THREAD_WAIT_NOREPORT_USER); + lck_interlock_unlock(lck, istate); + + if (res == THREAD_WAITING) { + res = thread_block(THREAD_CONTINUE_NULL); + slept++; + } + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_END, + trace_lck, res, slept, 0, 0); + } else { lck_interlock_unlock(lck, istate); - res = thread_block(THREAD_CONTINUE_NULL); - istate = lck_interlock_lock(lck); + break; } } - KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX1_CODE) | DBG_FUNC_END, - (int)lck, lck->read_count, 0, 0, 0); } - - lck_interlock_unlock(lck, istate); - - return (FALSE); +#if CONFIG_DTRACE + /* + * We infer whether we took the sleep/spin path above by checking readers_at_sleep. + */ + if (dtrace_ls_enabled == TRUE) { + if (slept == 0) { + LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN, lck, mach_absolute_time() - wait_interval, 0); + } else { + LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK, lck, + mach_absolute_time() - wait_interval, 1, + (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep); + } + } + LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lck, 1); +#endif + return (TRUE); } /* - * Routine: lck_rw_lock_exclusive_to_shared + * Routine: lck_rw_lock_exclusive_to_shared */ -void -lck_rw_lock_exclusive_to_shared( - lck_rw_t *lck) -{ - boolean_t do_wakeup = FALSE; - boolean_t istate; - KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_START, - (int)lck, lck->want_write, lck->want_upgrade, 0, 0); +void lck_rw_lock_exclusive_to_shared(lck_rw_t *lock) +{ + uint32_t data, prev; + + for ( ; ; ) { + data = atomic_exchange_begin32(&lock->data, &prev, memory_order_release_smp); + if (data & LCK_RW_INTERLOCK) { + atomic_exchange_abort(); + lck_rw_interlock_spin(lock); /* wait for interlock to clear */ + continue; + } + data += LCK_RW_SHARED_READER; + if (data & LCK_RW_WANT_UPGRADE) + data &= ~(LCK_RW_WANT_UPGRADE); + else + data &= ~(LCK_RW_WANT_EXCL); + if (!((prev & LCK_RW_W_WAITING) && (prev & LCK_RW_PRIV_EXCL))) + data &= ~(LCK_RW_W_WAITING); + if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_release_smp)) + break; + cpu_pause(); + } + return lck_rw_lock_exclusive_to_shared_gen(lock, prev); +} - istate = lck_interlock_lock(lck); - lck->read_count++; - if (lck->want_upgrade) - lck->want_upgrade = FALSE; - else - lck->want_write = FALSE; +/* + * Routine: lck_rw_lock_exclusive_to_shared_gen + * Function: + * assembly fast path has already dropped + * our exclusive state and bumped lck_rw_shared_count + * all we need to do here is determine if anyone + * needs to be awakened. + */ +static void +lck_rw_lock_exclusive_to_shared_gen( + lck_rw_t *lck, + uint32_t prior_lock_state) +{ + __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck); + lck_rw_t *fake_lck; - if (lck->waiting) { - lck->waiting = FALSE; - do_wakeup = TRUE; - } + fake_lck = (lck_rw_t *)&prior_lock_state; - lck_interlock_unlock(lck, istate); + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_START, + trace_lck, fake_lck->lck_rw_want_write, fake_lck->lck_rw_want_upgrade, 0, 0); - if (do_wakeup) - thread_wakeup((event_t) lck); + /* + * don't wake up anyone waiting to take the lock exclusively + * since we hold a read count... when the read count drops to 0, + * the writers will be woken. + * + * wake up any waiting readers if we don't have any writers waiting, + * or the lock is NOT marked as rw_priv_excl (writers have privilege) + */ + if (!(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting) && fake_lck->lck_r_waiting) + thread_wakeup(RW_LOCK_READER_EVENT(lck)); KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_END, - (int)lck, lck->want_write, lck->want_upgrade, lck->read_count, 0); + trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, lck->lck_rw_shared_count, 0); +#if CONFIG_DTRACE + LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_TO_SHARED_DOWNGRADE, lck, 0); +#endif } @@ -1532,67 +1882,199 @@ lck_rw_try_lock( } /* - * Routine: lck_rw_try_lock_exclusive - * Function: - * Tries to get a write lock. - * - * Returns FALSE if the lock is not held on return. + * Routine: lck_rw_try_lock_shared */ -boolean_t -lck_rw_try_lock_exclusive( - lck_rw_t *lck) +boolean_t lck_rw_try_lock_shared(lck_rw_t *lock) { - boolean_t istate; + uint32_t data, prev; + + for ( ; ; ) { + data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp); + if (data & LCK_RW_INTERLOCK) { + atomic_exchange_abort(); + lck_rw_interlock_spin(lock); + continue; + } + if (data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) { + atomic_exchange_abort(); + return FALSE; /* lock is busy */ + } + data += LCK_RW_SHARED_READER; /* Increment reader refcount */ + if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp)) + break; + cpu_pause(); + } + current_thread()->rwlock_count++; + /* There is a 3 instr window where preemption may not notice rwlock_count after cmpxchg */ +#if CONFIG_DTRACE + LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_SHARED_ACQUIRE, lock, DTRACE_RW_SHARED); +#endif /* CONFIG_DTRACE */ + return TRUE; +} - istate = lck_interlock_lock(lck); - if (lck->want_write || lck->want_upgrade || lck->read_count) { - /* - * Can't get lock. - */ - lck_interlock_unlock(lck, istate); - return(FALSE); +/* + * Routine: lck_rw_try_lock_exclusive + */ + +boolean_t lck_rw_try_lock_exclusive(lck_rw_t *lock) +{ + uint32_t data, prev; + + for ( ; ; ) { + data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp); + if (data & LCK_RW_INTERLOCK) { + atomic_exchange_abort(); + lck_rw_interlock_spin(lock); + continue; + } + if (data & (LCK_RW_SHARED_MASK | LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) { + atomic_exchange_abort(); + return FALSE; /* can't get it */ + } + data |= LCK_RW_WANT_EXCL; + if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp)) + break; + cpu_pause(); } - /* - * Have lock. - */ + current_thread()->rwlock_count++; +#if CONFIG_DTRACE + LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL); +#endif /* CONFIG_DTRACE */ + return TRUE; +} - lck->want_write = TRUE; - lck_interlock_unlock(lck, istate); +void +lck_rw_assert( + lck_rw_t *lck, + unsigned int type) +{ + switch (type) { + case LCK_RW_ASSERT_SHARED: + if (lck->lck_rw_shared_count != 0) { + return; + } + break; + case LCK_RW_ASSERT_EXCLUSIVE: + if ((lck->lck_rw_want_write || + lck->lck_rw_want_upgrade) && + lck->lck_rw_shared_count == 0) { + return; + } + break; + case LCK_RW_ASSERT_HELD: + if (lck->lck_rw_want_write || + lck->lck_rw_want_upgrade || + lck->lck_rw_shared_count != 0) { + return; + } + break; + case LCK_RW_ASSERT_NOTHELD: + if (!(lck->lck_rw_want_write || + lck->lck_rw_want_upgrade || + lck->lck_rw_shared_count != 0)) { + return; + } + break; + default: + break; + } - return(TRUE); + panic("rw lock (%p)%s held (mode=%u), first word %08x\n", lck, (type == LCK_RW_ASSERT_NOTHELD ? "" : " not"), type, *(uint32_t *)lck); } -/* - * Routine: lck_rw_try_lock_shared - * Function: - * Tries to get a read lock. - * - * Returns FALSE if the lock is not held on return. - */ +/* On return to userspace, this routine is called if the rwlock_count is somehow imbalanced */ +#if MACH_LDEBUG +__dead2 +#endif +void +lck_rw_clear_promotions_x86(thread_t thread) +{ +#if MACH_LDEBUG + /* It's fatal to leave a RW lock locked and return to userspace */ + panic("%u rw lock(s) held on return to userspace for thread %p", thread->rwlock_count, thread); +#else + /* Paper over the issue */ + thread->rwlock_count = 0; + lck_rw_clear_promotion(thread, 0); +#endif +} boolean_t -lck_rw_try_lock_shared( - lck_rw_t *lck) +lck_rw_lock_yield_shared(lck_rw_t *lck, boolean_t force_yield) { - boolean_t istate; + lck_rw_assert(lck, LCK_RW_ASSERT_SHARED); + + if (lck->lck_rw_want_write || lck->lck_rw_want_upgrade || force_yield) { + lck_rw_unlock_shared(lck); + mutex_pause(2); + lck_rw_lock_shared(lck); + return TRUE; + } - istate = lck_interlock_lock(lck); + return FALSE; +} - if (lck->want_write || lck->want_upgrade) { - lck_interlock_unlock(lck, istate); - return(FALSE); +/* + * Routine: kdp_lck_rw_lock_is_acquired_exclusive + * NOT SAFE: To be used only by kernel debugger to avoid deadlock. + */ +boolean_t +kdp_lck_rw_lock_is_acquired_exclusive(lck_rw_t *lck) { + if (not_in_kdp) { + panic("panic: rw lock exclusive check done outside of kernel debugger"); } + return ((lck->lck_rw_want_upgrade || lck->lck_rw_want_write) && (lck->lck_rw_shared_count == 0)) ? TRUE : FALSE; +} - lck->read_count++; +/* + * Slow path routines for lck_mtx locking and unlocking functions. + * + * These functions were previously implemented in x86 assembly, + * and some optimizations are in place in this c code to obtain a compiled code + * as performant and compact as the assembly version. + * + * To avoid to inline these functions on the fast path, all functions directly called by + * the fast paths have the __attribute__((noinline)) specified. Also they are all implemented + * in such a way the fast path can tail call into them. In this way the return address + * does not need to be pushed on the caller stack and stack optimization can happen on the caller. + * + * Slow path code is structured in such a way there are no calls to functions that will return + * on the context of the caller function, i.e. all functions called are or tail call functions + * or inline functions. The number of arguments of the tail call functions are less then six, + * so that they can be passed over registers and do not need to be pushed on stack. + * This allows the compiler to not create a stack frame for the functions. + * + * __improbable and __probable are used to compile the slow path code in such a way + * the fast path case will be on a sequence of instructions with as less jumps as possible, + * to make this case the most optimized even if falling through the slow path. + */ - lck_interlock_unlock(lck, istate); +/* + * Intel lock invariants: + * + * lck_mtx_waiters: contains the count of threads currently in the mutex waitqueue + * + * The lock owner is promoted to the max priority of all its waiters only if it + * was a lower priority when it acquired or was an owner when a waiter waited. + * Max priority is capped at MAXPRI_PROMOTE. + * + * The last waiter will not be promoted as it is woken up, but the last + * lock owner may not have been the last thread to have been woken up depending on the + * luck of the draw. Therefore a last-owner may still have the promoted-on-wakeup + * flag set. + * + * TODO: Figure out an algorithm for stopping a lock holder which is already at the right + * priority from dropping priority in the future without having to take thread lock + * on acquire. + */ - return(TRUE); -} +#ifdef MUTEX_ZONE +extern zone_t lck_mtx_zone; +#endif /* * Routine: lck_mtx_alloc_init @@ -1603,10 +2085,13 @@ lck_mtx_alloc_init( lck_attr_t *attr) { lck_mtx_t *lck; - +#ifdef MUTEX_ZONE + if ((lck = (lck_mtx_t *)zalloc(lck_mtx_zone)) != 0) + lck_mtx_init(lck, grp, attr); +#else if ((lck = (lck_mtx_t *)kalloc(sizeof(lck_mtx_t))) != 0) lck_mtx_init(lck, grp, attr); - +#endif return(lck); } @@ -1619,7 +2104,11 @@ lck_mtx_free( lck_grp_t *grp) { lck_mtx_destroy(lck, grp); +#ifdef MUTEX_ZONE + zfree(lck_mtx_zone, lck); +#else kfree(lck, sizeof(lck_mtx_t)); +#endif } /* @@ -1631,20 +2120,20 @@ lck_mtx_ext_init( lck_grp_t *grp, lck_attr_t *attr) { - lck->lck_mtx.lck_mtx_ilk = 0; - lck->lck_mtx.lck_mtx_locked = 0; - lck->lck_mtx.lck_mtx_waiters = 0; - lck->lck_mtx.lck_mtx_pri = 0; - lck->lck_mtx_attr = 0; + bzero((void *)lck, sizeof(lck_mtx_ext_t)); if ((attr->lck_attr_val) & LCK_ATTR_DEBUG) { - lck->lck_mtx_deb.pc = 0; - lck->lck_mtx_deb.thread = 0; lck->lck_mtx_deb.type = MUTEX_TAG; lck->lck_mtx_attr |= LCK_MTX_ATTR_DEBUG; } lck->lck_mtx_grp = grp; + + if (grp->lck_grp_attr & LCK_GRP_ATTR_STAT) + lck->lck_mtx_attr |= LCK_MTX_ATTR_STAT; + + lck->lck_mtx.lck_mtx_is_ext = 1; + lck->lck_mtx.lck_mtx_pad32 = 0xFFFFFFFF; } /* @@ -1657,23 +2146,80 @@ lck_mtx_init( lck_attr_t *attr) { lck_mtx_ext_t *lck_ext; + lck_attr_t *lck_attr; + + if (attr != LCK_ATTR_NULL) + lck_attr = attr; + else + lck_attr = &LockDefaultLckAttr; - if ((attr != LCK_ATTR_NULL) && ((attr->lck_attr_val) & LCK_ATTR_DEBUG)) { + if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) { if ((lck_ext = (lck_mtx_ext_t *)kalloc(sizeof(lck_mtx_ext_t))) != 0) { - lck_mtx_ext_init(lck_ext, grp, attr); + lck_mtx_ext_init(lck_ext, grp, lck_attr); lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT; lck->lck_mtx_ptr = lck_ext; } } else { - lck->lck_mtx_ilk = 0; - lck->lck_mtx_locked = 0; - lck->lck_mtx_waiters = 0; - lck->lck_mtx_pri = 0; + lck->lck_mtx_owner = 0; + lck->lck_mtx_state = 0; + } + lck->lck_mtx_pad32 = 0xFFFFFFFF; + lck_grp_reference(grp); + lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX); +} + +/* + * Routine: lck_mtx_init_ext + */ +void +lck_mtx_init_ext( + lck_mtx_t *lck, + lck_mtx_ext_t *lck_ext, + lck_grp_t *grp, + lck_attr_t *attr) +{ + lck_attr_t *lck_attr; + + if (attr != LCK_ATTR_NULL) + lck_attr = attr; + else + lck_attr = &LockDefaultLckAttr; + + if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) { + lck_mtx_ext_init(lck_ext, grp, lck_attr); + lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT; + lck->lck_mtx_ptr = lck_ext; + } else { + lck->lck_mtx_owner = 0; + lck->lck_mtx_state = 0; } + lck->lck_mtx_pad32 = 0xFFFFFFFF; + lck_grp_reference(grp); lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX); } +static void +lck_mtx_lock_mark_destroyed( + lck_mtx_t *mutex, + boolean_t indirect) +{ + uint32_t state; + + if (indirect) { + /* convert to destroyed state */ + ordered_store_mtx_state_release(mutex, LCK_MTX_TAG_DESTROYED); + return; + } + + state = ordered_load_mtx_state(mutex); + lck_mtx_interlock_lock(mutex, &state); + + ordered_store_mtx_state_release(mutex, LCK_MTX_TAG_DESTROYED); + + enable_preemption(); +} + /* * Routine: lck_mtx_destroy */ @@ -1682,195 +2228,1226 @@ lck_mtx_destroy( lck_mtx_t *lck, lck_grp_t *grp) { - boolean_t lck_is_indirect; - + boolean_t indirect; + if (lck->lck_mtx_tag == LCK_MTX_TAG_DESTROYED) return; - lck_is_indirect = (lck->lck_mtx_tag == LCK_MTX_TAG_INDIRECT); - lck->lck_mtx_tag = LCK_MTX_TAG_DESTROYED; - if (lck_is_indirect) +#if MACH_LDEBUG + lck_mtx_assert(lck, LCK_MTX_ASSERT_NOTOWNED); +#endif + indirect = (lck->lck_mtx_tag == LCK_MTX_TAG_INDIRECT); + + lck_mtx_lock_mark_destroyed(lck, indirect); + + if (indirect) kfree(lck->lck_mtx_ptr, sizeof(lck_mtx_ext_t)); lck_grp_lckcnt_decr(grp, LCK_TYPE_MTX); lck_grp_deallocate(grp); return; } -/* - * Routine: lck_mtx_assert - */ + +#if DEVELOPMENT | DEBUG +__attribute__((noinline)) void -lck_mtx_assert( - __unused lck_mtx_t *lck, - __unused unsigned int type) +lck_mtx_owner_check_panic( + lck_mtx_t *lock) { + thread_t owner = (thread_t)lock->lck_mtx_owner; + panic("Mutex unlock attempted from non-owner thread. Owner=%p lock=%p", owner, lock); } +#endif -#if MACH_KDB - -void db_show_one_lock(lock_t *); - -void -db_show_one_lock( - lock_t *lock) +__attribute__((always_inline)) +static boolean_t +get_indirect_mutex( + lck_mtx_t **lock, + uint32_t *state) { - db_printf("Read_count = 0x%x, %swant_upgrade, %swant_write, ", - lock->read_count, - lock->want_upgrade ? "" : "!", - lock->want_write ? "" : "!"); - db_printf("%swaiting, %scan_sleep\n", - lock->waiting ? "" : "!", lock->can_sleep ? "" : "!"); - db_printf("Interlock:\n"); - db_show_one_simple_lock((db_expr_t)simple_lock_addr(lock->interlock), - TRUE, (db_expr_t)0, (char *)0); + *lock = &((*lock)->lck_mtx_ptr->lck_mtx); + *state = ordered_load_mtx_state(*lock); + return TRUE; } -#endif /* MACH_KDB */ - -/* - * The C portion of the mutex package. These routines are only invoked - * if the optimized assembler routines can't do the work. - */ - /* - * Routine: lock_alloc - * Function: - * Allocate a mutex for external users who cannot - * hard-code the structure definition into their - * objects. - * For now just use kalloc, but a zone is probably - * warranted. + * Routine: lck_mtx_unlock_slow + * + * Unlocks a mutex held by current thread. + * + * It will wake up waiters if necessary. + * + * Interlock can be held. */ -mutex_t * -mutex_alloc( - unsigned short tag) +__attribute__((noinline)) +void +lck_mtx_unlock_slow( + lck_mtx_t *lock) { - mutex_t *m; + thread_t thread; + uint32_t state, prev; + boolean_t indirect = FALSE; - if ((m = (mutex_t *)kalloc(sizeof(mutex_t))) != 0) - mutex_init(m, tag); - return(m); -} + state = ordered_load_mtx_state(lock); -/* - * Routine: mutex_free - * Function: - * Free a mutex allocated for external users. - * For now just use kfree, but a zone is probably - * warranted. + /* Is this an indirect mutex? */ + if (__improbable(state == LCK_MTX_TAG_INDIRECT)) { + indirect = get_indirect_mutex(&lock, &state); + } + + thread = current_thread(); + +#if DEVELOPMENT | DEBUG + thread_t owner = (thread_t)lock->lck_mtx_owner; + if(__improbable(owner != thread)) + lck_mtx_owner_check_panic(lock); +#endif + + /* check if it is held as a spinlock */ + if (__improbable((state & LCK_MTX_MLOCKED_MSK) == 0)) + goto unlock; + + lck_mtx_interlock_lock_clear_flags(lock, LCK_MTX_MLOCKED_MSK, &state); + +unlock: + /* preemption disabled, interlock held and mutex not held */ + + /* clear owner */ + ordered_store_mtx_owner(lock, 0); + /* keep original state in prev for later evaluation */ + prev = state; + + if (__improbable(state & LCK_MTX_WAITERS_MSK)) { +#if MACH_LDEBUG + if (thread) + thread->mutex_count--; +#endif + return lck_mtx_unlock_wakeup_tail(lock, state, indirect); + } + + /* release interlock, promotion and clear spin flag */ + state &= (~(LCK_MTX_ILOCKED_MSK | LCK_MTX_SPIN_MSK)); + ordered_store_mtx_state_release(lock, state); /* since I own the interlock, I don't need an atomic update */ + +#if MACH_LDEBUG + /* perform lock statistics after drop to prevent delay */ + if (thread) + thread->mutex_count--; /* lock statistic */ +#endif /* MACH_LDEBUG */ + + /* re-enable preemption */ + lck_mtx_unlock_finish_inline(lock, FALSE); + + return; +} + +#define LCK_MTX_LCK_WAIT_CODE 0x20 +#define LCK_MTX_LCK_WAKEUP_CODE 0x21 +#define LCK_MTX_LCK_SPIN_CODE 0x22 +#define LCK_MTX_LCK_ACQUIRE_CODE 0x23 +#define LCK_MTX_LCK_DEMOTE_CODE 0x24 + +/* + * Routine: lck_mtx_unlock_wakeup_tail + * + * Invoked on unlock when there is + * contention, i.e. the assembly routine sees + * that mutex->lck_mtx_waiters != 0 + * + * neither the mutex or interlock is held + * + * Note that this routine might not be called if there are pending + * waiters which have previously been woken up, and they didn't + * end up boosting the old owner. + * + * assembly routine previously did the following to mutex: + * (after saving the state in prior_lock_state) + * decremented lck_mtx_waiters if nonzero + * + * This function needs to be called as a tail call + * to optimize the compiled code. */ -void -mutex_free( - mutex_t *m) +__attribute__((noinline)) +static void +lck_mtx_unlock_wakeup_tail ( + lck_mtx_t *mutex, + uint32_t state, + boolean_t indirect) { - kfree(m, sizeof(mutex_t)); + struct turnstile *ts; + + __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex); + kern_return_t did_wake; + + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_START, + trace_lck, 0, mutex->lck_mtx_waiters, 0, 0); + + ts = turnstile_prepare((uintptr_t)mutex, NULL, TURNSTILE_NULL, TURNSTILE_KERNEL_MUTEX); + + if (mutex->lck_mtx_waiters > 1) { + /* WAITQ_PROMOTE_ON_WAKE will call turnstile_update_inheritor on the wokenup thread */ + did_wake = waitq_wakeup64_one(&ts->ts_waitq, CAST_EVENT64_T(LCK_MTX_EVENT(mutex)), THREAD_AWAKENED, WAITQ_PROMOTE_ON_WAKE); + } else { + did_wake = waitq_wakeup64_one(&ts->ts_waitq, CAST_EVENT64_T(LCK_MTX_EVENT(mutex)), THREAD_AWAKENED, WAITQ_ALL_PRIORITIES); + turnstile_update_inheritor(ts, NULL, TURNSTILE_IMMEDIATE_UPDATE); + } + assert(did_wake == KERN_SUCCESS); + + turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD); + turnstile_complete((uintptr_t)mutex, NULL, NULL, TURNSTILE_KERNEL_MUTEX); + + state -= LCK_MTX_WAITER; + state &= (~(LCK_MTX_SPIN_MSK | LCK_MTX_ILOCKED_MSK)); + ordered_store_mtx_state_release(mutex, state); + + assert(current_thread()->turnstile != NULL); + + turnstile_cleanup(); + + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_END, + trace_lck, 0, mutex->lck_mtx_waiters, 0, 0); + + lck_mtx_unlock_finish_inline(mutex, indirect); } /* - * Routine: _mutex_assert + * Routine: lck_mtx_lock_acquire_x86 + * + * Invoked on acquiring the mutex when there is + * contention (i.e. the assembly routine sees that + * that mutex->lck_mtx_waiters != 0 + * + * mutex is owned... interlock is held... preemption is disabled */ +__attribute__((always_inline)) +static void +lck_mtx_lock_acquire_inline( + lck_mtx_t *mutex, + struct turnstile *ts) +{ + __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex); + + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_START, + trace_lck, 0, mutex->lck_mtx_waiters, 0, 0); + + thread_t thread = (thread_t)mutex->lck_mtx_owner; /* faster than current_thread() */ + assert(thread->waiting_for_mutex == NULL); + + if (mutex->lck_mtx_waiters > 0) { + if (ts == NULL) { + ts = turnstile_prepare((uintptr_t)mutex, NULL, TURNSTILE_NULL, TURNSTILE_KERNEL_MUTEX); + } + + turnstile_update_inheritor(ts, thread, (TURNSTILE_IMMEDIATE_UPDATE | TURNSTILE_INHERITOR_THREAD)); + turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD); + } + + if (ts != NULL) { + turnstile_complete((uintptr_t)mutex, NULL, NULL, TURNSTILE_KERNEL_MUTEX); + } + + assert(current_thread()->turnstile != NULL); + + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_END, + trace_lck, 0, mutex->lck_mtx_waiters, 0, 0); +} + void -_mutex_assert ( - mutex_t *mutex, - unsigned int what) +lck_mtx_lock_acquire_x86( + lck_mtx_t *mutex) { + return lck_mtx_lock_acquire_inline(mutex, NULL); +} - thread_t thread = current_thread(); - thread_t holder; +/* + * Tail call helpers for lock functions that perform + * lck_mtx_lock_acquire followed by the caller's finish routine, to optimize + * the caller's compiled code. + */ + +__attribute__((noinline)) +static void +lck_mtx_lock_acquire_tail( + lck_mtx_t *mutex, + boolean_t indirect, + struct turnstile *ts) +{ + lck_mtx_lock_acquire_inline(mutex, ts); + lck_mtx_lock_finish_inline_with_cleanup(mutex, ordered_load_mtx_state(mutex), indirect); +} + +__attribute__((noinline)) +static boolean_t +lck_mtx_try_lock_acquire_tail( + lck_mtx_t *mutex) +{ + lck_mtx_lock_acquire_inline(mutex, NULL); + lck_mtx_try_lock_finish_inline(mutex, ordered_load_mtx_state(mutex)); + + return TRUE; +} + +__attribute__((noinline)) +static void +lck_mtx_convert_spin_acquire_tail( + lck_mtx_t *mutex) +{ + lck_mtx_lock_acquire_inline(mutex, NULL); + lck_mtx_convert_spin_finish_inline(mutex, ordered_load_mtx_state(mutex)); +} + +boolean_t +lck_mtx_ilk_unlock( + lck_mtx_t *mutex) +{ + lck_mtx_ilk_unlock_inline(mutex, ordered_load_mtx_state(mutex)); + return TRUE; +} + +static inline void +lck_mtx_interlock_lock_set_and_clear_flags( + lck_mtx_t *mutex, + uint32_t xor_flags, + uint32_t and_flags, + uint32_t *new_state) +{ + uint32_t state, prev; + state = *new_state; + + for ( ; ; ) { + /* have to wait for interlock to clear */ + while (__improbable(state & (LCK_MTX_ILOCKED_MSK | xor_flags))) { + cpu_pause(); + state = ordered_load_mtx_state(mutex); + } + prev = state; /* prev contains snapshot for exchange */ + state |= LCK_MTX_ILOCKED_MSK | xor_flags; /* pick up interlock */ + state &= ~and_flags; /* clear flags */ + + disable_preemption(); + if (os_atomic_cmpxchg(&mutex->lck_mtx_state, prev, state, acquire)) + break; + enable_preemption(); + cpu_pause(); + state = ordered_load_mtx_state(mutex); + } + *new_state = state; + return; +} + +static inline void +lck_mtx_interlock_lock_clear_flags( + lck_mtx_t *mutex, + uint32_t and_flags, + uint32_t *new_state) +{ + return lck_mtx_interlock_lock_set_and_clear_flags(mutex, 0, and_flags, new_state); +} + +static inline void +lck_mtx_interlock_lock( + lck_mtx_t *mutex, + uint32_t *new_state) +{ + return lck_mtx_interlock_lock_set_and_clear_flags(mutex, 0, 0, new_state); +} + +static inline int +lck_mtx_interlock_try_lock_set_flags( + lck_mtx_t *mutex, + uint32_t or_flags, + uint32_t *new_state) +{ + uint32_t state, prev; + state = *new_state; + + /* have to wait for interlock to clear */ + if (state & (LCK_MTX_ILOCKED_MSK | or_flags)) { + return 0; + } + prev = state; /* prev contains snapshot for exchange */ + state |= LCK_MTX_ILOCKED_MSK | or_flags; /* pick up interlock */ + disable_preemption(); + if (os_atomic_cmpxchg(&mutex->lck_mtx_state, prev, state, acquire)) { + *new_state = state; + return 1; + } + + enable_preemption(); + return 0; +} + +static inline int +lck_mtx_interlock_try_lock( + lck_mtx_t *mutex, + uint32_t *new_state) +{ + return lck_mtx_interlock_try_lock_set_flags(mutex, 0, new_state); +} + +static inline int +lck_mtx_interlock_try_lock_disable_interrupts( + lck_mtx_t *mutex, + boolean_t *istate) +{ + uint32_t state; + + *istate = ml_set_interrupts_enabled(FALSE); + state = ordered_load_mtx_state(mutex); - if (panicstr != NULL) - return; + if (lck_mtx_interlock_try_lock(mutex, &state)) { + return 1; + } else { + ml_set_interrupts_enabled(*istate); + return 0; + } +} - holder = (thread_t) mutex->lck_mtx.lck_mtx_locked; +static inline void +lck_mtx_interlock_unlock_enable_interrupts( + lck_mtx_t *mutex, + boolean_t istate) +{ + lck_mtx_ilk_unlock(mutex); + ml_set_interrupts_enabled(istate); +} + +__attribute__((noinline)) +static void +lck_mtx_lock_contended( + lck_mtx_t *lock, + boolean_t indirect, + boolean_t *first_miss) +{ + lck_mtx_spinwait_ret_type_t ret; + uint32_t state; + thread_t thread; + struct turnstile *ts = NULL; + +try_again: + + if (indirect) { + lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, first_miss); + } + + ret = lck_mtx_lock_spinwait_x86(lock); + state = ordered_load_mtx_state(lock); + switch (ret) { + case LCK_MTX_SPINWAIT_NO_SPIN: + /* + * owner not on core, lck_mtx_lock_spinwait_x86 didn't even + * try to spin. + */ + if (indirect) { + lck_grp_mtx_update_direct_wait((struct _lck_mtx_ext_*)lock); + } + + /* just fall through case LCK_MTX_SPINWAIT_SPUN */ + case LCK_MTX_SPINWAIT_SPUN: + /* + * mutex not acquired but lck_mtx_lock_spinwait_x86 tried to spin + * interlock not held + */ + lck_mtx_interlock_lock(lock, &state); + assert(state & LCK_MTX_ILOCKED_MSK); + + if (state & LCK_MTX_MLOCKED_MSK) { + if (indirect) { + lck_grp_mtx_update_wait((struct _lck_mtx_ext_*)lock, first_miss); + } + lck_mtx_lock_wait_x86(lock, &ts); + /* + * interlock is not held here. + */ + goto try_again; + } else { + + /* grab the mutex */ + state |= LCK_MTX_MLOCKED_MSK; + ordered_store_mtx_state_release(lock, state); + thread = current_thread(); + ordered_store_mtx_owner(lock, (uintptr_t)thread); +#if MACH_LDEBUG + if (thread) { + thread->mutex_count++; + } +#endif /* MACH_LDEBUG */ + } - switch (what) { - case MA_OWNED: - if (thread != holder) - panic("mutex %x not owned\n", mutex); break; + case LCK_MTX_SPINWAIT_ACQUIRED: + /* + * mutex has been acquired by lck_mtx_lock_spinwait_x86 + * interlock is held and preemption disabled + * owner is set and mutex marked as locked + * statistics updated too + */ + break; + default: + panic("lck_mtx_lock_spinwait_x86 returned %d for mutex %p\n", ret, lock); + } + + /* + * interlock is already acquired here + */ + + /* mutex has been acquired */ + thread = (thread_t)lock->lck_mtx_owner; + if (state & LCK_MTX_WAITERS_MSK) { + /* + * lck_mtx_lock_acquire_tail will call + * turnstile_complete. + */ + return lck_mtx_lock_acquire_tail(lock, indirect, ts); + } - case MA_NOTOWNED: - if (thread == holder) - panic("mutex %x owned\n", mutex); - break; + if (ts != NULL) { + turnstile_complete((uintptr_t)lock, NULL, NULL, TURNSTILE_KERNEL_MUTEX); } + assert(current_thread()->turnstile != NULL); + + /* release the interlock */ + lck_mtx_lock_finish_inline_with_cleanup(lock, ordered_load_mtx_state(lock), indirect); } -#if MACH_KDB /* - * Routines to print out simple_locks and mutexes in a nicely-formatted - * fashion. + * Helper noinline functions for calling + * panic to optimize compiled code. */ -char *simple_lock_labels = "ENTRY ILK THREAD DURATION CALLER"; -char *mutex_labels = "ENTRY LOCKED WAITERS THREAD CALLER"; +__attribute__((noinline)) __abortlike +static void +lck_mtx_destroyed( + lck_mtx_t *lock) +{ + panic("trying to interlock destroyed mutex (%p)", lock); +} + +__attribute__((noinline)) +static boolean_t +lck_mtx_try_destroyed( + lck_mtx_t *lock) +{ + panic("trying to interlock destroyed mutex (%p)", lock); + return FALSE; +} + +__attribute__((always_inline)) +static boolean_t +lck_mtx_lock_wait_interlock_to_clear( + lck_mtx_t *lock, + uint32_t* new_state) +{ + uint32_t state; + + for ( ; ; ) { + cpu_pause(); + state = ordered_load_mtx_state(lock); + if (!(state & (LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK))) { + *new_state = state; + return TRUE; + } + if (state & LCK_MTX_MLOCKED_MSK) { + /* if it is held as mutex, just fail */ + return FALSE; + } + } +} + +__attribute__((always_inline)) +static boolean_t +lck_mtx_try_lock_wait_interlock_to_clear( + lck_mtx_t *lock, + uint32_t* new_state) +{ + uint32_t state; + + for ( ; ; ) { + cpu_pause(); + state = ordered_load_mtx_state(lock); + if (state & (LCK_MTX_MLOCKED_MSK | LCK_MTX_SPIN_MSK)) { + /* if it is held as mutex or spin, just fail */ + return FALSE; + } + if (!(state & LCK_MTX_ILOCKED_MSK)) { + *new_state = state; + return TRUE; + } + } +} +/* + * Routine: lck_mtx_lock_slow + * + * Locks a mutex for current thread. + * If the lock is contended this function might + * sleep. + * + * Called with interlock not held. + */ +__attribute__((noinline)) void -db_show_one_simple_lock ( - db_expr_t addr, - boolean_t have_addr, - db_expr_t count, - char * modif) +lck_mtx_lock_slow( + lck_mtx_t *lock) { - simple_lock_t saddr = (simple_lock_t)addr; + boolean_t indirect = FALSE; + uint32_t state; + int first_miss = 0; + + state = ordered_load_mtx_state(lock); + + /* is the interlock or mutex held */ + if (__improbable(state & ((LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK)))) { + /* + * Note: both LCK_MTX_TAG_DESTROYED and LCK_MTX_TAG_INDIRECT + * have LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK + * set in state (state == lck_mtx_tag) + */ + + + /* is the mutex already held and not indirect */ + if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))){ + /* no, must have been the mutex */ + return lck_mtx_lock_contended(lock, indirect, &first_miss); + } + + /* check to see if it is marked destroyed */ + if (__improbable(state == LCK_MTX_TAG_DESTROYED)) { + lck_mtx_destroyed(lock); + } + + /* Is this an indirect mutex? */ + if (__improbable(state == LCK_MTX_TAG_INDIRECT)) { + indirect = get_indirect_mutex(&lock, &state); + + first_miss = 0; + lck_grp_mtx_update_held((struct _lck_mtx_ext_*)lock); - if (saddr == (simple_lock_t)0 || !have_addr) { - db_error ("No simple_lock\n"); + if (state & LCK_MTX_SPIN_MSK) { + /* M_SPIN_MSK was set, so M_ILOCKED_MSK must also be present */ + assert(state & LCK_MTX_ILOCKED_MSK); + lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss); + } + } + + if (!lck_mtx_lock_wait_interlock_to_clear(lock, &state)) { + return lck_mtx_lock_contended(lock, indirect, &first_miss); + } } -#if USLOCK_DEBUG - else if (saddr->lock_type != USLOCK_TAG) - db_error ("Not a simple_lock\n"); -#endif /* USLOCK_DEBUG */ - db_printf ("%s\n", simple_lock_labels); - db_print_simple_lock (saddr); + /* no - can't be INDIRECT, DESTROYED or locked */ + while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_MLOCKED_MSK, &state))) { + if (!lck_mtx_lock_wait_interlock_to_clear(lock, &state)) { + return lck_mtx_lock_contended(lock, indirect, &first_miss); + } + } + + /* lock and interlock acquired */ + + thread_t thread = current_thread(); + /* record owner of mutex */ + ordered_store_mtx_owner(lock, (uintptr_t)thread); + +#if MACH_LDEBUG + if (thread) { + thread->mutex_count++; /* lock statistic */ + } +#endif + /* + * Check if there are waiters to + * inherit their priority. + */ + if (__improbable(state & LCK_MTX_WAITERS_MSK)) { + return lck_mtx_lock_acquire_tail(lock, indirect, NULL); + } + + /* release the interlock */ + lck_mtx_lock_finish_inline(lock, ordered_load_mtx_state(lock), indirect); + + return; +} + +__attribute__((noinline)) +boolean_t +lck_mtx_try_lock_slow( + lck_mtx_t *lock) +{ + boolean_t indirect = FALSE; + uint32_t state; + int first_miss = 0; + + state = ordered_load_mtx_state(lock); + + /* is the interlock or mutex held */ + if (__improbable(state & ((LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK)))) { + /* + * Note: both LCK_MTX_TAG_DESTROYED and LCK_MTX_TAG_INDIRECT + * have LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK + * set in state (state == lck_mtx_tag) + */ + + /* is the mutex already held and not indirect */ + if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))){ + return FALSE; + } + + /* check to see if it is marked destroyed */ + if (__improbable(state == LCK_MTX_TAG_DESTROYED)) { + lck_mtx_try_destroyed(lock); + } + + /* Is this an indirect mutex? */ + if (__improbable(state == LCK_MTX_TAG_INDIRECT)) { + indirect = get_indirect_mutex(&lock, &state); + + first_miss = 0; + lck_grp_mtx_update_held((struct _lck_mtx_ext_*)lock); + } + + if (!lck_mtx_try_lock_wait_interlock_to_clear(lock, &state)) { + if (indirect) + lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss); + return FALSE; + } + } + + /* no - can't be INDIRECT, DESTROYED or locked */ + while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_MLOCKED_MSK, &state))) { + if (!lck_mtx_try_lock_wait_interlock_to_clear(lock, &state)) { + if (indirect) + lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss); + return FALSE; + } + } + + /* lock and interlock acquired */ + + thread_t thread = current_thread(); + /* record owner of mutex */ + ordered_store_mtx_owner(lock, (uintptr_t)thread); + +#if MACH_LDEBUG + if (thread) { + thread->mutex_count++; /* lock statistic */ + } +#endif + /* + * Check if there are waiters to + * inherit their priority. + */ + if (__improbable(state & LCK_MTX_WAITERS_MSK)) { + return lck_mtx_try_lock_acquire_tail(lock); + } + + /* release the interlock */ + lck_mtx_try_lock_finish_inline(lock, ordered_load_mtx_state(lock)); + + return TRUE; + } +__attribute__((noinline)) void -db_print_simple_lock ( - simple_lock_t addr) +lck_mtx_lock_spin_slow( + lck_mtx_t *lock) { + boolean_t indirect = FALSE; + uint32_t state; + int first_miss = 0; + + state = ordered_load_mtx_state(lock); + + /* is the interlock or mutex held */ + if (__improbable(state & ((LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK)))) { + /* + * Note: both LCK_MTX_TAG_DESTROYED and LCK_MTX_TAG_INDIRECT + * have LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK + * set in state (state == lck_mtx_tag) + */ + + + /* is the mutex already held and not indirect */ + if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))){ + /* no, must have been the mutex */ + return lck_mtx_lock_contended(lock, indirect, &first_miss); + } + + /* check to see if it is marked destroyed */ + if (__improbable(state == LCK_MTX_TAG_DESTROYED)) { + lck_mtx_destroyed(lock); + } + + /* Is this an indirect mutex? */ + if (__improbable(state == LCK_MTX_TAG_INDIRECT)) { + indirect = get_indirect_mutex(&lock, &state); + + first_miss = 0; + lck_grp_mtx_update_held((struct _lck_mtx_ext_*)lock); + + if (state & LCK_MTX_SPIN_MSK) { + /* M_SPIN_MSK was set, so M_ILOCKED_MSK must also be present */ + assert(state & LCK_MTX_ILOCKED_MSK); + lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss); + } + } + + if (!lck_mtx_lock_wait_interlock_to_clear(lock, &state)) { + return lck_mtx_lock_contended(lock, indirect, &first_miss); + } + } + + /* no - can't be INDIRECT, DESTROYED or locked */ + while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_SPIN_MSK, &state) )) { + if (!lck_mtx_lock_wait_interlock_to_clear(lock, &state)) { + return lck_mtx_lock_contended(lock, indirect, &first_miss); + } + } + + /* lock as spinlock and interlock acquired */ + + thread_t thread = current_thread(); + /* record owner of mutex */ + ordered_store_mtx_owner(lock, (uintptr_t)thread); + +#if MACH_LDEBUG + if (thread) { + thread->mutex_count++; /* lock statistic */ + } +#endif + +#if CONFIG_DTRACE + LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN_ACQUIRE, lock, 0); +#endif + /* return with the interlock held and preemption disabled */ + return; +} + +__attribute__((noinline)) +boolean_t +lck_mtx_try_lock_spin_slow( + lck_mtx_t *lock) +{ + boolean_t indirect = FALSE; + uint32_t state; + int first_miss = 0; + + state = ordered_load_mtx_state(lock); + + /* is the interlock or mutex held */ + if (__improbable(state & ((LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK)))) { + /* + * Note: both LCK_MTX_TAG_DESTROYED and LCK_MTX_TAG_INDIRECT + * have LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK + * set in state (state == lck_mtx_tag) + */ + + /* is the mutex already held and not indirect */ + if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))){ + return FALSE; + } + + /* check to see if it is marked destroyed */ + if (__improbable(state == LCK_MTX_TAG_DESTROYED)) { + lck_mtx_try_destroyed(lock); + } + + /* Is this an indirect mutex? */ + if (__improbable(state == LCK_MTX_TAG_INDIRECT)) { + indirect = get_indirect_mutex(&lock, &state); + + first_miss = 0; + lck_grp_mtx_update_held((struct _lck_mtx_ext_*)lock); + } + + if (!lck_mtx_try_lock_wait_interlock_to_clear(lock, &state)) { + if (indirect) + lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss); + return FALSE; + } + } + + /* no - can't be INDIRECT, DESTROYED or locked */ + while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_SPIN_MSK, &state))) { + if (!lck_mtx_try_lock_wait_interlock_to_clear(lock, &state)) { + if (indirect) + lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss); + return FALSE; + } + } + + /* lock and interlock acquired */ + + thread_t thread = current_thread(); + /* record owner of mutex */ + ordered_store_mtx_owner(lock, (uintptr_t)thread); + +#if MACH_LDEBUG + if (thread) { + thread->mutex_count++; /* lock statistic */ + } +#endif + +#if CONFIG_DTRACE + LOCKSTAT_RECORD(LS_LCK_MTX_TRY_SPIN_LOCK_ACQUIRE, lock, 0); +#endif + return TRUE; - db_printf ("%08x %3d", addr, *hw_lock_addr(addr->interlock)); -#if USLOCK_DEBUG - db_printf (" %08x", addr->debug.lock_thread); - db_printf (" %08x ", addr->debug.duration[1]); - db_printsym ((int)addr->debug.lock_pc, DB_STGY_ANY); -#endif /* USLOCK_DEBUG */ - db_printf ("\n"); } +__attribute__((noinline)) void -db_show_one_mutex ( - db_expr_t addr, - boolean_t have_addr, - db_expr_t count, - char * modif) +lck_mtx_convert_spin( + lck_mtx_t *lock) { - mutex_t * maddr = (mutex_t *)addr; + uint32_t state; - if (maddr == (mutex_t *)0 || !have_addr) - db_error ("No mutex\n"); -#if MACH_LDEBUG - else if (maddr->type != MUTEX_TAG) - db_error ("Not a mutex\n"); -#endif /* MACH_LDEBUG */ + state = ordered_load_mtx_state(lock); + + /* Is this an indirect mutex? */ + if (__improbable(state == LCK_MTX_TAG_INDIRECT)) { + /* If so, take indirection */ + get_indirect_mutex(&lock, &state); + } + + assertf((thread_t)lock->lck_mtx_owner == current_thread(), "lock %p not owned by thread %p (current owner %p)", lock, current_thread(), (thread_t)lock->lck_mtx_owner ); + + if (__improbable(state & LCK_MTX_MLOCKED_MSK)) { + /* already owned as a mutex, just return */ + return; + } + + assert(get_preemption_level() > 0); + assert(state & LCK_MTX_ILOCKED_MSK); + assert(state & LCK_MTX_SPIN_MSK); + + /* + * Check if there are waiters to + * inherit their priority. + */ + if (__improbable(state & LCK_MTX_WAITERS_MSK)) { + return lck_mtx_convert_spin_acquire_tail(lock); + } + + lck_mtx_convert_spin_finish_inline(lock, ordered_load_mtx_state(lock)); + + return; +} + +static inline boolean_t +lck_mtx_lock_grab_mutex( + lck_mtx_t *lock) +{ + uint32_t state; + + state = ordered_load_mtx_state(lock); + + if (!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_MLOCKED_MSK, &state)) { + return FALSE; + } + + /* lock and interlock acquired */ - db_printf ("%s\n", mutex_labels); - db_print_mutex (maddr); + thread_t thread = current_thread(); + /* record owner of mutex */ + ordered_store_mtx_owner(lock, (uintptr_t)thread); + +#if MACH_LDEBUG + if (thread) { + thread->mutex_count++; /* lock statistic */ + } +#endif + return TRUE; } +__attribute__((noinline)) void -db_print_mutex ( - mutex_t * addr) +lck_mtx_assert( + lck_mtx_t *lock, + unsigned int type) { - db_printf ("%08x %6d %7d", - addr, *addr, addr->lck_mtx.lck_mtx_waiters); -#if MACH_LDEBUG - db_printf (" %08x ", addr->thread); - db_printsym (addr->pc, DB_STGY_ANY); -#endif /* MACH_LDEBUG */ - db_printf ("\n"); + thread_t thread, owner; + uint32_t state; + + thread = current_thread(); + state = ordered_load_mtx_state(lock); + + if (state == LCK_MTX_TAG_INDIRECT) { + get_indirect_mutex(&lock, &state); + } + + owner = (thread_t)lock->lck_mtx_owner; + + if (type == LCK_MTX_ASSERT_OWNED) { + if (owner != thread || !(state & (LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK))) + panic("mutex (%p) not owned\n", lock); + } else { + assert (type == LCK_MTX_ASSERT_NOTOWNED); + if (owner == thread) + panic("mutex (%p) owned\n", lock); + } +} + +/* + * Routine: lck_mtx_lock_spinwait_x86 + * + * Invoked trying to acquire a mutex when there is contention but + * the holder is running on another processor. We spin for up to a maximum + * time waiting for the lock to be released. + * + * Called with the interlock unlocked. + * returns LCK_MTX_SPINWAIT_ACQUIRED if mutex acquired + * returns LCK_MTX_SPINWAIT_SPUN if we spun + * returns LCK_MTX_SPINWAIT_NO_SPIN if we didn't spin due to the holder not running + */ +__attribute__((noinline)) +lck_mtx_spinwait_ret_type_t +lck_mtx_lock_spinwait_x86( + lck_mtx_t *mutex) +{ + __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex); + thread_t holder; + uint64_t overall_deadline; + uint64_t check_owner_deadline; + uint64_t cur_time; + lck_mtx_spinwait_ret_type_t retval = LCK_MTX_SPINWAIT_SPUN; + int loopcount = 0; + + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_START, + trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, 0, 0); + + cur_time = mach_absolute_time(); + overall_deadline = cur_time + MutexSpin; + check_owner_deadline = cur_time; + + /* + * Spin while: + * - mutex is locked, and + * - its locked as a spin lock, and + * - owner is running on another processor, and + * - owner (processor) is not idling, and + * - we haven't spun for long enough. + */ + do { + if (__probable(lck_mtx_lock_grab_mutex(mutex))) { + retval = LCK_MTX_SPINWAIT_ACQUIRED; + break; + } + cur_time = mach_absolute_time(); + + if (cur_time >= overall_deadline) + break; + + if (cur_time >= check_owner_deadline && mutex->lck_mtx_owner) { + boolean_t istate; + + /* + * We will repeatedly peek at the state of the lock while spinning, + * and we will acquire the interlock to do so. + * The thread that will unlock the mutex will also need to acquire + * the interlock, and we want to avoid to slow it down. + * To avoid to get an interrupt while holding the interlock + * and increase the time we are holding it, we + * will try to acquire the interlock with interrupts disabled. + * This is safe because it is a "try_lock", if we can't acquire + * the interlock we re-enable the interrupts and fail, so it is + * ok to call it even if the interlock was already held. + */ + if (lck_mtx_interlock_try_lock_disable_interrupts(mutex, &istate)) { + + if ((holder = (thread_t) mutex->lck_mtx_owner) != NULL) { + + if ( !(holder->machine.specFlags & OnProc) || + (holder->state & TH_IDLE)) { + + lck_mtx_interlock_unlock_enable_interrupts(mutex, istate); + + if (loopcount == 0) + retval = LCK_MTX_SPINWAIT_NO_SPIN; + break; + } + } + lck_mtx_interlock_unlock_enable_interrupts(mutex, istate); + + check_owner_deadline = cur_time + (MutexSpin / 4); + } + } + cpu_pause(); + + loopcount++; + + } while (TRUE); + +#if CONFIG_DTRACE + /* + * We've already kept a count via overall_deadline of how long we spun. + * If dtrace is active, then we compute backwards to decide how + * long we spun. + * + * Note that we record a different probe id depending on whether + * this is a direct or indirect mutex. This allows us to + * penalize only lock groups that have debug/stats enabled + * with dtrace processing if desired. + */ + if (__probable(mutex->lck_mtx_is_ext == 0)) { + LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN, mutex, + mach_absolute_time() - (overall_deadline - MutexSpin)); + } else { + LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_SPIN, mutex, + mach_absolute_time() - (overall_deadline - MutexSpin)); + } + /* The lockstat acquire event is recorded by the assembly code beneath us. */ +#endif + + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_END, + trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, retval, 0); + + return retval; +} + + + +/* + * Routine: lck_mtx_lock_wait_x86 + * + * Invoked in order to wait on contention. + * + * Called with the interlock locked and + * preemption disabled... + * returns it unlocked and with preemption enabled + * + * lck_mtx_waiters is 1:1 with a wakeup needing to occur. + * A runnable waiter can exist between wait and acquire + * without a waiters count being set. + * This allows us to never make a spurious wakeup call. + * + * Priority: + * This avoids taking the thread lock if the owning thread is the same priority. + * This optimizes the case of same-priority threads contending on a lock. + * However, that allows the owning thread to drop in priority while holding the lock, + * because there is no state that the priority change can notice that + * says that the targeted thread holds a contended mutex. + * + * One possible solution: priority changes could look for some atomic tag + * on the thread saying 'holding contended lock', and then set up a promotion. + * Needs a story for dropping that promotion - the last contended unlock + * has to notice that this has happened. + */ +__attribute__((noinline)) +void +lck_mtx_lock_wait_x86 ( + lck_mtx_t *mutex, + struct turnstile **ts) +{ + thread_t self = current_thread(); + +#if CONFIG_DTRACE + uint64_t sleep_start = 0; + + if (lockstat_probemap[LS_LCK_MTX_LOCK_BLOCK] || lockstat_probemap[LS_LCK_MTX_EXT_LOCK_BLOCK]) { + sleep_start = mach_absolute_time(); + } +#endif + __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex); + + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_START, + trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), + mutex->lck_mtx_waiters, 0, 0); + + assert(self->waiting_for_mutex == NULL); + self->waiting_for_mutex = mutex; + mutex->lck_mtx_waiters++; + + thread_t holder = (thread_t)mutex->lck_mtx_owner; + assert(holder != NULL); + + /* + * lck_mtx_lock_wait_x86 might be called on a loop. Call prepare just once and reuse + * the same turnstile while looping, the matching turnstile compleate will be called + * by lck_mtx_lock_contended when finally acquiring the lock. + */ + if (*ts == NULL) { + *ts = turnstile_prepare((uintptr_t)mutex, NULL, TURNSTILE_NULL, TURNSTILE_KERNEL_MUTEX); + } + + struct turnstile *turnstile = *ts; + thread_set_pending_block_hint(self, kThreadWaitKernelMutex); + turnstile_update_inheritor(turnstile, holder, (TURNSTILE_DELAYED_UPDATE | TURNSTILE_INHERITOR_THREAD)); + + waitq_assert_wait64(&turnstile->ts_waitq, CAST_EVENT64_T(LCK_MTX_EVENT(mutex)), THREAD_UNINT | THREAD_WAIT_NOREPORT_USER, TIMEOUT_WAIT_FOREVER); + + lck_mtx_ilk_unlock(mutex); + + turnstile_update_inheritor_complete(turnstile, TURNSTILE_INTERLOCK_NOT_HELD); + + thread_block(THREAD_CONTINUE_NULL); + + self->waiting_for_mutex = NULL; + + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_END, + trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), + mutex->lck_mtx_waiters, 0, 0); + +#if CONFIG_DTRACE + /* + * Record the Dtrace lockstat probe for blocking, block time + * measured from when we were entered. + */ + if (sleep_start) { + if (mutex->lck_mtx_is_ext == 0) { + LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_BLOCK, mutex, + mach_absolute_time() - sleep_start); + } else { + LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_BLOCK, mutex, + mach_absolute_time() - sleep_start); + } + } +#endif } -#endif /* MACH_KDB */ +/* + * Routine: kdp_lck_mtx_lock_spin_is_acquired + * NOT SAFE: To be used only by kernel debugger to avoid deadlock. + * Returns: TRUE if lock is acquired. + */ +boolean_t +kdp_lck_mtx_lock_spin_is_acquired(lck_mtx_t *lck) +{ + if (not_in_kdp) { + panic("panic: kdp_lck_mtx_lock_spin_is_acquired called outside of kernel debugger"); + } + + if (lck->lck_mtx_ilocked || lck->lck_mtx_mlocked) { + return TRUE; + } + + return FALSE; +} + +void +kdp_lck_mtx_find_owner(__unused struct waitq * waitq, event64_t event, thread_waitinfo_t * waitinfo) +{ + lck_mtx_t * mutex = LCK_EVENT_TO_MUTEX(event); + waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(mutex); + thread_t holder = (thread_t)mutex->lck_mtx_owner; + waitinfo->owner = thread_tid(holder); +} + +void +kdp_rwlck_find_owner(__unused struct waitq * waitq, event64_t event, thread_waitinfo_t * waitinfo) +{ + lck_rw_t *rwlck = NULL; + switch(waitinfo->wait_type) { + case kThreadWaitKernelRWLockRead: + rwlck = READ_EVENT_TO_RWLOCK(event); + break; + case kThreadWaitKernelRWLockWrite: + case kThreadWaitKernelRWLockUpgrade: + rwlck = WRITE_EVENT_TO_RWLOCK(event); + break; + default: + panic("%s was called with an invalid blocking type", __FUNCTION__); + break; + } + waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(rwlck); + waitinfo->owner = 0; +}