2 * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
46 * Carnegie Mellon requests users of this software to return to
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
58 * Author: Avadis Tevanian, Jr., Michael Wayne Young
61 * Locking primitives implementation
64 #define LOCK_PRIVATE 1
66 #include <mach_ldebug.h>
68 #include <kern/lock_stat.h>
69 #include <kern/locks.h>
70 #include <kern/kalloc.h>
71 #include <kern/misc_protos.h>
72 #include <kern/thread.h>
73 #include <kern/processor.h>
74 #include <kern/cpu_data.h>
75 #include <kern/cpu_number.h>
76 #include <kern/sched_prim.h>
77 #include <kern/debug.h>
80 #include <i386/machine_routines.h> /* machine_timeout_suspended() */
81 #include <machine/atomic.h>
82 #include <machine/machine_cpu.h>
84 #include <machine/atomic.h>
85 #include <sys/kdebug.h>
86 #include <i386/locks_i386_inlines.h>
87 #include <kern/cpu_number.h>
91 #define DTRACE_RW_SHARED 0x0 //reader
92 #define DTRACE_RW_EXCL 0x1 //writer
93 #define DTRACE_NO_FLAG 0x0 //not applicable
94 #endif /* CONFIG_DTRACE */
96 #define LCK_RW_LCK_EXCLUSIVE_CODE 0x100
97 #define LCK_RW_LCK_EXCLUSIVE1_CODE 0x101
98 #define LCK_RW_LCK_SHARED_CODE 0x102
99 #define LCK_RW_LCK_SH_TO_EX_CODE 0x103
100 #define LCK_RW_LCK_SH_TO_EX1_CODE 0x104
101 #define LCK_RW_LCK_EX_TO_SH_CODE 0x105
103 #define LCK_RW_LCK_EX_WRITER_SPIN_CODE 0x106
104 #define LCK_RW_LCK_EX_WRITER_WAIT_CODE 0x107
105 #define LCK_RW_LCK_EX_READER_SPIN_CODE 0x108
106 #define LCK_RW_LCK_EX_READER_WAIT_CODE 0x109
107 #define LCK_RW_LCK_SHARED_SPIN_CODE 0x110
108 #define LCK_RW_LCK_SHARED_WAIT_CODE 0x111
109 #define LCK_RW_LCK_SH_TO_EX_SPIN_CODE 0x112
110 #define LCK_RW_LCK_SH_TO_EX_WAIT_CODE 0x113
113 #define ANY_LOCK_DEBUG (USLOCK_DEBUG || LOCK_DEBUG || MUTEX_DEBUG)
115 unsigned int LcksOpts
= 0;
117 #if DEVELOPMENT || DEBUG
118 unsigned int LckDisablePreemptCheck
= 0;
125 * Perform simple lock checks.
127 int uslock_check
= 1;
128 int max_lock_loops
= 100000000;
129 decl_simple_lock_data(extern, printf_lock
);
130 decl_simple_lock_data(extern, panic_lock
);
131 #endif /* USLOCK_DEBUG */
133 extern unsigned int not_in_kdp
;
136 * We often want to know the addresses of the callers
137 * of the various lock routines. However, this information
138 * is only used for debugging and statistics.
141 #define INVALID_PC ((void *) VM_MAX_KERNEL_ADDRESS)
142 #define INVALID_THREAD ((void *) VM_MAX_KERNEL_ADDRESS)
144 #define OBTAIN_PC(pc) ((pc) = GET_RETURN_PC())
145 #define DECL_PC(pc) pc_t pc;
146 #else /* ANY_LOCK_DEBUG */
150 * Eliminate lint complaints about unused local pc variables.
152 #define OBTAIN_PC(pc) ++pc
154 #define OBTAIN_PC(pc)
156 #endif /* USLOCK_DEBUG */
159 * atomic exchange API is a low level abstraction of the operations
160 * to atomically read, modify, and write a pointer. This abstraction works
161 * for both Intel and ARMv8.1 compare and exchange atomic instructions as
162 * well as the ARM exclusive instructions.
164 * atomic_exchange_begin() - begin exchange and retrieve current value
165 * atomic_exchange_complete() - conclude an exchange
166 * atomic_exchange_abort() - cancel an exchange started with atomic_exchange_begin()
169 atomic_exchange_begin32(uint32_t *target
, uint32_t *previous
, enum memory_order ord
)
173 (void)ord
; // Memory order not used
174 val
= os_atomic_load(target
, relaxed
);
180 atomic_exchange_complete32(uint32_t *target
, uint32_t previous
, uint32_t newval
, enum memory_order ord
)
182 return __c11_atomic_compare_exchange_strong((_Atomic
uint32_t *)target
, &previous
, newval
, ord
, memory_order_relaxed
);
186 atomic_exchange_abort(void)
191 atomic_test_and_set32(uint32_t *target
, uint32_t test_mask
, uint32_t set_mask
, enum memory_order ord
, boolean_t wait
)
193 uint32_t value
, prev
;
196 value
= atomic_exchange_begin32(target
, &prev
, ord
);
197 if (value
& test_mask
) {
201 atomic_exchange_abort();
206 if (atomic_exchange_complete32(target
, prev
, value
, ord
)) {
213 hw_atomic_test_and_set32(uint32_t *target
, uint32_t test_mask
, uint32_t set_mask
, enum memory_order ord
, boolean_t wait
)
215 return atomic_test_and_set32(target
, test_mask
, set_mask
, ord
, wait
);
219 * Portable lock package implementation of usimple_locks.
223 #define USLDBG(stmt) stmt
224 void usld_lock_init(usimple_lock_t
, unsigned short);
225 void usld_lock_pre(usimple_lock_t
, pc_t
);
226 void usld_lock_post(usimple_lock_t
, pc_t
);
227 void usld_unlock(usimple_lock_t
, pc_t
);
228 void usld_lock_try_pre(usimple_lock_t
, pc_t
);
229 void usld_lock_try_post(usimple_lock_t
, pc_t
);
230 int usld_lock_common_checks(usimple_lock_t
, char *);
231 #else /* USLOCK_DEBUG */
233 #endif /* USLOCK_DEBUG */
236 * Forward definitions
239 static void lck_rw_lock_shared_gen(lck_rw_t
*lck
);
240 static void lck_rw_lock_exclusive_gen(lck_rw_t
*lck
);
241 static boolean_t
lck_rw_lock_shared_to_exclusive_success(lck_rw_t
*lck
);
242 static boolean_t
lck_rw_lock_shared_to_exclusive_failure(lck_rw_t
*lck
, uint32_t prior_lock_state
);
243 static void lck_rw_lock_exclusive_to_shared_gen(lck_rw_t
*lck
, uint32_t prior_lock_state
);
244 static lck_rw_type_t
lck_rw_done_gen(lck_rw_t
*lck
, uint32_t prior_lock_state
);
245 void lck_rw_clear_promotions_x86(thread_t thread
);
246 static boolean_t
lck_rw_held_read_or_upgrade(lck_rw_t
*lock
);
247 static boolean_t
lck_rw_grab_want(lck_rw_t
*lock
);
248 static boolean_t
lck_rw_grab_shared(lck_rw_t
*lock
);
249 static void lck_mtx_unlock_wakeup_tail(lck_mtx_t
*mutex
, uint32_t state
, boolean_t indirect
);
250 static void lck_mtx_interlock_lock(lck_mtx_t
*mutex
, uint32_t *new_state
);
251 static void lck_mtx_interlock_lock_clear_flags(lck_mtx_t
*mutex
, uint32_t and_flags
, uint32_t *new_state
);
252 static int lck_mtx_interlock_try_lock_set_flags(lck_mtx_t
*mutex
, uint32_t or_flags
, uint32_t *new_state
);
253 static boolean_t
lck_mtx_lock_wait_interlock_to_clear(lck_mtx_t
*lock
, uint32_t *new_state
);
254 static boolean_t
lck_mtx_try_lock_wait_interlock_to_clear(lck_mtx_t
*lock
, uint32_t *new_state
);
258 * Routine: lck_spin_alloc_init
267 if ((lck
= (lck_spin_t
*)kalloc(sizeof(lck_spin_t
))) != 0) {
268 lck_spin_init(lck
, grp
, attr
);
275 * Routine: lck_spin_free
282 lck_spin_destroy(lck
, grp
);
283 kfree(lck
, sizeof(lck_spin_t
));
287 * Routine: lck_spin_init
293 __unused lck_attr_t
*attr
)
295 usimple_lock_init((usimple_lock_t
) lck
, 0);
297 lck_grp_reference(grp
);
298 lck_grp_lckcnt_incr(grp
, LCK_TYPE_SPIN
);
303 * Routine: lck_spin_destroy
310 if (lck
->interlock
== LCK_SPIN_TAG_DESTROYED
) {
313 lck
->interlock
= LCK_SPIN_TAG_DESTROYED
;
315 lck_grp_lckcnt_decr(grp
, LCK_TYPE_SPIN
);
316 lck_grp_deallocate(grp
);
322 * Routine: lck_spin_lock
330 usimple_lock((usimple_lock_t
) lck
, grp
);
337 usimple_lock((usimple_lock_t
) lck
, NULL
);
341 * Routine: lck_spin_unlock
347 usimple_unlock((usimple_lock_t
) lck
);
351 lck_spin_try_lock_grp(
356 boolean_t lrval
= (boolean_t
)usimple_lock_try((usimple_lock_t
) lck
, grp
);
357 #if DEVELOPMENT || DEBUG
367 * Routine: lck_spin_try_lock
373 boolean_t lrval
= (boolean_t
)usimple_lock_try((usimple_lock_t
) lck
, LCK_GRP_NULL
);
374 #if DEVELOPMENT || DEBUG
383 * Routine: lck_spin_assert
386 lck_spin_assert(lck_spin_t
*lock
, unsigned int type
)
388 thread_t thread
, holder
;
391 if (__improbable(type
!= LCK_ASSERT_OWNED
&& type
!= LCK_ASSERT_NOTOWNED
)) {
392 panic("lck_spin_assert(): invalid arg (%u)", type
);
395 state
= lock
->interlock
;
396 holder
= (thread_t
)state
;
397 thread
= current_thread();
398 if (type
== LCK_ASSERT_OWNED
) {
399 if (__improbable(holder
== THREAD_NULL
)) {
400 panic("Lock not owned %p = %lx", lock
, state
);
402 if (__improbable(holder
!= thread
)) {
403 panic("Lock not owned by current thread %p = %lx", lock
, state
);
405 } else if (type
== LCK_ASSERT_NOTOWNED
) {
406 if (__improbable(holder
!= THREAD_NULL
)) {
407 if (holder
== thread
) {
408 panic("Lock owned by current thread %p = %lx", lock
, state
);
415 * Routine: kdp_lck_spin_is_acquired
416 * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
417 * Returns: TRUE if lock is acquired.
420 kdp_lck_spin_is_acquired(lck_spin_t
*lck
)
423 panic("panic: spinlock acquired check done outside of kernel debugger");
425 return (lck
->interlock
!= 0)? TRUE
: FALSE
;
429 * Initialize a usimple_lock.
431 * No change in preemption state.
436 __unused
unsigned short tag
)
438 #ifndef MACHINE_SIMPLE_LOCK
439 USLDBG(usld_lock_init(l
, tag
));
440 hw_lock_init(&l
->interlock
);
442 simple_lock_init((simple_lock_t
)l
, tag
);
446 volatile uint32_t spinlock_owner_cpu
= ~0;
447 volatile usimple_lock_t spinlock_timed_out
;
450 spinlock_timeout_NMI(uintptr_t thread_addr
)
454 for (i
= 0; i
< real_ncpus
; i
++) {
455 if ((cpu_data_ptr
[i
] != NULL
) && ((uintptr_t)cpu_data_ptr
[i
]->cpu_active_thread
== thread_addr
)) {
456 spinlock_owner_cpu
= i
;
457 if ((uint32_t) cpu_number() != i
) {
458 /* Cause NMI and panic on the owner's cpu */
459 NMIPI_panic(cpu_to_cpumask(i
), SPINLOCK_TIMEOUT
);
465 return spinlock_owner_cpu
;
469 * Acquire a usimple_lock.
471 * Returns with preemption disabled. Note
472 * that the hw_lock routines are responsible for
473 * maintaining preemption state.
478 LCK_GRP_ARG(lck_grp_t
*grp
))
480 #ifndef MACHINE_SIMPLE_LOCK
484 USLDBG(usld_lock_pre(l
, pc
));
486 if (__improbable(hw_lock_to(&l
->interlock
, LockTimeOutTSC
, grp
) == 0)) {
487 boolean_t uslock_acquired
= FALSE
;
488 while (machine_timeout_suspended()) {
490 if ((uslock_acquired
= hw_lock_to(&l
->interlock
, LockTimeOutTSC
, grp
))) {
495 if (uslock_acquired
== FALSE
) {
497 uintptr_t lowner
= (uintptr_t)l
->interlock
.lock_data
;
498 spinlock_timed_out
= l
;
499 lock_cpu
= spinlock_timeout_NMI(lowner
);
500 panic("Spinlock acquisition timed out: lock=%p, lock owner thread=0x%lx, current_thread: %p, lock owner active on CPU 0x%x, current owner: 0x%lx, time: %llu",
501 l
, lowner
, current_thread(), lock_cpu
, (uintptr_t)l
->interlock
.lock_data
, mach_absolute_time());
504 #if DEVELOPMENT || DEBUG
508 USLDBG(usld_lock_post(l
, pc
));
510 simple_lock((simple_lock_t
)l
, grp
);
513 LOCKSTAT_RECORD(LS_LCK_SPIN_LOCK_ACQUIRE
, l
, 0, (uintptr_t)LCK_GRP_PROBEARG(grp
));
519 * Release a usimple_lock.
521 * Returns with preemption enabled. Note
522 * that the hw_lock routines are responsible for
523 * maintaining preemption state.
529 #ifndef MACHINE_SIMPLE_LOCK
533 USLDBG(usld_unlock(l
, pc
));
534 #if DEVELOPMENT || DEBUG
537 hw_lock_unlock(&l
->interlock
);
539 simple_unlock_rwmb((simple_lock_t
)l
);
545 * Conditionally acquire a usimple_lock.
547 * On success, returns with preemption disabled.
548 * On failure, returns with preemption in the same state
549 * as when first invoked. Note that the hw_lock routines
550 * are responsible for maintaining preemption state.
552 * XXX No stats are gathered on a miss; I preserved this
553 * behavior from the original assembly-language code, but
554 * doesn't it make sense to log misses? XXX
561 #ifndef MACHINE_SIMPLE_LOCK
562 unsigned int success
;
566 USLDBG(usld_lock_try_pre(l
, pc
));
567 if ((success
= hw_lock_try(&l
->interlock
, grp
))) {
568 #if DEVELOPMENT || DEBUG
571 USLDBG(usld_lock_try_post(l
, pc
));
575 return simple_lock_try((simple_lock_t
)l
, grp
);
580 * Acquire a usimple_lock while polling for pending cpu signals
581 * and spinning on a lock.
586 (usimple_lock_try_lock_mp_signal_safe_loop_deadline
)(usimple_lock_t l
,
588 LCK_GRP_ARG(lck_grp_t
*grp
))
590 boolean_t istate
= ml_get_interrupts_enabled();
592 if (deadline
< mach_absolute_time()) {
596 while (!simple_lock_try(l
, grp
)) {
598 cpu_signal_handler(NULL
);
601 if (deadline
< mach_absolute_time()) {
612 (usimple_lock_try_lock_loop
)(usimple_lock_t l
613 LCK_GRP_ARG(lck_grp_t
*grp
))
615 usimple_lock_try_lock_mp_signal_safe_loop_deadline(l
, ULLONG_MAX
, grp
);
620 (usimple_lock_try_lock_mp_signal_safe_loop_duration
)(usimple_lock_t l
,
622 LCK_GRP_ARG(lck_grp_t
*grp
))
625 uint64_t base_at
= mach_absolute_time();
626 uint64_t duration_at
;
628 nanoseconds_to_absolutetime(duration
, &duration_at
);
629 deadline
= base_at
+ duration_at
;
630 if (deadline
< base_at
) {
631 /* deadline has overflowed, make it saturate */
632 deadline
= ULLONG_MAX
;
635 return usimple_lock_try_lock_mp_signal_safe_loop_deadline(l
, deadline
, grp
);
640 * States of a usimple_lock. The default when initializing
641 * a usimple_lock is setting it up for debug checking.
643 #define USLOCK_CHECKED 0x0001 /* lock is being checked */
644 #define USLOCK_TAKEN 0x0002 /* lock has been taken */
645 #define USLOCK_INIT 0xBAA0 /* lock has been initialized */
646 #define USLOCK_INITIALIZED (USLOCK_INIT|USLOCK_CHECKED)
647 #define USLOCK_CHECKING(l) (uslock_check && \
648 ((l)->debug.state & USLOCK_CHECKED))
651 * Initialize the debugging information contained
657 __unused
unsigned short tag
)
659 if (l
== USIMPLE_LOCK_NULL
) {
660 panic("lock initialization: null lock pointer");
662 l
->lock_type
= USLOCK_TAG
;
663 l
->debug
.state
= uslock_check
? USLOCK_INITIALIZED
: 0;
664 l
->debug
.lock_cpu
= l
->debug
.unlock_cpu
= 0;
665 l
->debug
.lock_pc
= l
->debug
.unlock_pc
= INVALID_PC
;
666 l
->debug
.lock_thread
= l
->debug
.unlock_thread
= INVALID_THREAD
;
667 l
->debug
.duration
[0] = l
->debug
.duration
[1] = 0;
668 l
->debug
.unlock_cpu
= l
->debug
.unlock_cpu
= 0;
669 l
->debug
.unlock_pc
= l
->debug
.unlock_pc
= INVALID_PC
;
670 l
->debug
.unlock_thread
= l
->debug
.unlock_thread
= INVALID_THREAD
;
675 * These checks apply to all usimple_locks, not just
676 * those with USLOCK_CHECKED turned on.
679 usld_lock_common_checks(
683 if (l
== USIMPLE_LOCK_NULL
) {
684 panic("%s: null lock pointer", caller
);
686 if (l
->lock_type
!= USLOCK_TAG
) {
687 panic("%s: %p is not a usimple lock, 0x%x", caller
, l
, l
->lock_type
);
689 if (!(l
->debug
.state
& USLOCK_INIT
)) {
690 panic("%s: %p is not an initialized lock, 0x%x", caller
, l
, l
->debug
.state
);
692 return USLOCK_CHECKING(l
);
697 * Debug checks on a usimple_lock just before attempting
706 char caller
[] = "usimple_lock";
709 if (!usld_lock_common_checks(l
, caller
)) {
714 * Note that we have a weird case where we are getting a lock when we are]
715 * in the process of putting the system to sleep. We are running with no
716 * current threads, therefore we can't tell if we are trying to retake a lock
717 * we have or someone on the other processor has it. Therefore we just
718 * ignore this test if the locking thread is 0.
721 if ((l
->debug
.state
& USLOCK_TAKEN
) && l
->debug
.lock_thread
&&
722 l
->debug
.lock_thread
== (void *) current_thread()) {
723 printf("%s: lock %p already locked (at %p) by",
724 caller
, l
, l
->debug
.lock_pc
);
725 printf(" current thread %p (new attempt at pc %p)\n",
726 l
->debug
.lock_thread
, pc
);
729 mp_disable_preemption();
730 mp_enable_preemption();
735 * Debug checks on a usimple_lock just after acquiring it.
737 * Pre-emption has been disabled at this point,
738 * so we are safe in using cpu_number.
746 char caller
[] = "successful usimple_lock";
749 if (!usld_lock_common_checks(l
, caller
)) {
753 if (!((l
->debug
.state
& ~USLOCK_TAKEN
) == USLOCK_INITIALIZED
)) {
754 panic("%s: lock %p became uninitialized",
757 if ((l
->debug
.state
& USLOCK_TAKEN
)) {
758 panic("%s: lock 0x%p became TAKEN by someone else",
762 mycpu
= cpu_number();
763 l
->debug
.lock_thread
= (void *)current_thread();
764 l
->debug
.state
|= USLOCK_TAKEN
;
765 l
->debug
.lock_pc
= pc
;
766 l
->debug
.lock_cpu
= mycpu
;
771 * Debug checks on a usimple_lock just before
772 * releasing it. Note that the caller has not
773 * yet released the hardware lock.
775 * Preemption is still disabled, so there's
776 * no problem using cpu_number.
784 char caller
[] = "usimple_unlock";
787 if (!usld_lock_common_checks(l
, caller
)) {
791 mycpu
= cpu_number();
793 if (!(l
->debug
.state
& USLOCK_TAKEN
)) {
794 panic("%s: lock 0x%p hasn't been taken",
797 if (l
->debug
.lock_thread
!= (void *) current_thread()) {
798 panic("%s: unlocking lock 0x%p, owned by thread %p",
799 caller
, l
, l
->debug
.lock_thread
);
801 if (l
->debug
.lock_cpu
!= mycpu
) {
802 printf("%s: unlocking lock 0x%p on cpu 0x%x",
804 printf(" (acquired on cpu 0x%x)\n", l
->debug
.lock_cpu
);
808 l
->debug
.unlock_thread
= l
->debug
.lock_thread
;
809 l
->debug
.lock_thread
= INVALID_PC
;
810 l
->debug
.state
&= ~USLOCK_TAKEN
;
811 l
->debug
.unlock_pc
= pc
;
812 l
->debug
.unlock_cpu
= mycpu
;
817 * Debug checks on a usimple_lock just before
818 * attempting to acquire it.
820 * Preemption isn't guaranteed to be disabled.
827 char caller
[] = "usimple_lock_try";
829 if (!usld_lock_common_checks(l
, caller
)) {
836 * Debug checks on a usimple_lock just after
837 * successfully attempting to acquire it.
839 * Preemption has been disabled by the
840 * lock acquisition attempt, so it's safe
849 char caller
[] = "successful usimple_lock_try";
851 if (!usld_lock_common_checks(l
, caller
)) {
855 if (!((l
->debug
.state
& ~USLOCK_TAKEN
) == USLOCK_INITIALIZED
)) {
856 panic("%s: lock 0x%p became uninitialized",
859 if ((l
->debug
.state
& USLOCK_TAKEN
)) {
860 panic("%s: lock 0x%p became TAKEN by someone else",
864 mycpu
= cpu_number();
865 l
->debug
.lock_thread
= (void *) current_thread();
866 l
->debug
.state
|= USLOCK_TAKEN
;
867 l
->debug
.lock_pc
= pc
;
868 l
->debug
.lock_cpu
= mycpu
;
870 #endif /* USLOCK_DEBUG */
873 * Routine: lck_rw_alloc_init
882 if ((lck
= (lck_rw_t
*)kalloc(sizeof(lck_rw_t
))) != 0) {
883 bzero(lck
, sizeof(lck_rw_t
));
884 lck_rw_init(lck
, grp
, attr
);
891 * Routine: lck_rw_free
898 lck_rw_destroy(lck
, grp
);
899 kfree(lck
, sizeof(lck_rw_t
));
903 * Routine: lck_rw_init
911 lck_attr_t
*lck_attr
= (attr
!= LCK_ATTR_NULL
) ?
912 attr
: &LockDefaultLckAttr
;
914 hw_lock_byte_init(&lck
->lck_rw_interlock
);
915 lck
->lck_rw_want_write
= FALSE
;
916 lck
->lck_rw_want_upgrade
= FALSE
;
917 lck
->lck_rw_shared_count
= 0;
918 lck
->lck_rw_can_sleep
= TRUE
;
919 lck
->lck_r_waiting
= lck
->lck_w_waiting
= 0;
921 lck
->lck_rw_priv_excl
= ((lck_attr
->lck_attr_val
&
922 LCK_ATTR_RW_SHARED_PRIORITY
) == 0);
924 lck_grp_reference(grp
);
925 lck_grp_lckcnt_incr(grp
, LCK_TYPE_RW
);
929 * Routine: lck_rw_destroy
936 if (lck
->lck_rw_tag
== LCK_RW_TAG_DESTROYED
) {
940 lck_rw_assert(lck
, LCK_RW_ASSERT_NOTHELD
);
942 lck
->lck_rw_tag
= LCK_RW_TAG_DESTROYED
;
943 lck_grp_lckcnt_decr(grp
, LCK_TYPE_RW
);
944 lck_grp_deallocate(grp
);
949 * Sleep locks. These use the same data structure and algorithm
950 * as the spin locks, but the process sleeps while it is waiting
951 * for the lock. These work on uniprocessor systems.
954 #define DECREMENTER_TIMEOUT 1000000
957 * We disable interrupts while holding the RW interlock to prevent an
958 * interrupt from exacerbating hold time.
959 * Hence, local helper functions lck_interlock_lock()/lck_interlock_unlock().
961 static inline boolean_t
962 lck_interlock_lock(lck_rw_t
*lck
)
966 istate
= ml_set_interrupts_enabled(FALSE
);
967 hw_lock_byte_lock(&lck
->lck_rw_interlock
);
972 lck_interlock_unlock(lck_rw_t
*lck
, boolean_t istate
)
974 hw_lock_byte_unlock(&lck
->lck_rw_interlock
);
975 ml_set_interrupts_enabled(istate
);
979 * This inline is used when busy-waiting for an rw lock.
980 * If interrupts were disabled when the lock primitive was called,
981 * we poll the IPI handler for pending tlb flushes.
982 * XXX This is a hack to avoid deadlocking on the pmap_system_lock.
985 lck_rw_lock_pause(boolean_t interrupts_enabled
)
987 if (!interrupts_enabled
) {
988 handle_pending_TLB_flushes();
993 static inline boolean_t
994 lck_rw_held_read_or_upgrade(lck_rw_t
*lock
)
996 if (ordered_load(&lock
->data
) & (LCK_RW_SHARED_MASK
| LCK_RW_INTERLOCK
| LCK_RW_WANT_UPGRADE
)) {
1003 * compute the deadline to spin against when
1004 * waiting for a change of state on a lck_rw_t
1006 static inline uint64_t
1007 lck_rw_deadline_for_spin(lck_rw_t
*lck
)
1009 if (lck
->lck_rw_can_sleep
) {
1010 if (lck
->lck_r_waiting
|| lck
->lck_w_waiting
|| lck
->lck_rw_shared_count
> machine_info
.max_cpus
) {
1012 * there are already threads waiting on this lock... this
1013 * implies that they have spun beyond their deadlines waiting for
1014 * the desired state to show up so we will not bother spinning at this time...
1016 * the current number of threads sharing this lock exceeds our capacity to run them
1017 * concurrently and since all states we're going to spin for require the rw_shared_count
1018 * to be at 0, we'll not bother spinning since the latency for this to happen is
1021 return mach_absolute_time();
1023 return mach_absolute_time() + MutexSpin
;
1025 return mach_absolute_time() + (100000LL * 1000000000LL);
1031 * Spin while interlock is held.
1035 lck_rw_interlock_spin(lck_rw_t
*lock
)
1037 while (ordered_load(&lock
->data
) & LCK_RW_INTERLOCK
) {
1043 lck_rw_grab_want(lck_rw_t
*lock
)
1045 uint32_t data
, prev
;
1048 data
= atomic_exchange_begin32(&lock
->data
, &prev
, memory_order_relaxed
);
1049 if ((data
& LCK_RW_INTERLOCK
) == 0) {
1052 atomic_exchange_abort();
1053 lck_rw_interlock_spin(lock
);
1055 if (data
& LCK_RW_WANT_WRITE
) {
1056 atomic_exchange_abort();
1059 data
|= LCK_RW_WANT_WRITE
;
1060 return atomic_exchange_complete32(&lock
->data
, prev
, data
, memory_order_relaxed
);
1064 lck_rw_grab_shared(lck_rw_t
*lock
)
1066 uint32_t data
, prev
;
1069 data
= atomic_exchange_begin32(&lock
->data
, &prev
, memory_order_acquire_smp
);
1070 if ((data
& LCK_RW_INTERLOCK
) == 0) {
1073 atomic_exchange_abort();
1074 lck_rw_interlock_spin(lock
);
1076 if (data
& (LCK_RW_WANT_WRITE
| LCK_RW_WANT_UPGRADE
)) {
1077 if (((data
& LCK_RW_SHARED_MASK
) == 0) || (data
& LCK_RW_PRIV_EXCL
)) {
1078 atomic_exchange_abort();
1082 data
+= LCK_RW_SHARED_READER
;
1083 return atomic_exchange_complete32(&lock
->data
, prev
, data
, memory_order_acquire_smp
);
1087 * Routine: lck_rw_lock_exclusive
1090 lck_rw_lock_exclusive_gen(
1093 __kdebug_only
uintptr_t trace_lck
= unslide_for_kdebug(lck
);
1094 uint64_t deadline
= 0;
1098 wait_result_t res
= 0;
1099 boolean_t istate
= -1;
1102 boolean_t dtrace_ls_initialized
= FALSE
;
1103 boolean_t dtrace_rwl_excl_spin
, dtrace_rwl_excl_block
, dtrace_ls_enabled
= FALSE
;
1104 uint64_t wait_interval
= 0;
1105 int readers_at_sleep
= 0;
1109 * Try to acquire the lck_rw_want_write bit.
1111 while (!lck_rw_grab_want(lck
)) {
1113 if (dtrace_ls_initialized
== FALSE
) {
1114 dtrace_ls_initialized
= TRUE
;
1115 dtrace_rwl_excl_spin
= (lockstat_probemap
[LS_LCK_RW_LOCK_EXCL_SPIN
] != 0);
1116 dtrace_rwl_excl_block
= (lockstat_probemap
[LS_LCK_RW_LOCK_EXCL_BLOCK
] != 0);
1117 dtrace_ls_enabled
= dtrace_rwl_excl_spin
|| dtrace_rwl_excl_block
;
1118 if (dtrace_ls_enabled
) {
1120 * Either sleeping or spinning is happening,
1121 * start a timing of our delay interval now.
1123 readers_at_sleep
= lck
->lck_rw_shared_count
;
1124 wait_interval
= mach_absolute_time();
1129 istate
= ml_get_interrupts_enabled();
1132 deadline
= lck_rw_deadline_for_spin(lck
);
1134 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_RW_LCK_EX_WRITER_SPIN_CODE
) | DBG_FUNC_START
, trace_lck
, 0, 0, 0, 0);
1136 while (((gotlock
= lck_rw_grab_want(lck
)) == 0) && mach_absolute_time() < deadline
) {
1137 lck_rw_lock_pause(istate
);
1140 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_RW_LCK_EX_WRITER_SPIN_CODE
) | DBG_FUNC_END
, trace_lck
, 0, 0, gotlock
, 0);
1146 * if we get here, the deadline has expired w/o us
1147 * being able to grab the lock exclusively
1148 * check to see if we're allowed to do a thread_block
1150 if (lck
->lck_rw_can_sleep
) {
1151 istate
= lck_interlock_lock(lck
);
1153 if (lck
->lck_rw_want_write
) {
1154 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_RW_LCK_EX_WRITER_WAIT_CODE
) | DBG_FUNC_START
, trace_lck
, 0, 0, 0, 0);
1156 lck
->lck_w_waiting
= TRUE
;
1158 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite
);
1159 res
= assert_wait(RW_LOCK_WRITER_EVENT(lck
),
1160 THREAD_UNINT
| THREAD_WAIT_NOREPORT_USER
);
1161 lck_interlock_unlock(lck
, istate
);
1163 if (res
== THREAD_WAITING
) {
1164 res
= thread_block(THREAD_CONTINUE_NULL
);
1167 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_RW_LCK_EX_WRITER_WAIT_CODE
) | DBG_FUNC_END
, trace_lck
, res
, slept
, 0, 0);
1169 lck
->lck_rw_want_write
= TRUE
;
1170 lck_interlock_unlock(lck
, istate
);
1176 * Wait for readers (and upgrades) to finish...
1177 * the test for these conditions must be done simultaneously with
1178 * a check of the interlock not being held since
1179 * the rw_shared_count will drop to 0 first and then want_upgrade
1180 * will be set to 1 in the shared_to_exclusive scenario... those
1181 * adjustments are done behind the interlock and represent an
1182 * atomic change in state and must be considered as such
1183 * however, once we see the read count at 0, the want_upgrade not set
1184 * and the interlock not held, we are safe to proceed
1186 while (lck_rw_held_read_or_upgrade(lck
)) {
1189 * Either sleeping or spinning is happening, start
1190 * a timing of our delay interval now. If we set it
1191 * to -1 we don't have accurate data so we cannot later
1192 * decide to record a dtrace spin or sleep event.
1194 if (dtrace_ls_initialized
== FALSE
) {
1195 dtrace_ls_initialized
= TRUE
;
1196 dtrace_rwl_excl_spin
= (lockstat_probemap
[LS_LCK_RW_LOCK_EXCL_SPIN
] != 0);
1197 dtrace_rwl_excl_block
= (lockstat_probemap
[LS_LCK_RW_LOCK_EXCL_BLOCK
] != 0);
1198 dtrace_ls_enabled
= dtrace_rwl_excl_spin
|| dtrace_rwl_excl_block
;
1199 if (dtrace_ls_enabled
) {
1201 * Either sleeping or spinning is happening,
1202 * start a timing of our delay interval now.
1204 readers_at_sleep
= lck
->lck_rw_shared_count
;
1205 wait_interval
= mach_absolute_time();
1210 istate
= ml_get_interrupts_enabled();
1213 deadline
= lck_rw_deadline_for_spin(lck
);
1215 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_RW_LCK_EX_READER_SPIN_CODE
) | DBG_FUNC_START
, trace_lck
, 0, 0, 0, 0);
1217 while ((lockheld
= lck_rw_held_read_or_upgrade(lck
)) && mach_absolute_time() < deadline
) {
1218 lck_rw_lock_pause(istate
);
1221 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_RW_LCK_EX_READER_SPIN_CODE
) | DBG_FUNC_END
, trace_lck
, 0, 0, lockheld
, 0);
1227 * if we get here, the deadline has expired w/o us
1228 * being able to grab the lock exclusively
1229 * check to see if we're allowed to do a thread_block
1231 if (lck
->lck_rw_can_sleep
) {
1232 istate
= lck_interlock_lock(lck
);
1234 if (lck
->lck_rw_shared_count
!= 0 || lck
->lck_rw_want_upgrade
) {
1235 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_RW_LCK_EX_READER_WAIT_CODE
) | DBG_FUNC_START
, trace_lck
, 0, 0, 0, 0);
1237 lck
->lck_w_waiting
= TRUE
;
1239 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite
);
1240 res
= assert_wait(RW_LOCK_WRITER_EVENT(lck
),
1241 THREAD_UNINT
| THREAD_WAIT_NOREPORT_USER
);
1242 lck_interlock_unlock(lck
, istate
);
1244 if (res
== THREAD_WAITING
) {
1245 res
= thread_block(THREAD_CONTINUE_NULL
);
1248 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_RW_LCK_EX_READER_WAIT_CODE
) | DBG_FUNC_END
, trace_lck
, res
, slept
, 0, 0);
1250 lck_interlock_unlock(lck
, istate
);
1252 * must own the lock now, since we checked for
1253 * readers or upgrade owner behind the interlock
1254 * no need for a call to 'lck_rw_held_read_or_upgrade'
1263 * Decide what latencies we suffered that are Dtrace events.
1264 * If we have set wait_interval, then we either spun or slept.
1265 * At least we get out from under the interlock before we record
1266 * which is the best we can do here to minimize the impact
1268 * If we have set wait_interval to -1, then dtrace was not enabled when we
1269 * started sleeping/spinning so we don't record this event.
1271 if (dtrace_ls_enabled
== TRUE
) {
1273 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_SPIN
, lck
,
1274 mach_absolute_time() - wait_interval
, 1);
1277 * For the blocking case, we also record if when we blocked
1278 * it was held for read or write, and how many readers.
1279 * Notice that above we recorded this before we dropped
1280 * the interlock so the count is accurate.
1282 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_BLOCK
, lck
,
1283 mach_absolute_time() - wait_interval
, 1,
1284 (readers_at_sleep
== 0 ? 1 : 0), readers_at_sleep
);
1287 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE
, lck
, 1);
1292 * Routine: lck_rw_done
1296 lck_rw_done(lck_rw_t
*lock
)
1298 uint32_t data
, prev
;
1301 data
= atomic_exchange_begin32(&lock
->data
, &prev
, memory_order_release_smp
);
1302 if (data
& LCK_RW_INTERLOCK
) { /* wait for interlock to clear */
1303 atomic_exchange_abort();
1304 lck_rw_interlock_spin(lock
);
1307 if (data
& LCK_RW_SHARED_MASK
) {
1308 data
-= LCK_RW_SHARED_READER
;
1309 if ((data
& LCK_RW_SHARED_MASK
) == 0) { /* if reader count has now gone to 0, check for waiters */
1312 } else { /* if reader count == 0, must be exclusive lock */
1313 if (data
& LCK_RW_WANT_UPGRADE
) {
1314 data
&= ~(LCK_RW_WANT_UPGRADE
);
1316 if (data
& LCK_RW_WANT_WRITE
) {
1317 data
&= ~(LCK_RW_WANT_EXCL
);
1318 } else { /* lock is not 'owned', panic */
1319 panic("Releasing non-exclusive RW lock without a reader refcount!");
1323 if (prev
& LCK_RW_W_WAITING
) {
1324 data
&= ~(LCK_RW_W_WAITING
);
1325 if ((prev
& LCK_RW_PRIV_EXCL
) == 0) {
1326 data
&= ~(LCK_RW_R_WAITING
);
1329 data
&= ~(LCK_RW_R_WAITING
);
1332 if (atomic_exchange_complete32(&lock
->data
, prev
, data
, memory_order_release_smp
)) {
1337 return lck_rw_done_gen(lock
, prev
);
1341 * Routine: lck_rw_done_gen
1343 * called from lck_rw_done()
1344 * prior_lock_state is the value in the 1st
1345 * word of the lock at the time of a successful
1346 * atomic compare and exchange with the new value...
1347 * it represents the state of the lock before we
1348 * decremented the rw_shared_count or cleared either
1349 * rw_want_upgrade or rw_want_write and
1350 * the lck_x_waiting bits... since the wrapper
1351 * routine has already changed the state atomically,
1352 * we just need to decide if we should
1353 * wake up anyone and what value to return... we do
1354 * this by examining the state of the lock before
1357 static lck_rw_type_t
1360 uint32_t prior_lock_state
)
1363 lck_rw_type_t lock_type
;
1365 uint32_t rwlock_count
;
1367 thread
= current_thread();
1368 rwlock_count
= thread
->rwlock_count
--;
1369 fake_lck
= (lck_rw_t
*)&prior_lock_state
;
1371 if (lck
->lck_rw_can_sleep
) {
1373 * prior_lock state is a snapshot of the 1st word of the
1374 * lock in question... we'll fake up a pointer to it
1375 * and carefully not access anything beyond whats defined
1376 * in the first word of a lck_rw_t
1379 if (fake_lck
->lck_rw_shared_count
<= 1) {
1380 if (fake_lck
->lck_w_waiting
) {
1381 thread_wakeup(RW_LOCK_WRITER_EVENT(lck
));
1384 if (!(fake_lck
->lck_rw_priv_excl
&& fake_lck
->lck_w_waiting
) && fake_lck
->lck_r_waiting
) {
1385 thread_wakeup(RW_LOCK_READER_EVENT(lck
));
1389 if (rwlock_count
== 0) {
1390 panic("rw lock count underflow for thread %p", thread
);
1393 /* Check if dropping the lock means that we need to unpromote */
1395 if ((rwlock_count
== 1 /* field now 0 */) && (thread
->sched_flags
& TH_SFLAG_RW_PROMOTED
)) {
1396 /* sched_flags checked without lock, but will be rechecked while clearing */
1397 lck_rw_clear_promotion(thread
, unslide_for_kdebug(lck
));
1400 if (fake_lck
->lck_rw_shared_count
) {
1401 lock_type
= LCK_RW_TYPE_SHARED
;
1403 lock_type
= LCK_RW_TYPE_EXCLUSIVE
;
1407 LOCKSTAT_RECORD(LS_LCK_RW_DONE_RELEASE
, lck
, lock_type
== LCK_RW_TYPE_SHARED
? 0 : 1);
1415 * Routine: lck_rw_unlock
1420 lck_rw_type_t lck_rw_type
)
1422 if (lck_rw_type
== LCK_RW_TYPE_SHARED
) {
1423 lck_rw_unlock_shared(lck
);
1424 } else if (lck_rw_type
== LCK_RW_TYPE_EXCLUSIVE
) {
1425 lck_rw_unlock_exclusive(lck
);
1427 panic("lck_rw_unlock(): Invalid RW lock type: %d\n", lck_rw_type
);
1433 * Routine: lck_rw_unlock_shared
1436 lck_rw_unlock_shared(
1441 assertf(lck
->lck_rw_shared_count
> 0, "lck %p has shared_count=0x%x", lck
, lck
->lck_rw_shared_count
);
1442 ret
= lck_rw_done(lck
);
1444 if (ret
!= LCK_RW_TYPE_SHARED
) {
1445 panic("lck_rw_unlock_shared(): lock %p held in mode: %d\n", lck
, ret
);
1451 * Routine: lck_rw_unlock_exclusive
1454 lck_rw_unlock_exclusive(
1459 ret
= lck_rw_done(lck
);
1461 if (ret
!= LCK_RW_TYPE_EXCLUSIVE
) {
1462 panic("lck_rw_unlock_exclusive(): lock held in mode: %d\n", ret
);
1468 * Routine: lck_rw_lock
1473 lck_rw_type_t lck_rw_type
)
1475 if (lck_rw_type
== LCK_RW_TYPE_SHARED
) {
1476 lck_rw_lock_shared(lck
);
1477 } else if (lck_rw_type
== LCK_RW_TYPE_EXCLUSIVE
) {
1478 lck_rw_lock_exclusive(lck
);
1480 panic("lck_rw_lock(): Invalid RW lock type: %x\n", lck_rw_type
);
1485 * Routine: lck_rw_lock_shared
1488 lck_rw_lock_shared(lck_rw_t
*lock
)
1490 uint32_t data
, prev
;
1492 current_thread()->rwlock_count
++;
1494 data
= atomic_exchange_begin32(&lock
->data
, &prev
, memory_order_acquire_smp
);
1495 if (data
& (LCK_RW_WANT_EXCL
| LCK_RW_WANT_UPGRADE
| LCK_RW_INTERLOCK
)) {
1496 atomic_exchange_abort();
1497 if (lock
->lck_rw_can_sleep
) {
1498 lck_rw_lock_shared_gen(lock
);
1505 data
+= LCK_RW_SHARED_READER
;
1506 if (atomic_exchange_complete32(&lock
->data
, prev
, data
, memory_order_acquire_smp
)) {
1512 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE
, lock
, DTRACE_RW_SHARED
);
1513 #endif /* CONFIG_DTRACE */
1518 * Routine: lck_rw_lock_shared_gen
1520 * assembly fast path code has determined that this lock
1521 * is held exclusively... this is where we spin/block
1522 * until we can acquire the lock in the shared mode
1525 lck_rw_lock_shared_gen(
1528 __kdebug_only
uintptr_t trace_lck
= unslide_for_kdebug(lck
);
1529 uint64_t deadline
= 0;
1532 wait_result_t res
= 0;
1533 boolean_t istate
= -1;
1536 uint64_t wait_interval
= 0;
1537 int readers_at_sleep
= 0;
1538 boolean_t dtrace_ls_initialized
= FALSE
;
1539 boolean_t dtrace_rwl_shared_spin
, dtrace_rwl_shared_block
, dtrace_ls_enabled
= FALSE
;
1542 while (!lck_rw_grab_shared(lck
)) {
1544 if (dtrace_ls_initialized
== FALSE
) {
1545 dtrace_ls_initialized
= TRUE
;
1546 dtrace_rwl_shared_spin
= (lockstat_probemap
[LS_LCK_RW_LOCK_SHARED_SPIN
] != 0);
1547 dtrace_rwl_shared_block
= (lockstat_probemap
[LS_LCK_RW_LOCK_SHARED_BLOCK
] != 0);
1548 dtrace_ls_enabled
= dtrace_rwl_shared_spin
|| dtrace_rwl_shared_block
;
1549 if (dtrace_ls_enabled
) {
1551 * Either sleeping or spinning is happening,
1552 * start a timing of our delay interval now.
1554 readers_at_sleep
= lck
->lck_rw_shared_count
;
1555 wait_interval
= mach_absolute_time();
1560 istate
= ml_get_interrupts_enabled();
1563 deadline
= lck_rw_deadline_for_spin(lck
);
1565 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_RW_LCK_SHARED_SPIN_CODE
) | DBG_FUNC_START
,
1566 trace_lck
, lck
->lck_rw_want_write
, lck
->lck_rw_want_upgrade
, 0, 0);
1568 while (((gotlock
= lck_rw_grab_shared(lck
)) == 0) && mach_absolute_time() < deadline
) {
1569 lck_rw_lock_pause(istate
);
1572 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_RW_LCK_SHARED_SPIN_CODE
) | DBG_FUNC_END
,
1573 trace_lck
, lck
->lck_rw_want_write
, lck
->lck_rw_want_upgrade
, gotlock
, 0);
1579 * if we get here, the deadline has expired w/o us
1580 * being able to grab the lock for read
1581 * check to see if we're allowed to do a thread_block
1583 if (lck
->lck_rw_can_sleep
) {
1584 istate
= lck_interlock_lock(lck
);
1586 if ((lck
->lck_rw_want_write
|| lck
->lck_rw_want_upgrade
) &&
1587 ((lck
->lck_rw_shared_count
== 0) || lck
->lck_rw_priv_excl
)) {
1588 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_RW_LCK_SHARED_WAIT_CODE
) | DBG_FUNC_START
,
1589 trace_lck
, lck
->lck_rw_want_write
, lck
->lck_rw_want_upgrade
, 0, 0);
1591 lck
->lck_r_waiting
= TRUE
;
1593 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockRead
);
1594 res
= assert_wait(RW_LOCK_READER_EVENT(lck
),
1595 THREAD_UNINT
| THREAD_WAIT_NOREPORT_USER
);
1596 lck_interlock_unlock(lck
, istate
);
1598 if (res
== THREAD_WAITING
) {
1599 res
= thread_block(THREAD_CONTINUE_NULL
);
1602 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_RW_LCK_SHARED_WAIT_CODE
) | DBG_FUNC_END
,
1603 trace_lck
, res
, slept
, 0, 0);
1605 lck
->lck_rw_shared_count
++;
1606 lck_interlock_unlock(lck
, istate
);
1613 if (dtrace_ls_enabled
== TRUE
) {
1615 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_SPIN
, lck
, mach_absolute_time() - wait_interval
, 0);
1617 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_BLOCK
, lck
,
1618 mach_absolute_time() - wait_interval
, 0,
1619 (readers_at_sleep
== 0 ? 1 : 0), readers_at_sleep
);
1622 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE
, lck
, 0);
1628 * Routine: lck_rw_lock_exclusive
1632 lck_rw_lock_exclusive(lck_rw_t
*lock
)
1634 current_thread()->rwlock_count
++;
1635 if (atomic_test_and_set32(&lock
->data
,
1636 (LCK_RW_SHARED_MASK
| LCK_RW_WANT_EXCL
| LCK_RW_WANT_UPGRADE
| LCK_RW_INTERLOCK
),
1637 LCK_RW_WANT_EXCL
, memory_order_acquire_smp
, FALSE
)) {
1639 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE
, lock
, DTRACE_RW_EXCL
);
1640 #endif /* CONFIG_DTRACE */
1642 lck_rw_lock_exclusive_gen(lock
);
1648 * Routine: lck_rw_lock_shared_to_exclusive
1650 * False returned upon failure, in this case the shared lock is dropped.
1654 lck_rw_lock_shared_to_exclusive(lck_rw_t
*lock
)
1656 uint32_t data
, prev
;
1659 data
= atomic_exchange_begin32(&lock
->data
, &prev
, memory_order_acquire_smp
);
1660 if (data
& LCK_RW_INTERLOCK
) {
1661 atomic_exchange_abort();
1662 lck_rw_interlock_spin(lock
);
1665 if (data
& LCK_RW_WANT_UPGRADE
) {
1666 data
-= LCK_RW_SHARED_READER
;
1667 if ((data
& LCK_RW_SHARED_MASK
) == 0) { /* we were the last reader */
1668 data
&= ~(LCK_RW_W_WAITING
); /* so clear the wait indicator */
1670 if (atomic_exchange_complete32(&lock
->data
, prev
, data
, memory_order_acquire_smp
)) {
1671 return lck_rw_lock_shared_to_exclusive_failure(lock
, prev
);
1674 data
|= LCK_RW_WANT_UPGRADE
; /* ask for WANT_UPGRADE */
1675 data
-= LCK_RW_SHARED_READER
; /* and shed our read count */
1676 if (atomic_exchange_complete32(&lock
->data
, prev
, data
, memory_order_acquire_smp
)) {
1682 /* we now own the WANT_UPGRADE */
1683 if (data
& LCK_RW_SHARED_MASK
) { /* check to see if all of the readers are drained */
1684 lck_rw_lock_shared_to_exclusive_success(lock
); /* if not, we need to go wait */
1687 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE
, lock
, 0);
1694 * Routine: lck_rw_lock_shared_to_exclusive_failure
1696 * assembly fast path code has already dropped our read
1697 * count and determined that someone else owns 'lck_rw_want_upgrade'
1698 * if 'lck_rw_shared_count' == 0, its also already dropped 'lck_w_waiting'
1699 * all we need to do here is determine if a wakeup is needed
1702 lck_rw_lock_shared_to_exclusive_failure(
1704 uint32_t prior_lock_state
)
1707 thread_t thread
= current_thread();
1708 uint32_t rwlock_count
;
1710 /* Check if dropping the lock means that we need to unpromote */
1711 rwlock_count
= thread
->rwlock_count
--;
1713 if (rwlock_count
== 0) {
1714 panic("rw lock count underflow for thread %p", thread
);
1717 fake_lck
= (lck_rw_t
*)&prior_lock_state
;
1719 if (fake_lck
->lck_w_waiting
&& fake_lck
->lck_rw_shared_count
== 1) {
1721 * Someone else has requested upgrade.
1722 * Since we've released the read lock, wake
1723 * him up if he's blocked waiting
1725 thread_wakeup(RW_LOCK_WRITER_EVENT(lck
));
1728 if ((rwlock_count
== 1 /* field now 0 */) && (thread
->sched_flags
& TH_SFLAG_RW_PROMOTED
)) {
1729 /* sched_flags checked without lock, but will be rechecked while clearing */
1730 lck_rw_clear_promotion(thread
, unslide_for_kdebug(lck
));
1733 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_RW_LCK_SH_TO_EX_CODE
) | DBG_FUNC_NONE
,
1734 VM_KERNEL_UNSLIDE_OR_PERM(lck
), lck
->lck_rw_shared_count
, lck
->lck_rw_want_upgrade
, 0, 0);
1741 * Routine: lck_rw_lock_shared_to_exclusive_failure
1743 * assembly fast path code has already dropped our read
1744 * count and successfully acquired 'lck_rw_want_upgrade'
1745 * we just need to wait for the rest of the readers to drain
1746 * and then we can return as the exclusive holder of this lock
1749 lck_rw_lock_shared_to_exclusive_success(
1752 __kdebug_only
uintptr_t trace_lck
= unslide_for_kdebug(lck
);
1753 uint64_t deadline
= 0;
1755 int still_shared
= 0;
1757 boolean_t istate
= -1;
1760 uint64_t wait_interval
= 0;
1761 int readers_at_sleep
= 0;
1762 boolean_t dtrace_ls_initialized
= FALSE
;
1763 boolean_t dtrace_rwl_shared_to_excl_spin
, dtrace_rwl_shared_to_excl_block
, dtrace_ls_enabled
= FALSE
;
1766 while (lck
->lck_rw_shared_count
!= 0) {
1768 if (dtrace_ls_initialized
== FALSE
) {
1769 dtrace_ls_initialized
= TRUE
;
1770 dtrace_rwl_shared_to_excl_spin
= (lockstat_probemap
[LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN
] != 0);
1771 dtrace_rwl_shared_to_excl_block
= (lockstat_probemap
[LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK
] != 0);
1772 dtrace_ls_enabled
= dtrace_rwl_shared_to_excl_spin
|| dtrace_rwl_shared_to_excl_block
;
1773 if (dtrace_ls_enabled
) {
1775 * Either sleeping or spinning is happening,
1776 * start a timing of our delay interval now.
1778 readers_at_sleep
= lck
->lck_rw_shared_count
;
1779 wait_interval
= mach_absolute_time();
1784 istate
= ml_get_interrupts_enabled();
1787 deadline
= lck_rw_deadline_for_spin(lck
);
1789 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_RW_LCK_SH_TO_EX_SPIN_CODE
) | DBG_FUNC_START
,
1790 trace_lck
, lck
->lck_rw_shared_count
, 0, 0, 0);
1792 while ((still_shared
= lck
->lck_rw_shared_count
) && mach_absolute_time() < deadline
) {
1793 lck_rw_lock_pause(istate
);
1796 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_RW_LCK_SH_TO_EX_SPIN_CODE
) | DBG_FUNC_END
,
1797 trace_lck
, lck
->lck_rw_shared_count
, 0, 0, 0);
1799 if (!still_shared
) {
1803 * if we get here, the deadline has expired w/o
1804 * the rw_shared_count having drained to 0
1805 * check to see if we're allowed to do a thread_block
1807 if (lck
->lck_rw_can_sleep
) {
1808 istate
= lck_interlock_lock(lck
);
1810 if (lck
->lck_rw_shared_count
!= 0) {
1811 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_RW_LCK_SH_TO_EX_WAIT_CODE
) | DBG_FUNC_START
,
1812 trace_lck
, lck
->lck_rw_shared_count
, 0, 0, 0);
1814 lck
->lck_w_waiting
= TRUE
;
1816 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockUpgrade
);
1817 res
= assert_wait(RW_LOCK_WRITER_EVENT(lck
),
1818 THREAD_UNINT
| THREAD_WAIT_NOREPORT_USER
);
1819 lck_interlock_unlock(lck
, istate
);
1821 if (res
== THREAD_WAITING
) {
1822 res
= thread_block(THREAD_CONTINUE_NULL
);
1825 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_RW_LCK_SH_TO_EX_WAIT_CODE
) | DBG_FUNC_END
,
1826 trace_lck
, res
, slept
, 0, 0);
1828 lck_interlock_unlock(lck
, istate
);
1835 * We infer whether we took the sleep/spin path above by checking readers_at_sleep.
1837 if (dtrace_ls_enabled
== TRUE
) {
1839 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN
, lck
, mach_absolute_time() - wait_interval
, 0);
1841 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK
, lck
,
1842 mach_absolute_time() - wait_interval
, 1,
1843 (readers_at_sleep
== 0 ? 1 : 0), readers_at_sleep
);
1846 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE
, lck
, 1);
1852 * Routine: lck_rw_lock_exclusive_to_shared
1856 lck_rw_lock_exclusive_to_shared(lck_rw_t
*lock
)
1858 uint32_t data
, prev
;
1861 data
= atomic_exchange_begin32(&lock
->data
, &prev
, memory_order_release_smp
);
1862 if (data
& LCK_RW_INTERLOCK
) {
1863 atomic_exchange_abort();
1864 lck_rw_interlock_spin(lock
); /* wait for interlock to clear */
1867 data
+= LCK_RW_SHARED_READER
;
1868 if (data
& LCK_RW_WANT_UPGRADE
) {
1869 data
&= ~(LCK_RW_WANT_UPGRADE
);
1871 data
&= ~(LCK_RW_WANT_EXCL
);
1873 if (!((prev
& LCK_RW_W_WAITING
) && (prev
& LCK_RW_PRIV_EXCL
))) {
1874 data
&= ~(LCK_RW_W_WAITING
);
1876 if (atomic_exchange_complete32(&lock
->data
, prev
, data
, memory_order_release_smp
)) {
1881 return lck_rw_lock_exclusive_to_shared_gen(lock
, prev
);
1886 * Routine: lck_rw_lock_exclusive_to_shared_gen
1888 * assembly fast path has already dropped
1889 * our exclusive state and bumped lck_rw_shared_count
1890 * all we need to do here is determine if anyone
1891 * needs to be awakened.
1894 lck_rw_lock_exclusive_to_shared_gen(
1896 uint32_t prior_lock_state
)
1898 __kdebug_only
uintptr_t trace_lck
= unslide_for_kdebug(lck
);
1901 fake_lck
= (lck_rw_t
*)&prior_lock_state
;
1903 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_RW_LCK_EX_TO_SH_CODE
) | DBG_FUNC_START
,
1904 trace_lck
, fake_lck
->lck_rw_want_write
, fake_lck
->lck_rw_want_upgrade
, 0, 0);
1907 * don't wake up anyone waiting to take the lock exclusively
1908 * since we hold a read count... when the read count drops to 0,
1909 * the writers will be woken.
1911 * wake up any waiting readers if we don't have any writers waiting,
1912 * or the lock is NOT marked as rw_priv_excl (writers have privilege)
1914 if (!(fake_lck
->lck_rw_priv_excl
&& fake_lck
->lck_w_waiting
) && fake_lck
->lck_r_waiting
) {
1915 thread_wakeup(RW_LOCK_READER_EVENT(lck
));
1918 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_RW_LCK_EX_TO_SH_CODE
) | DBG_FUNC_END
,
1919 trace_lck
, lck
->lck_rw_want_write
, lck
->lck_rw_want_upgrade
, lck
->lck_rw_shared_count
, 0);
1922 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_TO_SHARED_DOWNGRADE
, lck
, 0);
1928 * Routine: lck_rw_try_lock
1933 lck_rw_type_t lck_rw_type
)
1935 if (lck_rw_type
== LCK_RW_TYPE_SHARED
) {
1936 return lck_rw_try_lock_shared(lck
);
1937 } else if (lck_rw_type
== LCK_RW_TYPE_EXCLUSIVE
) {
1938 return lck_rw_try_lock_exclusive(lck
);
1940 panic("lck_rw_try_lock(): Invalid rw lock type: %x\n", lck_rw_type
);
1946 * Routine: lck_rw_try_lock_shared
1950 lck_rw_try_lock_shared(lck_rw_t
*lock
)
1952 uint32_t data
, prev
;
1955 data
= atomic_exchange_begin32(&lock
->data
, &prev
, memory_order_acquire_smp
);
1956 if (data
& LCK_RW_INTERLOCK
) {
1957 atomic_exchange_abort();
1958 lck_rw_interlock_spin(lock
);
1961 if (data
& (LCK_RW_WANT_EXCL
| LCK_RW_WANT_UPGRADE
)) {
1962 atomic_exchange_abort();
1963 return FALSE
; /* lock is busy */
1965 data
+= LCK_RW_SHARED_READER
; /* Increment reader refcount */
1966 if (atomic_exchange_complete32(&lock
->data
, prev
, data
, memory_order_acquire_smp
)) {
1971 current_thread()->rwlock_count
++;
1972 /* There is a 3 instr window where preemption may not notice rwlock_count after cmpxchg */
1974 LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_SHARED_ACQUIRE
, lock
, DTRACE_RW_SHARED
);
1975 #endif /* CONFIG_DTRACE */
1981 * Routine: lck_rw_try_lock_exclusive
1985 lck_rw_try_lock_exclusive(lck_rw_t
*lock
)
1987 uint32_t data
, prev
;
1990 data
= atomic_exchange_begin32(&lock
->data
, &prev
, memory_order_acquire_smp
);
1991 if (data
& LCK_RW_INTERLOCK
) {
1992 atomic_exchange_abort();
1993 lck_rw_interlock_spin(lock
);
1996 if (data
& (LCK_RW_SHARED_MASK
| LCK_RW_WANT_EXCL
| LCK_RW_WANT_UPGRADE
)) {
1997 atomic_exchange_abort();
1998 return FALSE
; /* can't get it */
2000 data
|= LCK_RW_WANT_EXCL
;
2001 if (atomic_exchange_complete32(&lock
->data
, prev
, data
, memory_order_acquire_smp
)) {
2007 current_thread()->rwlock_count
++;
2009 LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_EXCL_ACQUIRE
, lock
, DTRACE_RW_EXCL
);
2010 #endif /* CONFIG_DTRACE */
2021 case LCK_RW_ASSERT_SHARED
:
2022 if (lck
->lck_rw_shared_count
!= 0) {
2026 case LCK_RW_ASSERT_EXCLUSIVE
:
2027 if ((lck
->lck_rw_want_write
||
2028 lck
->lck_rw_want_upgrade
) &&
2029 lck
->lck_rw_shared_count
== 0) {
2033 case LCK_RW_ASSERT_HELD
:
2034 if (lck
->lck_rw_want_write
||
2035 lck
->lck_rw_want_upgrade
||
2036 lck
->lck_rw_shared_count
!= 0) {
2040 case LCK_RW_ASSERT_NOTHELD
:
2041 if (!(lck
->lck_rw_want_write
||
2042 lck
->lck_rw_want_upgrade
||
2043 lck
->lck_rw_shared_count
!= 0)) {
2051 panic("rw lock (%p)%s held (mode=%u), first word %08x\n", lck
, (type
== LCK_RW_ASSERT_NOTHELD
? "" : " not"), type
, *(uint32_t *)lck
);
2054 /* On return to userspace, this routine is called if the rwlock_count is somehow imbalanced */
2059 lck_rw_clear_promotions_x86(thread_t thread
)
2062 /* It's fatal to leave a RW lock locked and return to userspace */
2063 panic("%u rw lock(s) held on return to userspace for thread %p", thread
->rwlock_count
, thread
);
2065 /* Paper over the issue */
2066 thread
->rwlock_count
= 0;
2067 lck_rw_clear_promotion(thread
, 0);
2072 lck_rw_lock_yield_shared(lck_rw_t
*lck
, boolean_t force_yield
)
2074 lck_rw_assert(lck
, LCK_RW_ASSERT_SHARED
);
2076 if (lck
->lck_rw_want_write
|| lck
->lck_rw_want_upgrade
|| force_yield
) {
2077 lck_rw_unlock_shared(lck
);
2079 lck_rw_lock_shared(lck
);
2087 * Routine: kdp_lck_rw_lock_is_acquired_exclusive
2088 * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
2091 kdp_lck_rw_lock_is_acquired_exclusive(lck_rw_t
*lck
)
2094 panic("panic: rw lock exclusive check done outside of kernel debugger");
2096 return ((lck
->lck_rw_want_upgrade
|| lck
->lck_rw_want_write
) && (lck
->lck_rw_shared_count
== 0)) ? TRUE
: FALSE
;
2100 * Slow path routines for lck_mtx locking and unlocking functions.
2102 * These functions were previously implemented in x86 assembly,
2103 * and some optimizations are in place in this c code to obtain a compiled code
2104 * as performant and compact as the assembly version.
2106 * To avoid to inline these functions on the fast path, all functions directly called by
2107 * the fast paths have the __attribute__((noinline)) specified. Also they are all implemented
2108 * in such a way the fast path can tail call into them. In this way the return address
2109 * does not need to be pushed on the caller stack and stack optimization can happen on the caller.
2111 * Slow path code is structured in such a way there are no calls to functions that will return
2112 * on the context of the caller function, i.e. all functions called are or tail call functions
2113 * or inline functions. The number of arguments of the tail call functions are less then six,
2114 * so that they can be passed over registers and do not need to be pushed on stack.
2115 * This allows the compiler to not create a stack frame for the functions.
2117 * __improbable and __probable are used to compile the slow path code in such a way
2118 * the fast path case will be on a sequence of instructions with as less jumps as possible,
2119 * to make this case the most optimized even if falling through the slow path.
2123 * Intel lock invariants:
2125 * lck_mtx_waiters: contains the count of threads currently in the mutex waitqueue
2127 * The lock owner is promoted to the max priority of all its waiters only if it
2128 * was a lower priority when it acquired or was an owner when a waiter waited.
2129 * Max priority is capped at MAXPRI_PROMOTE.
2131 * The last waiter will not be promoted as it is woken up, but the last
2132 * lock owner may not have been the last thread to have been woken up depending on the
2133 * luck of the draw. Therefore a last-owner may still have the promoted-on-wakeup
2136 * TODO: Figure out an algorithm for stopping a lock holder which is already at the right
2137 * priority from dropping priority in the future without having to take thread lock
2142 extern zone_t lck_mtx_zone
;
2146 * Routine: lck_mtx_alloc_init
2155 if ((lck
= (lck_mtx_t
*)zalloc(lck_mtx_zone
)) != 0) {
2156 lck_mtx_init(lck
, grp
, attr
);
2159 if ((lck
= (lck_mtx_t
*)kalloc(sizeof(lck_mtx_t
))) != 0) {
2160 lck_mtx_init(lck
, grp
, attr
);
2167 * Routine: lck_mtx_free
2174 lck_mtx_destroy(lck
, grp
);
2176 zfree(lck_mtx_zone
, lck
);
2178 kfree(lck
, sizeof(lck_mtx_t
));
2183 * Routine: lck_mtx_ext_init
2191 bzero((void *)lck
, sizeof(lck_mtx_ext_t
));
2193 if ((attr
->lck_attr_val
) & LCK_ATTR_DEBUG
) {
2194 lck
->lck_mtx_deb
.type
= MUTEX_TAG
;
2195 lck
->lck_mtx_attr
|= LCK_MTX_ATTR_DEBUG
;
2198 lck
->lck_mtx_grp
= grp
;
2200 if (grp
->lck_grp_attr
& LCK_GRP_ATTR_STAT
) {
2201 lck
->lck_mtx_attr
|= LCK_MTX_ATTR_STAT
;
2204 lck
->lck_mtx
.lck_mtx_is_ext
= 1;
2205 lck
->lck_mtx
.lck_mtx_pad32
= 0xFFFFFFFF;
2209 * Routine: lck_mtx_init
2217 lck_mtx_ext_t
*lck_ext
;
2218 lck_attr_t
*lck_attr
;
2220 if (attr
!= LCK_ATTR_NULL
) {
2223 lck_attr
= &LockDefaultLckAttr
;
2226 if ((lck_attr
->lck_attr_val
) & LCK_ATTR_DEBUG
) {
2227 if ((lck_ext
= (lck_mtx_ext_t
*)kalloc(sizeof(lck_mtx_ext_t
))) != 0) {
2228 lck_mtx_ext_init(lck_ext
, grp
, lck_attr
);
2229 lck
->lck_mtx_tag
= LCK_MTX_TAG_INDIRECT
;
2230 lck
->lck_mtx_ptr
= lck_ext
;
2233 lck
->lck_mtx_owner
= 0;
2234 lck
->lck_mtx_state
= 0;
2236 lck
->lck_mtx_pad32
= 0xFFFFFFFF;
2237 lck_grp_reference(grp
);
2238 lck_grp_lckcnt_incr(grp
, LCK_TYPE_MTX
);
2242 * Routine: lck_mtx_init_ext
2247 lck_mtx_ext_t
*lck_ext
,
2251 lck_attr_t
*lck_attr
;
2253 if (attr
!= LCK_ATTR_NULL
) {
2256 lck_attr
= &LockDefaultLckAttr
;
2259 if ((lck_attr
->lck_attr_val
) & LCK_ATTR_DEBUG
) {
2260 lck_mtx_ext_init(lck_ext
, grp
, lck_attr
);
2261 lck
->lck_mtx_tag
= LCK_MTX_TAG_INDIRECT
;
2262 lck
->lck_mtx_ptr
= lck_ext
;
2264 lck
->lck_mtx_owner
= 0;
2265 lck
->lck_mtx_state
= 0;
2267 lck
->lck_mtx_pad32
= 0xFFFFFFFF;
2269 lck_grp_reference(grp
);
2270 lck_grp_lckcnt_incr(grp
, LCK_TYPE_MTX
);
2274 lck_mtx_lock_mark_destroyed(
2281 /* convert to destroyed state */
2282 ordered_store_mtx_state_release(mutex
, LCK_MTX_TAG_DESTROYED
);
2286 state
= ordered_load_mtx_state(mutex
);
2287 lck_mtx_interlock_lock(mutex
, &state
);
2289 ordered_store_mtx_state_release(mutex
, LCK_MTX_TAG_DESTROYED
);
2291 enable_preemption();
2295 * Routine: lck_mtx_destroy
2304 if (lck
->lck_mtx_tag
== LCK_MTX_TAG_DESTROYED
) {
2308 lck_mtx_assert(lck
, LCK_MTX_ASSERT_NOTOWNED
);
2310 indirect
= (lck
->lck_mtx_tag
== LCK_MTX_TAG_INDIRECT
);
2312 lck_mtx_lock_mark_destroyed(lck
, indirect
);
2315 kfree(lck
->lck_mtx_ptr
, sizeof(lck_mtx_ext_t
));
2317 lck_grp_lckcnt_decr(grp
, LCK_TYPE_MTX
);
2318 lck_grp_deallocate(grp
);
2323 #if DEVELOPMENT | DEBUG
2324 __attribute__((noinline
))
2326 lck_mtx_owner_check_panic(
2329 thread_t owner
= (thread_t
)lock
->lck_mtx_owner
;
2330 panic("Mutex unlock attempted from non-owner thread. Owner=%p lock=%p", owner
, lock
);
2334 __attribute__((always_inline
))
2340 *lock
= &((*lock
)->lck_mtx_ptr
->lck_mtx
);
2341 *state
= ordered_load_mtx_state(*lock
);
2346 * Routine: lck_mtx_unlock_slow
2348 * Unlocks a mutex held by current thread.
2350 * It will wake up waiters if necessary.
2352 * Interlock can be held.
2354 __attribute__((noinline
))
2356 lck_mtx_unlock_slow(
2360 uint32_t state
, prev
;
2361 boolean_t indirect
= FALSE
;
2363 state
= ordered_load_mtx_state(lock
);
2365 /* Is this an indirect mutex? */
2366 if (__improbable(state
== LCK_MTX_TAG_INDIRECT
)) {
2367 indirect
= get_indirect_mutex(&lock
, &state
);
2370 thread
= current_thread();
2372 #if DEVELOPMENT | DEBUG
2373 thread_t owner
= (thread_t
)lock
->lck_mtx_owner
;
2374 if (__improbable(owner
!= thread
)) {
2375 lck_mtx_owner_check_panic(lock
);
2379 /* check if it is held as a spinlock */
2380 if (__improbable((state
& LCK_MTX_MLOCKED_MSK
) == 0)) {
2384 lck_mtx_interlock_lock_clear_flags(lock
, LCK_MTX_MLOCKED_MSK
, &state
);
2387 /* preemption disabled, interlock held and mutex not held */
2390 ordered_store_mtx_owner(lock
, 0);
2391 /* keep original state in prev for later evaluation */
2394 if (__improbable(state
& LCK_MTX_WAITERS_MSK
)) {
2397 thread
->mutex_count
--;
2400 return lck_mtx_unlock_wakeup_tail(lock
, state
, indirect
);
2403 /* release interlock, promotion and clear spin flag */
2404 state
&= (~(LCK_MTX_ILOCKED_MSK
| LCK_MTX_SPIN_MSK
));
2405 ordered_store_mtx_state_release(lock
, state
); /* since I own the interlock, I don't need an atomic update */
2408 /* perform lock statistics after drop to prevent delay */
2410 thread
->mutex_count
--; /* lock statistic */
2412 #endif /* MACH_LDEBUG */
2414 /* re-enable preemption */
2415 lck_mtx_unlock_finish_inline(lock
, FALSE
);
2420 #define LCK_MTX_LCK_WAIT_CODE 0x20
2421 #define LCK_MTX_LCK_WAKEUP_CODE 0x21
2422 #define LCK_MTX_LCK_SPIN_CODE 0x22
2423 #define LCK_MTX_LCK_ACQUIRE_CODE 0x23
2424 #define LCK_MTX_LCK_DEMOTE_CODE 0x24
2427 * Routine: lck_mtx_unlock_wakeup_tail
2429 * Invoked on unlock when there is
2430 * contention, i.e. the assembly routine sees
2431 * that mutex->lck_mtx_waiters != 0
2433 * neither the mutex or interlock is held
2435 * Note that this routine might not be called if there are pending
2436 * waiters which have previously been woken up, and they didn't
2437 * end up boosting the old owner.
2439 * assembly routine previously did the following to mutex:
2440 * (after saving the state in prior_lock_state)
2441 * decremented lck_mtx_waiters if nonzero
2443 * This function needs to be called as a tail call
2444 * to optimize the compiled code.
2446 __attribute__((noinline
))
2448 lck_mtx_unlock_wakeup_tail(
2453 struct turnstile
*ts
;
2455 __kdebug_only
uintptr_t trace_lck
= unslide_for_kdebug(mutex
);
2456 kern_return_t did_wake
;
2458 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_MTX_LCK_WAKEUP_CODE
) | DBG_FUNC_START
,
2459 trace_lck
, 0, mutex
->lck_mtx_waiters
, 0, 0);
2461 ts
= turnstile_prepare((uintptr_t)mutex
, NULL
, TURNSTILE_NULL
, TURNSTILE_KERNEL_MUTEX
);
2463 if (mutex
->lck_mtx_waiters
> 1) {
2464 /* WAITQ_PROMOTE_ON_WAKE will call turnstile_update_inheritor on the wokenup thread */
2465 did_wake
= waitq_wakeup64_one(&ts
->ts_waitq
, CAST_EVENT64_T(LCK_MTX_EVENT(mutex
)), THREAD_AWAKENED
, WAITQ_PROMOTE_ON_WAKE
);
2467 did_wake
= waitq_wakeup64_one(&ts
->ts_waitq
, CAST_EVENT64_T(LCK_MTX_EVENT(mutex
)), THREAD_AWAKENED
, WAITQ_ALL_PRIORITIES
);
2468 turnstile_update_inheritor(ts
, NULL
, TURNSTILE_IMMEDIATE_UPDATE
);
2470 assert(did_wake
== KERN_SUCCESS
);
2472 turnstile_update_inheritor_complete(ts
, TURNSTILE_INTERLOCK_HELD
);
2473 turnstile_complete((uintptr_t)mutex
, NULL
, NULL
, TURNSTILE_KERNEL_MUTEX
);
2475 state
-= LCK_MTX_WAITER
;
2476 state
&= (~(LCK_MTX_SPIN_MSK
| LCK_MTX_ILOCKED_MSK
));
2477 ordered_store_mtx_state_release(mutex
, state
);
2479 assert(current_thread()->turnstile
!= NULL
);
2481 turnstile_cleanup();
2483 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_MTX_LCK_WAKEUP_CODE
) | DBG_FUNC_END
,
2484 trace_lck
, 0, mutex
->lck_mtx_waiters
, 0, 0);
2486 lck_mtx_unlock_finish_inline(mutex
, indirect
);
2490 * Routine: lck_mtx_lock_acquire_x86
2492 * Invoked on acquiring the mutex when there is
2493 * contention (i.e. the assembly routine sees that
2494 * that mutex->lck_mtx_waiters != 0
2496 * mutex is owned... interlock is held... preemption is disabled
2498 __attribute__((always_inline
))
2500 lck_mtx_lock_acquire_inline(
2502 struct turnstile
*ts
)
2504 __kdebug_only
uintptr_t trace_lck
= unslide_for_kdebug(mutex
);
2506 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_MTX_LCK_ACQUIRE_CODE
) | DBG_FUNC_START
,
2507 trace_lck
, 0, mutex
->lck_mtx_waiters
, 0, 0);
2509 thread_t thread
= (thread_t
)mutex
->lck_mtx_owner
; /* faster than current_thread() */
2510 assert(thread
->waiting_for_mutex
== NULL
);
2512 if (mutex
->lck_mtx_waiters
> 0) {
2514 ts
= turnstile_prepare((uintptr_t)mutex
, NULL
, TURNSTILE_NULL
, TURNSTILE_KERNEL_MUTEX
);
2517 turnstile_update_inheritor(ts
, thread
, (TURNSTILE_IMMEDIATE_UPDATE
| TURNSTILE_INHERITOR_THREAD
));
2518 turnstile_update_inheritor_complete(ts
, TURNSTILE_INTERLOCK_HELD
);
2522 turnstile_complete((uintptr_t)mutex
, NULL
, NULL
, TURNSTILE_KERNEL_MUTEX
);
2525 assert(current_thread()->turnstile
!= NULL
);
2527 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_MTX_LCK_ACQUIRE_CODE
) | DBG_FUNC_END
,
2528 trace_lck
, 0, mutex
->lck_mtx_waiters
, 0, 0);
2532 lck_mtx_lock_acquire_x86(
2535 return lck_mtx_lock_acquire_inline(mutex
, NULL
);
2539 * Tail call helpers for lock functions that perform
2540 * lck_mtx_lock_acquire followed by the caller's finish routine, to optimize
2541 * the caller's compiled code.
2544 __attribute__((noinline
))
2546 lck_mtx_lock_acquire_tail(
2549 struct turnstile
*ts
)
2551 lck_mtx_lock_acquire_inline(mutex
, ts
);
2552 lck_mtx_lock_finish_inline_with_cleanup(mutex
, ordered_load_mtx_state(mutex
), indirect
);
2555 __attribute__((noinline
))
2557 lck_mtx_try_lock_acquire_tail(
2560 lck_mtx_lock_acquire_inline(mutex
, NULL
);
2561 lck_mtx_try_lock_finish_inline(mutex
, ordered_load_mtx_state(mutex
));
2566 __attribute__((noinline
))
2568 lck_mtx_convert_spin_acquire_tail(
2571 lck_mtx_lock_acquire_inline(mutex
, NULL
);
2572 lck_mtx_convert_spin_finish_inline(mutex
, ordered_load_mtx_state(mutex
));
2579 lck_mtx_ilk_unlock_inline(mutex
, ordered_load_mtx_state(mutex
));
2584 lck_mtx_interlock_lock_set_and_clear_flags(
2588 uint32_t *new_state
)
2590 uint32_t state
, prev
;
2594 /* have to wait for interlock to clear */
2595 while (__improbable(state
& (LCK_MTX_ILOCKED_MSK
| xor_flags
))) {
2597 state
= ordered_load_mtx_state(mutex
);
2599 prev
= state
; /* prev contains snapshot for exchange */
2600 state
|= LCK_MTX_ILOCKED_MSK
| xor_flags
; /* pick up interlock */
2601 state
&= ~and_flags
; /* clear flags */
2603 disable_preemption();
2604 if (os_atomic_cmpxchg(&mutex
->lck_mtx_state
, prev
, state
, acquire
)) {
2607 enable_preemption();
2609 state
= ordered_load_mtx_state(mutex
);
2616 lck_mtx_interlock_lock_clear_flags(
2619 uint32_t *new_state
)
2621 return lck_mtx_interlock_lock_set_and_clear_flags(mutex
, 0, and_flags
, new_state
);
2625 lck_mtx_interlock_lock(
2627 uint32_t *new_state
)
2629 return lck_mtx_interlock_lock_set_and_clear_flags(mutex
, 0, 0, new_state
);
2633 lck_mtx_interlock_try_lock_set_flags(
2636 uint32_t *new_state
)
2638 uint32_t state
, prev
;
2641 /* have to wait for interlock to clear */
2642 if (state
& (LCK_MTX_ILOCKED_MSK
| or_flags
)) {
2645 prev
= state
; /* prev contains snapshot for exchange */
2646 state
|= LCK_MTX_ILOCKED_MSK
| or_flags
; /* pick up interlock */
2647 disable_preemption();
2648 if (os_atomic_cmpxchg(&mutex
->lck_mtx_state
, prev
, state
, acquire
)) {
2653 enable_preemption();
2657 __attribute__((noinline
))
2659 lck_mtx_lock_contended(
2662 boolean_t
*first_miss
)
2664 lck_mtx_spinwait_ret_type_t ret
;
2667 struct turnstile
*ts
= NULL
;
2672 lck_grp_mtx_update_miss((struct _lck_mtx_ext_
*)lock
, first_miss
);
2675 ret
= lck_mtx_lock_spinwait_x86(lock
);
2676 state
= ordered_load_mtx_state(lock
);
2678 case LCK_MTX_SPINWAIT_NO_SPIN
:
2680 * owner not on core, lck_mtx_lock_spinwait_x86 didn't even
2684 lck_grp_mtx_update_direct_wait((struct _lck_mtx_ext_
*)lock
);
2687 /* just fall through case LCK_MTX_SPINWAIT_SPUN */
2688 case LCK_MTX_SPINWAIT_SPUN_HIGH_THR
:
2689 case LCK_MTX_SPINWAIT_SPUN_OWNER_NOT_CORE
:
2690 case LCK_MTX_SPINWAIT_SPUN_NO_WINDOW_CONTENTION
:
2691 case LCK_MTX_SPINWAIT_SPUN_SLIDING_THR
:
2693 * mutex not acquired but lck_mtx_lock_spinwait_x86 tried to spin
2694 * interlock not held
2696 lck_mtx_interlock_lock(lock
, &state
);
2697 assert(state
& LCK_MTX_ILOCKED_MSK
);
2699 if (state
& LCK_MTX_MLOCKED_MSK
) {
2701 lck_grp_mtx_update_wait((struct _lck_mtx_ext_
*)lock
, first_miss
);
2703 lck_mtx_lock_wait_x86(lock
, &ts
);
2705 * interlock is not held here.
2709 /* grab the mutex */
2710 state
|= LCK_MTX_MLOCKED_MSK
;
2711 ordered_store_mtx_state_release(lock
, state
);
2712 thread
= current_thread();
2713 ordered_store_mtx_owner(lock
, (uintptr_t)thread
);
2716 thread
->mutex_count
++;
2718 #endif /* MACH_LDEBUG */
2722 case LCK_MTX_SPINWAIT_ACQUIRED
:
2724 * mutex has been acquired by lck_mtx_lock_spinwait_x86
2725 * interlock is held and preemption disabled
2726 * owner is set and mutex marked as locked
2727 * statistics updated too
2731 panic("lck_mtx_lock_spinwait_x86 returned %d for mutex %p\n", ret
, lock
);
2735 * interlock is already acquired here
2738 /* mutex has been acquired */
2739 thread
= (thread_t
)lock
->lck_mtx_owner
;
2740 if (state
& LCK_MTX_WAITERS_MSK
) {
2742 * lck_mtx_lock_acquire_tail will call
2743 * turnstile_complete.
2745 return lck_mtx_lock_acquire_tail(lock
, indirect
, ts
);
2749 turnstile_complete((uintptr_t)lock
, NULL
, NULL
, TURNSTILE_KERNEL_MUTEX
);
2752 assert(current_thread()->turnstile
!= NULL
);
2754 /* release the interlock */
2755 lck_mtx_lock_finish_inline_with_cleanup(lock
, ordered_load_mtx_state(lock
), indirect
);
2759 * Helper noinline functions for calling
2760 * panic to optimize compiled code.
2763 __attribute__((noinline
)) __abortlike
2768 panic("trying to interlock destroyed mutex (%p)", lock
);
2771 __attribute__((noinline
))
2773 lck_mtx_try_destroyed(
2776 panic("trying to interlock destroyed mutex (%p)", lock
);
2780 __attribute__((always_inline
))
2782 lck_mtx_lock_wait_interlock_to_clear(
2784 uint32_t* new_state
)
2790 state
= ordered_load_mtx_state(lock
);
2791 if (!(state
& (LCK_MTX_ILOCKED_MSK
| LCK_MTX_MLOCKED_MSK
))) {
2795 if (state
& LCK_MTX_MLOCKED_MSK
) {
2796 /* if it is held as mutex, just fail */
2802 __attribute__((always_inline
))
2804 lck_mtx_try_lock_wait_interlock_to_clear(
2806 uint32_t* new_state
)
2812 state
= ordered_load_mtx_state(lock
);
2813 if (state
& (LCK_MTX_MLOCKED_MSK
| LCK_MTX_SPIN_MSK
)) {
2814 /* if it is held as mutex or spin, just fail */
2817 if (!(state
& LCK_MTX_ILOCKED_MSK
)) {
2825 * Routine: lck_mtx_lock_slow
2827 * Locks a mutex for current thread.
2828 * If the lock is contended this function might
2831 * Called with interlock not held.
2833 __attribute__((noinline
))
2838 boolean_t indirect
= FALSE
;
2842 state
= ordered_load_mtx_state(lock
);
2844 /* is the interlock or mutex held */
2845 if (__improbable(state
& ((LCK_MTX_ILOCKED_MSK
| LCK_MTX_MLOCKED_MSK
)))) {
2847 * Note: both LCK_MTX_TAG_DESTROYED and LCK_MTX_TAG_INDIRECT
2848 * have LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK
2849 * set in state (state == lck_mtx_tag)
2853 /* is the mutex already held and not indirect */
2854 if (__improbable(!(state
& LCK_MTX_ILOCKED_MSK
))) {
2855 /* no, must have been the mutex */
2856 return lck_mtx_lock_contended(lock
, indirect
, &first_miss
);
2859 /* check to see if it is marked destroyed */
2860 if (__improbable(state
== LCK_MTX_TAG_DESTROYED
)) {
2861 lck_mtx_destroyed(lock
);
2864 /* Is this an indirect mutex? */
2865 if (__improbable(state
== LCK_MTX_TAG_INDIRECT
)) {
2866 indirect
= get_indirect_mutex(&lock
, &state
);
2869 lck_grp_mtx_update_held((struct _lck_mtx_ext_
*)lock
);
2871 if (state
& LCK_MTX_SPIN_MSK
) {
2872 /* M_SPIN_MSK was set, so M_ILOCKED_MSK must also be present */
2873 assert(state
& LCK_MTX_ILOCKED_MSK
);
2874 lck_grp_mtx_update_miss((struct _lck_mtx_ext_
*)lock
, &first_miss
);
2878 if (!lck_mtx_lock_wait_interlock_to_clear(lock
, &state
)) {
2879 return lck_mtx_lock_contended(lock
, indirect
, &first_miss
);
2883 /* no - can't be INDIRECT, DESTROYED or locked */
2884 while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock
, LCK_MTX_MLOCKED_MSK
, &state
))) {
2885 if (!lck_mtx_lock_wait_interlock_to_clear(lock
, &state
)) {
2886 return lck_mtx_lock_contended(lock
, indirect
, &first_miss
);
2890 /* lock and interlock acquired */
2892 thread_t thread
= current_thread();
2893 /* record owner of mutex */
2894 ordered_store_mtx_owner(lock
, (uintptr_t)thread
);
2898 thread
->mutex_count
++; /* lock statistic */
2902 * Check if there are waiters to
2903 * inherit their priority.
2905 if (__improbable(state
& LCK_MTX_WAITERS_MSK
)) {
2906 return lck_mtx_lock_acquire_tail(lock
, indirect
, NULL
);
2909 /* release the interlock */
2910 lck_mtx_lock_finish_inline(lock
, ordered_load_mtx_state(lock
), indirect
);
2915 __attribute__((noinline
))
2917 lck_mtx_try_lock_slow(
2920 boolean_t indirect
= FALSE
;
2924 state
= ordered_load_mtx_state(lock
);
2926 /* is the interlock or mutex held */
2927 if (__improbable(state
& ((LCK_MTX_ILOCKED_MSK
| LCK_MTX_MLOCKED_MSK
)))) {
2929 * Note: both LCK_MTX_TAG_DESTROYED and LCK_MTX_TAG_INDIRECT
2930 * have LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK
2931 * set in state (state == lck_mtx_tag)
2934 /* is the mutex already held and not indirect */
2935 if (__improbable(!(state
& LCK_MTX_ILOCKED_MSK
))) {
2939 /* check to see if it is marked destroyed */
2940 if (__improbable(state
== LCK_MTX_TAG_DESTROYED
)) {
2941 lck_mtx_try_destroyed(lock
);
2944 /* Is this an indirect mutex? */
2945 if (__improbable(state
== LCK_MTX_TAG_INDIRECT
)) {
2946 indirect
= get_indirect_mutex(&lock
, &state
);
2949 lck_grp_mtx_update_held((struct _lck_mtx_ext_
*)lock
);
2952 if (!lck_mtx_try_lock_wait_interlock_to_clear(lock
, &state
)) {
2954 lck_grp_mtx_update_miss((struct _lck_mtx_ext_
*)lock
, &first_miss
);
2960 /* no - can't be INDIRECT, DESTROYED or locked */
2961 while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock
, LCK_MTX_MLOCKED_MSK
, &state
))) {
2962 if (!lck_mtx_try_lock_wait_interlock_to_clear(lock
, &state
)) {
2964 lck_grp_mtx_update_miss((struct _lck_mtx_ext_
*)lock
, &first_miss
);
2970 /* lock and interlock acquired */
2972 thread_t thread
= current_thread();
2973 /* record owner of mutex */
2974 ordered_store_mtx_owner(lock
, (uintptr_t)thread
);
2978 thread
->mutex_count
++; /* lock statistic */
2982 * Check if there are waiters to
2983 * inherit their priority.
2985 if (__improbable(state
& LCK_MTX_WAITERS_MSK
)) {
2986 return lck_mtx_try_lock_acquire_tail(lock
);
2989 /* release the interlock */
2990 lck_mtx_try_lock_finish_inline(lock
, ordered_load_mtx_state(lock
));
2995 __attribute__((noinline
))
2997 lck_mtx_lock_spin_slow(
3000 boolean_t indirect
= FALSE
;
3004 state
= ordered_load_mtx_state(lock
);
3006 /* is the interlock or mutex held */
3007 if (__improbable(state
& ((LCK_MTX_ILOCKED_MSK
| LCK_MTX_MLOCKED_MSK
)))) {
3009 * Note: both LCK_MTX_TAG_DESTROYED and LCK_MTX_TAG_INDIRECT
3010 * have LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK
3011 * set in state (state == lck_mtx_tag)
3015 /* is the mutex already held and not indirect */
3016 if (__improbable(!(state
& LCK_MTX_ILOCKED_MSK
))) {
3017 /* no, must have been the mutex */
3018 return lck_mtx_lock_contended(lock
, indirect
, &first_miss
);
3021 /* check to see if it is marked destroyed */
3022 if (__improbable(state
== LCK_MTX_TAG_DESTROYED
)) {
3023 lck_mtx_destroyed(lock
);
3026 /* Is this an indirect mutex? */
3027 if (__improbable(state
== LCK_MTX_TAG_INDIRECT
)) {
3028 indirect
= get_indirect_mutex(&lock
, &state
);
3031 lck_grp_mtx_update_held((struct _lck_mtx_ext_
*)lock
);
3033 if (state
& LCK_MTX_SPIN_MSK
) {
3034 /* M_SPIN_MSK was set, so M_ILOCKED_MSK must also be present */
3035 assert(state
& LCK_MTX_ILOCKED_MSK
);
3036 lck_grp_mtx_update_miss((struct _lck_mtx_ext_
*)lock
, &first_miss
);
3040 if (!lck_mtx_lock_wait_interlock_to_clear(lock
, &state
)) {
3041 return lck_mtx_lock_contended(lock
, indirect
, &first_miss
);
3045 /* no - can't be INDIRECT, DESTROYED or locked */
3046 while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock
, LCK_MTX_SPIN_MSK
, &state
))) {
3047 if (!lck_mtx_lock_wait_interlock_to_clear(lock
, &state
)) {
3048 return lck_mtx_lock_contended(lock
, indirect
, &first_miss
);
3052 /* lock as spinlock and interlock acquired */
3054 thread_t thread
= current_thread();
3055 /* record owner of mutex */
3056 ordered_store_mtx_owner(lock
, (uintptr_t)thread
);
3060 thread
->mutex_count
++; /* lock statistic */
3065 LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN_ACQUIRE
, lock
, 0);
3067 /* return with the interlock held and preemption disabled */
3071 __attribute__((noinline
))
3073 lck_mtx_try_lock_spin_slow(
3076 boolean_t indirect
= FALSE
;
3080 state
= ordered_load_mtx_state(lock
);
3082 /* is the interlock or mutex held */
3083 if (__improbable(state
& ((LCK_MTX_ILOCKED_MSK
| LCK_MTX_MLOCKED_MSK
)))) {
3085 * Note: both LCK_MTX_TAG_DESTROYED and LCK_MTX_TAG_INDIRECT
3086 * have LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK
3087 * set in state (state == lck_mtx_tag)
3090 /* is the mutex already held and not indirect */
3091 if (__improbable(!(state
& LCK_MTX_ILOCKED_MSK
))) {
3095 /* check to see if it is marked destroyed */
3096 if (__improbable(state
== LCK_MTX_TAG_DESTROYED
)) {
3097 lck_mtx_try_destroyed(lock
);
3100 /* Is this an indirect mutex? */
3101 if (__improbable(state
== LCK_MTX_TAG_INDIRECT
)) {
3102 indirect
= get_indirect_mutex(&lock
, &state
);
3105 lck_grp_mtx_update_held((struct _lck_mtx_ext_
*)lock
);
3108 if (!lck_mtx_try_lock_wait_interlock_to_clear(lock
, &state
)) {
3110 lck_grp_mtx_update_miss((struct _lck_mtx_ext_
*)lock
, &first_miss
);
3116 /* no - can't be INDIRECT, DESTROYED or locked */
3117 while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock
, LCK_MTX_SPIN_MSK
, &state
))) {
3118 if (!lck_mtx_try_lock_wait_interlock_to_clear(lock
, &state
)) {
3120 lck_grp_mtx_update_miss((struct _lck_mtx_ext_
*)lock
, &first_miss
);
3126 /* lock and interlock acquired */
3128 thread_t thread
= current_thread();
3129 /* record owner of mutex */
3130 ordered_store_mtx_owner(lock
, (uintptr_t)thread
);
3134 thread
->mutex_count
++; /* lock statistic */
3139 LOCKSTAT_RECORD(LS_LCK_MTX_TRY_SPIN_LOCK_ACQUIRE
, lock
, 0);
3144 __attribute__((noinline
))
3146 lck_mtx_convert_spin(
3151 state
= ordered_load_mtx_state(lock
);
3153 /* Is this an indirect mutex? */
3154 if (__improbable(state
== LCK_MTX_TAG_INDIRECT
)) {
3155 /* If so, take indirection */
3156 get_indirect_mutex(&lock
, &state
);
3159 assertf((thread_t
)lock
->lck_mtx_owner
== current_thread(), "lock %p not owned by thread %p (current owner %p)", lock
, current_thread(), (thread_t
)lock
->lck_mtx_owner
);
3161 if (__improbable(state
& LCK_MTX_MLOCKED_MSK
)) {
3162 /* already owned as a mutex, just return */
3166 assert(get_preemption_level() > 0);
3167 assert(state
& LCK_MTX_ILOCKED_MSK
);
3168 assert(state
& LCK_MTX_SPIN_MSK
);
3171 * Check if there are waiters to
3172 * inherit their priority.
3174 if (__improbable(state
& LCK_MTX_WAITERS_MSK
)) {
3175 return lck_mtx_convert_spin_acquire_tail(lock
);
3178 lck_mtx_convert_spin_finish_inline(lock
, ordered_load_mtx_state(lock
));
3183 static inline boolean_t
3184 lck_mtx_lock_grab_mutex(
3189 state
= ordered_load_mtx_state(lock
);
3191 if (!lck_mtx_interlock_try_lock_set_flags(lock
, LCK_MTX_MLOCKED_MSK
, &state
)) {
3195 /* lock and interlock acquired */
3197 thread_t thread
= current_thread();
3198 /* record owner of mutex */
3199 ordered_store_mtx_owner(lock
, (uintptr_t)thread
);
3203 thread
->mutex_count
++; /* lock statistic */
3209 __attribute__((noinline
))
3215 thread_t thread
, owner
;
3218 thread
= current_thread();
3219 state
= ordered_load_mtx_state(lock
);
3221 if (state
== LCK_MTX_TAG_INDIRECT
) {
3222 get_indirect_mutex(&lock
, &state
);
3225 owner
= (thread_t
)lock
->lck_mtx_owner
;
3227 if (type
== LCK_MTX_ASSERT_OWNED
) {
3228 if (owner
!= thread
|| !(state
& (LCK_MTX_ILOCKED_MSK
| LCK_MTX_MLOCKED_MSK
))) {
3229 panic("mutex (%p) not owned\n", lock
);
3232 assert(type
== LCK_MTX_ASSERT_NOTOWNED
);
3233 if (owner
== thread
) {
3234 panic("mutex (%p) owned\n", lock
);
3240 * Routine: lck_mtx_lock_spinwait_x86
3242 * Invoked trying to acquire a mutex when there is contention but
3243 * the holder is running on another processor. We spin for up to a maximum
3244 * time waiting for the lock to be released.
3246 * Called with the interlock unlocked.
3247 * returns LCK_MTX_SPINWAIT_ACQUIRED if mutex acquired
3248 * returns LCK_MTX_SPINWAIT_SPUN if we spun
3249 * returns LCK_MTX_SPINWAIT_NO_SPIN if we didn't spin due to the holder not running
3251 __attribute__((noinline
))
3252 lck_mtx_spinwait_ret_type_t
3253 lck_mtx_lock_spinwait_x86(
3256 __kdebug_only
uintptr_t trace_lck
= unslide_for_kdebug(mutex
);
3257 thread_t owner
, prev_owner
;
3258 uint64_t window_deadline
, sliding_deadline
, high_deadline
;
3259 uint64_t start_time
, cur_time
, avg_hold_time
, bias
, delta
;
3260 lck_mtx_spinwait_ret_type_t retval
= LCK_MTX_SPINWAIT_SPUN_HIGH_THR
;
3262 int total_hold_time_samples
, window_hold_time_samples
, unfairness
;
3263 uint i
, prev_owner_cpu
;
3264 bool owner_on_core
, adjust
;
3266 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_MTX_LCK_SPIN_CODE
) | DBG_FUNC_START
,
3267 trace_lck
, VM_KERNEL_UNSLIDE_OR_PERM(mutex
->lck_mtx_owner
), mutex
->lck_mtx_waiters
, 0, 0);
3269 start_time
= mach_absolute_time();
3271 * window_deadline represents the "learning" phase.
3272 * The thread collects statistics about the lock during
3273 * window_deadline and then it makes a decision on whether to spin more
3274 * or block according to the concurrency behavior
3277 * Every thread can spin at least low_MutexSpin.
3279 window_deadline
= start_time
+ low_MutexSpin
;
3281 * Sliding_deadline is the adjusted spin deadline
3282 * computed after the "learning" phase.
3284 sliding_deadline
= window_deadline
;
3286 * High_deadline is a hard deadline. No thread
3287 * can spin more than this deadline.
3289 if (high_MutexSpin
>= 0) {
3290 high_deadline
= start_time
+ high_MutexSpin
;
3292 high_deadline
= start_time
+ low_MutexSpin
* real_ncpus
;
3296 * Do not know yet which is the owner cpu.
3297 * Initialize prev_owner_cpu with next cpu.
3299 prev_owner_cpu
= (cpu_number() + 1) % real_ncpus
;
3300 total_hold_time_samples
= 0;
3301 window_hold_time_samples
= 0;
3304 bias
= (os_hash_kernel_pointer(mutex
) + cpu_number()) % real_ncpus
;
3306 prev_owner
= (thread_t
) mutex
->lck_mtx_owner
;
3309 * - mutex is locked, and
3310 * - it's locked as a spin lock, and
3311 * - owner is running on another processor, and
3312 * - we haven't spun for long enough.
3316 * Try to acquire the lock.
3318 if (__probable(lck_mtx_lock_grab_mutex(mutex
))) {
3319 retval
= LCK_MTX_SPINWAIT_ACQUIRED
;
3323 cur_time
= mach_absolute_time();
3326 * Never spin past high_deadline.
3328 if (cur_time
>= high_deadline
) {
3329 retval
= LCK_MTX_SPINWAIT_SPUN_HIGH_THR
;
3334 * Check if owner is on core. If not block.
3336 owner
= (thread_t
) mutex
->lck_mtx_owner
;
3339 owner_on_core
= FALSE
;
3341 disable_preemption();
3342 owner
= (thread_t
) mutex
->lck_mtx_owner
;
3345 * For scalability we want to check if the owner is on core
3346 * without locking the mutex interlock.
3347 * If we do not lock the mutex interlock, the owner that we see might be
3348 * invalid, so we cannot dereference it. Therefore we cannot check
3349 * any field of the thread to tell us if it is on core.
3350 * Check if the thread that is running on the other cpus matches the owner.
3354 if ((cpu_data_ptr
[i
] != NULL
) && (cpu_data_ptr
[i
]->cpu_active_thread
== owner
)) {
3355 owner_on_core
= TRUE
;
3358 if (++i
>= real_ncpus
) {
3361 } while (i
!= prev_owner_cpu
);
3362 enable_preemption();
3364 if (owner_on_core
) {
3368 owner
= (thread_t
) mutex
->lck_mtx_owner
;
3369 if (owner
== prev_owner
) {
3371 * Owner is not on core.
3374 if (loopcount
== 0) {
3375 retval
= LCK_MTX_SPINWAIT_NO_SPIN
;
3377 retval
= LCK_MTX_SPINWAIT_SPUN_OWNER_NOT_CORE
;
3382 * Fall through if the owner changed while we were scanning.
3383 * The new owner could potentially be on core, so loop
3388 enable_preemption();
3393 * Save how many times we see the owner changing.
3394 * We can roughly estimate the mutex hold
3395 * time and the fairness with that.
3397 if (owner
!= prev_owner
) {
3399 total_hold_time_samples
++;
3400 window_hold_time_samples
++;
3404 * Learning window expired.
3405 * Try to adjust the sliding_deadline.
3407 if (cur_time
>= window_deadline
) {
3409 * If there was not contention during the window
3412 if (window_hold_time_samples
< 1) {
3413 retval
= LCK_MTX_SPINWAIT_SPUN_NO_WINDOW_CONTENTION
;
3419 * For a fair lock, we'd wait for at most (NCPU-1) periods,
3420 * but the lock is unfair, so let's try to estimate by how much.
3422 unfairness
= total_hold_time_samples
/ real_ncpus
;
3424 if (unfairness
== 0) {
3426 * We observed the owner changing `total_hold_time_samples` times which
3427 * let us estimate the average hold time of this mutex for the duration
3429 * avg_hold_time = (cur_time - start_time) / total_hold_time_samples;
3431 * In this case spin at max avg_hold_time * (real_ncpus - 1)
3433 delta
= cur_time
- start_time
;
3434 sliding_deadline
= start_time
+ (delta
* (real_ncpus
- 1)) / total_hold_time_samples
;
3437 * In this case at least one of the other cpus was able to get the lock twice
3438 * while I was spinning.
3439 * We could spin longer but it won't necessarily help if the system is unfair.
3440 * Try to randomize the wait to reduce contention.
3442 * We compute how much time we could potentially spin
3443 * and distribute it over the cpus.
3445 * bias is an integer between 0 and real_ncpus.
3446 * distributed_increment = ((high_deadline - cur_time) / real_ncpus) * bias
3448 delta
= high_deadline
- cur_time
;
3449 sliding_deadline
= cur_time
+ ((delta
* bias
) / real_ncpus
);
3454 window_deadline
+= low_MutexSpin
;
3455 window_hold_time_samples
= 0;
3459 * Stop spinning if we past
3460 * the adjusted deadline.
3462 if (cur_time
>= sliding_deadline
) {
3463 retval
= LCK_MTX_SPINWAIT_SPUN_SLIDING_THR
;
3467 if ((thread_t
) mutex
->lck_mtx_owner
!= NULL
) {
3476 * Note that we record a different probe id depending on whether
3477 * this is a direct or indirect mutex. This allows us to
3478 * penalize only lock groups that have debug/stats enabled
3479 * with dtrace processing if desired.
3481 if (__probable(mutex
->lck_mtx_is_ext
== 0)) {
3482 LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN
, mutex
,
3483 mach_absolute_time() - start_time
);
3485 LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_SPIN
, mutex
,
3486 mach_absolute_time() - start_time
);
3488 /* The lockstat acquire event is recorded by the assembly code beneath us. */
3491 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_MTX_LCK_SPIN_CODE
) | DBG_FUNC_END
,
3492 trace_lck
, VM_KERNEL_UNSLIDE_OR_PERM(mutex
->lck_mtx_owner
), mutex
->lck_mtx_waiters
, retval
, 0);
3500 * Routine: lck_mtx_lock_wait_x86
3502 * Invoked in order to wait on contention.
3504 * Called with the interlock locked and
3505 * preemption disabled...
3506 * returns it unlocked and with preemption enabled
3508 * lck_mtx_waiters is 1:1 with a wakeup needing to occur.
3509 * A runnable waiter can exist between wait and acquire
3510 * without a waiters count being set.
3511 * This allows us to never make a spurious wakeup call.
3514 * This avoids taking the thread lock if the owning thread is the same priority.
3515 * This optimizes the case of same-priority threads contending on a lock.
3516 * However, that allows the owning thread to drop in priority while holding the lock,
3517 * because there is no state that the priority change can notice that
3518 * says that the targeted thread holds a contended mutex.
3520 * One possible solution: priority changes could look for some atomic tag
3521 * on the thread saying 'holding contended lock', and then set up a promotion.
3522 * Needs a story for dropping that promotion - the last contended unlock
3523 * has to notice that this has happened.
3525 __attribute__((noinline
))
3527 lck_mtx_lock_wait_x86(
3529 struct turnstile
**ts
)
3531 thread_t self
= current_thread();
3534 uint64_t sleep_start
= 0;
3536 if (lockstat_probemap
[LS_LCK_MTX_LOCK_BLOCK
] || lockstat_probemap
[LS_LCK_MTX_EXT_LOCK_BLOCK
]) {
3537 sleep_start
= mach_absolute_time();
3540 __kdebug_only
uintptr_t trace_lck
= unslide_for_kdebug(mutex
);
3542 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_MTX_LCK_WAIT_CODE
) | DBG_FUNC_START
,
3543 trace_lck
, VM_KERNEL_UNSLIDE_OR_PERM(mutex
->lck_mtx_owner
),
3544 mutex
->lck_mtx_waiters
, 0, 0);
3546 assert(self
->waiting_for_mutex
== NULL
);
3547 self
->waiting_for_mutex
= mutex
;
3548 mutex
->lck_mtx_waiters
++;
3550 thread_t holder
= (thread_t
)mutex
->lck_mtx_owner
;
3551 assert(holder
!= NULL
);
3554 * lck_mtx_lock_wait_x86 might be called on a loop. Call prepare just once and reuse
3555 * the same turnstile while looping, the matching turnstile compleate will be called
3556 * by lck_mtx_lock_contended when finally acquiring the lock.
3559 *ts
= turnstile_prepare((uintptr_t)mutex
, NULL
, TURNSTILE_NULL
, TURNSTILE_KERNEL_MUTEX
);
3562 struct turnstile
*turnstile
= *ts
;
3563 thread_set_pending_block_hint(self
, kThreadWaitKernelMutex
);
3564 turnstile_update_inheritor(turnstile
, holder
, (TURNSTILE_DELAYED_UPDATE
| TURNSTILE_INHERITOR_THREAD
));
3566 waitq_assert_wait64(&turnstile
->ts_waitq
, CAST_EVENT64_T(LCK_MTX_EVENT(mutex
)), THREAD_UNINT
| THREAD_WAIT_NOREPORT_USER
, TIMEOUT_WAIT_FOREVER
);
3568 lck_mtx_ilk_unlock(mutex
);
3570 turnstile_update_inheritor_complete(turnstile
, TURNSTILE_INTERLOCK_NOT_HELD
);
3572 thread_block(THREAD_CONTINUE_NULL
);
3574 self
->waiting_for_mutex
= NULL
;
3576 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_MTX_LCK_WAIT_CODE
) | DBG_FUNC_END
,
3577 trace_lck
, VM_KERNEL_UNSLIDE_OR_PERM(mutex
->lck_mtx_owner
),
3578 mutex
->lck_mtx_waiters
, 0, 0);
3582 * Record the Dtrace lockstat probe for blocking, block time
3583 * measured from when we were entered.
3586 if (mutex
->lck_mtx_is_ext
== 0) {
3587 LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_BLOCK
, mutex
,
3588 mach_absolute_time() - sleep_start
);
3590 LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_BLOCK
, mutex
,
3591 mach_absolute_time() - sleep_start
);
3598 * Routine: kdp_lck_mtx_lock_spin_is_acquired
3599 * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
3600 * Returns: TRUE if lock is acquired.
3603 kdp_lck_mtx_lock_spin_is_acquired(lck_mtx_t
*lck
)
3606 panic("panic: kdp_lck_mtx_lock_spin_is_acquired called outside of kernel debugger");
3609 if (lck
->lck_mtx_ilocked
|| lck
->lck_mtx_mlocked
) {
3617 kdp_lck_mtx_find_owner(__unused
struct waitq
* waitq
, event64_t event
, thread_waitinfo_t
* waitinfo
)
3619 lck_mtx_t
* mutex
= LCK_EVENT_TO_MUTEX(event
);
3620 waitinfo
->context
= VM_KERNEL_UNSLIDE_OR_PERM(mutex
);
3621 thread_t holder
= (thread_t
)mutex
->lck_mtx_owner
;
3622 waitinfo
->owner
= thread_tid(holder
);
3626 kdp_rwlck_find_owner(__unused
struct waitq
* waitq
, event64_t event
, thread_waitinfo_t
* waitinfo
)
3628 lck_rw_t
*rwlck
= NULL
;
3629 switch (waitinfo
->wait_type
) {
3630 case kThreadWaitKernelRWLockRead
:
3631 rwlck
= READ_EVENT_TO_RWLOCK(event
);
3633 case kThreadWaitKernelRWLockWrite
:
3634 case kThreadWaitKernelRWLockUpgrade
:
3635 rwlck
= WRITE_EVENT_TO_RWLOCK(event
);
3638 panic("%s was called with an invalid blocking type", __FUNCTION__
);
3641 waitinfo
->context
= VM_KERNEL_UNSLIDE_OR_PERM(rwlck
);
3642 waitinfo
->owner
= 0;