2 * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
46 * Carnegie Mellon requests users of this software to return to
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
58 * Author: Avadis Tevanian, Jr., Michael Wayne Young
61 * Locking primitives implementation
64 #define LOCK_PRIVATE 1
66 #include <mach_ldebug.h>
68 #include <kern/lock_stat.h>
69 #include <kern/locks.h>
70 #include <kern/zalloc.h>
71 #include <kern/misc_protos.h>
72 #include <kern/thread.h>
73 #include <kern/processor.h>
74 #include <kern/cpu_data.h>
75 #include <kern/cpu_number.h>
76 #include <kern/sched_prim.h>
77 #include <kern/debug.h>
80 #include <i386/machine_routines.h> /* machine_timeout_suspended() */
81 #include <machine/atomic.h>
82 #include <machine/machine_cpu.h>
84 #include <machine/atomic.h>
85 #include <sys/kdebug.h>
86 #include <i386/locks_i386_inlines.h>
87 #include <kern/cpu_number.h>
91 #define DTRACE_RW_SHARED 0x0 //reader
92 #define DTRACE_RW_EXCL 0x1 //writer
93 #define DTRACE_NO_FLAG 0x0 //not applicable
94 #endif /* CONFIG_DTRACE */
96 #define LCK_RW_LCK_EXCLUSIVE_CODE 0x100
97 #define LCK_RW_LCK_EXCLUSIVE1_CODE 0x101
98 #define LCK_RW_LCK_SHARED_CODE 0x102
99 #define LCK_RW_LCK_SH_TO_EX_CODE 0x103
100 #define LCK_RW_LCK_SH_TO_EX1_CODE 0x104
101 #define LCK_RW_LCK_EX_TO_SH_CODE 0x105
103 #define LCK_RW_LCK_EX_WRITER_SPIN_CODE 0x106
104 #define LCK_RW_LCK_EX_WRITER_WAIT_CODE 0x107
105 #define LCK_RW_LCK_EX_READER_SPIN_CODE 0x108
106 #define LCK_RW_LCK_EX_READER_WAIT_CODE 0x109
107 #define LCK_RW_LCK_SHARED_SPIN_CODE 0x110
108 #define LCK_RW_LCK_SHARED_WAIT_CODE 0x111
109 #define LCK_RW_LCK_SH_TO_EX_SPIN_CODE 0x112
110 #define LCK_RW_LCK_SH_TO_EX_WAIT_CODE 0x113
113 #define ANY_LOCK_DEBUG (USLOCK_DEBUG || LOCK_DEBUG || MUTEX_DEBUG)
119 * Perform simple lock checks.
121 int uslock_check
= 1;
122 int max_lock_loops
= 100000000;
123 decl_simple_lock_data(extern, printf_lock
);
124 decl_simple_lock_data(extern, panic_lock
);
125 #endif /* USLOCK_DEBUG */
127 extern unsigned int not_in_kdp
;
130 #define usimple_lock_nopreempt(lck, grp) \
131 usimple_lock_nopreempt(lck)
132 #define usimple_lock_try_nopreempt(lck, grp) \
133 usimple_lock_try_nopreempt(lck)
135 static void usimple_lock_nopreempt(usimple_lock_t
, lck_grp_t
*);
136 static unsigned int usimple_lock_try_nopreempt(usimple_lock_t
, lck_grp_t
*);
139 * We often want to know the addresses of the callers
140 * of the various lock routines. However, this information
141 * is only used for debugging and statistics.
144 #define INVALID_PC ((void *) VM_MAX_KERNEL_ADDRESS)
145 #define INVALID_THREAD ((void *) VM_MAX_KERNEL_ADDRESS)
147 #define OBTAIN_PC(pc) ((pc) = GET_RETURN_PC())
148 #define DECL_PC(pc) pc_t pc;
149 #else /* ANY_LOCK_DEBUG */
153 * Eliminate lint complaints about unused local pc variables.
155 #define OBTAIN_PC(pc) ++pc
157 #define OBTAIN_PC(pc)
159 #endif /* USLOCK_DEBUG */
161 ZONE_VIEW_DEFINE(ZV_LCK_SPIN
, "lck_spin",
162 KHEAP_ID_DEFAULT
, sizeof(lck_spin_t
));
164 ZONE_VIEW_DEFINE(ZV_LCK_MTX
, "lck_mtx",
165 KHEAP_ID_DEFAULT
, sizeof(lck_mtx_t
));
167 ZONE_VIEW_DEFINE(ZV_LCK_MTX_EXT
, "lck_mtx_ext",
168 KHEAP_ID_DEFAULT
, sizeof(lck_mtx_ext_t
));
170 ZONE_VIEW_DEFINE(ZV_LCK_RW
, "lck_rw",
171 KHEAP_ID_DEFAULT
, sizeof(lck_rw_t
));
174 * atomic exchange API is a low level abstraction of the operations
175 * to atomically read, modify, and write a pointer. This abstraction works
176 * for both Intel and ARMv8.1 compare and exchange atomic instructions as
177 * well as the ARM exclusive instructions.
179 * atomic_exchange_begin() - begin exchange and retrieve current value
180 * atomic_exchange_complete() - conclude an exchange
181 * atomic_exchange_abort() - cancel an exchange started with atomic_exchange_begin()
184 atomic_exchange_begin32(uint32_t *target
, uint32_t *previous
, enum memory_order ord
)
188 (void)ord
; // Memory order not used
189 val
= os_atomic_load(target
, relaxed
);
195 atomic_exchange_complete32(uint32_t *target
, uint32_t previous
, uint32_t newval
, enum memory_order ord
)
197 return __c11_atomic_compare_exchange_strong((_Atomic
uint32_t *)target
, &previous
, newval
, ord
, memory_order_relaxed
);
201 atomic_exchange_abort(void)
206 atomic_test_and_set32(uint32_t *target
, uint32_t test_mask
, uint32_t set_mask
, enum memory_order ord
, boolean_t wait
)
208 uint32_t value
, prev
;
211 value
= atomic_exchange_begin32(target
, &prev
, ord
);
212 if (value
& test_mask
) {
216 atomic_exchange_abort();
221 if (atomic_exchange_complete32(target
, prev
, value
, ord
)) {
228 hw_atomic_test_and_set32(uint32_t *target
, uint32_t test_mask
, uint32_t set_mask
, enum memory_order ord
, boolean_t wait
)
230 return atomic_test_and_set32(target
, test_mask
, set_mask
, ord
, wait
);
234 * Portable lock package implementation of usimple_locks.
238 #define USLDBG(stmt) stmt
239 void usld_lock_init(usimple_lock_t
, unsigned short);
240 void usld_lock_pre(usimple_lock_t
, pc_t
);
241 void usld_lock_post(usimple_lock_t
, pc_t
);
242 void usld_unlock(usimple_lock_t
, pc_t
);
243 void usld_lock_try_pre(usimple_lock_t
, pc_t
);
244 void usld_lock_try_post(usimple_lock_t
, pc_t
);
245 int usld_lock_common_checks(usimple_lock_t
, char *);
246 #else /* USLOCK_DEBUG */
248 #endif /* USLOCK_DEBUG */
251 * Forward definitions
254 static void lck_rw_lock_shared_gen(lck_rw_t
*lck
);
255 static void lck_rw_lock_exclusive_gen(lck_rw_t
*lck
);
256 static boolean_t
lck_rw_lock_shared_to_exclusive_success(lck_rw_t
*lck
);
257 static boolean_t
lck_rw_lock_shared_to_exclusive_failure(lck_rw_t
*lck
, uint32_t prior_lock_state
);
258 static void lck_rw_lock_exclusive_to_shared_gen(lck_rw_t
*lck
, uint32_t prior_lock_state
);
259 static lck_rw_type_t
lck_rw_done_gen(lck_rw_t
*lck
, uint32_t prior_lock_state
);
260 void lck_rw_clear_promotions_x86(thread_t thread
);
261 static boolean_t
lck_rw_held_read_or_upgrade(lck_rw_t
*lock
);
262 static boolean_t
lck_rw_grab_want(lck_rw_t
*lock
);
263 static boolean_t
lck_rw_grab_shared(lck_rw_t
*lock
);
264 static void lck_mtx_unlock_wakeup_tail(lck_mtx_t
*mutex
, uint32_t state
, boolean_t indirect
);
265 static void lck_mtx_interlock_lock(lck_mtx_t
*mutex
, uint32_t *new_state
);
266 static void lck_mtx_interlock_lock_clear_flags(lck_mtx_t
*mutex
, uint32_t and_flags
, uint32_t *new_state
);
267 static int lck_mtx_interlock_try_lock_set_flags(lck_mtx_t
*mutex
, uint32_t or_flags
, uint32_t *new_state
);
268 static boolean_t
lck_mtx_lock_wait_interlock_to_clear(lck_mtx_t
*lock
, uint32_t *new_state
);
269 static boolean_t
lck_mtx_try_lock_wait_interlock_to_clear(lck_mtx_t
*lock
, uint32_t *new_state
);
273 * Routine: lck_spin_alloc_init
282 lck
= zalloc(ZV_LCK_SPIN
);
283 lck_spin_init(lck
, grp
, attr
);
288 * Routine: lck_spin_free
295 lck_spin_destroy(lck
, grp
);
296 zfree(ZV_LCK_SPIN
, lck
);
300 * Routine: lck_spin_init
306 __unused lck_attr_t
*attr
)
308 usimple_lock_init((usimple_lock_t
) lck
, 0);
310 lck_grp_reference(grp
);
311 lck_grp_lckcnt_incr(grp
, LCK_TYPE_SPIN
);
316 * Routine: lck_spin_destroy
323 if (lck
->interlock
== LCK_SPIN_TAG_DESTROYED
) {
326 lck
->interlock
= LCK_SPIN_TAG_DESTROYED
;
328 lck_grp_lckcnt_decr(grp
, LCK_TYPE_SPIN
);
329 lck_grp_deallocate(grp
);
335 * Routine: lck_spin_lock
343 usimple_lock((usimple_lock_t
) lck
, grp
);
350 usimple_lock((usimple_lock_t
) lck
, NULL
);
354 lck_spin_lock_nopreempt(
357 usimple_lock_nopreempt((usimple_lock_t
) lck
, NULL
);
361 lck_spin_lock_nopreempt_grp(
366 usimple_lock_nopreempt((usimple_lock_t
) lck
, grp
);
370 * Routine: lck_spin_unlock
376 usimple_unlock((usimple_lock_t
) lck
);
380 lck_spin_unlock_nopreempt(
383 usimple_unlock_nopreempt((usimple_lock_t
) lck
);
387 lck_spin_try_lock_grp(
392 boolean_t lrval
= (boolean_t
)usimple_lock_try((usimple_lock_t
) lck
, grp
);
393 #if DEVELOPMENT || DEBUG
403 * Routine: lck_spin_try_lock
409 boolean_t lrval
= (boolean_t
)usimple_lock_try((usimple_lock_t
) lck
, LCK_GRP_NULL
);
410 #if DEVELOPMENT || DEBUG
419 lck_spin_try_lock_nopreempt(
422 boolean_t lrval
= (boolean_t
)usimple_lock_try_nopreempt((usimple_lock_t
) lck
, LCK_GRP_NULL
);
423 #if DEVELOPMENT || DEBUG
432 lck_spin_try_lock_nopreempt_grp(
437 boolean_t lrval
= (boolean_t
)usimple_lock_try_nopreempt((usimple_lock_t
) lck
, grp
);
438 #if DEVELOPMENT || DEBUG
447 * Routine: lck_spin_assert
450 lck_spin_assert(lck_spin_t
*lock
, unsigned int type
)
452 thread_t thread
, holder
;
455 if (__improbable(type
!= LCK_ASSERT_OWNED
&& type
!= LCK_ASSERT_NOTOWNED
)) {
456 panic("lck_spin_assert(): invalid arg (%u)", type
);
459 state
= lock
->interlock
;
460 holder
= (thread_t
)state
;
461 thread
= current_thread();
462 if (type
== LCK_ASSERT_OWNED
) {
463 if (__improbable(holder
== THREAD_NULL
)) {
464 panic("Lock not owned %p = %lx", lock
, state
);
466 if (__improbable(holder
!= thread
)) {
467 panic("Lock not owned by current thread %p = %lx", lock
, state
);
469 } else if (type
== LCK_ASSERT_NOTOWNED
) {
470 if (__improbable(holder
!= THREAD_NULL
)) {
471 if (holder
== thread
) {
472 panic("Lock owned by current thread %p = %lx", lock
, state
);
479 * Routine: kdp_lck_spin_is_acquired
480 * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
481 * Returns: TRUE if lock is acquired.
484 kdp_lck_spin_is_acquired(lck_spin_t
*lck
)
487 panic("panic: spinlock acquired check done outside of kernel debugger");
489 return (lck
->interlock
!= 0)? TRUE
: FALSE
;
493 * Initialize a usimple_lock.
495 * No change in preemption state.
500 __unused
unsigned short tag
)
502 USLDBG(usld_lock_init(l
, tag
));
503 hw_lock_init(&l
->interlock
);
506 volatile uint32_t spinlock_owner_cpu
= ~0;
507 volatile usimple_lock_t spinlock_timed_out
;
510 spinlock_timeout_NMI(uintptr_t thread_addr
)
514 for (i
= 0; i
< real_ncpus
; i
++) {
515 if ((cpu_data_ptr
[i
] != NULL
) && ((uintptr_t)cpu_data_ptr
[i
]->cpu_active_thread
== thread_addr
)) {
516 spinlock_owner_cpu
= i
;
517 if ((uint32_t) cpu_number() != i
) {
518 /* Cause NMI and panic on the owner's cpu */
519 NMIPI_panic(cpu_to_cpumask(i
), SPINLOCK_TIMEOUT
);
525 return spinlock_owner_cpu
;
530 usimple_lock_acquire_timeout_panic(usimple_lock_t l
)
532 uintptr_t lowner
= (uintptr_t)l
->interlock
.lock_data
;
535 spinlock_timed_out
= l
; /* spinlock_timeout_NMI consumes this */
536 lock_cpu
= spinlock_timeout_NMI(lowner
);
537 panic("Spinlock acquisition timed out: lock=%p, "
538 "lock owner thread=0x%lx, current_thread: %p, "
539 "lock owner active on CPU 0x%x, current owner: 0x%lx, time: %llu",
540 l
, lowner
, current_thread(), lock_cpu
,
541 (uintptr_t)l
->interlock
.lock_data
, mach_absolute_time());
545 * Acquire a usimple_lock.
547 * Returns with preemption disabled. Note
548 * that the hw_lock routines are responsible for
549 * maintaining preemption state.
554 LCK_GRP_ARG(lck_grp_t
*grp
))
559 USLDBG(usld_lock_pre(l
, pc
));
561 while (__improbable(hw_lock_to(&l
->interlock
, LockTimeOutTSC
, grp
) == 0)) {
562 if (!machine_timeout_suspended()) {
563 usimple_lock_acquire_timeout_panic(l
);
568 #if DEVELOPMENT || DEBUG
572 USLDBG(usld_lock_post(l
, pc
));
574 LOCKSTAT_RECORD(LS_LCK_SPIN_LOCK_ACQUIRE
, l
, 0, (uintptr_t)LCK_GRP_PROBEARG(grp
));
579 * Acquire a usimple_lock_nopreempt
581 * Called and returns with preemption disabled. Note
582 * that the hw_lock routines are responsible for
583 * maintaining preemption state.
586 usimple_lock_nopreempt(
593 USLDBG(usld_lock_pre(l
, pc
));
595 while (__improbable(hw_lock_to_nopreempt(&l
->interlock
, LockTimeOutTSC
, grp
) == 0)) {
596 if (!machine_timeout_suspended()) {
597 usimple_lock_acquire_timeout_panic(l
);
602 #if DEVELOPMENT || DEBUG
606 USLDBG(usld_lock_post(l
, pc
));
608 LOCKSTAT_RECORD(LS_LCK_SPIN_LOCK_ACQUIRE
, l
, 0, (uintptr_t)LCK_GRP_PROBEARG(grp
));
614 * Release a usimple_lock.
616 * Returns with preemption enabled. Note
617 * that the hw_lock routines are responsible for
618 * maintaining preemption state.
627 USLDBG(usld_unlock(l
, pc
));
628 #if DEVELOPMENT || DEBUG
631 hw_lock_unlock(&l
->interlock
);
635 * Release a usimple_unlock_nopreempt.
637 * Called and returns with preemption enabled. Note
638 * that the hw_lock routines are responsible for
639 * maintaining preemption state.
642 usimple_unlock_nopreempt(
648 USLDBG(usld_unlock(l
, pc
));
649 #if DEVELOPMENT || DEBUG
652 hw_lock_unlock_nopreempt(&l
->interlock
);
656 * Conditionally acquire a usimple_lock.
658 * On success, returns with preemption disabled.
659 * On failure, returns with preemption in the same state
660 * as when first invoked. Note that the hw_lock routines
661 * are responsible for maintaining preemption state.
663 * XXX No stats are gathered on a miss; I preserved this
664 * behavior from the original assembly-language code, but
665 * doesn't it make sense to log misses? XXX
672 unsigned int success
;
676 USLDBG(usld_lock_try_pre(l
, pc
));
677 if ((success
= hw_lock_try(&l
->interlock
, grp
))) {
678 #if DEVELOPMENT || DEBUG
681 USLDBG(usld_lock_try_post(l
, pc
));
687 * Conditionally acquire a usimple_lock.
689 * Called and returns with preemption disabled. Note
690 * that the hw_lock routines are responsible for
691 * maintaining preemption state.
693 * XXX No stats are gathered on a miss; I preserved this
694 * behavior from the original assembly-language code, but
695 * doesn't it make sense to log misses? XXX
698 usimple_lock_try_nopreempt(
702 unsigned int success
;
706 USLDBG(usld_lock_try_pre(l
, pc
));
707 if ((success
= hw_lock_try_nopreempt(&l
->interlock
, grp
))) {
708 #if DEVELOPMENT || DEBUG
711 USLDBG(usld_lock_try_post(l
, pc
));
717 * Acquire a usimple_lock while polling for pending cpu signals
718 * and spinning on a lock.
723 (usimple_lock_try_lock_mp_signal_safe_loop_deadline
)(usimple_lock_t l
,
725 LCK_GRP_ARG(lck_grp_t
*grp
))
727 boolean_t istate
= ml_get_interrupts_enabled();
729 if (deadline
< mach_absolute_time()) {
733 while (!simple_lock_try(l
, grp
)) {
735 cpu_signal_handler(NULL
);
738 if (deadline
< mach_absolute_time()) {
749 (usimple_lock_try_lock_loop
)(usimple_lock_t l
750 LCK_GRP_ARG(lck_grp_t
*grp
))
752 /* When the lock is not contended, grab the lock and go. */
753 if (!simple_lock_try(l
, grp
)) {
754 usimple_lock_try_lock_mp_signal_safe_loop_deadline(l
, ULLONG_MAX
, grp
);
760 (usimple_lock_try_lock_mp_signal_safe_loop_duration
)(usimple_lock_t l
,
762 LCK_GRP_ARG(lck_grp_t
*grp
))
766 uint64_t duration_at
;
768 /* Fast track for uncontended locks */
769 if (simple_lock_try(l
, grp
)) {
773 base_at
= mach_absolute_time();
775 nanoseconds_to_absolutetime(duration
, &duration_at
);
776 deadline
= base_at
+ duration_at
;
777 if (deadline
< base_at
) {
778 /* deadline has overflowed, make it saturate */
779 deadline
= ULLONG_MAX
;
782 return usimple_lock_try_lock_mp_signal_safe_loop_deadline(l
, deadline
, grp
);
787 * States of a usimple_lock. The default when initializing
788 * a usimple_lock is setting it up for debug checking.
790 #define USLOCK_CHECKED 0x0001 /* lock is being checked */
791 #define USLOCK_TAKEN 0x0002 /* lock has been taken */
792 #define USLOCK_INIT 0xBAA0 /* lock has been initialized */
793 #define USLOCK_INITIALIZED (USLOCK_INIT|USLOCK_CHECKED)
794 #define USLOCK_CHECKING(l) (uslock_check && \
795 ((l)->debug.state & USLOCK_CHECKED))
798 * Initialize the debugging information contained
804 __unused
unsigned short tag
)
806 if (l
== USIMPLE_LOCK_NULL
) {
807 panic("lock initialization: null lock pointer");
809 l
->lock_type
= USLOCK_TAG
;
810 l
->debug
.state
= uslock_check
? USLOCK_INITIALIZED
: 0;
811 l
->debug
.lock_cpu
= l
->debug
.unlock_cpu
= 0;
812 l
->debug
.lock_pc
= l
->debug
.unlock_pc
= INVALID_PC
;
813 l
->debug
.lock_thread
= l
->debug
.unlock_thread
= INVALID_THREAD
;
814 l
->debug
.duration
[0] = l
->debug
.duration
[1] = 0;
815 l
->debug
.unlock_cpu
= l
->debug
.unlock_cpu
= 0;
816 l
->debug
.unlock_pc
= l
->debug
.unlock_pc
= INVALID_PC
;
817 l
->debug
.unlock_thread
= l
->debug
.unlock_thread
= INVALID_THREAD
;
822 * These checks apply to all usimple_locks, not just
823 * those with USLOCK_CHECKED turned on.
826 usld_lock_common_checks(
830 if (l
== USIMPLE_LOCK_NULL
) {
831 panic("%s: null lock pointer", caller
);
833 if (l
->lock_type
!= USLOCK_TAG
) {
834 panic("%s: %p is not a usimple lock, 0x%x", caller
, l
, l
->lock_type
);
836 if (!(l
->debug
.state
& USLOCK_INIT
)) {
837 panic("%s: %p is not an initialized lock, 0x%x", caller
, l
, l
->debug
.state
);
839 return USLOCK_CHECKING(l
);
844 * Debug checks on a usimple_lock just before attempting
853 char caller
[] = "usimple_lock";
856 if (!usld_lock_common_checks(l
, caller
)) {
861 * Note that we have a weird case where we are getting a lock when we are]
862 * in the process of putting the system to sleep. We are running with no
863 * current threads, therefore we can't tell if we are trying to retake a lock
864 * we have or someone on the other processor has it. Therefore we just
865 * ignore this test if the locking thread is 0.
868 if ((l
->debug
.state
& USLOCK_TAKEN
) && l
->debug
.lock_thread
&&
869 l
->debug
.lock_thread
== (void *) current_thread()) {
870 printf("%s: lock %p already locked (at %p) by",
871 caller
, l
, l
->debug
.lock_pc
);
872 printf(" current thread %p (new attempt at pc %p)\n",
873 l
->debug
.lock_thread
, pc
);
876 mp_disable_preemption();
877 mp_enable_preemption();
882 * Debug checks on a usimple_lock just after acquiring it.
884 * Pre-emption has been disabled at this point,
885 * so we are safe in using cpu_number.
893 char caller
[] = "successful usimple_lock";
896 if (!usld_lock_common_checks(l
, caller
)) {
900 if (!((l
->debug
.state
& ~USLOCK_TAKEN
) == USLOCK_INITIALIZED
)) {
901 panic("%s: lock %p became uninitialized",
904 if ((l
->debug
.state
& USLOCK_TAKEN
)) {
905 panic("%s: lock 0x%p became TAKEN by someone else",
909 mycpu
= (unsigned int)cpu_number();
910 assert(mycpu
<= UCHAR_MAX
);
912 l
->debug
.lock_thread
= (void *)current_thread();
913 l
->debug
.state
|= USLOCK_TAKEN
;
914 l
->debug
.lock_pc
= pc
;
915 l
->debug
.lock_cpu
= (unsigned char)mycpu
;
920 * Debug checks on a usimple_lock just before
921 * releasing it. Note that the caller has not
922 * yet released the hardware lock.
924 * Preemption is still disabled, so there's
925 * no problem using cpu_number.
933 char caller
[] = "usimple_unlock";
936 if (!usld_lock_common_checks(l
, caller
)) {
940 mycpu
= cpu_number();
941 assert(mycpu
<= UCHAR_MAX
);
943 if (!(l
->debug
.state
& USLOCK_TAKEN
)) {
944 panic("%s: lock 0x%p hasn't been taken",
947 if (l
->debug
.lock_thread
!= (void *) current_thread()) {
948 panic("%s: unlocking lock 0x%p, owned by thread %p",
949 caller
, l
, l
->debug
.lock_thread
);
951 if (l
->debug
.lock_cpu
!= mycpu
) {
952 printf("%s: unlocking lock 0x%p on cpu 0x%x",
954 printf(" (acquired on cpu 0x%x)\n", l
->debug
.lock_cpu
);
958 l
->debug
.unlock_thread
= l
->debug
.lock_thread
;
959 l
->debug
.lock_thread
= INVALID_PC
;
960 l
->debug
.state
&= ~USLOCK_TAKEN
;
961 l
->debug
.unlock_pc
= pc
;
962 l
->debug
.unlock_cpu
= (unsigned char)mycpu
;
967 * Debug checks on a usimple_lock just before
968 * attempting to acquire it.
970 * Preemption isn't guaranteed to be disabled.
977 char caller
[] = "usimple_lock_try";
979 if (!usld_lock_common_checks(l
, caller
)) {
986 * Debug checks on a usimple_lock just after
987 * successfully attempting to acquire it.
989 * Preemption has been disabled by the
990 * lock acquisition attempt, so it's safe
999 char caller
[] = "successful usimple_lock_try";
1001 if (!usld_lock_common_checks(l
, caller
)) {
1005 if (!((l
->debug
.state
& ~USLOCK_TAKEN
) == USLOCK_INITIALIZED
)) {
1006 panic("%s: lock 0x%p became uninitialized",
1009 if ((l
->debug
.state
& USLOCK_TAKEN
)) {
1010 panic("%s: lock 0x%p became TAKEN by someone else",
1014 mycpu
= cpu_number();
1015 assert(mycpu
<= UCHAR_MAX
);
1017 l
->debug
.lock_thread
= (void *) current_thread();
1018 l
->debug
.state
|= USLOCK_TAKEN
;
1019 l
->debug
.lock_pc
= pc
;
1020 l
->debug
.lock_cpu
= (unsigned char)mycpu
;
1022 #endif /* USLOCK_DEBUG */
1025 * Routine: lck_rw_alloc_init
1034 lck
= zalloc_flags(ZV_LCK_RW
, Z_WAITOK
| Z_ZERO
);
1035 lck_rw_init(lck
, grp
, attr
);
1040 * Routine: lck_rw_free
1047 lck_rw_destroy(lck
, grp
);
1048 zfree(ZV_LCK_RW
, lck
);
1052 * Routine: lck_rw_init
1060 lck_attr_t
*lck_attr
= (attr
!= LCK_ATTR_NULL
) ?
1061 attr
: &LockDefaultLckAttr
;
1063 hw_lock_byte_init(&lck
->lck_rw_interlock
);
1064 lck
->lck_rw_want_write
= FALSE
;
1065 lck
->lck_rw_want_upgrade
= FALSE
;
1066 lck
->lck_rw_shared_count
= 0;
1067 lck
->lck_rw_can_sleep
= TRUE
;
1068 lck
->lck_r_waiting
= lck
->lck_w_waiting
= 0;
1069 lck
->lck_rw_tag
= 0;
1070 lck
->lck_rw_priv_excl
= ((lck_attr
->lck_attr_val
&
1071 LCK_ATTR_RW_SHARED_PRIORITY
) == 0);
1073 lck_grp_reference(grp
);
1074 lck_grp_lckcnt_incr(grp
, LCK_TYPE_RW
);
1078 * Routine: lck_rw_destroy
1085 if (lck
->lck_rw_tag
== LCK_RW_TAG_DESTROYED
) {
1089 lck_rw_assert(lck
, LCK_RW_ASSERT_NOTHELD
);
1091 lck
->lck_rw_tag
= LCK_RW_TAG_DESTROYED
;
1092 lck_grp_lckcnt_decr(grp
, LCK_TYPE_RW
);
1093 lck_grp_deallocate(grp
);
1098 * Sleep locks. These use the same data structure and algorithm
1099 * as the spin locks, but the process sleeps while it is waiting
1100 * for the lock. These work on uniprocessor systems.
1103 #define DECREMENTER_TIMEOUT 1000000
1106 * We disable interrupts while holding the RW interlock to prevent an
1107 * interrupt from exacerbating hold time.
1108 * Hence, local helper functions lck_interlock_lock()/lck_interlock_unlock().
1110 static inline boolean_t
1111 lck_interlock_lock(lck_rw_t
*lck
)
1115 istate
= ml_set_interrupts_enabled(FALSE
);
1116 hw_lock_byte_lock(&lck
->lck_rw_interlock
);
1121 lck_interlock_unlock(lck_rw_t
*lck
, boolean_t istate
)
1123 hw_lock_byte_unlock(&lck
->lck_rw_interlock
);
1124 ml_set_interrupts_enabled(istate
);
1128 * This inline is used when busy-waiting for an rw lock.
1129 * If interrupts were disabled when the lock primitive was called,
1130 * we poll the IPI handler for pending tlb flushes.
1131 * XXX This is a hack to avoid deadlocking on the pmap_system_lock.
1134 lck_rw_lock_pause(boolean_t interrupts_enabled
)
1136 if (!interrupts_enabled
) {
1137 handle_pending_TLB_flushes();
1142 static inline boolean_t
1143 lck_rw_held_read_or_upgrade(lck_rw_t
*lock
)
1145 if (ordered_load(&lock
->data
) & (LCK_RW_SHARED_MASK
| LCK_RW_INTERLOCK
| LCK_RW_WANT_UPGRADE
)) {
1152 * compute the deadline to spin against when
1153 * waiting for a change of state on a lck_rw_t
1155 static inline uint64_t
1156 lck_rw_deadline_for_spin(lck_rw_t
*lck
)
1158 if (lck
->lck_rw_can_sleep
) {
1159 if (lck
->lck_r_waiting
|| lck
->lck_w_waiting
|| lck
->lck_rw_shared_count
> machine_info
.max_cpus
) {
1161 * there are already threads waiting on this lock... this
1162 * implies that they have spun beyond their deadlines waiting for
1163 * the desired state to show up so we will not bother spinning at this time...
1165 * the current number of threads sharing this lock exceeds our capacity to run them
1166 * concurrently and since all states we're going to spin for require the rw_shared_count
1167 * to be at 0, we'll not bother spinning since the latency for this to happen is
1170 return mach_absolute_time();
1172 return mach_absolute_time() + MutexSpin
;
1174 return mach_absolute_time() + (100000LL * 1000000000LL);
1180 * Spin while interlock is held.
1184 lck_rw_interlock_spin(lck_rw_t
*lock
)
1186 while (ordered_load(&lock
->data
) & LCK_RW_INTERLOCK
) {
1192 lck_rw_grab_want(lck_rw_t
*lock
)
1194 uint32_t data
, prev
;
1197 data
= atomic_exchange_begin32(&lock
->data
, &prev
, memory_order_relaxed
);
1198 if ((data
& LCK_RW_INTERLOCK
) == 0) {
1201 atomic_exchange_abort();
1202 lck_rw_interlock_spin(lock
);
1204 if (data
& LCK_RW_WANT_WRITE
) {
1205 atomic_exchange_abort();
1208 data
|= LCK_RW_WANT_WRITE
;
1209 return atomic_exchange_complete32(&lock
->data
, prev
, data
, memory_order_relaxed
);
1213 lck_rw_grab_shared(lck_rw_t
*lock
)
1215 uint32_t data
, prev
;
1218 data
= atomic_exchange_begin32(&lock
->data
, &prev
, memory_order_acquire_smp
);
1219 if ((data
& LCK_RW_INTERLOCK
) == 0) {
1222 atomic_exchange_abort();
1223 lck_rw_interlock_spin(lock
);
1225 if (data
& (LCK_RW_WANT_WRITE
| LCK_RW_WANT_UPGRADE
)) {
1226 if (((data
& LCK_RW_SHARED_MASK
) == 0) || (data
& LCK_RW_PRIV_EXCL
)) {
1227 atomic_exchange_abort();
1231 data
+= LCK_RW_SHARED_READER
;
1232 return atomic_exchange_complete32(&lock
->data
, prev
, data
, memory_order_acquire_smp
);
1236 * Routine: lck_rw_lock_exclusive
1239 lck_rw_lock_exclusive_gen(
1242 __kdebug_only
uintptr_t trace_lck
= unslide_for_kdebug(lck
);
1243 uint64_t deadline
= 0;
1247 wait_result_t res
= 0;
1248 boolean_t istate
= -1;
1251 boolean_t dtrace_ls_initialized
= FALSE
;
1252 boolean_t dtrace_rwl_excl_spin
, dtrace_rwl_excl_block
, dtrace_ls_enabled
= FALSE
;
1253 uint64_t wait_interval
= 0;
1254 int readers_at_sleep
= 0;
1258 * Try to acquire the lck_rw_want_write bit.
1260 while (!lck_rw_grab_want(lck
)) {
1262 if (dtrace_ls_initialized
== FALSE
) {
1263 dtrace_ls_initialized
= TRUE
;
1264 dtrace_rwl_excl_spin
= (lockstat_probemap
[LS_LCK_RW_LOCK_EXCL_SPIN
] != 0);
1265 dtrace_rwl_excl_block
= (lockstat_probemap
[LS_LCK_RW_LOCK_EXCL_BLOCK
] != 0);
1266 dtrace_ls_enabled
= dtrace_rwl_excl_spin
|| dtrace_rwl_excl_block
;
1267 if (dtrace_ls_enabled
) {
1269 * Either sleeping or spinning is happening,
1270 * start a timing of our delay interval now.
1272 readers_at_sleep
= lck
->lck_rw_shared_count
;
1273 wait_interval
= mach_absolute_time();
1278 istate
= ml_get_interrupts_enabled();
1281 deadline
= lck_rw_deadline_for_spin(lck
);
1283 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_RW_LCK_EX_WRITER_SPIN_CODE
) | DBG_FUNC_START
, trace_lck
, 0, 0, 0, 0);
1285 while (((gotlock
= lck_rw_grab_want(lck
)) == 0) && mach_absolute_time() < deadline
) {
1286 lck_rw_lock_pause(istate
);
1289 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_RW_LCK_EX_WRITER_SPIN_CODE
) | DBG_FUNC_END
, trace_lck
, 0, 0, gotlock
, 0);
1295 * if we get here, the deadline has expired w/o us
1296 * being able to grab the lock exclusively
1297 * check to see if we're allowed to do a thread_block
1299 if (lck
->lck_rw_can_sleep
) {
1300 istate
= lck_interlock_lock(lck
);
1302 if (lck
->lck_rw_want_write
) {
1303 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_RW_LCK_EX_WRITER_WAIT_CODE
) | DBG_FUNC_START
, trace_lck
, 0, 0, 0, 0);
1305 lck
->lck_w_waiting
= TRUE
;
1307 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite
);
1308 res
= assert_wait(RW_LOCK_WRITER_EVENT(lck
),
1309 THREAD_UNINT
| THREAD_WAIT_NOREPORT_USER
);
1310 lck_interlock_unlock(lck
, istate
);
1312 if (res
== THREAD_WAITING
) {
1313 res
= thread_block(THREAD_CONTINUE_NULL
);
1316 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_RW_LCK_EX_WRITER_WAIT_CODE
) | DBG_FUNC_END
, trace_lck
, res
, slept
, 0, 0);
1318 lck
->lck_rw_want_write
= TRUE
;
1319 lck_interlock_unlock(lck
, istate
);
1325 * Wait for readers (and upgrades) to finish...
1326 * the test for these conditions must be done simultaneously with
1327 * a check of the interlock not being held since
1328 * the rw_shared_count will drop to 0 first and then want_upgrade
1329 * will be set to 1 in the shared_to_exclusive scenario... those
1330 * adjustments are done behind the interlock and represent an
1331 * atomic change in state and must be considered as such
1332 * however, once we see the read count at 0, the want_upgrade not set
1333 * and the interlock not held, we are safe to proceed
1335 while (lck_rw_held_read_or_upgrade(lck
)) {
1338 * Either sleeping or spinning is happening, start
1339 * a timing of our delay interval now. If we set it
1340 * to -1 we don't have accurate data so we cannot later
1341 * decide to record a dtrace spin or sleep event.
1343 if (dtrace_ls_initialized
== FALSE
) {
1344 dtrace_ls_initialized
= TRUE
;
1345 dtrace_rwl_excl_spin
= (lockstat_probemap
[LS_LCK_RW_LOCK_EXCL_SPIN
] != 0);
1346 dtrace_rwl_excl_block
= (lockstat_probemap
[LS_LCK_RW_LOCK_EXCL_BLOCK
] != 0);
1347 dtrace_ls_enabled
= dtrace_rwl_excl_spin
|| dtrace_rwl_excl_block
;
1348 if (dtrace_ls_enabled
) {
1350 * Either sleeping or spinning is happening,
1351 * start a timing of our delay interval now.
1353 readers_at_sleep
= lck
->lck_rw_shared_count
;
1354 wait_interval
= mach_absolute_time();
1359 istate
= ml_get_interrupts_enabled();
1362 deadline
= lck_rw_deadline_for_spin(lck
);
1364 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_RW_LCK_EX_READER_SPIN_CODE
) | DBG_FUNC_START
, trace_lck
, 0, 0, 0, 0);
1366 while ((lockheld
= lck_rw_held_read_or_upgrade(lck
)) && mach_absolute_time() < deadline
) {
1367 lck_rw_lock_pause(istate
);
1370 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_RW_LCK_EX_READER_SPIN_CODE
) | DBG_FUNC_END
, trace_lck
, 0, 0, lockheld
, 0);
1376 * if we get here, the deadline has expired w/o us
1377 * being able to grab the lock exclusively
1378 * check to see if we're allowed to do a thread_block
1380 if (lck
->lck_rw_can_sleep
) {
1381 istate
= lck_interlock_lock(lck
);
1383 if (lck
->lck_rw_shared_count
!= 0 || lck
->lck_rw_want_upgrade
) {
1384 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_RW_LCK_EX_READER_WAIT_CODE
) | DBG_FUNC_START
, trace_lck
, 0, 0, 0, 0);
1386 lck
->lck_w_waiting
= TRUE
;
1388 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite
);
1389 res
= assert_wait(RW_LOCK_WRITER_EVENT(lck
),
1390 THREAD_UNINT
| THREAD_WAIT_NOREPORT_USER
);
1391 lck_interlock_unlock(lck
, istate
);
1393 if (res
== THREAD_WAITING
) {
1394 res
= thread_block(THREAD_CONTINUE_NULL
);
1397 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_RW_LCK_EX_READER_WAIT_CODE
) | DBG_FUNC_END
, trace_lck
, res
, slept
, 0, 0);
1399 lck_interlock_unlock(lck
, istate
);
1401 * must own the lock now, since we checked for
1402 * readers or upgrade owner behind the interlock
1403 * no need for a call to 'lck_rw_held_read_or_upgrade'
1412 * Decide what latencies we suffered that are Dtrace events.
1413 * If we have set wait_interval, then we either spun or slept.
1414 * At least we get out from under the interlock before we record
1415 * which is the best we can do here to minimize the impact
1417 * If we have set wait_interval to -1, then dtrace was not enabled when we
1418 * started sleeping/spinning so we don't record this event.
1420 if (dtrace_ls_enabled
== TRUE
) {
1422 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_SPIN
, lck
,
1423 mach_absolute_time() - wait_interval
, 1);
1426 * For the blocking case, we also record if when we blocked
1427 * it was held for read or write, and how many readers.
1428 * Notice that above we recorded this before we dropped
1429 * the interlock so the count is accurate.
1431 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_BLOCK
, lck
,
1432 mach_absolute_time() - wait_interval
, 1,
1433 (readers_at_sleep
== 0 ? 1 : 0), readers_at_sleep
);
1436 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE
, lck
, 1);
1441 * Routine: lck_rw_done
1445 lck_rw_done(lck_rw_t
*lock
)
1447 uint32_t data
, prev
;
1450 data
= atomic_exchange_begin32(&lock
->data
, &prev
, memory_order_release_smp
);
1451 if (data
& LCK_RW_INTERLOCK
) { /* wait for interlock to clear */
1452 atomic_exchange_abort();
1453 lck_rw_interlock_spin(lock
);
1456 if (data
& LCK_RW_SHARED_MASK
) {
1457 data
-= LCK_RW_SHARED_READER
;
1458 if ((data
& LCK_RW_SHARED_MASK
) == 0) { /* if reader count has now gone to 0, check for waiters */
1461 } else { /* if reader count == 0, must be exclusive lock */
1462 if (data
& LCK_RW_WANT_UPGRADE
) {
1463 data
&= ~(LCK_RW_WANT_UPGRADE
);
1465 if (data
& LCK_RW_WANT_WRITE
) {
1466 data
&= ~(LCK_RW_WANT_EXCL
);
1467 } else { /* lock is not 'owned', panic */
1468 panic("Releasing non-exclusive RW lock without a reader refcount!");
1472 if (prev
& LCK_RW_W_WAITING
) {
1473 data
&= ~(LCK_RW_W_WAITING
);
1474 if ((prev
& LCK_RW_PRIV_EXCL
) == 0) {
1475 data
&= ~(LCK_RW_R_WAITING
);
1478 data
&= ~(LCK_RW_R_WAITING
);
1481 if (atomic_exchange_complete32(&lock
->data
, prev
, data
, memory_order_release_smp
)) {
1486 return lck_rw_done_gen(lock
, prev
);
1490 * Routine: lck_rw_done_gen
1492 * called from lck_rw_done()
1493 * prior_lock_state is the value in the 1st
1494 * word of the lock at the time of a successful
1495 * atomic compare and exchange with the new value...
1496 * it represents the state of the lock before we
1497 * decremented the rw_shared_count or cleared either
1498 * rw_want_upgrade or rw_want_write and
1499 * the lck_x_waiting bits... since the wrapper
1500 * routine has already changed the state atomically,
1501 * we just need to decide if we should
1502 * wake up anyone and what value to return... we do
1503 * this by examining the state of the lock before
1506 static lck_rw_type_t
1509 uint32_t prior_lock_state
)
1512 lck_rw_type_t lock_type
;
1514 uint32_t rwlock_count
;
1516 thread
= current_thread();
1517 rwlock_count
= thread
->rwlock_count
--;
1518 fake_lck
= (lck_rw_t
*)&prior_lock_state
;
1520 if (lck
->lck_rw_can_sleep
) {
1522 * prior_lock state is a snapshot of the 1st word of the
1523 * lock in question... we'll fake up a pointer to it
1524 * and carefully not access anything beyond whats defined
1525 * in the first word of a lck_rw_t
1528 if (fake_lck
->lck_rw_shared_count
<= 1) {
1529 if (fake_lck
->lck_w_waiting
) {
1530 thread_wakeup(RW_LOCK_WRITER_EVENT(lck
));
1533 if (!(fake_lck
->lck_rw_priv_excl
&& fake_lck
->lck_w_waiting
) && fake_lck
->lck_r_waiting
) {
1534 thread_wakeup(RW_LOCK_READER_EVENT(lck
));
1538 if (rwlock_count
== 0) {
1539 panic("rw lock count underflow for thread %p", thread
);
1542 /* Check if dropping the lock means that we need to unpromote */
1544 if ((rwlock_count
== 1 /* field now 0 */) && (thread
->sched_flags
& TH_SFLAG_RW_PROMOTED
)) {
1545 /* sched_flags checked without lock, but will be rechecked while clearing */
1546 lck_rw_clear_promotion(thread
, unslide_for_kdebug(lck
));
1549 if (fake_lck
->lck_rw_shared_count
) {
1550 lock_type
= LCK_RW_TYPE_SHARED
;
1552 lock_type
= LCK_RW_TYPE_EXCLUSIVE
;
1556 LOCKSTAT_RECORD(LS_LCK_RW_DONE_RELEASE
, lck
, lock_type
== LCK_RW_TYPE_SHARED
? 0 : 1);
1564 * Routine: lck_rw_unlock
1569 lck_rw_type_t lck_rw_type
)
1571 if (lck_rw_type
== LCK_RW_TYPE_SHARED
) {
1572 lck_rw_unlock_shared(lck
);
1573 } else if (lck_rw_type
== LCK_RW_TYPE_EXCLUSIVE
) {
1574 lck_rw_unlock_exclusive(lck
);
1576 panic("lck_rw_unlock(): Invalid RW lock type: %d\n", lck_rw_type
);
1582 * Routine: lck_rw_unlock_shared
1585 lck_rw_unlock_shared(
1590 assertf(lck
->lck_rw_shared_count
> 0, "lck %p has shared_count=0x%x", lck
, lck
->lck_rw_shared_count
);
1591 ret
= lck_rw_done(lck
);
1593 if (ret
!= LCK_RW_TYPE_SHARED
) {
1594 panic("lck_rw_unlock_shared(): lock %p held in mode: %d\n", lck
, ret
);
1600 * Routine: lck_rw_unlock_exclusive
1603 lck_rw_unlock_exclusive(
1608 ret
= lck_rw_done(lck
);
1610 if (ret
!= LCK_RW_TYPE_EXCLUSIVE
) {
1611 panic("lck_rw_unlock_exclusive(): lock held in mode: %d\n", ret
);
1617 * Routine: lck_rw_lock
1622 lck_rw_type_t lck_rw_type
)
1624 if (lck_rw_type
== LCK_RW_TYPE_SHARED
) {
1625 lck_rw_lock_shared(lck
);
1626 } else if (lck_rw_type
== LCK_RW_TYPE_EXCLUSIVE
) {
1627 lck_rw_lock_exclusive(lck
);
1629 panic("lck_rw_lock(): Invalid RW lock type: %x\n", lck_rw_type
);
1634 * Routine: lck_rw_lock_shared
1637 lck_rw_lock_shared(lck_rw_t
*lock
)
1639 uint32_t data
, prev
;
1641 current_thread()->rwlock_count
++;
1643 data
= atomic_exchange_begin32(&lock
->data
, &prev
, memory_order_acquire_smp
);
1644 if (data
& (LCK_RW_WANT_EXCL
| LCK_RW_WANT_UPGRADE
| LCK_RW_INTERLOCK
)) {
1645 atomic_exchange_abort();
1646 if (lock
->lck_rw_can_sleep
) {
1647 lck_rw_lock_shared_gen(lock
);
1654 data
+= LCK_RW_SHARED_READER
;
1655 if (atomic_exchange_complete32(&lock
->data
, prev
, data
, memory_order_acquire_smp
)) {
1661 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE
, lock
, DTRACE_RW_SHARED
);
1662 #endif /* CONFIG_DTRACE */
1667 * Routine: lck_rw_lock_shared_gen
1669 * assembly fast path code has determined that this lock
1670 * is held exclusively... this is where we spin/block
1671 * until we can acquire the lock in the shared mode
1674 lck_rw_lock_shared_gen(
1677 __kdebug_only
uintptr_t trace_lck
= unslide_for_kdebug(lck
);
1678 uint64_t deadline
= 0;
1681 wait_result_t res
= 0;
1682 boolean_t istate
= -1;
1685 uint64_t wait_interval
= 0;
1686 int readers_at_sleep
= 0;
1687 boolean_t dtrace_ls_initialized
= FALSE
;
1688 boolean_t dtrace_rwl_shared_spin
, dtrace_rwl_shared_block
, dtrace_ls_enabled
= FALSE
;
1691 while (!lck_rw_grab_shared(lck
)) {
1693 if (dtrace_ls_initialized
== FALSE
) {
1694 dtrace_ls_initialized
= TRUE
;
1695 dtrace_rwl_shared_spin
= (lockstat_probemap
[LS_LCK_RW_LOCK_SHARED_SPIN
] != 0);
1696 dtrace_rwl_shared_block
= (lockstat_probemap
[LS_LCK_RW_LOCK_SHARED_BLOCK
] != 0);
1697 dtrace_ls_enabled
= dtrace_rwl_shared_spin
|| dtrace_rwl_shared_block
;
1698 if (dtrace_ls_enabled
) {
1700 * Either sleeping or spinning is happening,
1701 * start a timing of our delay interval now.
1703 readers_at_sleep
= lck
->lck_rw_shared_count
;
1704 wait_interval
= mach_absolute_time();
1709 istate
= ml_get_interrupts_enabled();
1712 deadline
= lck_rw_deadline_for_spin(lck
);
1714 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_RW_LCK_SHARED_SPIN_CODE
) | DBG_FUNC_START
,
1715 trace_lck
, lck
->lck_rw_want_write
, lck
->lck_rw_want_upgrade
, 0, 0);
1717 while (((gotlock
= lck_rw_grab_shared(lck
)) == 0) && mach_absolute_time() < deadline
) {
1718 lck_rw_lock_pause(istate
);
1721 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_RW_LCK_SHARED_SPIN_CODE
) | DBG_FUNC_END
,
1722 trace_lck
, lck
->lck_rw_want_write
, lck
->lck_rw_want_upgrade
, gotlock
, 0);
1728 * if we get here, the deadline has expired w/o us
1729 * being able to grab the lock for read
1730 * check to see if we're allowed to do a thread_block
1732 if (lck
->lck_rw_can_sleep
) {
1733 istate
= lck_interlock_lock(lck
);
1735 if ((lck
->lck_rw_want_write
|| lck
->lck_rw_want_upgrade
) &&
1736 ((lck
->lck_rw_shared_count
== 0) || lck
->lck_rw_priv_excl
)) {
1737 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_RW_LCK_SHARED_WAIT_CODE
) | DBG_FUNC_START
,
1738 trace_lck
, lck
->lck_rw_want_write
, lck
->lck_rw_want_upgrade
, 0, 0);
1740 lck
->lck_r_waiting
= TRUE
;
1742 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockRead
);
1743 res
= assert_wait(RW_LOCK_READER_EVENT(lck
),
1744 THREAD_UNINT
| THREAD_WAIT_NOREPORT_USER
);
1745 lck_interlock_unlock(lck
, istate
);
1747 if (res
== THREAD_WAITING
) {
1748 res
= thread_block(THREAD_CONTINUE_NULL
);
1751 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_RW_LCK_SHARED_WAIT_CODE
) | DBG_FUNC_END
,
1752 trace_lck
, res
, slept
, 0, 0);
1754 lck
->lck_rw_shared_count
++;
1755 lck_interlock_unlock(lck
, istate
);
1762 if (dtrace_ls_enabled
== TRUE
) {
1764 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_SPIN
, lck
, mach_absolute_time() - wait_interval
, 0);
1766 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_BLOCK
, lck
,
1767 mach_absolute_time() - wait_interval
, 0,
1768 (readers_at_sleep
== 0 ? 1 : 0), readers_at_sleep
);
1771 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE
, lck
, 0);
1775 #define LCK_RW_LOCK_EXCLUSIVE_TAS(lck) (atomic_test_and_set32(&(lck)->data, \
1776 (LCK_RW_SHARED_MASK | LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE | LCK_RW_INTERLOCK), \
1777 LCK_RW_WANT_EXCL, memory_order_acquire_smp, FALSE))
1780 * Routine: lck_rw_lock_exclusive_check_contended
1784 lck_rw_lock_exclusive_check_contended(lck_rw_t
*lock
)
1786 bool contended
= false;
1787 current_thread()->rwlock_count
++;
1788 if (LCK_RW_LOCK_EXCLUSIVE_TAS(lock
)) {
1790 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE
, lock
, DTRACE_RW_EXCL
);
1791 #endif /* CONFIG_DTRACE */
1794 lck_rw_lock_exclusive_gen(lock
);
1800 * Routine: lck_rw_lock_exclusive
1804 lck_rw_lock_exclusive(lck_rw_t
*lock
)
1806 current_thread()->rwlock_count
++;
1807 if (LCK_RW_LOCK_EXCLUSIVE_TAS(lock
)) {
1809 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE
, lock
, DTRACE_RW_EXCL
);
1810 #endif /* CONFIG_DTRACE */
1812 lck_rw_lock_exclusive_gen(lock
);
1818 * Routine: lck_rw_lock_shared_to_exclusive
1820 * False returned upon failure, in this case the shared lock is dropped.
1824 lck_rw_lock_shared_to_exclusive(lck_rw_t
*lock
)
1826 uint32_t data
, prev
;
1829 data
= atomic_exchange_begin32(&lock
->data
, &prev
, memory_order_acquire_smp
);
1830 if (data
& LCK_RW_INTERLOCK
) {
1831 atomic_exchange_abort();
1832 lck_rw_interlock_spin(lock
);
1835 if (data
& LCK_RW_WANT_UPGRADE
) {
1836 data
-= LCK_RW_SHARED_READER
;
1837 if ((data
& LCK_RW_SHARED_MASK
) == 0) { /* we were the last reader */
1838 data
&= ~(LCK_RW_W_WAITING
); /* so clear the wait indicator */
1840 if (atomic_exchange_complete32(&lock
->data
, prev
, data
, memory_order_acquire_smp
)) {
1841 return lck_rw_lock_shared_to_exclusive_failure(lock
, prev
);
1844 data
|= LCK_RW_WANT_UPGRADE
; /* ask for WANT_UPGRADE */
1845 data
-= LCK_RW_SHARED_READER
; /* and shed our read count */
1846 if (atomic_exchange_complete32(&lock
->data
, prev
, data
, memory_order_acquire_smp
)) {
1852 /* we now own the WANT_UPGRADE */
1853 if (data
& LCK_RW_SHARED_MASK
) { /* check to see if all of the readers are drained */
1854 lck_rw_lock_shared_to_exclusive_success(lock
); /* if not, we need to go wait */
1857 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE
, lock
, 0);
1864 * Routine: lck_rw_lock_shared_to_exclusive_failure
1866 * assembly fast path code has already dropped our read
1867 * count and determined that someone else owns 'lck_rw_want_upgrade'
1868 * if 'lck_rw_shared_count' == 0, its also already dropped 'lck_w_waiting'
1869 * all we need to do here is determine if a wakeup is needed
1872 lck_rw_lock_shared_to_exclusive_failure(
1874 uint32_t prior_lock_state
)
1877 thread_t thread
= current_thread();
1878 uint32_t rwlock_count
;
1880 /* Check if dropping the lock means that we need to unpromote */
1881 rwlock_count
= thread
->rwlock_count
--;
1883 if (rwlock_count
== 0) {
1884 panic("rw lock count underflow for thread %p", thread
);
1887 fake_lck
= (lck_rw_t
*)&prior_lock_state
;
1889 if (fake_lck
->lck_w_waiting
&& fake_lck
->lck_rw_shared_count
== 1) {
1891 * Someone else has requested upgrade.
1892 * Since we've released the read lock, wake
1893 * him up if he's blocked waiting
1895 thread_wakeup(RW_LOCK_WRITER_EVENT(lck
));
1898 if ((rwlock_count
== 1 /* field now 0 */) && (thread
->sched_flags
& TH_SFLAG_RW_PROMOTED
)) {
1899 /* sched_flags checked without lock, but will be rechecked while clearing */
1900 lck_rw_clear_promotion(thread
, unslide_for_kdebug(lck
));
1903 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_RW_LCK_SH_TO_EX_CODE
) | DBG_FUNC_NONE
,
1904 VM_KERNEL_UNSLIDE_OR_PERM(lck
), lck
->lck_rw_shared_count
, lck
->lck_rw_want_upgrade
, 0, 0);
1911 * Routine: lck_rw_lock_shared_to_exclusive_failure
1913 * assembly fast path code has already dropped our read
1914 * count and successfully acquired 'lck_rw_want_upgrade'
1915 * we just need to wait for the rest of the readers to drain
1916 * and then we can return as the exclusive holder of this lock
1919 lck_rw_lock_shared_to_exclusive_success(
1922 __kdebug_only
uintptr_t trace_lck
= unslide_for_kdebug(lck
);
1923 uint64_t deadline
= 0;
1925 int still_shared
= 0;
1927 boolean_t istate
= -1;
1930 uint64_t wait_interval
= 0;
1931 int readers_at_sleep
= 0;
1932 boolean_t dtrace_ls_initialized
= FALSE
;
1933 boolean_t dtrace_rwl_shared_to_excl_spin
, dtrace_rwl_shared_to_excl_block
, dtrace_ls_enabled
= FALSE
;
1936 while (lck
->lck_rw_shared_count
!= 0) {
1938 if (dtrace_ls_initialized
== FALSE
) {
1939 dtrace_ls_initialized
= TRUE
;
1940 dtrace_rwl_shared_to_excl_spin
= (lockstat_probemap
[LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN
] != 0);
1941 dtrace_rwl_shared_to_excl_block
= (lockstat_probemap
[LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK
] != 0);
1942 dtrace_ls_enabled
= dtrace_rwl_shared_to_excl_spin
|| dtrace_rwl_shared_to_excl_block
;
1943 if (dtrace_ls_enabled
) {
1945 * Either sleeping or spinning is happening,
1946 * start a timing of our delay interval now.
1948 readers_at_sleep
= lck
->lck_rw_shared_count
;
1949 wait_interval
= mach_absolute_time();
1954 istate
= ml_get_interrupts_enabled();
1957 deadline
= lck_rw_deadline_for_spin(lck
);
1959 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_RW_LCK_SH_TO_EX_SPIN_CODE
) | DBG_FUNC_START
,
1960 trace_lck
, lck
->lck_rw_shared_count
, 0, 0, 0);
1962 while ((still_shared
= lck
->lck_rw_shared_count
) && mach_absolute_time() < deadline
) {
1963 lck_rw_lock_pause(istate
);
1966 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_RW_LCK_SH_TO_EX_SPIN_CODE
) | DBG_FUNC_END
,
1967 trace_lck
, lck
->lck_rw_shared_count
, 0, 0, 0);
1969 if (!still_shared
) {
1973 * if we get here, the deadline has expired w/o
1974 * the rw_shared_count having drained to 0
1975 * check to see if we're allowed to do a thread_block
1977 if (lck
->lck_rw_can_sleep
) {
1978 istate
= lck_interlock_lock(lck
);
1980 if (lck
->lck_rw_shared_count
!= 0) {
1981 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_RW_LCK_SH_TO_EX_WAIT_CODE
) | DBG_FUNC_START
,
1982 trace_lck
, lck
->lck_rw_shared_count
, 0, 0, 0);
1984 lck
->lck_w_waiting
= TRUE
;
1986 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockUpgrade
);
1987 res
= assert_wait(RW_LOCK_WRITER_EVENT(lck
),
1988 THREAD_UNINT
| THREAD_WAIT_NOREPORT_USER
);
1989 lck_interlock_unlock(lck
, istate
);
1991 if (res
== THREAD_WAITING
) {
1992 res
= thread_block(THREAD_CONTINUE_NULL
);
1995 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_RW_LCK_SH_TO_EX_WAIT_CODE
) | DBG_FUNC_END
,
1996 trace_lck
, res
, slept
, 0, 0);
1998 lck_interlock_unlock(lck
, istate
);
2005 * We infer whether we took the sleep/spin path above by checking readers_at_sleep.
2007 if (dtrace_ls_enabled
== TRUE
) {
2009 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN
, lck
, mach_absolute_time() - wait_interval
, 0);
2011 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK
, lck
,
2012 mach_absolute_time() - wait_interval
, 1,
2013 (readers_at_sleep
== 0 ? 1 : 0), readers_at_sleep
);
2016 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE
, lck
, 1);
2022 * Routine: lck_rw_lock_exclusive_to_shared
2026 lck_rw_lock_exclusive_to_shared(lck_rw_t
*lock
)
2028 uint32_t data
, prev
;
2031 data
= atomic_exchange_begin32(&lock
->data
, &prev
, memory_order_release_smp
);
2032 if (data
& LCK_RW_INTERLOCK
) {
2033 atomic_exchange_abort();
2034 lck_rw_interlock_spin(lock
); /* wait for interlock to clear */
2037 data
+= LCK_RW_SHARED_READER
;
2038 if (data
& LCK_RW_WANT_UPGRADE
) {
2039 data
&= ~(LCK_RW_WANT_UPGRADE
);
2041 data
&= ~(LCK_RW_WANT_EXCL
);
2043 if (!((prev
& LCK_RW_W_WAITING
) && (prev
& LCK_RW_PRIV_EXCL
))) {
2044 data
&= ~(LCK_RW_W_WAITING
);
2046 if (atomic_exchange_complete32(&lock
->data
, prev
, data
, memory_order_release_smp
)) {
2051 return lck_rw_lock_exclusive_to_shared_gen(lock
, prev
);
2056 * Routine: lck_rw_lock_exclusive_to_shared_gen
2058 * assembly fast path has already dropped
2059 * our exclusive state and bumped lck_rw_shared_count
2060 * all we need to do here is determine if anyone
2061 * needs to be awakened.
2064 lck_rw_lock_exclusive_to_shared_gen(
2066 uint32_t prior_lock_state
)
2068 __kdebug_only
uintptr_t trace_lck
= unslide_for_kdebug(lck
);
2071 fake_lck
= (lck_rw_t
*)&prior_lock_state
;
2073 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_RW_LCK_EX_TO_SH_CODE
) | DBG_FUNC_START
,
2074 trace_lck
, fake_lck
->lck_rw_want_write
, fake_lck
->lck_rw_want_upgrade
, 0, 0);
2077 * don't wake up anyone waiting to take the lock exclusively
2078 * since we hold a read count... when the read count drops to 0,
2079 * the writers will be woken.
2081 * wake up any waiting readers if we don't have any writers waiting,
2082 * or the lock is NOT marked as rw_priv_excl (writers have privilege)
2084 if (!(fake_lck
->lck_rw_priv_excl
&& fake_lck
->lck_w_waiting
) && fake_lck
->lck_r_waiting
) {
2085 thread_wakeup(RW_LOCK_READER_EVENT(lck
));
2088 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_RW_LCK_EX_TO_SH_CODE
) | DBG_FUNC_END
,
2089 trace_lck
, lck
->lck_rw_want_write
, lck
->lck_rw_want_upgrade
, lck
->lck_rw_shared_count
, 0);
2092 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_TO_SHARED_DOWNGRADE
, lck
, 0);
2098 * Routine: lck_rw_try_lock
2103 lck_rw_type_t lck_rw_type
)
2105 if (lck_rw_type
== LCK_RW_TYPE_SHARED
) {
2106 return lck_rw_try_lock_shared(lck
);
2107 } else if (lck_rw_type
== LCK_RW_TYPE_EXCLUSIVE
) {
2108 return lck_rw_try_lock_exclusive(lck
);
2110 panic("lck_rw_try_lock(): Invalid rw lock type: %x\n", lck_rw_type
);
2116 * Routine: lck_rw_try_lock_shared
2120 lck_rw_try_lock_shared(lck_rw_t
*lock
)
2122 uint32_t data
, prev
;
2125 data
= atomic_exchange_begin32(&lock
->data
, &prev
, memory_order_acquire_smp
);
2126 if (data
& LCK_RW_INTERLOCK
) {
2127 atomic_exchange_abort();
2128 lck_rw_interlock_spin(lock
);
2131 if (data
& (LCK_RW_WANT_EXCL
| LCK_RW_WANT_UPGRADE
)) {
2132 atomic_exchange_abort();
2133 return FALSE
; /* lock is busy */
2135 data
+= LCK_RW_SHARED_READER
; /* Increment reader refcount */
2136 if (atomic_exchange_complete32(&lock
->data
, prev
, data
, memory_order_acquire_smp
)) {
2141 current_thread()->rwlock_count
++;
2142 /* There is a 3 instr window where preemption may not notice rwlock_count after cmpxchg */
2144 LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_SHARED_ACQUIRE
, lock
, DTRACE_RW_SHARED
);
2145 #endif /* CONFIG_DTRACE */
2151 * Routine: lck_rw_try_lock_exclusive
2155 lck_rw_try_lock_exclusive(lck_rw_t
*lock
)
2157 uint32_t data
, prev
;
2160 data
= atomic_exchange_begin32(&lock
->data
, &prev
, memory_order_acquire_smp
);
2161 if (data
& LCK_RW_INTERLOCK
) {
2162 atomic_exchange_abort();
2163 lck_rw_interlock_spin(lock
);
2166 if (data
& (LCK_RW_SHARED_MASK
| LCK_RW_WANT_EXCL
| LCK_RW_WANT_UPGRADE
)) {
2167 atomic_exchange_abort();
2168 return FALSE
; /* can't get it */
2170 data
|= LCK_RW_WANT_EXCL
;
2171 if (atomic_exchange_complete32(&lock
->data
, prev
, data
, memory_order_acquire_smp
)) {
2177 current_thread()->rwlock_count
++;
2179 LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_EXCL_ACQUIRE
, lock
, DTRACE_RW_EXCL
);
2180 #endif /* CONFIG_DTRACE */
2191 case LCK_RW_ASSERT_SHARED
:
2192 if (lck
->lck_rw_shared_count
!= 0) {
2196 case LCK_RW_ASSERT_EXCLUSIVE
:
2197 if ((lck
->lck_rw_want_write
||
2198 lck
->lck_rw_want_upgrade
) &&
2199 lck
->lck_rw_shared_count
== 0) {
2203 case LCK_RW_ASSERT_HELD
:
2204 if (lck
->lck_rw_want_write
||
2205 lck
->lck_rw_want_upgrade
||
2206 lck
->lck_rw_shared_count
!= 0) {
2210 case LCK_RW_ASSERT_NOTHELD
:
2211 if (!(lck
->lck_rw_want_write
||
2212 lck
->lck_rw_want_upgrade
||
2213 lck
->lck_rw_shared_count
!= 0)) {
2221 panic("rw lock (%p)%s held (mode=%u), first word %08x\n", lck
, (type
== LCK_RW_ASSERT_NOTHELD
? "" : " not"), type
, *(uint32_t *)lck
);
2224 /* On return to userspace, this routine is called if the rwlock_count is somehow imbalanced */
2229 lck_rw_clear_promotions_x86(thread_t thread
)
2232 /* It's fatal to leave a RW lock locked and return to userspace */
2233 panic("%u rw lock(s) held on return to userspace for thread %p", thread
->rwlock_count
, thread
);
2235 /* Paper over the issue */
2236 thread
->rwlock_count
= 0;
2237 lck_rw_clear_promotion(thread
, 0);
2242 lck_rw_lock_yield_shared(lck_rw_t
*lck
, boolean_t force_yield
)
2244 lck_rw_assert(lck
, LCK_RW_ASSERT_SHARED
);
2246 if (lck
->lck_rw_want_write
|| lck
->lck_rw_want_upgrade
|| force_yield
) {
2247 lck_rw_unlock_shared(lck
);
2249 lck_rw_lock_shared(lck
);
2257 * Routine: kdp_lck_rw_lock_is_acquired_exclusive
2258 * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
2261 kdp_lck_rw_lock_is_acquired_exclusive(lck_rw_t
*lck
)
2264 panic("panic: rw lock exclusive check done outside of kernel debugger");
2266 return ((lck
->lck_rw_want_upgrade
|| lck
->lck_rw_want_write
) && (lck
->lck_rw_shared_count
== 0)) ? TRUE
: FALSE
;
2270 * Slow path routines for lck_mtx locking and unlocking functions.
2272 * These functions were previously implemented in x86 assembly,
2273 * and some optimizations are in place in this c code to obtain a compiled code
2274 * as performant and compact as the assembly version.
2276 * To avoid to inline these functions on the fast path, all functions directly called by
2277 * the fast paths have the __attribute__((noinline)) specified. Also they are all implemented
2278 * in such a way the fast path can tail call into them. In this way the return address
2279 * does not need to be pushed on the caller stack and stack optimization can happen on the caller.
2281 * Slow path code is structured in such a way there are no calls to functions that will return
2282 * on the context of the caller function, i.e. all functions called are or tail call functions
2283 * or inline functions. The number of arguments of the tail call functions are less then six,
2284 * so that they can be passed over registers and do not need to be pushed on stack.
2285 * This allows the compiler to not create a stack frame for the functions.
2287 * __improbable and __probable are used to compile the slow path code in such a way
2288 * the fast path case will be on a sequence of instructions with as less jumps as possible,
2289 * to make this case the most optimized even if falling through the slow path.
2293 * Intel lock invariants:
2295 * lck_mtx_waiters: contains the count of threads currently in the mutex waitqueue
2297 * The lock owner is promoted to the max priority of all its waiters only if it
2298 * was a lower priority when it acquired or was an owner when a waiter waited.
2299 * Max priority is capped at MAXPRI_PROMOTE.
2301 * The last waiter will not be promoted as it is woken up, but the last
2302 * lock owner may not have been the last thread to have been woken up depending on the
2303 * luck of the draw. Therefore a last-owner may still have the promoted-on-wakeup
2306 * TODO: Figure out an algorithm for stopping a lock holder which is already at the right
2307 * priority from dropping priority in the future without having to take thread lock
2312 * Routine: lck_mtx_alloc_init
2321 lck
= zalloc(ZV_LCK_MTX
);
2322 lck_mtx_init(lck
, grp
, attr
);
2327 * Routine: lck_mtx_free
2334 lck_mtx_destroy(lck
, grp
);
2335 zfree(ZV_LCK_MTX
, lck
);
2339 * Routine: lck_mtx_ext_init
2347 bzero((void *)lck
, sizeof(lck_mtx_ext_t
));
2349 if ((attr
->lck_attr_val
) & LCK_ATTR_DEBUG
) {
2350 lck
->lck_mtx_deb
.type
= MUTEX_TAG
;
2351 lck
->lck_mtx_attr
|= LCK_MTX_ATTR_DEBUG
;
2354 lck
->lck_mtx_grp
= grp
;
2356 if (grp
->lck_grp_attr
& LCK_GRP_ATTR_STAT
) {
2357 lck
->lck_mtx_attr
|= LCK_MTX_ATTR_STAT
;
2360 lck
->lck_mtx
.lck_mtx_is_ext
= 1;
2361 lck
->lck_mtx
.lck_mtx_pad32
= 0xFFFFFFFF;
2365 * Routine: lck_mtx_init
2373 lck_mtx_ext_t
*lck_ext
;
2374 lck_attr_t
*lck_attr
;
2376 if (attr
!= LCK_ATTR_NULL
) {
2379 lck_attr
= &LockDefaultLckAttr
;
2382 if ((lck_attr
->lck_attr_val
) & LCK_ATTR_DEBUG
) {
2383 lck_ext
= zalloc(ZV_LCK_MTX_EXT
);
2384 lck_mtx_ext_init(lck_ext
, grp
, lck_attr
);
2385 lck
->lck_mtx_tag
= LCK_MTX_TAG_INDIRECT
;
2386 lck
->lck_mtx_ptr
= lck_ext
;
2388 lck
->lck_mtx_owner
= 0;
2389 lck
->lck_mtx_state
= 0;
2391 lck
->lck_mtx_pad32
= 0xFFFFFFFF;
2392 lck_grp_reference(grp
);
2393 lck_grp_lckcnt_incr(grp
, LCK_TYPE_MTX
);
2397 * Routine: lck_mtx_init_ext
2402 lck_mtx_ext_t
*lck_ext
,
2406 lck_attr_t
*lck_attr
;
2408 if (attr
!= LCK_ATTR_NULL
) {
2411 lck_attr
= &LockDefaultLckAttr
;
2414 if ((lck_attr
->lck_attr_val
) & LCK_ATTR_DEBUG
) {
2415 lck_mtx_ext_init(lck_ext
, grp
, lck_attr
);
2416 lck
->lck_mtx_tag
= LCK_MTX_TAG_INDIRECT
;
2417 lck
->lck_mtx_ptr
= lck_ext
;
2419 lck
->lck_mtx_owner
= 0;
2420 lck
->lck_mtx_state
= 0;
2422 lck
->lck_mtx_pad32
= 0xFFFFFFFF;
2424 lck_grp_reference(grp
);
2425 lck_grp_lckcnt_incr(grp
, LCK_TYPE_MTX
);
2429 lck_mtx_lock_mark_destroyed(
2436 /* convert to destroyed state */
2437 ordered_store_mtx_state_release(mutex
, LCK_MTX_TAG_DESTROYED
);
2441 state
= ordered_load_mtx_state(mutex
);
2442 lck_mtx_interlock_lock(mutex
, &state
);
2444 ordered_store_mtx_state_release(mutex
, LCK_MTX_TAG_DESTROYED
);
2446 enable_preemption();
2450 * Routine: lck_mtx_destroy
2459 if (lck
->lck_mtx_tag
== LCK_MTX_TAG_DESTROYED
) {
2463 lck_mtx_assert(lck
, LCK_MTX_ASSERT_NOTOWNED
);
2465 indirect
= (lck
->lck_mtx_tag
== LCK_MTX_TAG_INDIRECT
);
2467 lck_mtx_lock_mark_destroyed(lck
, indirect
);
2470 zfree(ZV_LCK_MTX_EXT
, lck
->lck_mtx_ptr
);
2472 lck_grp_lckcnt_decr(grp
, LCK_TYPE_MTX
);
2473 lck_grp_deallocate(grp
);
2478 #if DEVELOPMENT | DEBUG
2479 __attribute__((noinline
))
2481 lck_mtx_owner_check_panic(
2484 thread_t owner
= (thread_t
)lock
->lck_mtx_owner
;
2485 panic("Mutex unlock attempted from non-owner thread. Owner=%p lock=%p", owner
, lock
);
2489 __attribute__((always_inline
))
2495 *lock
= &((*lock
)->lck_mtx_ptr
->lck_mtx
);
2496 *state
= ordered_load_mtx_state(*lock
);
2501 * Routine: lck_mtx_unlock_slow
2503 * Unlocks a mutex held by current thread.
2505 * It will wake up waiters if necessary.
2507 * Interlock can be held.
2509 __attribute__((noinline
))
2511 lck_mtx_unlock_slow(
2515 uint32_t state
, prev
;
2516 boolean_t indirect
= FALSE
;
2518 state
= ordered_load_mtx_state(lock
);
2520 /* Is this an indirect mutex? */
2521 if (__improbable(state
== LCK_MTX_TAG_INDIRECT
)) {
2522 indirect
= get_indirect_mutex(&lock
, &state
);
2525 thread
= current_thread();
2527 #if DEVELOPMENT | DEBUG
2528 thread_t owner
= (thread_t
)lock
->lck_mtx_owner
;
2529 if (__improbable(owner
!= thread
)) {
2530 lck_mtx_owner_check_panic(lock
);
2534 /* check if it is held as a spinlock */
2535 if (__improbable((state
& LCK_MTX_MLOCKED_MSK
) == 0)) {
2539 lck_mtx_interlock_lock_clear_flags(lock
, LCK_MTX_MLOCKED_MSK
, &state
);
2542 /* preemption disabled, interlock held and mutex not held */
2545 ordered_store_mtx_owner(lock
, 0);
2546 /* keep original state in prev for later evaluation */
2549 if (__improbable(state
& LCK_MTX_WAITERS_MSK
)) {
2552 thread
->mutex_count
--;
2555 return lck_mtx_unlock_wakeup_tail(lock
, state
, indirect
);
2558 /* release interlock, promotion and clear spin flag */
2559 state
&= (~(LCK_MTX_ILOCKED_MSK
| LCK_MTX_SPIN_MSK
));
2560 ordered_store_mtx_state_release(lock
, state
); /* since I own the interlock, I don't need an atomic update */
2563 /* perform lock statistics after drop to prevent delay */
2565 thread
->mutex_count
--; /* lock statistic */
2567 #endif /* MACH_LDEBUG */
2569 /* re-enable preemption */
2570 lck_mtx_unlock_finish_inline(lock
, FALSE
);
2575 #define LCK_MTX_LCK_WAIT_CODE 0x20
2576 #define LCK_MTX_LCK_WAKEUP_CODE 0x21
2577 #define LCK_MTX_LCK_SPIN_CODE 0x22
2578 #define LCK_MTX_LCK_ACQUIRE_CODE 0x23
2579 #define LCK_MTX_LCK_DEMOTE_CODE 0x24
2582 * Routine: lck_mtx_unlock_wakeup_tail
2584 * Invoked on unlock when there is
2585 * contention, i.e. the assembly routine sees
2586 * that mutex->lck_mtx_waiters != 0
2588 * neither the mutex or interlock is held
2590 * Note that this routine might not be called if there are pending
2591 * waiters which have previously been woken up, and they didn't
2592 * end up boosting the old owner.
2594 * assembly routine previously did the following to mutex:
2595 * (after saving the state in prior_lock_state)
2596 * decremented lck_mtx_waiters if nonzero
2598 * This function needs to be called as a tail call
2599 * to optimize the compiled code.
2601 __attribute__((noinline
))
2603 lck_mtx_unlock_wakeup_tail(
2608 struct turnstile
*ts
;
2610 __kdebug_only
uintptr_t trace_lck
= unslide_for_kdebug(mutex
);
2611 kern_return_t did_wake
;
2613 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_MTX_LCK_WAKEUP_CODE
) | DBG_FUNC_START
,
2614 trace_lck
, 0, mutex
->lck_mtx_waiters
, 0, 0);
2616 ts
= turnstile_prepare((uintptr_t)mutex
, NULL
, TURNSTILE_NULL
, TURNSTILE_KERNEL_MUTEX
);
2618 if (mutex
->lck_mtx_waiters
> 1) {
2619 /* WAITQ_PROMOTE_ON_WAKE will call turnstile_update_inheritor on the wokenup thread */
2620 did_wake
= waitq_wakeup64_one(&ts
->ts_waitq
, CAST_EVENT64_T(LCK_MTX_EVENT(mutex
)), THREAD_AWAKENED
, WAITQ_PROMOTE_ON_WAKE
);
2622 did_wake
= waitq_wakeup64_one(&ts
->ts_waitq
, CAST_EVENT64_T(LCK_MTX_EVENT(mutex
)), THREAD_AWAKENED
, WAITQ_ALL_PRIORITIES
);
2623 turnstile_update_inheritor(ts
, NULL
, TURNSTILE_IMMEDIATE_UPDATE
);
2625 assert(did_wake
== KERN_SUCCESS
);
2627 turnstile_update_inheritor_complete(ts
, TURNSTILE_INTERLOCK_HELD
);
2628 turnstile_complete((uintptr_t)mutex
, NULL
, NULL
, TURNSTILE_KERNEL_MUTEX
);
2630 state
-= LCK_MTX_WAITER
;
2631 state
&= (~(LCK_MTX_SPIN_MSK
| LCK_MTX_ILOCKED_MSK
));
2632 ordered_store_mtx_state_release(mutex
, state
);
2634 assert(current_thread()->turnstile
!= NULL
);
2636 turnstile_cleanup();
2638 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_MTX_LCK_WAKEUP_CODE
) | DBG_FUNC_END
,
2639 trace_lck
, 0, mutex
->lck_mtx_waiters
, 0, 0);
2641 lck_mtx_unlock_finish_inline(mutex
, indirect
);
2645 * Routine: lck_mtx_lock_acquire_x86
2647 * Invoked on acquiring the mutex when there is
2648 * contention (i.e. the assembly routine sees that
2649 * that mutex->lck_mtx_waiters != 0
2651 * mutex is owned... interlock is held... preemption is disabled
2653 __attribute__((always_inline
))
2655 lck_mtx_lock_acquire_inline(
2657 struct turnstile
*ts
)
2659 __kdebug_only
uintptr_t trace_lck
= unslide_for_kdebug(mutex
);
2661 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_MTX_LCK_ACQUIRE_CODE
) | DBG_FUNC_START
,
2662 trace_lck
, 0, mutex
->lck_mtx_waiters
, 0, 0);
2664 thread_t thread
= (thread_t
)mutex
->lck_mtx_owner
; /* faster than current_thread() */
2665 assert(thread
->waiting_for_mutex
== NULL
);
2667 if (mutex
->lck_mtx_waiters
> 0) {
2669 ts
= turnstile_prepare((uintptr_t)mutex
, NULL
, TURNSTILE_NULL
, TURNSTILE_KERNEL_MUTEX
);
2672 turnstile_update_inheritor(ts
, thread
, (TURNSTILE_IMMEDIATE_UPDATE
| TURNSTILE_INHERITOR_THREAD
));
2673 turnstile_update_inheritor_complete(ts
, TURNSTILE_INTERLOCK_HELD
);
2677 turnstile_complete((uintptr_t)mutex
, NULL
, NULL
, TURNSTILE_KERNEL_MUTEX
);
2680 assert(current_thread()->turnstile
!= NULL
);
2682 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_MTX_LCK_ACQUIRE_CODE
) | DBG_FUNC_END
,
2683 trace_lck
, 0, mutex
->lck_mtx_waiters
, 0, 0);
2687 lck_mtx_lock_acquire_x86(
2690 return lck_mtx_lock_acquire_inline(mutex
, NULL
);
2694 * Tail call helpers for lock functions that perform
2695 * lck_mtx_lock_acquire followed by the caller's finish routine, to optimize
2696 * the caller's compiled code.
2699 __attribute__((noinline
))
2701 lck_mtx_lock_acquire_tail(
2704 struct turnstile
*ts
)
2706 lck_mtx_lock_acquire_inline(mutex
, ts
);
2707 lck_mtx_lock_finish_inline_with_cleanup(mutex
, ordered_load_mtx_state(mutex
), indirect
);
2710 __attribute__((noinline
))
2712 lck_mtx_try_lock_acquire_tail(
2715 lck_mtx_lock_acquire_inline(mutex
, NULL
);
2716 lck_mtx_try_lock_finish_inline(mutex
, ordered_load_mtx_state(mutex
));
2721 __attribute__((noinline
))
2723 lck_mtx_convert_spin_acquire_tail(
2726 lck_mtx_lock_acquire_inline(mutex
, NULL
);
2727 lck_mtx_convert_spin_finish_inline(mutex
, ordered_load_mtx_state(mutex
));
2734 lck_mtx_ilk_unlock_inline(mutex
, ordered_load_mtx_state(mutex
));
2739 lck_mtx_interlock_lock_set_and_clear_flags(
2743 uint32_t *new_state
)
2745 uint32_t state
, prev
;
2749 /* have to wait for interlock to clear */
2750 while (__improbable(state
& (LCK_MTX_ILOCKED_MSK
| xor_flags
))) {
2752 state
= ordered_load_mtx_state(mutex
);
2754 prev
= state
; /* prev contains snapshot for exchange */
2755 state
|= LCK_MTX_ILOCKED_MSK
| xor_flags
; /* pick up interlock */
2756 state
&= ~and_flags
; /* clear flags */
2758 disable_preemption();
2759 if (os_atomic_cmpxchg(&mutex
->lck_mtx_state
, prev
, state
, acquire
)) {
2762 enable_preemption();
2764 state
= ordered_load_mtx_state(mutex
);
2771 lck_mtx_interlock_lock_clear_flags(
2774 uint32_t *new_state
)
2776 return lck_mtx_interlock_lock_set_and_clear_flags(mutex
, 0, and_flags
, new_state
);
2780 lck_mtx_interlock_lock(
2782 uint32_t *new_state
)
2784 return lck_mtx_interlock_lock_set_and_clear_flags(mutex
, 0, 0, new_state
);
2788 lck_mtx_interlock_try_lock_set_flags(
2791 uint32_t *new_state
)
2793 uint32_t state
, prev
;
2796 /* have to wait for interlock to clear */
2797 if (state
& (LCK_MTX_ILOCKED_MSK
| or_flags
)) {
2800 prev
= state
; /* prev contains snapshot for exchange */
2801 state
|= LCK_MTX_ILOCKED_MSK
| or_flags
; /* pick up interlock */
2802 disable_preemption();
2803 if (os_atomic_cmpxchg(&mutex
->lck_mtx_state
, prev
, state
, acquire
)) {
2808 enable_preemption();
2812 __attribute__((noinline
))
2814 lck_mtx_lock_contended(
2817 boolean_t
*first_miss
)
2819 lck_mtx_spinwait_ret_type_t ret
;
2822 struct turnstile
*ts
= NULL
;
2827 lck_grp_mtx_update_miss((struct _lck_mtx_ext_
*)lock
, first_miss
);
2830 ret
= lck_mtx_lock_spinwait_x86(lock
);
2831 state
= ordered_load_mtx_state(lock
);
2833 case LCK_MTX_SPINWAIT_NO_SPIN
:
2835 * owner not on core, lck_mtx_lock_spinwait_x86 didn't even
2839 lck_grp_mtx_update_direct_wait((struct _lck_mtx_ext_
*)lock
);
2842 /* just fall through case LCK_MTX_SPINWAIT_SPUN */
2844 case LCK_MTX_SPINWAIT_SPUN_HIGH_THR
:
2845 case LCK_MTX_SPINWAIT_SPUN_OWNER_NOT_CORE
:
2846 case LCK_MTX_SPINWAIT_SPUN_NO_WINDOW_CONTENTION
:
2847 case LCK_MTX_SPINWAIT_SPUN_SLIDING_THR
:
2849 * mutex not acquired but lck_mtx_lock_spinwait_x86 tried to spin
2850 * interlock not held
2852 lck_mtx_interlock_lock(lock
, &state
);
2853 assert(state
& LCK_MTX_ILOCKED_MSK
);
2855 if (state
& LCK_MTX_MLOCKED_MSK
) {
2857 lck_grp_mtx_update_wait((struct _lck_mtx_ext_
*)lock
, first_miss
);
2859 lck_mtx_lock_wait_x86(lock
, &ts
);
2861 * interlock is not held here.
2865 /* grab the mutex */
2866 state
|= LCK_MTX_MLOCKED_MSK
;
2867 ordered_store_mtx_state_release(lock
, state
);
2868 thread
= current_thread();
2869 ordered_store_mtx_owner(lock
, (uintptr_t)thread
);
2872 thread
->mutex_count
++;
2874 #endif /* MACH_LDEBUG */
2878 case LCK_MTX_SPINWAIT_ACQUIRED
:
2880 * mutex has been acquired by lck_mtx_lock_spinwait_x86
2881 * interlock is held and preemption disabled
2882 * owner is set and mutex marked as locked
2883 * statistics updated too
2887 panic("lck_mtx_lock_spinwait_x86 returned %d for mutex %p\n", ret
, lock
);
2891 * interlock is already acquired here
2894 /* mutex has been acquired */
2895 thread
= (thread_t
)lock
->lck_mtx_owner
;
2896 if (state
& LCK_MTX_WAITERS_MSK
) {
2898 * lck_mtx_lock_acquire_tail will call
2899 * turnstile_complete.
2901 return lck_mtx_lock_acquire_tail(lock
, indirect
, ts
);
2905 turnstile_complete((uintptr_t)lock
, NULL
, NULL
, TURNSTILE_KERNEL_MUTEX
);
2908 assert(current_thread()->turnstile
!= NULL
);
2910 /* release the interlock */
2911 lck_mtx_lock_finish_inline_with_cleanup(lock
, ordered_load_mtx_state(lock
), indirect
);
2915 * Helper noinline functions for calling
2916 * panic to optimize compiled code.
2919 __attribute__((noinline
)) __abortlike
2924 panic("trying to interlock destroyed mutex (%p)", lock
);
2927 __attribute__((noinline
))
2929 lck_mtx_try_destroyed(
2932 panic("trying to interlock destroyed mutex (%p)", lock
);
2936 __attribute__((always_inline
))
2938 lck_mtx_lock_wait_interlock_to_clear(
2940 uint32_t* new_state
)
2946 state
= ordered_load_mtx_state(lock
);
2947 if (!(state
& (LCK_MTX_ILOCKED_MSK
| LCK_MTX_MLOCKED_MSK
))) {
2951 if (state
& LCK_MTX_MLOCKED_MSK
) {
2952 /* if it is held as mutex, just fail */
2958 __attribute__((always_inline
))
2960 lck_mtx_try_lock_wait_interlock_to_clear(
2962 uint32_t* new_state
)
2968 state
= ordered_load_mtx_state(lock
);
2969 if (state
& (LCK_MTX_MLOCKED_MSK
| LCK_MTX_SPIN_MSK
)) {
2970 /* if it is held as mutex or spin, just fail */
2973 if (!(state
& LCK_MTX_ILOCKED_MSK
)) {
2981 * Routine: lck_mtx_lock_slow
2983 * Locks a mutex for current thread.
2984 * If the lock is contended this function might
2987 * Called with interlock not held.
2989 __attribute__((noinline
))
2994 boolean_t indirect
= FALSE
;
2998 state
= ordered_load_mtx_state(lock
);
3000 /* is the interlock or mutex held */
3001 if (__improbable(state
& ((LCK_MTX_ILOCKED_MSK
| LCK_MTX_MLOCKED_MSK
)))) {
3003 * Note: both LCK_MTX_TAG_DESTROYED and LCK_MTX_TAG_INDIRECT
3004 * have LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK
3005 * set in state (state == lck_mtx_tag)
3009 /* is the mutex already held and not indirect */
3010 if (__improbable(!(state
& LCK_MTX_ILOCKED_MSK
))) {
3011 /* no, must have been the mutex */
3012 return lck_mtx_lock_contended(lock
, indirect
, &first_miss
);
3015 /* check to see if it is marked destroyed */
3016 if (__improbable(state
== LCK_MTX_TAG_DESTROYED
)) {
3017 lck_mtx_destroyed(lock
);
3020 /* Is this an indirect mutex? */
3021 if (__improbable(state
== LCK_MTX_TAG_INDIRECT
)) {
3022 indirect
= get_indirect_mutex(&lock
, &state
);
3025 lck_grp_mtx_update_held((struct _lck_mtx_ext_
*)lock
);
3027 if (state
& LCK_MTX_SPIN_MSK
) {
3028 /* M_SPIN_MSK was set, so M_ILOCKED_MSK must also be present */
3029 assert(state
& LCK_MTX_ILOCKED_MSK
);
3030 lck_grp_mtx_update_miss((struct _lck_mtx_ext_
*)lock
, &first_miss
);
3034 if (!lck_mtx_lock_wait_interlock_to_clear(lock
, &state
)) {
3035 return lck_mtx_lock_contended(lock
, indirect
, &first_miss
);
3039 /* no - can't be INDIRECT, DESTROYED or locked */
3040 while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock
, LCK_MTX_MLOCKED_MSK
, &state
))) {
3041 if (!lck_mtx_lock_wait_interlock_to_clear(lock
, &state
)) {
3042 return lck_mtx_lock_contended(lock
, indirect
, &first_miss
);
3046 /* lock and interlock acquired */
3048 thread_t thread
= current_thread();
3049 /* record owner of mutex */
3050 ordered_store_mtx_owner(lock
, (uintptr_t)thread
);
3054 thread
->mutex_count
++; /* lock statistic */
3058 * Check if there are waiters to
3059 * inherit their priority.
3061 if (__improbable(state
& LCK_MTX_WAITERS_MSK
)) {
3062 return lck_mtx_lock_acquire_tail(lock
, indirect
, NULL
);
3065 /* release the interlock */
3066 lck_mtx_lock_finish_inline(lock
, ordered_load_mtx_state(lock
), indirect
);
3071 __attribute__((noinline
))
3073 lck_mtx_try_lock_slow(
3076 boolean_t indirect
= FALSE
;
3080 state
= ordered_load_mtx_state(lock
);
3082 /* is the interlock or mutex held */
3083 if (__improbable(state
& ((LCK_MTX_ILOCKED_MSK
| LCK_MTX_MLOCKED_MSK
)))) {
3085 * Note: both LCK_MTX_TAG_DESTROYED and LCK_MTX_TAG_INDIRECT
3086 * have LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK
3087 * set in state (state == lck_mtx_tag)
3090 /* is the mutex already held and not indirect */
3091 if (__improbable(!(state
& LCK_MTX_ILOCKED_MSK
))) {
3095 /* check to see if it is marked destroyed */
3096 if (__improbable(state
== LCK_MTX_TAG_DESTROYED
)) {
3097 lck_mtx_try_destroyed(lock
);
3100 /* Is this an indirect mutex? */
3101 if (__improbable(state
== LCK_MTX_TAG_INDIRECT
)) {
3102 indirect
= get_indirect_mutex(&lock
, &state
);
3105 lck_grp_mtx_update_held((struct _lck_mtx_ext_
*)lock
);
3108 if (!lck_mtx_try_lock_wait_interlock_to_clear(lock
, &state
)) {
3110 lck_grp_mtx_update_miss((struct _lck_mtx_ext_
*)lock
, &first_miss
);
3116 /* no - can't be INDIRECT, DESTROYED or locked */
3117 while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock
, LCK_MTX_MLOCKED_MSK
, &state
))) {
3118 if (!lck_mtx_try_lock_wait_interlock_to_clear(lock
, &state
)) {
3120 lck_grp_mtx_update_miss((struct _lck_mtx_ext_
*)lock
, &first_miss
);
3126 /* lock and interlock acquired */
3128 thread_t thread
= current_thread();
3129 /* record owner of mutex */
3130 ordered_store_mtx_owner(lock
, (uintptr_t)thread
);
3134 thread
->mutex_count
++; /* lock statistic */
3138 * Check if there are waiters to
3139 * inherit their priority.
3141 if (__improbable(state
& LCK_MTX_WAITERS_MSK
)) {
3142 return lck_mtx_try_lock_acquire_tail(lock
);
3145 /* release the interlock */
3146 lck_mtx_try_lock_finish_inline(lock
, ordered_load_mtx_state(lock
));
3151 __attribute__((noinline
))
3153 lck_mtx_lock_spin_slow(
3156 boolean_t indirect
= FALSE
;
3160 state
= ordered_load_mtx_state(lock
);
3162 /* is the interlock or mutex held */
3163 if (__improbable(state
& ((LCK_MTX_ILOCKED_MSK
| LCK_MTX_MLOCKED_MSK
)))) {
3165 * Note: both LCK_MTX_TAG_DESTROYED and LCK_MTX_TAG_INDIRECT
3166 * have LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK
3167 * set in state (state == lck_mtx_tag)
3171 /* is the mutex already held and not indirect */
3172 if (__improbable(!(state
& LCK_MTX_ILOCKED_MSK
))) {
3173 /* no, must have been the mutex */
3174 return lck_mtx_lock_contended(lock
, indirect
, &first_miss
);
3177 /* check to see if it is marked destroyed */
3178 if (__improbable(state
== LCK_MTX_TAG_DESTROYED
)) {
3179 lck_mtx_destroyed(lock
);
3182 /* Is this an indirect mutex? */
3183 if (__improbable(state
== LCK_MTX_TAG_INDIRECT
)) {
3184 indirect
= get_indirect_mutex(&lock
, &state
);
3187 lck_grp_mtx_update_held((struct _lck_mtx_ext_
*)lock
);
3189 if (state
& LCK_MTX_SPIN_MSK
) {
3190 /* M_SPIN_MSK was set, so M_ILOCKED_MSK must also be present */
3191 assert(state
& LCK_MTX_ILOCKED_MSK
);
3192 lck_grp_mtx_update_miss((struct _lck_mtx_ext_
*)lock
, &first_miss
);
3196 if (!lck_mtx_lock_wait_interlock_to_clear(lock
, &state
)) {
3197 return lck_mtx_lock_contended(lock
, indirect
, &first_miss
);
3201 /* no - can't be INDIRECT, DESTROYED or locked */
3202 while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock
, LCK_MTX_SPIN_MSK
, &state
))) {
3203 if (!lck_mtx_lock_wait_interlock_to_clear(lock
, &state
)) {
3204 return lck_mtx_lock_contended(lock
, indirect
, &first_miss
);
3208 /* lock as spinlock and interlock acquired */
3210 thread_t thread
= current_thread();
3211 /* record owner of mutex */
3212 ordered_store_mtx_owner(lock
, (uintptr_t)thread
);
3216 thread
->mutex_count
++; /* lock statistic */
3221 LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN_ACQUIRE
, lock
, 0);
3223 /* return with the interlock held and preemption disabled */
3227 __attribute__((noinline
))
3229 lck_mtx_try_lock_spin_slow(
3232 boolean_t indirect
= FALSE
;
3236 state
= ordered_load_mtx_state(lock
);
3238 /* is the interlock or mutex held */
3239 if (__improbable(state
& ((LCK_MTX_ILOCKED_MSK
| LCK_MTX_MLOCKED_MSK
)))) {
3241 * Note: both LCK_MTX_TAG_DESTROYED and LCK_MTX_TAG_INDIRECT
3242 * have LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK
3243 * set in state (state == lck_mtx_tag)
3246 /* is the mutex already held and not indirect */
3247 if (__improbable(!(state
& LCK_MTX_ILOCKED_MSK
))) {
3251 /* check to see if it is marked destroyed */
3252 if (__improbable(state
== LCK_MTX_TAG_DESTROYED
)) {
3253 lck_mtx_try_destroyed(lock
);
3256 /* Is this an indirect mutex? */
3257 if (__improbable(state
== LCK_MTX_TAG_INDIRECT
)) {
3258 indirect
= get_indirect_mutex(&lock
, &state
);
3261 lck_grp_mtx_update_held((struct _lck_mtx_ext_
*)lock
);
3264 if (!lck_mtx_try_lock_wait_interlock_to_clear(lock
, &state
)) {
3266 lck_grp_mtx_update_miss((struct _lck_mtx_ext_
*)lock
, &first_miss
);
3272 /* no - can't be INDIRECT, DESTROYED or locked */
3273 while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock
, LCK_MTX_SPIN_MSK
, &state
))) {
3274 if (!lck_mtx_try_lock_wait_interlock_to_clear(lock
, &state
)) {
3276 lck_grp_mtx_update_miss((struct _lck_mtx_ext_
*)lock
, &first_miss
);
3282 /* lock and interlock acquired */
3284 thread_t thread
= current_thread();
3285 /* record owner of mutex */
3286 ordered_store_mtx_owner(lock
, (uintptr_t)thread
);
3290 thread
->mutex_count
++; /* lock statistic */
3295 LOCKSTAT_RECORD(LS_LCK_MTX_TRY_SPIN_LOCK_ACQUIRE
, lock
, 0);
3300 __attribute__((noinline
))
3302 lck_mtx_convert_spin(
3307 state
= ordered_load_mtx_state(lock
);
3309 /* Is this an indirect mutex? */
3310 if (__improbable(state
== LCK_MTX_TAG_INDIRECT
)) {
3311 /* If so, take indirection */
3312 get_indirect_mutex(&lock
, &state
);
3315 assertf((thread_t
)lock
->lck_mtx_owner
== current_thread(), "lock %p not owned by thread %p (current owner %p)", lock
, current_thread(), (thread_t
)lock
->lck_mtx_owner
);
3317 if (__improbable(state
& LCK_MTX_MLOCKED_MSK
)) {
3318 /* already owned as a mutex, just return */
3322 assert(get_preemption_level() > 0);
3323 assert(state
& LCK_MTX_ILOCKED_MSK
);
3324 assert(state
& LCK_MTX_SPIN_MSK
);
3327 * Check if there are waiters to
3328 * inherit their priority.
3330 if (__improbable(state
& LCK_MTX_WAITERS_MSK
)) {
3331 return lck_mtx_convert_spin_acquire_tail(lock
);
3334 lck_mtx_convert_spin_finish_inline(lock
, ordered_load_mtx_state(lock
));
3339 static inline boolean_t
3340 lck_mtx_lock_grab_mutex(
3345 state
= ordered_load_mtx_state(lock
);
3347 if (!lck_mtx_interlock_try_lock_set_flags(lock
, LCK_MTX_MLOCKED_MSK
, &state
)) {
3351 /* lock and interlock acquired */
3353 thread_t thread
= current_thread();
3354 /* record owner of mutex */
3355 ordered_store_mtx_owner(lock
, (uintptr_t)thread
);
3359 thread
->mutex_count
++; /* lock statistic */
3365 __attribute__((noinline
))
3371 thread_t thread
, owner
;
3374 thread
= current_thread();
3375 state
= ordered_load_mtx_state(lock
);
3377 if (state
== LCK_MTX_TAG_INDIRECT
) {
3378 get_indirect_mutex(&lock
, &state
);
3381 owner
= (thread_t
)lock
->lck_mtx_owner
;
3383 if (type
== LCK_MTX_ASSERT_OWNED
) {
3384 if (owner
!= thread
|| !(state
& (LCK_MTX_ILOCKED_MSK
| LCK_MTX_MLOCKED_MSK
))) {
3385 panic("mutex (%p) not owned\n", lock
);
3388 assert(type
== LCK_MTX_ASSERT_NOTOWNED
);
3389 if (owner
== thread
) {
3390 panic("mutex (%p) owned\n", lock
);
3396 * Routine: lck_mtx_lock_spinwait_x86
3398 * Invoked trying to acquire a mutex when there is contention but
3399 * the holder is running on another processor. We spin for up to a maximum
3400 * time waiting for the lock to be released.
3402 * Called with the interlock unlocked.
3403 * returns LCK_MTX_SPINWAIT_ACQUIRED if mutex acquired
3404 * returns LCK_MTX_SPINWAIT_SPUN if we spun
3405 * returns LCK_MTX_SPINWAIT_NO_SPIN if we didn't spin due to the holder not running
3407 __attribute__((noinline
))
3408 lck_mtx_spinwait_ret_type_t
3409 lck_mtx_lock_spinwait_x86(
3412 __kdebug_only
uintptr_t trace_lck
= unslide_for_kdebug(mutex
);
3413 thread_t owner
, prev_owner
;
3414 uint64_t window_deadline
, sliding_deadline
, high_deadline
;
3415 uint64_t start_time
, cur_time
, avg_hold_time
, bias
, delta
;
3416 lck_mtx_spinwait_ret_type_t retval
= LCK_MTX_SPINWAIT_SPUN_HIGH_THR
;
3418 int total_hold_time_samples
, window_hold_time_samples
, unfairness
;
3419 uint i
, prev_owner_cpu
;
3420 bool owner_on_core
, adjust
;
3422 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_MTX_LCK_SPIN_CODE
) | DBG_FUNC_START
,
3423 trace_lck
, VM_KERNEL_UNSLIDE_OR_PERM(mutex
->lck_mtx_owner
), mutex
->lck_mtx_waiters
, 0, 0);
3425 start_time
= mach_absolute_time();
3427 * window_deadline represents the "learning" phase.
3428 * The thread collects statistics about the lock during
3429 * window_deadline and then it makes a decision on whether to spin more
3430 * or block according to the concurrency behavior
3433 * Every thread can spin at least low_MutexSpin.
3435 window_deadline
= start_time
+ low_MutexSpin
;
3437 * Sliding_deadline is the adjusted spin deadline
3438 * computed after the "learning" phase.
3440 sliding_deadline
= window_deadline
;
3442 * High_deadline is a hard deadline. No thread
3443 * can spin more than this deadline.
3445 if (high_MutexSpin
>= 0) {
3446 high_deadline
= start_time
+ high_MutexSpin
;
3448 high_deadline
= start_time
+ low_MutexSpin
* real_ncpus
;
3452 * Do not know yet which is the owner cpu.
3453 * Initialize prev_owner_cpu with next cpu.
3455 prev_owner_cpu
= (cpu_number() + 1) % real_ncpus
;
3456 total_hold_time_samples
= 0;
3457 window_hold_time_samples
= 0;
3460 bias
= (os_hash_kernel_pointer(mutex
) + cpu_number()) % real_ncpus
;
3462 prev_owner
= (thread_t
) mutex
->lck_mtx_owner
;
3465 * - mutex is locked, and
3466 * - it's locked as a spin lock, and
3467 * - owner is running on another processor, and
3468 * - we haven't spun for long enough.
3472 * Try to acquire the lock.
3474 if (__probable(lck_mtx_lock_grab_mutex(mutex
))) {
3475 retval
= LCK_MTX_SPINWAIT_ACQUIRED
;
3479 cur_time
= mach_absolute_time();
3482 * Never spin past high_deadline.
3484 if (cur_time
>= high_deadline
) {
3485 retval
= LCK_MTX_SPINWAIT_SPUN_HIGH_THR
;
3490 * Check if owner is on core. If not block.
3492 owner
= (thread_t
) mutex
->lck_mtx_owner
;
3495 owner_on_core
= FALSE
;
3497 disable_preemption();
3498 owner
= (thread_t
) mutex
->lck_mtx_owner
;
3501 * For scalability we want to check if the owner is on core
3502 * without locking the mutex interlock.
3503 * If we do not lock the mutex interlock, the owner that we see might be
3504 * invalid, so we cannot dereference it. Therefore we cannot check
3505 * any field of the thread to tell us if it is on core.
3506 * Check if the thread that is running on the other cpus matches the owner.
3510 if ((cpu_data_ptr
[i
] != NULL
) && (cpu_data_ptr
[i
]->cpu_active_thread
== owner
)) {
3511 owner_on_core
= TRUE
;
3514 if (++i
>= real_ncpus
) {
3517 } while (i
!= prev_owner_cpu
);
3518 enable_preemption();
3520 if (owner_on_core
) {
3524 owner
= (thread_t
) mutex
->lck_mtx_owner
;
3525 if (owner
== prev_owner
) {
3527 * Owner is not on core.
3530 if (loopcount
== 0) {
3531 retval
= LCK_MTX_SPINWAIT_NO_SPIN
;
3533 retval
= LCK_MTX_SPINWAIT_SPUN_OWNER_NOT_CORE
;
3538 * Fall through if the owner changed while we were scanning.
3539 * The new owner could potentially be on core, so loop
3544 enable_preemption();
3549 * Save how many times we see the owner changing.
3550 * We can roughly estimate the mutex hold
3551 * time and the fairness with that.
3553 if (owner
!= prev_owner
) {
3555 total_hold_time_samples
++;
3556 window_hold_time_samples
++;
3560 * Learning window expired.
3561 * Try to adjust the sliding_deadline.
3563 if (cur_time
>= window_deadline
) {
3565 * If there was not contention during the window
3568 if (window_hold_time_samples
< 1) {
3569 retval
= LCK_MTX_SPINWAIT_SPUN_NO_WINDOW_CONTENTION
;
3575 * For a fair lock, we'd wait for at most (NCPU-1) periods,
3576 * but the lock is unfair, so let's try to estimate by how much.
3578 unfairness
= total_hold_time_samples
/ real_ncpus
;
3580 if (unfairness
== 0) {
3582 * We observed the owner changing `total_hold_time_samples` times which
3583 * let us estimate the average hold time of this mutex for the duration
3585 * avg_hold_time = (cur_time - start_time) / total_hold_time_samples;
3587 * In this case spin at max avg_hold_time * (real_ncpus - 1)
3589 delta
= cur_time
- start_time
;
3590 sliding_deadline
= start_time
+ (delta
* (real_ncpus
- 1)) / total_hold_time_samples
;
3593 * In this case at least one of the other cpus was able to get the lock twice
3594 * while I was spinning.
3595 * We could spin longer but it won't necessarily help if the system is unfair.
3596 * Try to randomize the wait to reduce contention.
3598 * We compute how much time we could potentially spin
3599 * and distribute it over the cpus.
3601 * bias is an integer between 0 and real_ncpus.
3602 * distributed_increment = ((high_deadline - cur_time) / real_ncpus) * bias
3604 delta
= high_deadline
- cur_time
;
3605 sliding_deadline
= cur_time
+ ((delta
* bias
) / real_ncpus
);
3610 window_deadline
+= low_MutexSpin
;
3611 window_hold_time_samples
= 0;
3615 * Stop spinning if we past
3616 * the adjusted deadline.
3618 if (cur_time
>= sliding_deadline
) {
3619 retval
= LCK_MTX_SPINWAIT_SPUN_SLIDING_THR
;
3623 if ((thread_t
) mutex
->lck_mtx_owner
!= NULL
) {
3632 * Note that we record a different probe id depending on whether
3633 * this is a direct or indirect mutex. This allows us to
3634 * penalize only lock groups that have debug/stats enabled
3635 * with dtrace processing if desired.
3637 if (__probable(mutex
->lck_mtx_is_ext
== 0)) {
3638 LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN
, mutex
,
3639 mach_absolute_time() - start_time
);
3641 LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_SPIN
, mutex
,
3642 mach_absolute_time() - start_time
);
3644 /* The lockstat acquire event is recorded by the assembly code beneath us. */
3647 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_MTX_LCK_SPIN_CODE
) | DBG_FUNC_END
,
3648 trace_lck
, VM_KERNEL_UNSLIDE_OR_PERM(mutex
->lck_mtx_owner
), mutex
->lck_mtx_waiters
, retval
, 0);
3656 * Routine: lck_mtx_lock_wait_x86
3658 * Invoked in order to wait on contention.
3660 * Called with the interlock locked and
3661 * preemption disabled...
3662 * returns it unlocked and with preemption enabled
3664 * lck_mtx_waiters is 1:1 with a wakeup needing to occur.
3665 * A runnable waiter can exist between wait and acquire
3666 * without a waiters count being set.
3667 * This allows us to never make a spurious wakeup call.
3670 * This avoids taking the thread lock if the owning thread is the same priority.
3671 * This optimizes the case of same-priority threads contending on a lock.
3672 * However, that allows the owning thread to drop in priority while holding the lock,
3673 * because there is no state that the priority change can notice that
3674 * says that the targeted thread holds a contended mutex.
3676 * One possible solution: priority changes could look for some atomic tag
3677 * on the thread saying 'holding contended lock', and then set up a promotion.
3678 * Needs a story for dropping that promotion - the last contended unlock
3679 * has to notice that this has happened.
3681 __attribute__((noinline
))
3683 lck_mtx_lock_wait_x86(
3685 struct turnstile
**ts
)
3687 thread_t self
= current_thread();
3690 uint64_t sleep_start
= 0;
3692 if (lockstat_probemap
[LS_LCK_MTX_LOCK_BLOCK
] || lockstat_probemap
[LS_LCK_MTX_EXT_LOCK_BLOCK
]) {
3693 sleep_start
= mach_absolute_time();
3696 __kdebug_only
uintptr_t trace_lck
= unslide_for_kdebug(mutex
);
3698 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_MTX_LCK_WAIT_CODE
) | DBG_FUNC_START
,
3699 trace_lck
, VM_KERNEL_UNSLIDE_OR_PERM(mutex
->lck_mtx_owner
),
3700 mutex
->lck_mtx_waiters
, 0, 0);
3702 assert(self
->waiting_for_mutex
== NULL
);
3703 self
->waiting_for_mutex
= mutex
;
3704 mutex
->lck_mtx_waiters
++;
3706 thread_t holder
= (thread_t
)mutex
->lck_mtx_owner
;
3707 assert(holder
!= NULL
);
3710 * lck_mtx_lock_wait_x86 might be called on a loop. Call prepare just once and reuse
3711 * the same turnstile while looping, the matching turnstile compleate will be called
3712 * by lck_mtx_lock_contended when finally acquiring the lock.
3715 *ts
= turnstile_prepare((uintptr_t)mutex
, NULL
, TURNSTILE_NULL
, TURNSTILE_KERNEL_MUTEX
);
3718 struct turnstile
*turnstile
= *ts
;
3719 thread_set_pending_block_hint(self
, kThreadWaitKernelMutex
);
3720 turnstile_update_inheritor(turnstile
, holder
, (TURNSTILE_DELAYED_UPDATE
| TURNSTILE_INHERITOR_THREAD
));
3722 waitq_assert_wait64(&turnstile
->ts_waitq
, CAST_EVENT64_T(LCK_MTX_EVENT(mutex
)), THREAD_UNINT
| THREAD_WAIT_NOREPORT_USER
, TIMEOUT_WAIT_FOREVER
);
3724 lck_mtx_ilk_unlock(mutex
);
3726 turnstile_update_inheritor_complete(turnstile
, TURNSTILE_INTERLOCK_NOT_HELD
);
3728 thread_block(THREAD_CONTINUE_NULL
);
3730 self
->waiting_for_mutex
= NULL
;
3732 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_MTX_LCK_WAIT_CODE
) | DBG_FUNC_END
,
3733 trace_lck
, VM_KERNEL_UNSLIDE_OR_PERM(mutex
->lck_mtx_owner
),
3734 mutex
->lck_mtx_waiters
, 0, 0);
3738 * Record the Dtrace lockstat probe for blocking, block time
3739 * measured from when we were entered.
3742 if (mutex
->lck_mtx_is_ext
== 0) {
3743 LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_BLOCK
, mutex
,
3744 mach_absolute_time() - sleep_start
);
3746 LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_BLOCK
, mutex
,
3747 mach_absolute_time() - sleep_start
);
3754 * Routine: kdp_lck_mtx_lock_spin_is_acquired
3755 * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
3756 * Returns: TRUE if lock is acquired.
3759 kdp_lck_mtx_lock_spin_is_acquired(lck_mtx_t
*lck
)
3762 panic("panic: kdp_lck_mtx_lock_spin_is_acquired called outside of kernel debugger");
3765 if (lck
->lck_mtx_ilocked
|| lck
->lck_mtx_mlocked
) {
3773 kdp_lck_mtx_find_owner(__unused
struct waitq
* waitq
, event64_t event
, thread_waitinfo_t
* waitinfo
)
3775 lck_mtx_t
* mutex
= LCK_EVENT_TO_MUTEX(event
);
3776 waitinfo
->context
= VM_KERNEL_UNSLIDE_OR_PERM(mutex
);
3777 thread_t holder
= (thread_t
)mutex
->lck_mtx_owner
;
3778 waitinfo
->owner
= thread_tid(holder
);
3782 kdp_rwlck_find_owner(__unused
struct waitq
* waitq
, event64_t event
, thread_waitinfo_t
* waitinfo
)
3784 lck_rw_t
*rwlck
= NULL
;
3785 switch (waitinfo
->wait_type
) {
3786 case kThreadWaitKernelRWLockRead
:
3787 rwlck
= READ_EVENT_TO_RWLOCK(event
);
3789 case kThreadWaitKernelRWLockWrite
:
3790 case kThreadWaitKernelRWLockUpgrade
:
3791 rwlck
= WRITE_EVENT_TO_RWLOCK(event
);
3794 panic("%s was called with an invalid blocking type", __FUNCTION__
);
3797 waitinfo
->context
= VM_KERNEL_UNSLIDE_OR_PERM(rwlck
);
3798 waitinfo
->owner
= 0;