2 * Copyright (c) 2000-2012 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
46 * Carnegie Mellon requests users of this software to return to
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
58 * Author: Avadis Tevanian, Jr., Michael Wayne Young
61 * Locking primitives implementation
64 #include <mach_ldebug.h>
66 #include <kern/locks.h>
67 #include <kern/kalloc.h>
68 #include <kern/misc_protos.h>
69 #include <kern/thread.h>
70 #include <kern/processor.h>
71 #include <kern/cpu_data.h>
72 #include <kern/cpu_number.h>
73 #include <kern/sched_prim.h>
75 #include <kern/debug.h>
78 #include <i386/machine_routines.h> /* machine_timeout_suspended() */
79 #include <machine/machine_cpu.h>
82 #include <sys/kdebug.h>
83 #include <mach/branch_predicates.h>
86 * We need only enough declarations from the BSD-side to be able to
87 * test if our probe is active, and to call __dtrace_probe(). Setting
88 * NEED_DTRACE_DEFS gets a local copy of those definitions pulled in.
91 #define NEED_DTRACE_DEFS
92 #include <../bsd/sys/lockstat.h>
95 #define LCK_RW_LCK_EXCLUSIVE_CODE 0x100
96 #define LCK_RW_LCK_EXCLUSIVE1_CODE 0x101
97 #define LCK_RW_LCK_SHARED_CODE 0x102
98 #define LCK_RW_LCK_SH_TO_EX_CODE 0x103
99 #define LCK_RW_LCK_SH_TO_EX1_CODE 0x104
100 #define LCK_RW_LCK_EX_TO_SH_CODE 0x105
102 #define LCK_RW_LCK_EX_WRITER_SPIN_CODE 0x106
103 #define LCK_RW_LCK_EX_WRITER_WAIT_CODE 0x107
104 #define LCK_RW_LCK_EX_READER_SPIN_CODE 0x108
105 #define LCK_RW_LCK_EX_READER_WAIT_CODE 0x109
106 #define LCK_RW_LCK_SHARED_SPIN_CODE 0x110
107 #define LCK_RW_LCK_SHARED_WAIT_CODE 0x111
108 #define LCK_RW_LCK_SH_TO_EX_SPIN_CODE 0x112
109 #define LCK_RW_LCK_SH_TO_EX_WAIT_CODE 0x113
112 #define ANY_LOCK_DEBUG (USLOCK_DEBUG || LOCK_DEBUG || MUTEX_DEBUG)
114 unsigned int LcksOpts
=0;
120 * Perform simple lock checks.
122 int uslock_check
= 1;
123 int max_lock_loops
= 100000000;
124 decl_simple_lock_data(extern , printf_lock
)
125 decl_simple_lock_data(extern , panic_lock
)
126 #endif /* USLOCK_DEBUG */
128 extern unsigned int not_in_kdp
;
131 * We often want to know the addresses of the callers
132 * of the various lock routines. However, this information
133 * is only used for debugging and statistics.
136 #define INVALID_PC ((void *) VM_MAX_KERNEL_ADDRESS)
137 #define INVALID_THREAD ((void *) VM_MAX_KERNEL_ADDRESS)
139 #define OBTAIN_PC(pc) ((pc) = GET_RETURN_PC())
140 #define DECL_PC(pc) pc_t pc;
141 #else /* ANY_LOCK_DEBUG */
145 * Eliminate lint complaints about unused local pc variables.
147 #define OBTAIN_PC(pc) ++pc
149 #define OBTAIN_PC(pc)
151 #endif /* USLOCK_DEBUG */
155 * Portable lock package implementation of usimple_locks.
159 #define USLDBG(stmt) stmt
160 void usld_lock_init(usimple_lock_t
, unsigned short);
161 void usld_lock_pre(usimple_lock_t
, pc_t
);
162 void usld_lock_post(usimple_lock_t
, pc_t
);
163 void usld_unlock(usimple_lock_t
, pc_t
);
164 void usld_lock_try_pre(usimple_lock_t
, pc_t
);
165 void usld_lock_try_post(usimple_lock_t
, pc_t
);
166 int usld_lock_common_checks(usimple_lock_t
, char *);
167 #else /* USLOCK_DEBUG */
169 #endif /* USLOCK_DEBUG */
172 extern int lck_rw_grab_want(lck_rw_t
*lck
);
173 extern int lck_rw_grab_shared(lck_rw_t
*lck
);
174 extern int lck_rw_held_read_or_upgrade(lck_rw_t
*lck
);
178 * Forward definitions
181 void lck_rw_lock_shared_gen(
184 void lck_rw_lock_exclusive_gen(
187 boolean_t
lck_rw_lock_shared_to_exclusive_success(
190 boolean_t
lck_rw_lock_shared_to_exclusive_failure(
192 int prior_lock_state
);
194 void lck_rw_lock_exclusive_to_shared_gen(
196 int prior_lock_state
);
198 lck_rw_type_t
lck_rw_done_gen(
200 int prior_lock_state
);
202 void lck_rw_clear_promotions_x86(thread_t thread
);
205 * Routine: lck_spin_alloc_init
214 if ((lck
= (lck_spin_t
*)kalloc(sizeof(lck_spin_t
))) != 0)
215 lck_spin_init(lck
, grp
, attr
);
221 * Routine: lck_spin_free
228 lck_spin_destroy(lck
, grp
);
229 kfree(lck
, sizeof(lck_spin_t
));
233 * Routine: lck_spin_init
239 __unused lck_attr_t
*attr
)
241 usimple_lock_init((usimple_lock_t
) lck
, 0);
242 lck_grp_reference(grp
);
243 lck_grp_lckcnt_incr(grp
, LCK_TYPE_SPIN
);
247 * Routine: lck_spin_destroy
254 if (lck
->interlock
== LCK_SPIN_TAG_DESTROYED
)
256 lck
->interlock
= LCK_SPIN_TAG_DESTROYED
;
257 lck_grp_lckcnt_decr(grp
, LCK_TYPE_SPIN
);
258 lck_grp_deallocate(grp
);
263 * Routine: lck_spin_lock
269 usimple_lock((usimple_lock_t
) lck
);
273 * Routine: lck_spin_unlock
279 usimple_unlock((usimple_lock_t
) lck
);
284 * Routine: lck_spin_try_lock
290 boolean_t lrval
= (boolean_t
)usimple_lock_try((usimple_lock_t
) lck
);
291 #if DEVELOPMENT || DEBUG
300 * Routine: lck_spin_assert
303 lck_spin_assert(lck_spin_t
*lock
, unsigned int type
)
305 thread_t thread
, holder
;
308 if (__improbable(type
!= LCK_ASSERT_OWNED
&& type
!= LCK_ASSERT_NOTOWNED
)) {
309 panic("lck_spin_assert(): invalid arg (%u)", type
);
312 state
= lock
->interlock
;
313 holder
= (thread_t
)state
;
314 thread
= current_thread();
315 if (type
== LCK_ASSERT_OWNED
) {
316 if (__improbable(holder
== THREAD_NULL
)) {
317 panic("Lock not owned %p = %lx", lock
, state
);
319 if (__improbable(holder
!= thread
)) {
320 panic("Lock not owned by current thread %p = %lx", lock
, state
);
322 } else if (type
== LCK_ASSERT_NOTOWNED
) {
323 if (__improbable(holder
!= THREAD_NULL
)) {
324 if (holder
== thread
) {
325 panic("Lock owned by current thread %p = %lx", lock
, state
);
327 panic("Lock %p owned by thread %p", lock
, holder
);
334 * Routine: kdp_lck_spin_is_acquired
335 * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
336 * Returns: TRUE if lock is acquired.
339 kdp_lck_spin_is_acquired(lck_spin_t
*lck
) {
341 panic("panic: spinlock acquired check done outside of kernel debugger");
343 return (lck
->interlock
!= 0)? TRUE
: FALSE
;
347 * Initialize a usimple_lock.
349 * No change in preemption state.
354 __unused
unsigned short tag
)
356 #ifndef MACHINE_SIMPLE_LOCK
357 USLDBG(usld_lock_init(l
, tag
));
358 hw_lock_init(&l
->interlock
);
360 simple_lock_init((simple_lock_t
)l
,tag
);
364 volatile uint32_t spinlock_owner_cpu
= ~0;
365 volatile usimple_lock_t spinlock_timed_out
;
367 uint32_t spinlock_timeout_NMI(uintptr_t thread_addr
) {
371 for (i
= 0; i
< real_ncpus
; i
++) {
372 if ((uintptr_t)cpu_data_ptr
[i
]->cpu_active_thread
== thread_addr
) {
373 spinlock_owner_cpu
= i
;
374 if ((uint32_t) cpu_number() == i
)
376 cpu_datap(i
)->cpu_NMI_acknowledged
= FALSE
;
377 cpu_NMI_interrupt(i
);
378 deadline
= mach_absolute_time() + (LockTimeOut
* 2);
379 while (mach_absolute_time() < deadline
&& cpu_datap(i
)->cpu_NMI_acknowledged
== FALSE
)
385 return spinlock_owner_cpu
;
389 * Acquire a usimple_lock.
391 * Returns with preemption disabled. Note
392 * that the hw_lock routines are responsible for
393 * maintaining preemption state.
399 #ifndef MACHINE_SIMPLE_LOCK
403 USLDBG(usld_lock_pre(l
, pc
));
405 if(__improbable(hw_lock_to(&l
->interlock
, LockTimeOutTSC
) == 0)) {
406 boolean_t uslock_acquired
= FALSE
;
407 while (machine_timeout_suspended()) {
409 if ((uslock_acquired
= hw_lock_to(&l
->interlock
, LockTimeOutTSC
)))
413 if (uslock_acquired
== FALSE
) {
415 uintptr_t lowner
= (uintptr_t)l
->interlock
.lock_data
;
416 spinlock_timed_out
= l
;
417 lock_cpu
= spinlock_timeout_NMI(lowner
);
418 panic("Spinlock acquisition timed out: lock=%p, lock owner thread=0x%lx, current_thread: %p, lock owner active on CPU 0x%x, current owner: 0x%lx", l
, lowner
, current_thread(), lock_cpu
, (uintptr_t)l
->interlock
.lock_data
);
421 #if DEVELOPMENT || DEBUG
425 USLDBG(usld_lock_post(l
, pc
));
427 simple_lock((simple_lock_t
)l
);
433 * Release a usimple_lock.
435 * Returns with preemption enabled. Note
436 * that the hw_lock routines are responsible for
437 * maintaining preemption state.
443 #ifndef MACHINE_SIMPLE_LOCK
447 USLDBG(usld_unlock(l
, pc
));
448 #if DEVELOPMENT || DEBUG
451 hw_lock_unlock(&l
->interlock
);
453 simple_unlock_rwmb((simple_lock_t
)l
);
459 * Conditionally acquire a usimple_lock.
461 * On success, returns with preemption disabled.
462 * On failure, returns with preemption in the same state
463 * as when first invoked. Note that the hw_lock routines
464 * are responsible for maintaining preemption state.
466 * XXX No stats are gathered on a miss; I preserved this
467 * behavior from the original assembly-language code, but
468 * doesn't it make sense to log misses? XXX
474 #ifndef MACHINE_SIMPLE_LOCK
475 unsigned int success
;
479 USLDBG(usld_lock_try_pre(l
, pc
));
480 if ((success
= hw_lock_try(&l
->interlock
))) {
481 #if DEVELOPMENT || DEBUG
484 USLDBG(usld_lock_try_post(l
, pc
));
488 return(simple_lock_try((simple_lock_t
)l
));
493 * Acquire a usimple_lock while polling for pending TLB flushes
494 * and spinning on a lock.
498 usimple_lock_try_lock_loop(usimple_lock_t l
)
500 boolean_t istate
= ml_get_interrupts_enabled();
501 while (!simple_lock_try((l
))) {
503 handle_pending_TLB_flushes();
510 * States of a usimple_lock. The default when initializing
511 * a usimple_lock is setting it up for debug checking.
513 #define USLOCK_CHECKED 0x0001 /* lock is being checked */
514 #define USLOCK_TAKEN 0x0002 /* lock has been taken */
515 #define USLOCK_INIT 0xBAA0 /* lock has been initialized */
516 #define USLOCK_INITIALIZED (USLOCK_INIT|USLOCK_CHECKED)
517 #define USLOCK_CHECKING(l) (uslock_check && \
518 ((l)->debug.state & USLOCK_CHECKED))
521 * Trace activities of a particularly interesting lock.
523 void usl_trace(usimple_lock_t
, int, pc_t
, const char *);
527 * Initialize the debugging information contained
533 __unused
unsigned short tag
)
535 if (l
== USIMPLE_LOCK_NULL
)
536 panic("lock initialization: null lock pointer");
537 l
->lock_type
= USLOCK_TAG
;
538 l
->debug
.state
= uslock_check
? USLOCK_INITIALIZED
: 0;
539 l
->debug
.lock_cpu
= l
->debug
.unlock_cpu
= 0;
540 l
->debug
.lock_pc
= l
->debug
.unlock_pc
= INVALID_PC
;
541 l
->debug
.lock_thread
= l
->debug
.unlock_thread
= INVALID_THREAD
;
542 l
->debug
.duration
[0] = l
->debug
.duration
[1] = 0;
543 l
->debug
.unlock_cpu
= l
->debug
.unlock_cpu
= 0;
544 l
->debug
.unlock_pc
= l
->debug
.unlock_pc
= INVALID_PC
;
545 l
->debug
.unlock_thread
= l
->debug
.unlock_thread
= INVALID_THREAD
;
550 * These checks apply to all usimple_locks, not just
551 * those with USLOCK_CHECKED turned on.
554 usld_lock_common_checks(
558 if (l
== USIMPLE_LOCK_NULL
)
559 panic("%s: null lock pointer", caller
);
560 if (l
->lock_type
!= USLOCK_TAG
)
561 panic("%s: %p is not a usimple lock, 0x%x", caller
, l
, l
->lock_type
);
562 if (!(l
->debug
.state
& USLOCK_INIT
))
563 panic("%s: %p is not an initialized lock, 0x%x", caller
, l
, l
->debug
.state
);
564 return USLOCK_CHECKING(l
);
569 * Debug checks on a usimple_lock just before attempting
578 char caller
[] = "usimple_lock";
581 if (!usld_lock_common_checks(l
, caller
))
585 * Note that we have a weird case where we are getting a lock when we are]
586 * in the process of putting the system to sleep. We are running with no
587 * current threads, therefore we can't tell if we are trying to retake a lock
588 * we have or someone on the other processor has it. Therefore we just
589 * ignore this test if the locking thread is 0.
592 if ((l
->debug
.state
& USLOCK_TAKEN
) && l
->debug
.lock_thread
&&
593 l
->debug
.lock_thread
== (void *) current_thread()) {
594 printf("%s: lock %p already locked (at %p) by",
595 caller
, l
, l
->debug
.lock_pc
);
596 printf(" current thread %p (new attempt at pc %p)\n",
597 l
->debug
.lock_thread
, pc
);
600 mp_disable_preemption();
601 usl_trace(l
, cpu_number(), pc
, caller
);
602 mp_enable_preemption();
607 * Debug checks on a usimple_lock just after acquiring it.
609 * Pre-emption has been disabled at this point,
610 * so we are safe in using cpu_number.
618 char caller
[] = "successful usimple_lock";
621 if (!usld_lock_common_checks(l
, caller
))
624 if (!((l
->debug
.state
& ~USLOCK_TAKEN
) == USLOCK_INITIALIZED
))
625 panic("%s: lock %p became uninitialized",
627 if ((l
->debug
.state
& USLOCK_TAKEN
))
628 panic("%s: lock 0x%p became TAKEN by someone else",
631 mycpu
= cpu_number();
632 l
->debug
.lock_thread
= (void *)current_thread();
633 l
->debug
.state
|= USLOCK_TAKEN
;
634 l
->debug
.lock_pc
= pc
;
635 l
->debug
.lock_cpu
= mycpu
;
637 usl_trace(l
, mycpu
, pc
, caller
);
642 * Debug checks on a usimple_lock just before
643 * releasing it. Note that the caller has not
644 * yet released the hardware lock.
646 * Preemption is still disabled, so there's
647 * no problem using cpu_number.
655 char caller
[] = "usimple_unlock";
658 if (!usld_lock_common_checks(l
, caller
))
661 mycpu
= cpu_number();
663 if (!(l
->debug
.state
& USLOCK_TAKEN
))
664 panic("%s: lock 0x%p hasn't been taken",
666 if (l
->debug
.lock_thread
!= (void *) current_thread())
667 panic("%s: unlocking lock 0x%p, owned by thread %p",
668 caller
, l
, l
->debug
.lock_thread
);
669 if (l
->debug
.lock_cpu
!= mycpu
) {
670 printf("%s: unlocking lock 0x%p on cpu 0x%x",
672 printf(" (acquired on cpu 0x%x)\n", l
->debug
.lock_cpu
);
675 usl_trace(l
, mycpu
, pc
, caller
);
677 l
->debug
.unlock_thread
= l
->debug
.lock_thread
;
678 l
->debug
.lock_thread
= INVALID_PC
;
679 l
->debug
.state
&= ~USLOCK_TAKEN
;
680 l
->debug
.unlock_pc
= pc
;
681 l
->debug
.unlock_cpu
= mycpu
;
686 * Debug checks on a usimple_lock just before
687 * attempting to acquire it.
689 * Preemption isn't guaranteed to be disabled.
696 char caller
[] = "usimple_lock_try";
698 if (!usld_lock_common_checks(l
, caller
))
700 mp_disable_preemption();
701 usl_trace(l
, cpu_number(), pc
, caller
);
702 mp_enable_preemption();
707 * Debug checks on a usimple_lock just after
708 * successfully attempting to acquire it.
710 * Preemption has been disabled by the
711 * lock acquisition attempt, so it's safe
720 char caller
[] = "successful usimple_lock_try";
722 if (!usld_lock_common_checks(l
, caller
))
725 if (!((l
->debug
.state
& ~USLOCK_TAKEN
) == USLOCK_INITIALIZED
))
726 panic("%s: lock 0x%p became uninitialized",
728 if ((l
->debug
.state
& USLOCK_TAKEN
))
729 panic("%s: lock 0x%p became TAKEN by someone else",
732 mycpu
= cpu_number();
733 l
->debug
.lock_thread
= (void *) current_thread();
734 l
->debug
.state
|= USLOCK_TAKEN
;
735 l
->debug
.lock_pc
= pc
;
736 l
->debug
.lock_cpu
= mycpu
;
738 usl_trace(l
, mycpu
, pc
, caller
);
743 * For very special cases, set traced_lock to point to a
744 * specific lock of interest. The result is a series of
745 * XPRs showing lock operations on that lock. The lock_seq
746 * value is used to show the order of those operations.
748 usimple_lock_t traced_lock
;
749 unsigned int lock_seq
;
756 const char * op_name
)
758 if (traced_lock
== l
) {
760 "seq %d, cpu %d, %s @ %x\n",
761 (uintptr_t) lock_seq
, (uintptr_t) mycpu
,
762 (uintptr_t) op_name
, (uintptr_t) pc
, 0);
768 #endif /* USLOCK_DEBUG */
771 * Routine: lck_rw_alloc_init
779 if ((lck
= (lck_rw_t
*)kalloc(sizeof(lck_rw_t
))) != 0) {
780 bzero(lck
, sizeof(lck_rw_t
));
781 lck_rw_init(lck
, grp
, attr
);
788 * Routine: lck_rw_free
794 lck_rw_destroy(lck
, grp
);
795 kfree(lck
, sizeof(lck_rw_t
));
799 * Routine: lck_rw_init
807 lck_attr_t
*lck_attr
= (attr
!= LCK_ATTR_NULL
) ?
808 attr
: &LockDefaultLckAttr
;
810 hw_lock_byte_init(&lck
->lck_rw_interlock
);
811 lck
->lck_rw_want_write
= FALSE
;
812 lck
->lck_rw_want_upgrade
= FALSE
;
813 lck
->lck_rw_shared_count
= 0;
814 lck
->lck_rw_can_sleep
= TRUE
;
815 lck
->lck_r_waiting
= lck
->lck_w_waiting
= 0;
817 lck
->lck_rw_priv_excl
= ((lck_attr
->lck_attr_val
&
818 LCK_ATTR_RW_SHARED_PRIORITY
) == 0);
820 lck_grp_reference(grp
);
821 lck_grp_lckcnt_incr(grp
, LCK_TYPE_RW
);
825 * Routine: lck_rw_destroy
832 if (lck
->lck_rw_tag
== LCK_RW_TAG_DESTROYED
)
835 lck_rw_assert(lck
, LCK_RW_ASSERT_NOTHELD
);
837 lck
->lck_rw_tag
= LCK_RW_TAG_DESTROYED
;
838 lck_grp_lckcnt_decr(grp
, LCK_TYPE_RW
);
839 lck_grp_deallocate(grp
);
844 * Sleep locks. These use the same data structure and algorithm
845 * as the spin locks, but the process sleeps while it is waiting
846 * for the lock. These work on uniprocessor systems.
849 #define DECREMENTER_TIMEOUT 1000000
851 #define RW_LOCK_READER_EVENT(x) \
852 ((event_t) (((unsigned char*) (x)) + (offsetof(lck_rw_t, lck_rw_tag))))
854 #define RW_LOCK_WRITER_EVENT(x) \
855 ((event_t) (((unsigned char*) (x)) + (offsetof(lck_rw_t, lck_rw_pad8))))
858 * We disable interrupts while holding the RW interlock to prevent an
859 * interrupt from exacerbating hold time.
860 * Hence, local helper functions lck_interlock_lock()/lck_interlock_unlock().
863 lck_interlock_lock(lck_rw_t
*lck
)
867 istate
= ml_set_interrupts_enabled(FALSE
);
868 hw_lock_byte_lock(&lck
->lck_rw_interlock
);
874 lck_interlock_unlock(lck_rw_t
*lck
, boolean_t istate
)
876 hw_lock_byte_unlock(&lck
->lck_rw_interlock
);
877 ml_set_interrupts_enabled(istate
);
881 * This inline is used when busy-waiting for an rw lock.
882 * If interrupts were disabled when the lock primitive was called,
883 * we poll the IPI handler for pending tlb flushes.
884 * XXX This is a hack to avoid deadlocking on the pmap_system_lock.
887 lck_rw_lock_pause(boolean_t interrupts_enabled
)
889 if (!interrupts_enabled
)
890 handle_pending_TLB_flushes();
896 * compute the deadline to spin against when
897 * waiting for a change of state on a lck_rw_t
899 static inline uint64_t
900 lck_rw_deadline_for_spin(lck_rw_t
*lck
)
902 if (lck
->lck_rw_can_sleep
) {
903 if (lck
->lck_r_waiting
|| lck
->lck_w_waiting
|| lck
->lck_rw_shared_count
> machine_info
.max_cpus
) {
905 * there are already threads waiting on this lock... this
906 * implies that they have spun beyond their deadlines waiting for
907 * the desired state to show up so we will not bother spinning at this time...
909 * the current number of threads sharing this lock exceeds our capacity to run them
910 * concurrently and since all states we're going to spin for require the rw_shared_count
911 * to be at 0, we'll not bother spinning since the latency for this to happen is
914 return (mach_absolute_time());
916 return (mach_absolute_time() + MutexSpin
);
918 return (mach_absolute_time() + (100000LL * 1000000000LL));
923 * Routine: lck_rw_lock_exclusive
926 lck_rw_lock_exclusive_gen(
929 __kdebug_only
uintptr_t trace_lck
= VM_KERNEL_UNSLIDE_OR_PERM(lck
);
930 uint64_t deadline
= 0;
934 wait_result_t res
= 0;
935 boolean_t istate
= -1;
938 boolean_t dtrace_ls_initialized
= FALSE
;
939 boolean_t dtrace_rwl_excl_spin
, dtrace_rwl_excl_block
, dtrace_ls_enabled
= FALSE
;
940 uint64_t wait_interval
= 0;
941 int readers_at_sleep
= 0;
945 * Try to acquire the lck_rw_want_write bit.
947 while ( !lck_rw_grab_want(lck
)) {
950 if (dtrace_ls_initialized
== FALSE
) {
951 dtrace_ls_initialized
= TRUE
;
952 dtrace_rwl_excl_spin
= (lockstat_probemap
[LS_LCK_RW_LOCK_EXCL_SPIN
] != 0);
953 dtrace_rwl_excl_block
= (lockstat_probemap
[LS_LCK_RW_LOCK_EXCL_BLOCK
] != 0);
954 dtrace_ls_enabled
= dtrace_rwl_excl_spin
|| dtrace_rwl_excl_block
;
955 if (dtrace_ls_enabled
) {
957 * Either sleeping or spinning is happening,
958 * start a timing of our delay interval now.
960 readers_at_sleep
= lck
->lck_rw_shared_count
;
961 wait_interval
= mach_absolute_time();
966 istate
= ml_get_interrupts_enabled();
968 deadline
= lck_rw_deadline_for_spin(lck
);
970 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_RW_LCK_EX_WRITER_SPIN_CODE
) | DBG_FUNC_START
, trace_lck
, 0, 0, 0, 0);
972 while (((gotlock
= lck_rw_grab_want(lck
)) == 0) && mach_absolute_time() < deadline
)
973 lck_rw_lock_pause(istate
);
975 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_RW_LCK_EX_WRITER_SPIN_CODE
) | DBG_FUNC_END
, trace_lck
, 0, 0, gotlock
, 0);
980 * if we get here, the deadline has expired w/o us
981 * being able to grab the lock exclusively
982 * check to see if we're allowed to do a thread_block
984 if (lck
->lck_rw_can_sleep
) {
986 istate
= lck_interlock_lock(lck
);
988 if (lck
->lck_rw_want_write
) {
990 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_RW_LCK_EX_WRITER_WAIT_CODE
) | DBG_FUNC_START
, trace_lck
, 0, 0, 0, 0);
992 lck
->lck_w_waiting
= TRUE
;
994 res
= assert_wait(RW_LOCK_WRITER_EVENT(lck
), THREAD_UNINT
);
995 lck_interlock_unlock(lck
, istate
);
997 if (res
== THREAD_WAITING
) {
998 res
= thread_block(THREAD_CONTINUE_NULL
);
1001 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_RW_LCK_EX_WRITER_WAIT_CODE
) | DBG_FUNC_END
, trace_lck
, res
, slept
, 0, 0);
1003 lck
->lck_rw_want_write
= TRUE
;
1004 lck_interlock_unlock(lck
, istate
);
1010 * Wait for readers (and upgrades) to finish...
1011 * the test for these conditions must be done simultaneously with
1012 * a check of the interlock not being held since
1013 * the rw_shared_count will drop to 0 first and then want_upgrade
1014 * will be set to 1 in the shared_to_exclusive scenario... those
1015 * adjustments are done behind the interlock and represent an
1016 * atomic change in state and must be considered as such
1017 * however, once we see the read count at 0, the want_upgrade not set
1018 * and the interlock not held, we are safe to proceed
1020 while (lck_rw_held_read_or_upgrade(lck
)) {
1024 * Either sleeping or spinning is happening, start
1025 * a timing of our delay interval now. If we set it
1026 * to -1 we don't have accurate data so we cannot later
1027 * decide to record a dtrace spin or sleep event.
1029 if (dtrace_ls_initialized
== FALSE
) {
1030 dtrace_ls_initialized
= TRUE
;
1031 dtrace_rwl_excl_spin
= (lockstat_probemap
[LS_LCK_RW_LOCK_EXCL_SPIN
] != 0);
1032 dtrace_rwl_excl_block
= (lockstat_probemap
[LS_LCK_RW_LOCK_EXCL_BLOCK
] != 0);
1033 dtrace_ls_enabled
= dtrace_rwl_excl_spin
|| dtrace_rwl_excl_block
;
1034 if (dtrace_ls_enabled
) {
1036 * Either sleeping or spinning is happening,
1037 * start a timing of our delay interval now.
1039 readers_at_sleep
= lck
->lck_rw_shared_count
;
1040 wait_interval
= mach_absolute_time();
1045 istate
= ml_get_interrupts_enabled();
1047 deadline
= lck_rw_deadline_for_spin(lck
);
1049 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_RW_LCK_EX_READER_SPIN_CODE
) | DBG_FUNC_START
, trace_lck
, 0, 0, 0, 0);
1051 while ((lockheld
= lck_rw_held_read_or_upgrade(lck
)) && mach_absolute_time() < deadline
)
1052 lck_rw_lock_pause(istate
);
1054 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_RW_LCK_EX_READER_SPIN_CODE
) | DBG_FUNC_END
, trace_lck
, 0, 0, lockheld
, 0);
1059 * if we get here, the deadline has expired w/o us
1060 * being able to grab the lock exclusively
1061 * check to see if we're allowed to do a thread_block
1063 if (lck
->lck_rw_can_sleep
) {
1065 istate
= lck_interlock_lock(lck
);
1067 if (lck
->lck_rw_shared_count
!= 0 || lck
->lck_rw_want_upgrade
) {
1068 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_RW_LCK_EX_READER_WAIT_CODE
) | DBG_FUNC_START
, trace_lck
, 0, 0, 0, 0);
1070 lck
->lck_w_waiting
= TRUE
;
1072 res
= assert_wait(RW_LOCK_WRITER_EVENT(lck
), THREAD_UNINT
);
1073 lck_interlock_unlock(lck
, istate
);
1075 if (res
== THREAD_WAITING
) {
1076 res
= thread_block(THREAD_CONTINUE_NULL
);
1079 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_RW_LCK_EX_READER_WAIT_CODE
) | DBG_FUNC_END
, trace_lck
, res
, slept
, 0, 0);
1081 lck_interlock_unlock(lck
, istate
);
1083 * must own the lock now, since we checked for
1084 * readers or upgrade owner behind the interlock
1085 * no need for a call to 'lck_rw_held_read_or_upgrade'
1094 * Decide what latencies we suffered that are Dtrace events.
1095 * If we have set wait_interval, then we either spun or slept.
1096 * At least we get out from under the interlock before we record
1097 * which is the best we can do here to minimize the impact
1099 * If we have set wait_interval to -1, then dtrace was not enabled when we
1100 * started sleeping/spinning so we don't record this event.
1102 if (dtrace_ls_enabled
== TRUE
) {
1104 LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_EXCL_SPIN
, lck
,
1105 mach_absolute_time() - wait_interval
, 1);
1108 * For the blocking case, we also record if when we blocked
1109 * it was held for read or write, and how many readers.
1110 * Notice that above we recorded this before we dropped
1111 * the interlock so the count is accurate.
1113 LOCKSTAT_RECORD4(LS_LCK_RW_LOCK_EXCL_BLOCK
, lck
,
1114 mach_absolute_time() - wait_interval
, 1,
1115 (readers_at_sleep
== 0 ? 1 : 0), readers_at_sleep
);
1118 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE
, lck
, 1);
1124 * Routine: lck_rw_done_gen
1126 * called from the assembly language wrapper...
1127 * prior_lock_state is the value in the 1st
1128 * word of the lock at the time of a successful
1129 * atomic compare and exchange with the new value...
1130 * it represents the state of the lock before we
1131 * decremented the rw_shared_count or cleared either
1132 * rw_want_upgrade or rw_want_write and
1133 * the lck_x_waiting bits... since the wrapper
1134 * routine has already changed the state atomically,
1135 * we just need to decide if we should
1136 * wake up anyone and what value to return... we do
1137 * this by examining the state of the lock before
1143 int prior_lock_state
)
1146 lck_rw_type_t lock_type
;
1148 uint32_t rwlock_count
;
1151 * prior_lock state is a snapshot of the 1st word of the
1152 * lock in question... we'll fake up a pointer to it
1153 * and carefully not access anything beyond whats defined
1154 * in the first word of a lck_rw_t
1156 fake_lck
= (lck_rw_t
*)&prior_lock_state
;
1158 if (fake_lck
->lck_rw_shared_count
<= 1) {
1159 if (fake_lck
->lck_w_waiting
)
1160 thread_wakeup(RW_LOCK_WRITER_EVENT(lck
));
1162 if (!(fake_lck
->lck_rw_priv_excl
&& fake_lck
->lck_w_waiting
) && fake_lck
->lck_r_waiting
)
1163 thread_wakeup(RW_LOCK_READER_EVENT(lck
));
1165 if (fake_lck
->lck_rw_shared_count
)
1166 lock_type
= LCK_RW_TYPE_SHARED
;
1168 lock_type
= LCK_RW_TYPE_EXCLUSIVE
;
1170 /* Check if dropping the lock means that we need to unpromote */
1171 thread
= current_thread();
1172 rwlock_count
= thread
->rwlock_count
--;
1174 if (rwlock_count
== 0) {
1175 panic("rw lock count underflow for thread %p", thread
);
1178 if ((rwlock_count
== 1 /* field now 0 */) && (thread
->sched_flags
& TH_SFLAG_RW_PROMOTED
)) {
1179 /* sched_flags checked without lock, but will be rechecked while clearing */
1180 lck_rw_clear_promotion(thread
);
1184 LOCKSTAT_RECORD(LS_LCK_RW_DONE_RELEASE
, lck
, lock_type
== LCK_RW_TYPE_SHARED
? 0 : 1);
1192 * Routine: lck_rw_unlock
1197 lck_rw_type_t lck_rw_type
)
1199 if (lck_rw_type
== LCK_RW_TYPE_SHARED
)
1200 lck_rw_unlock_shared(lck
);
1201 else if (lck_rw_type
== LCK_RW_TYPE_EXCLUSIVE
)
1202 lck_rw_unlock_exclusive(lck
);
1204 panic("lck_rw_unlock(): Invalid RW lock type: %d\n", lck_rw_type
);
1209 * Routine: lck_rw_unlock_shared
1212 lck_rw_unlock_shared(
1217 ret
= lck_rw_done(lck
);
1219 if (ret
!= LCK_RW_TYPE_SHARED
)
1220 panic("lck_rw_unlock_shared(): lock %p held in mode: %d\n", lck
, ret
);
1225 * Routine: lck_rw_unlock_exclusive
1228 lck_rw_unlock_exclusive(
1233 ret
= lck_rw_done(lck
);
1235 if (ret
!= LCK_RW_TYPE_EXCLUSIVE
)
1236 panic("lck_rw_unlock_exclusive(): lock held in mode: %d\n", ret
);
1241 * Routine: lck_rw_lock
1246 lck_rw_type_t lck_rw_type
)
1248 if (lck_rw_type
== LCK_RW_TYPE_SHARED
)
1249 lck_rw_lock_shared(lck
);
1250 else if (lck_rw_type
== LCK_RW_TYPE_EXCLUSIVE
)
1251 lck_rw_lock_exclusive(lck
);
1253 panic("lck_rw_lock(): Invalid RW lock type: %x\n", lck_rw_type
);
1258 * Routine: lck_rw_lock_shared_gen
1260 * assembly fast path code has determined that this lock
1261 * is held exclusively... this is where we spin/block
1262 * until we can acquire the lock in the shared mode
1265 lck_rw_lock_shared_gen(
1268 __kdebug_only
uintptr_t trace_lck
= VM_KERNEL_UNSLIDE_OR_PERM(lck
);
1269 uint64_t deadline
= 0;
1272 wait_result_t res
= 0;
1273 boolean_t istate
= -1;
1276 uint64_t wait_interval
= 0;
1277 int readers_at_sleep
= 0;
1278 boolean_t dtrace_ls_initialized
= FALSE
;
1279 boolean_t dtrace_rwl_shared_spin
, dtrace_rwl_shared_block
, dtrace_ls_enabled
= FALSE
;
1282 while ( !lck_rw_grab_shared(lck
)) {
1285 if (dtrace_ls_initialized
== FALSE
) {
1286 dtrace_ls_initialized
= TRUE
;
1287 dtrace_rwl_shared_spin
= (lockstat_probemap
[LS_LCK_RW_LOCK_SHARED_SPIN
] != 0);
1288 dtrace_rwl_shared_block
= (lockstat_probemap
[LS_LCK_RW_LOCK_SHARED_BLOCK
] != 0);
1289 dtrace_ls_enabled
= dtrace_rwl_shared_spin
|| dtrace_rwl_shared_block
;
1290 if (dtrace_ls_enabled
) {
1292 * Either sleeping or spinning is happening,
1293 * start a timing of our delay interval now.
1295 readers_at_sleep
= lck
->lck_rw_shared_count
;
1296 wait_interval
= mach_absolute_time();
1301 istate
= ml_get_interrupts_enabled();
1303 deadline
= lck_rw_deadline_for_spin(lck
);
1305 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_RW_LCK_SHARED_SPIN_CODE
) | DBG_FUNC_START
,
1306 trace_lck
, lck
->lck_rw_want_write
, lck
->lck_rw_want_upgrade
, 0, 0);
1308 while (((gotlock
= lck_rw_grab_shared(lck
)) == 0) && mach_absolute_time() < deadline
)
1309 lck_rw_lock_pause(istate
);
1311 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_RW_LCK_SHARED_SPIN_CODE
) | DBG_FUNC_END
,
1312 trace_lck
, lck
->lck_rw_want_write
, lck
->lck_rw_want_upgrade
, gotlock
, 0);
1317 * if we get here, the deadline has expired w/o us
1318 * being able to grab the lock for read
1319 * check to see if we're allowed to do a thread_block
1321 if (lck
->lck_rw_can_sleep
) {
1323 istate
= lck_interlock_lock(lck
);
1325 if ((lck
->lck_rw_want_write
|| lck
->lck_rw_want_upgrade
) &&
1326 ((lck
->lck_rw_shared_count
== 0) || lck
->lck_rw_priv_excl
)) {
1328 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_RW_LCK_SHARED_WAIT_CODE
) | DBG_FUNC_START
,
1329 trace_lck
, lck
->lck_rw_want_write
, lck
->lck_rw_want_upgrade
, 0, 0);
1331 lck
->lck_r_waiting
= TRUE
;
1333 res
= assert_wait(RW_LOCK_READER_EVENT(lck
), THREAD_UNINT
);
1334 lck_interlock_unlock(lck
, istate
);
1336 if (res
== THREAD_WAITING
) {
1337 res
= thread_block(THREAD_CONTINUE_NULL
);
1340 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_RW_LCK_SHARED_WAIT_CODE
) | DBG_FUNC_END
,
1341 trace_lck
, res
, slept
, 0, 0);
1343 lck
->lck_rw_shared_count
++;
1344 lck_interlock_unlock(lck
, istate
);
1351 if (dtrace_ls_enabled
== TRUE
) {
1353 LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_SHARED_SPIN
, lck
, mach_absolute_time() - wait_interval
, 0);
1355 LOCKSTAT_RECORD4(LS_LCK_RW_LOCK_SHARED_BLOCK
, lck
,
1356 mach_absolute_time() - wait_interval
, 0,
1357 (readers_at_sleep
== 0 ? 1 : 0), readers_at_sleep
);
1360 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE
, lck
, 0);
1366 * Routine: lck_rw_lock_shared_to_exclusive_failure
1368 * assembly fast path code has already dropped our read
1369 * count and determined that someone else owns 'lck_rw_want_upgrade'
1370 * if 'lck_rw_shared_count' == 0, its also already dropped 'lck_w_waiting'
1371 * all we need to do here is determine if a wakeup is needed
1374 lck_rw_lock_shared_to_exclusive_failure(
1376 int prior_lock_state
)
1379 thread_t thread
= current_thread();
1380 uint32_t rwlock_count
;
1382 /* Check if dropping the lock means that we need to unpromote */
1383 rwlock_count
= thread
->rwlock_count
--;
1385 if (rwlock_count
== 0) {
1386 panic("rw lock count underflow for thread %p", thread
);
1389 if ((rwlock_count
== 1 /* field now 0 */) && (thread
->sched_flags
& TH_SFLAG_RW_PROMOTED
)) {
1390 /* sched_flags checked without lock, but will be rechecked while clearing */
1391 lck_rw_clear_promotion(thread
);
1395 * prior_lock state is a snapshot of the 1st word of the
1396 * lock in question... we'll fake up a pointer to it
1397 * and carefully not access anything beyond whats defined
1398 * in the first word of a lck_rw_t
1400 fake_lck
= (lck_rw_t
*)&prior_lock_state
;
1402 if (fake_lck
->lck_w_waiting
&& fake_lck
->lck_rw_shared_count
== 1) {
1404 * Someone else has requested upgrade.
1405 * Since we've released the read lock, wake
1406 * him up if he's blocked waiting
1408 thread_wakeup(RW_LOCK_WRITER_EVENT(lck
));
1410 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_RW_LCK_SH_TO_EX_CODE
) | DBG_FUNC_NONE
,
1411 VM_KERNEL_UNSLIDE_OR_PERM(lck
), lck
->lck_rw_shared_count
, lck
->lck_rw_want_upgrade
, 0, 0);
1418 * Routine: lck_rw_lock_shared_to_exclusive_failure
1420 * assembly fast path code has already dropped our read
1421 * count and successfully acquired 'lck_rw_want_upgrade'
1422 * we just need to wait for the rest of the readers to drain
1423 * and then we can return as the exclusive holder of this lock
1426 lck_rw_lock_shared_to_exclusive_success(
1429 __kdebug_only
uintptr_t trace_lck
= VM_KERNEL_UNSLIDE_OR_PERM(lck
);
1430 uint64_t deadline
= 0;
1432 int still_shared
= 0;
1434 boolean_t istate
= -1;
1437 uint64_t wait_interval
= 0;
1438 int readers_at_sleep
= 0;
1439 boolean_t dtrace_ls_initialized
= FALSE
;
1440 boolean_t dtrace_rwl_shared_to_excl_spin
, dtrace_rwl_shared_to_excl_block
, dtrace_ls_enabled
= FALSE
;
1443 while (lck
->lck_rw_shared_count
!= 0) {
1446 if (dtrace_ls_initialized
== FALSE
) {
1447 dtrace_ls_initialized
= TRUE
;
1448 dtrace_rwl_shared_to_excl_spin
= (lockstat_probemap
[LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN
] != 0);
1449 dtrace_rwl_shared_to_excl_block
= (lockstat_probemap
[LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK
] != 0);
1450 dtrace_ls_enabled
= dtrace_rwl_shared_to_excl_spin
|| dtrace_rwl_shared_to_excl_block
;
1451 if (dtrace_ls_enabled
) {
1453 * Either sleeping or spinning is happening,
1454 * start a timing of our delay interval now.
1456 readers_at_sleep
= lck
->lck_rw_shared_count
;
1457 wait_interval
= mach_absolute_time();
1462 istate
= ml_get_interrupts_enabled();
1464 deadline
= lck_rw_deadline_for_spin(lck
);
1466 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_RW_LCK_SH_TO_EX_SPIN_CODE
) | DBG_FUNC_START
,
1467 trace_lck
, lck
->lck_rw_shared_count
, 0, 0, 0);
1469 while ((still_shared
= lck
->lck_rw_shared_count
) && mach_absolute_time() < deadline
)
1470 lck_rw_lock_pause(istate
);
1472 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_RW_LCK_SH_TO_EX_SPIN_CODE
) | DBG_FUNC_END
,
1473 trace_lck
, lck
->lck_rw_shared_count
, 0, 0, 0);
1478 * if we get here, the deadline has expired w/o
1479 * the rw_shared_count having drained to 0
1480 * check to see if we're allowed to do a thread_block
1482 if (lck
->lck_rw_can_sleep
) {
1484 istate
= lck_interlock_lock(lck
);
1486 if (lck
->lck_rw_shared_count
!= 0) {
1487 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_RW_LCK_SH_TO_EX_WAIT_CODE
) | DBG_FUNC_START
,
1488 trace_lck
, lck
->lck_rw_shared_count
, 0, 0, 0);
1490 lck
->lck_w_waiting
= TRUE
;
1492 res
= assert_wait(RW_LOCK_WRITER_EVENT(lck
), THREAD_UNINT
);
1493 lck_interlock_unlock(lck
, istate
);
1495 if (res
== THREAD_WAITING
) {
1496 res
= thread_block(THREAD_CONTINUE_NULL
);
1499 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_RW_LCK_SH_TO_EX_WAIT_CODE
) | DBG_FUNC_END
,
1500 trace_lck
, res
, slept
, 0, 0);
1502 lck_interlock_unlock(lck
, istate
);
1509 * We infer whether we took the sleep/spin path above by checking readers_at_sleep.
1511 if (dtrace_ls_enabled
== TRUE
) {
1513 LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN
, lck
, mach_absolute_time() - wait_interval
, 0);
1515 LOCKSTAT_RECORD4(LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK
, lck
,
1516 mach_absolute_time() - wait_interval
, 1,
1517 (readers_at_sleep
== 0 ? 1 : 0), readers_at_sleep
);
1520 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE
, lck
, 1);
1527 * Routine: lck_rw_lock_exclusive_to_shared
1529 * assembly fast path has already dropped
1530 * our exclusive state and bumped lck_rw_shared_count
1531 * all we need to do here is determine if anyone
1532 * needs to be awakened.
1535 lck_rw_lock_exclusive_to_shared_gen(
1537 int prior_lock_state
)
1539 __kdebug_only
uintptr_t trace_lck
= VM_KERNEL_UNSLIDE_OR_PERM(lck
);
1543 * prior_lock state is a snapshot of the 1st word of the
1544 * lock in question... we'll fake up a pointer to it
1545 * and carefully not access anything beyond whats defined
1546 * in the first word of a lck_rw_t
1548 fake_lck
= (lck_rw_t
*)&prior_lock_state
;
1550 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_RW_LCK_EX_TO_SH_CODE
) | DBG_FUNC_START
,
1551 trace_lck
, fake_lck
->lck_rw_want_write
, fake_lck
->lck_rw_want_upgrade
, 0, 0);
1554 * don't wake up anyone waiting to take the lock exclusively
1555 * since we hold a read count... when the read count drops to 0,
1556 * the writers will be woken.
1558 * wake up any waiting readers if we don't have any writers waiting,
1559 * or the lock is NOT marked as rw_priv_excl (writers have privilege)
1561 if (!(fake_lck
->lck_rw_priv_excl
&& fake_lck
->lck_w_waiting
) && fake_lck
->lck_r_waiting
)
1562 thread_wakeup(RW_LOCK_READER_EVENT(lck
));
1564 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_RW_LCK_EX_TO_SH_CODE
) | DBG_FUNC_END
,
1565 trace_lck
, lck
->lck_rw_want_write
, lck
->lck_rw_want_upgrade
, lck
->lck_rw_shared_count
, 0);
1568 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_TO_SHARED_DOWNGRADE
, lck
, 0);
1574 * Routine: lck_rw_try_lock
1579 lck_rw_type_t lck_rw_type
)
1581 if (lck_rw_type
== LCK_RW_TYPE_SHARED
)
1582 return(lck_rw_try_lock_shared(lck
));
1583 else if (lck_rw_type
== LCK_RW_TYPE_EXCLUSIVE
)
1584 return(lck_rw_try_lock_exclusive(lck
));
1586 panic("lck_rw_try_lock(): Invalid rw lock type: %x\n", lck_rw_type
);
1597 case LCK_RW_ASSERT_SHARED
:
1598 if (lck
->lck_rw_shared_count
!= 0) {
1602 case LCK_RW_ASSERT_EXCLUSIVE
:
1603 if ((lck
->lck_rw_want_write
||
1604 lck
->lck_rw_want_upgrade
) &&
1605 lck
->lck_rw_shared_count
== 0) {
1609 case LCK_RW_ASSERT_HELD
:
1610 if (lck
->lck_rw_want_write
||
1611 lck
->lck_rw_want_upgrade
||
1612 lck
->lck_rw_shared_count
!= 0) {
1616 case LCK_RW_ASSERT_NOTHELD
:
1617 if (!(lck
->lck_rw_want_write
||
1618 lck
->lck_rw_want_upgrade
||
1619 lck
->lck_rw_shared_count
!= 0)) {
1627 panic("rw lock (%p)%s held (mode=%u), first word %08x\n", lck
, (type
== LCK_RW_ASSERT_NOTHELD
? "" : " not"), type
, *(uint32_t *)lck
);
1630 /* On return to userspace, this routine is called if the rwlock_count is somehow imbalanced */
1632 lck_rw_clear_promotions_x86(thread_t thread
)
1635 /* It's fatal to leave a RW lock locked and return to userspace */
1636 panic("%u rw lock(s) held on return to userspace for thread %p", thread
->rwlock_count
, thread
);
1638 /* Paper over the issue */
1639 thread
->rwlock_count
= 0;
1640 lck_rw_clear_promotion(thread
);
1646 * Routine: kdp_lck_rw_lock_is_acquired_exclusive
1647 * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
1650 kdp_lck_rw_lock_is_acquired_exclusive(lck_rw_t
*lck
) {
1652 panic("panic: rw lock exclusive check done outside of kernel debugger");
1654 return ((lck
->lck_rw_want_upgrade
|| lck
->lck_rw_want_write
) && (lck
->lck_rw_shared_count
== 0)) ? TRUE
: FALSE
;
1659 extern zone_t lck_mtx_zone
;
1662 * Routine: lck_mtx_alloc_init
1671 if ((lck
= (lck_mtx_t
*)zalloc(lck_mtx_zone
)) != 0)
1672 lck_mtx_init(lck
, grp
, attr
);
1674 if ((lck
= (lck_mtx_t
*)kalloc(sizeof(lck_mtx_t
))) != 0)
1675 lck_mtx_init(lck
, grp
, attr
);
1681 * Routine: lck_mtx_free
1688 lck_mtx_destroy(lck
, grp
);
1690 zfree(lck_mtx_zone
, lck
);
1692 kfree(lck
, sizeof(lck_mtx_t
));
1697 * Routine: lck_mtx_ext_init
1705 bzero((void *)lck
, sizeof(lck_mtx_ext_t
));
1707 if ((attr
->lck_attr_val
) & LCK_ATTR_DEBUG
) {
1708 lck
->lck_mtx_deb
.type
= MUTEX_TAG
;
1709 lck
->lck_mtx_attr
|= LCK_MTX_ATTR_DEBUG
;
1712 lck
->lck_mtx_grp
= grp
;
1714 if (grp
->lck_grp_attr
& LCK_GRP_ATTR_STAT
)
1715 lck
->lck_mtx_attr
|= LCK_MTX_ATTR_STAT
;
1717 lck
->lck_mtx
.lck_mtx_is_ext
= 1;
1718 lck
->lck_mtx
.lck_mtx_pad32
= 0xFFFFFFFF;
1722 * Routine: lck_mtx_init
1730 lck_mtx_ext_t
*lck_ext
;
1731 lck_attr_t
*lck_attr
;
1733 if (attr
!= LCK_ATTR_NULL
)
1736 lck_attr
= &LockDefaultLckAttr
;
1738 if ((lck_attr
->lck_attr_val
) & LCK_ATTR_DEBUG
) {
1739 if ((lck_ext
= (lck_mtx_ext_t
*)kalloc(sizeof(lck_mtx_ext_t
))) != 0) {
1740 lck_mtx_ext_init(lck_ext
, grp
, lck_attr
);
1741 lck
->lck_mtx_tag
= LCK_MTX_TAG_INDIRECT
;
1742 lck
->lck_mtx_ptr
= lck_ext
;
1745 lck
->lck_mtx_owner
= 0;
1746 lck
->lck_mtx_state
= 0;
1748 lck
->lck_mtx_pad32
= 0xFFFFFFFF;
1749 lck_grp_reference(grp
);
1750 lck_grp_lckcnt_incr(grp
, LCK_TYPE_MTX
);
1754 * Routine: lck_mtx_init_ext
1759 lck_mtx_ext_t
*lck_ext
,
1763 lck_attr_t
*lck_attr
;
1765 if (attr
!= LCK_ATTR_NULL
)
1768 lck_attr
= &LockDefaultLckAttr
;
1770 if ((lck_attr
->lck_attr_val
) & LCK_ATTR_DEBUG
) {
1771 lck_mtx_ext_init(lck_ext
, grp
, lck_attr
);
1772 lck
->lck_mtx_tag
= LCK_MTX_TAG_INDIRECT
;
1773 lck
->lck_mtx_ptr
= lck_ext
;
1775 lck
->lck_mtx_owner
= 0;
1776 lck
->lck_mtx_state
= 0;
1778 lck
->lck_mtx_pad32
= 0xFFFFFFFF;
1780 lck_grp_reference(grp
);
1781 lck_grp_lckcnt_incr(grp
, LCK_TYPE_MTX
);
1785 * Routine: lck_mtx_destroy
1792 boolean_t lck_is_indirect
;
1794 if (lck
->lck_mtx_tag
== LCK_MTX_TAG_DESTROYED
)
1797 lck_mtx_assert(lck
, LCK_MTX_ASSERT_NOTOWNED
);
1799 lck_is_indirect
= (lck
->lck_mtx_tag
== LCK_MTX_TAG_INDIRECT
);
1801 lck_mtx_lock_mark_destroyed(lck
);
1803 if (lck_is_indirect
)
1804 kfree(lck
->lck_mtx_ptr
, sizeof(lck_mtx_ext_t
));
1805 lck_grp_lckcnt_decr(grp
, LCK_TYPE_MTX
);
1806 lck_grp_deallocate(grp
);
1811 #define LCK_MTX_LCK_WAIT_CODE 0x20
1812 #define LCK_MTX_LCK_WAKEUP_CODE 0x21
1813 #define LCK_MTX_LCK_SPIN_CODE 0x22
1814 #define LCK_MTX_LCK_ACQUIRE_CODE 0x23
1815 #define LCK_MTX_LCK_DEMOTE_CODE 0x24
1819 * Routine: lck_mtx_unlock_wakeup_x86
1821 * Invoked on unlock when there is
1822 * contention (i.e. the assembly routine sees that
1823 * that mutex->lck_mtx_waiters != 0 or
1824 * that mutex->lck_mtx_promoted != 0...
1826 * neither the mutex or interlock is held
1829 lck_mtx_unlock_wakeup_x86 (
1831 int prior_lock_state
)
1833 __kdebug_only
uintptr_t trace_lck
= VM_KERNEL_UNSLIDE_OR_PERM(mutex
);
1837 * prior_lock state is a snapshot of the 2nd word of the
1838 * lock in question... we'll fake up a lock with the bits
1839 * copied into place and carefully not access anything
1840 * beyond whats defined in the second word of a lck_mtx_t
1842 fake_lck
.lck_mtx_state
= prior_lock_state
;
1844 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_MTX_LCK_WAKEUP_CODE
) | DBG_FUNC_START
,
1845 trace_lck
, fake_lck
.lck_mtx_promoted
, fake_lck
.lck_mtx_waiters
, fake_lck
.lck_mtx_pri
, 0);
1847 if (__probable(fake_lck
.lck_mtx_waiters
)) {
1848 if (fake_lck
.lck_mtx_waiters
> 1)
1849 thread_wakeup_one_with_pri(LCK_MTX_EVENT(mutex
), fake_lck
.lck_mtx_pri
);
1851 thread_wakeup_one(LCK_MTX_EVENT(mutex
));
1854 if (__improbable(fake_lck
.lck_mtx_promoted
)) {
1855 thread_t thread
= current_thread();
1858 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_MTX_LCK_DEMOTE_CODE
) | DBG_FUNC_NONE
,
1859 thread_tid(thread
), thread
->promotions
, thread
->sched_flags
& TH_SFLAG_PROMOTED
, 0, 0);
1861 if (thread
->promotions
> 0) {
1862 spl_t s
= splsched();
1864 thread_lock(thread
);
1866 if (--thread
->promotions
== 0 && (thread
->sched_flags
& TH_SFLAG_PROMOTED
)) {
1868 thread
->sched_flags
&= ~TH_SFLAG_PROMOTED
;
1870 if (thread
->sched_flags
& TH_SFLAG_RW_PROMOTED
) {
1871 /* Thread still has a RW lock promotion */
1872 } else if (thread
->sched_flags
& TH_SFLAG_DEPRESSED_MASK
) {
1873 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED
, MACH_DEMOTE
) | DBG_FUNC_NONE
,
1874 thread
->sched_pri
, DEPRESSPRI
, 0, trace_lck
, 0);
1876 set_sched_pri(thread
, DEPRESSPRI
);
1879 if (thread
->base_pri
< thread
->sched_pri
) {
1880 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED
, MACH_DEMOTE
) | DBG_FUNC_NONE
,
1881 thread
->sched_pri
, thread
->base_pri
, 0, trace_lck
, 0);
1883 thread_recompute_sched_pri(thread
, FALSE
);
1887 thread_unlock(thread
);
1891 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_MTX_LCK_WAKEUP_CODE
) | DBG_FUNC_END
,
1892 trace_lck
, 0, mutex
->lck_mtx_waiters
, 0, 0);
1897 * Routine: lck_mtx_lock_acquire_x86
1899 * Invoked on acquiring the mutex when there is
1900 * contention (i.e. the assembly routine sees that
1901 * that mutex->lck_mtx_waiters != 0 or
1902 * thread->was_promoted_on_wakeup != 0)...
1904 * mutex is owned... interlock is held... preemption is disabled
1907 lck_mtx_lock_acquire_x86(
1910 __kdebug_only
uintptr_t trace_lck
= VM_KERNEL_UNSLIDE_OR_PERM(mutex
);
1915 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_MTX_LCK_ACQUIRE_CODE
) | DBG_FUNC_START
,
1916 trace_lck
, thread
->was_promoted_on_wakeup
, mutex
->lck_mtx_waiters
, mutex
->lck_mtx_pri
, 0);
1918 if (mutex
->lck_mtx_waiters
)
1919 priority
= mutex
->lck_mtx_pri
;
1923 thread
= (thread_t
)mutex
->lck_mtx_owner
; /* faster then current_thread() */
1925 if (thread
->sched_pri
< priority
|| thread
->was_promoted_on_wakeup
) {
1927 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED
, MACH_PROMOTE
) | DBG_FUNC_NONE
,
1928 thread
->sched_pri
, priority
, thread
->was_promoted_on_wakeup
, trace_lck
, 0);
1931 thread_lock(thread
);
1933 if (thread
->sched_pri
< priority
) {
1934 /* Do not promote past promotion ceiling */
1935 assert(priority
<= MAXPRI_PROMOTE
);
1936 set_sched_pri(thread
, priority
);
1938 if (mutex
->lck_mtx_promoted
== 0) {
1939 mutex
->lck_mtx_promoted
= 1;
1941 thread
->promotions
++;
1942 thread
->sched_flags
|= TH_SFLAG_PROMOTED
;
1944 thread
->was_promoted_on_wakeup
= 0;
1946 thread_unlock(thread
);
1949 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_MTX_LCK_ACQUIRE_CODE
) | DBG_FUNC_END
,
1950 trace_lck
, 0, mutex
->lck_mtx_waiters
, 0, 0);
1955 lck_mtx_interlock_try_lock(lck_mtx_t
*mutex
, boolean_t
*istate
)
1959 *istate
= ml_set_interrupts_enabled(FALSE
);
1960 retval
= lck_mtx_ilk_try_lock(mutex
);
1963 ml_set_interrupts_enabled(*istate
);
1969 lck_mtx_interlock_unlock(lck_mtx_t
*mutex
, boolean_t istate
)
1971 lck_mtx_ilk_unlock(mutex
);
1972 ml_set_interrupts_enabled(istate
);
1977 * Routine: lck_mtx_lock_spinwait_x86
1979 * Invoked trying to acquire a mutex when there is contention but
1980 * the holder is running on another processor. We spin for up to a maximum
1981 * time waiting for the lock to be released.
1983 * Called with the interlock unlocked.
1984 * returns 0 if mutex acquired
1985 * returns 1 if we spun
1986 * returns 2 if we didn't spin due to the holder not running
1989 lck_mtx_lock_spinwait_x86(
1992 __kdebug_only
uintptr_t trace_lck
= VM_KERNEL_UNSLIDE_OR_PERM(mutex
);
1994 uint64_t overall_deadline
;
1995 uint64_t check_owner_deadline
;
2000 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_MTX_LCK_SPIN_CODE
) | DBG_FUNC_START
,
2001 trace_lck
, VM_KERNEL_UNSLIDE_OR_PERM(mutex
->lck_mtx_owner
), mutex
->lck_mtx_waiters
, 0, 0);
2003 cur_time
= mach_absolute_time();
2004 overall_deadline
= cur_time
+ MutexSpin
;
2005 check_owner_deadline
= cur_time
;
2009 * - mutex is locked, and
2010 * - its locked as a spin lock, and
2011 * - owner is running on another processor, and
2012 * - owner (processor) is not idling, and
2013 * - we haven't spun for long enough.
2016 if (__probable(lck_mtx_lock_grab_mutex(mutex
))) {
2020 cur_time
= mach_absolute_time();
2022 if (cur_time
>= overall_deadline
)
2025 if (cur_time
>= check_owner_deadline
&& mutex
->lck_mtx_owner
) {
2028 if (lck_mtx_interlock_try_lock(mutex
, &istate
)) {
2030 if ((holder
= (thread_t
) mutex
->lck_mtx_owner
) != NULL
) {
2032 if ( !(holder
->machine
.specFlags
& OnProc
) ||
2033 (holder
->state
& TH_IDLE
)) {
2035 lck_mtx_interlock_unlock(mutex
, istate
);
2042 lck_mtx_interlock_unlock(mutex
, istate
);
2044 check_owner_deadline
= cur_time
+ (MutexSpin
/ 4);
2055 * We've already kept a count via overall_deadline of how long we spun.
2056 * If dtrace is active, then we compute backwards to decide how
2059 * Note that we record a different probe id depending on whether
2060 * this is a direct or indirect mutex. This allows us to
2061 * penalize only lock groups that have debug/stats enabled
2062 * with dtrace processing if desired.
2064 if (__probable(mutex
->lck_mtx_is_ext
== 0)) {
2065 LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN
, mutex
,
2066 mach_absolute_time() - (overall_deadline
- MutexSpin
));
2068 LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_SPIN
, mutex
,
2069 mach_absolute_time() - (overall_deadline
- MutexSpin
));
2071 /* The lockstat acquire event is recorded by the assembly code beneath us. */
2074 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_MTX_LCK_SPIN_CODE
) | DBG_FUNC_END
,
2075 trace_lck
, VM_KERNEL_UNSLIDE_OR_PERM(mutex
->lck_mtx_owner
), mutex
->lck_mtx_waiters
, retval
, 0);
2083 * Routine: lck_mtx_lock_wait_x86
2085 * Invoked in order to wait on contention.
2087 * Called with the interlock locked and
2088 * preemption disabled...
2089 * returns it unlocked and with preemption enabled
2092 lck_mtx_lock_wait_x86 (
2095 __kdebug_only
uintptr_t trace_lck
= VM_KERNEL_UNSLIDE_OR_PERM(mutex
);
2096 thread_t self
= current_thread();
2101 uint64_t sleep_start
= 0;
2103 if (lockstat_probemap
[LS_LCK_MTX_LOCK_BLOCK
] || lockstat_probemap
[LS_LCK_MTX_EXT_LOCK_BLOCK
]) {
2104 sleep_start
= mach_absolute_time();
2107 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_MTX_LCK_WAIT_CODE
) | DBG_FUNC_START
,
2108 trace_lck
, VM_KERNEL_UNSLIDE_OR_PERM(mutex
->lck_mtx_owner
), mutex
->lck_mtx_waiters
, mutex
->lck_mtx_pri
, 0);
2110 priority
= self
->sched_pri
;
2112 if (priority
< self
->base_pri
)
2113 priority
= self
->base_pri
;
2114 if (priority
< BASEPRI_DEFAULT
)
2115 priority
= BASEPRI_DEFAULT
;
2117 /* Do not promote past promotion ceiling */
2118 priority
= MIN(priority
, MAXPRI_PROMOTE
);
2120 if (mutex
->lck_mtx_waiters
== 0 || priority
> mutex
->lck_mtx_pri
)
2121 mutex
->lck_mtx_pri
= priority
;
2122 mutex
->lck_mtx_waiters
++;
2124 if ( (holder
= (thread_t
)mutex
->lck_mtx_owner
) &&
2125 holder
->sched_pri
< mutex
->lck_mtx_pri
) {
2127 thread_lock(holder
);
2129 /* holder priority may have been bumped by another thread
2130 * before thread_lock was taken
2132 if (holder
->sched_pri
< mutex
->lck_mtx_pri
) {
2133 KERNEL_DEBUG_CONSTANT(
2134 MACHDBG_CODE(DBG_MACH_SCHED
, MACH_PROMOTE
) | DBG_FUNC_NONE
,
2135 holder
->sched_pri
, priority
, thread_tid(holder
), trace_lck
, 0);
2136 /* Assert that we're not altering the priority of a
2137 * thread above the MAXPRI_PROMOTE band
2139 assert(holder
->sched_pri
< MAXPRI_PROMOTE
);
2140 set_sched_pri(holder
, priority
);
2142 if (mutex
->lck_mtx_promoted
== 0) {
2143 holder
->promotions
++;
2144 holder
->sched_flags
|= TH_SFLAG_PROMOTED
;
2146 mutex
->lck_mtx_promoted
= 1;
2149 thread_unlock(holder
);
2152 assert_wait(LCK_MTX_EVENT(mutex
), THREAD_UNINT
);
2154 lck_mtx_ilk_unlock(mutex
);
2156 thread_block(THREAD_CONTINUE_NULL
);
2158 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS
, LCK_MTX_LCK_WAIT_CODE
) | DBG_FUNC_END
,
2159 trace_lck
, VM_KERNEL_UNSLIDE_OR_PERM(mutex
->lck_mtx_owner
), mutex
->lck_mtx_waiters
, mutex
->lck_mtx_pri
, 0);
2163 * Record the Dtrace lockstat probe for blocking, block time
2164 * measured from when we were entered.
2167 if (mutex
->lck_mtx_is_ext
== 0) {
2168 LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_BLOCK
, mutex
,
2169 mach_absolute_time() - sleep_start
);
2171 LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_BLOCK
, mutex
,
2172 mach_absolute_time() - sleep_start
);
2179 * Routine: kdp_lck_mtx_lock_spin_is_acquired
2180 * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
2181 * Returns: TRUE if lock is acquired.
2184 kdp_lck_mtx_lock_spin_is_acquired(lck_mtx_t
*lck
)
2187 panic("panic: kdp_lck_mtx_lock_spin_is_acquired called outside of kernel debugger");
2190 if (lck
->lck_mtx_ilocked
|| lck
->lck_mtx_mlocked
) {