]> git.saurik.com Git - apple/xnu.git/blame - osfmk/arm/locks_arm.c
xnu-7195.101.1.tar.gz
[apple/xnu.git] / osfmk / arm / locks_arm.c
CommitLineData
5ba3f43e 1/*
cb323159 2 * Copyright (c) 2007-2018 Apple Inc. All rights reserved.
5ba3f43e
A
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/*
29 * @OSF_COPYRIGHT@
30 */
31/*
32 * Mach Operating System Copyright (c) 1991,1990,1989,1988,1987 Carnegie
33 * Mellon University All Rights Reserved.
0a7de745 34 *
5ba3f43e
A
35 * Permission to use, copy, modify and distribute this software and its
36 * documentation is hereby granted, provided that both the copyright notice
37 * and this permission notice appear in all copies of the software,
38 * derivative works or modified versions, and any portions thereof, and that
39 * both notices appear in supporting documentation.
0a7de745 40 *
5ba3f43e
A
41 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" CONDITION.
42 * CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR ANY DAMAGES
43 * WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
0a7de745 44 *
5ba3f43e 45 * Carnegie Mellon requests users of this software to return to
0a7de745 46 *
5ba3f43e
A
47 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
48 * School of Computer Science Carnegie Mellon University Pittsburgh PA
49 * 15213-3890
0a7de745 50 *
5ba3f43e
A
51 * any improvements or extensions that they make and grant Carnegie Mellon the
52 * rights to redistribute these changes.
53 */
54/*
55 * File: kern/lock.c
56 * Author: Avadis Tevanian, Jr., Michael Wayne Young
57 * Date: 1985
58 *
59 * Locking primitives implementation
60 */
61
5ba3f43e
A
62#define LOCK_PRIVATE 1
63
64#include <mach_ldebug.h>
65
f427ee49 66#include <kern/zalloc.h>
0a7de745 67#include <kern/lock_stat.h>
5ba3f43e
A
68#include <kern/locks.h>
69#include <kern/misc_protos.h>
70#include <kern/thread.h>
71#include <kern/processor.h>
72#include <kern/sched_prim.h>
5ba3f43e
A
73#include <kern/debug.h>
74#include <kern/kcdata.h>
75#include <string.h>
ea3f0419
A
76#include <arm/cpu_internal.h>
77#include <os/hash.h>
78#include <arm/cpu_data.h>
5ba3f43e
A
79
80#include <arm/cpu_data_internal.h>
81#include <arm/proc_reg.h>
82#include <arm/smp.h>
83#include <machine/atomic.h>
84#include <machine/machine_cpu.h>
85
86#include <sys/kdebug.h>
87
0a7de745
A
88#if CONFIG_DTRACE
89#define DTRACE_RW_SHARED 0x0 //reader
90#define DTRACE_RW_EXCL 0x1 //writer
91#define DTRACE_NO_FLAG 0x0 //not applicable
92#endif /* CONFIG_DTRACE */
5ba3f43e 93
0a7de745
A
94#define LCK_RW_LCK_EXCLUSIVE_CODE 0x100
95#define LCK_RW_LCK_EXCLUSIVE1_CODE 0x101
96#define LCK_RW_LCK_SHARED_CODE 0x102
97#define LCK_RW_LCK_SH_TO_EX_CODE 0x103
98#define LCK_RW_LCK_SH_TO_EX1_CODE 0x104
99#define LCK_RW_LCK_EX_TO_SH_CODE 0x105
5ba3f43e
A
100
101
0a7de745 102#define ANY_LOCK_DEBUG (USLOCK_DEBUG || LOCK_DEBUG || MUTEX_DEBUG)
5ba3f43e
A
103
104// Panic in tests that check lock usage correctness
105// These are undesirable when in a panic or a debugger is runnning.
106#define LOCK_CORRECTNESS_PANIC() (kernel_debugger_entry_count == 0)
107
0a7de745
A
108#define ADAPTIVE_SPIN_ENABLE 0x1
109
0a7de745 110int lck_mtx_adaptive_spin_mode = ADAPTIVE_SPIN_ENABLE;
0a7de745
A
111
112#define SPINWAIT_OWNER_CHECK_COUNT 4
113
114typedef enum {
115 SPINWAIT_ACQUIRED, /* Got the lock. */
116 SPINWAIT_INTERLOCK, /* Got the interlock, no owner, but caller must finish acquiring the lock. */
ea3f0419
A
117 SPINWAIT_DID_SPIN_HIGH_THR, /* Got the interlock, spun, but failed to get the lock. */
118 SPINWAIT_DID_SPIN_OWNER_NOT_CORE, /* Got the interlock, spun, but failed to get the lock. */
119 SPINWAIT_DID_SPIN_NO_WINDOW_CONTENTION, /* Got the interlock, spun, but failed to get the lock. */
120 SPINWAIT_DID_SPIN_SLIDING_THR,/* Got the interlock, spun, but failed to get the lock. */
0a7de745
A
121 SPINWAIT_DID_NOT_SPIN, /* Got the interlock, did not spin. */
122} spinwait_result_t;
123
f427ee49 124#if CONFIG_DTRACE
5ba3f43e
A
125extern uint64_t dtrace_spin_threshold;
126#endif
127
128/* Forwards */
129
5ba3f43e
A
130extern unsigned int not_in_kdp;
131
132/*
133 * We often want to know the addresses of the callers
134 * of the various lock routines. However, this information
135 * is only used for debugging and statistics.
136 */
137typedef void *pc_t;
0a7de745
A
138#define INVALID_PC ((void *) VM_MAX_KERNEL_ADDRESS)
139#define INVALID_THREAD ((void *) VM_MAX_KERNEL_ADDRESS)
5ba3f43e 140
0a7de745 141#ifdef lint
5ba3f43e
A
142/*
143 * Eliminate lint complaints about unused local pc variables.
144 */
0a7de745
A
145#define OBTAIN_PC(pc, l) ++pc
146#else /* lint */
147#define OBTAIN_PC(pc, l)
148#endif /* lint */
5ba3f43e
A
149
150
151/*
152 * Portable lock package implementation of usimple_locks.
153 */
154
5ba3f43e
A
155/*
156 * Owner thread pointer when lock held in spin mode
157 */
158#define LCK_MTX_SPIN_TAG 0xfffffff0
159
160
0a7de745
A
161#define interlock_lock(lock) hw_lock_bit ((hw_lock_bit_t*)(&(lock)->lck_mtx_data), LCK_ILOCK_BIT, LCK_GRP_NULL)
162#define interlock_try(lock) hw_lock_bit_try((hw_lock_bit_t*)(&(lock)->lck_mtx_data), LCK_ILOCK_BIT, LCK_GRP_NULL)
163#define interlock_unlock(lock) hw_unlock_bit ((hw_lock_bit_t*)(&(lock)->lck_mtx_data), LCK_ILOCK_BIT)
164#define lck_rw_ilk_lock(lock) hw_lock_bit ((hw_lock_bit_t*)(&(lock)->lck_rw_tag), LCK_RW_INTERLOCK_BIT, LCK_GRP_NULL)
165#define lck_rw_ilk_unlock(lock) hw_unlock_bit((hw_lock_bit_t*)(&(lock)->lck_rw_tag), LCK_RW_INTERLOCK_BIT)
5ba3f43e 166
cb323159 167#define load_memory_barrier() os_atomic_thread_fence(acquire)
5ba3f43e
A
168
169// Enforce program order of loads and stores.
cb323159
A
170#define ordered_load(target) \
171 os_atomic_load(target, compiler_acq_rel)
172#define ordered_store(target, value) \
173 os_atomic_store(target, value, compiler_acq_rel)
174
175#define ordered_load_mtx(lock) ordered_load(&(lock)->lck_mtx_data)
176#define ordered_store_mtx(lock, value) ordered_store(&(lock)->lck_mtx_data, (value))
177#define ordered_load_rw(lock) ordered_load(&(lock)->lck_rw_data)
178#define ordered_store_rw(lock, value) ordered_store(&(lock)->lck_rw_data, (value))
179#define ordered_load_rw_owner(lock) ordered_load(&(lock)->lck_rw_owner)
180#define ordered_store_rw_owner(lock, value) ordered_store(&(lock)->lck_rw_owner, (value))
181#define ordered_load_hw(lock) ordered_load(&(lock)->lock_data)
182#define ordered_store_hw(lock, value) ordered_store(&(lock)->lock_data, (value))
183#define ordered_load_bit(lock) ordered_load((lock))
184#define ordered_store_bit(lock, value) ordered_store((lock), (value))
5ba3f43e
A
185
186
187// Prevent the compiler from reordering memory operations around this
0a7de745 188#define compiler_memory_fence() __asm__ volatile ("" ::: "memory")
5ba3f43e 189
0a7de745
A
190#define LOCK_PANIC_TIMEOUT 0xc00000
191#define NOINLINE __attribute__((noinline))
5ba3f43e
A
192
193
194#if __arm__
195#define interrupts_disabled(mask) (mask & PSR_INTMASK)
196#else
197#define interrupts_disabled(mask) (mask & DAIF_IRQF)
198#endif
199
200
201#if __arm__
0a7de745
A
202#define enable_fiq() __asm__ volatile ("cpsie f" ::: "memory");
203#define enable_interrupts() __asm__ volatile ("cpsie if" ::: "memory");
5ba3f43e
A
204#endif
205
f427ee49
A
206ZONE_VIEW_DEFINE(ZV_LCK_SPIN, "lck_spin",
207 KHEAP_ID_DEFAULT, sizeof(lck_spin_t));
208
209ZONE_VIEW_DEFINE(ZV_LCK_MTX, "lck_mtx",
210 KHEAP_ID_DEFAULT, sizeof(lck_mtx_t));
211
212ZONE_VIEW_DEFINE(ZV_LCK_MTX_EXT, "lck_mtx_ext",
213 KHEAP_ID_DEFAULT, sizeof(lck_mtx_ext_t));
214
215ZONE_VIEW_DEFINE(ZV_LCK_RW, "lck_rw",
216 KHEAP_ID_DEFAULT, sizeof(lck_rw_t));
217
5ba3f43e
A
218/*
219 * Forward declarations
220 */
221
222static void lck_rw_lock_shared_gen(lck_rw_t *lck);
223static void lck_rw_lock_exclusive_gen(lck_rw_t *lck);
224static boolean_t lck_rw_lock_shared_to_exclusive_success(lck_rw_t *lck);
225static boolean_t lck_rw_lock_shared_to_exclusive_failure(lck_rw_t *lck, uint32_t prior_lock_state);
226static void lck_rw_lock_exclusive_to_shared_gen(lck_rw_t *lck, uint32_t prior_lock_state);
227static lck_rw_type_t lck_rw_done_gen(lck_rw_t *lck, uint32_t prior_lock_state);
5ba3f43e
A
228static boolean_t lck_rw_grab(lck_rw_t *lock, int mode, boolean_t wait);
229
230/*
231 * atomic exchange API is a low level abstraction of the operations
232 * to atomically read, modify, and write a pointer. This abstraction works
233 * for both Intel and ARMv8.1 compare and exchange atomic instructions as
234 * well as the ARM exclusive instructions.
235 *
236 * atomic_exchange_begin() - begin exchange and retrieve current value
237 * atomic_exchange_complete() - conclude an exchange
238 * atomic_exchange_abort() - cancel an exchange started with atomic_exchange_begin()
239 */
cb323159
A
240__unused static uint32_t
241load_exclusive32(uint32_t *target, enum memory_order ord)
242{
243 uint32_t value;
244
245#if __arm__
f427ee49 246 if (_os_atomic_mo_has_release(ord)) {
cb323159
A
247 // Pre-load release barrier
248 atomic_thread_fence(memory_order_release);
249 }
250 value = __builtin_arm_ldrex(target);
251#else
f427ee49 252 if (_os_atomic_mo_has_acquire(ord)) {
cb323159
A
253 value = __builtin_arm_ldaex(target); // ldaxr
254 } else {
255 value = __builtin_arm_ldrex(target); // ldxr
256 }
257#endif // __arm__
258 return value;
259}
260
261__unused static boolean_t
262store_exclusive32(uint32_t *target, uint32_t value, enum memory_order ord)
263{
264 boolean_t err;
265
266#if __arm__
267 err = __builtin_arm_strex(value, target);
f427ee49 268 if (_os_atomic_mo_has_acquire(ord)) {
cb323159
A
269 // Post-store acquire barrier
270 atomic_thread_fence(memory_order_acquire);
271 }
272#else
f427ee49 273 if (_os_atomic_mo_has_release(ord)) {
cb323159
A
274 err = __builtin_arm_stlex(value, target); // stlxr
275 } else {
276 err = __builtin_arm_strex(value, target); // stxr
277 }
278#endif // __arm__
279 return !err;
280}
281
5ba3f43e
A
282static uint32_t
283atomic_exchange_begin32(uint32_t *target, uint32_t *previous, enum memory_order ord)
284{
0a7de745 285 uint32_t val;
5ba3f43e 286
cb323159
A
287#if __ARM_ATOMICS_8_1
288 ord = memory_order_relaxed;
289#endif
5ba3f43e
A
290 val = load_exclusive32(target, ord);
291 *previous = val;
292 return val;
293}
294
295static boolean_t
296atomic_exchange_complete32(uint32_t *target, uint32_t previous, uint32_t newval, enum memory_order ord)
297{
cb323159
A
298#if __ARM_ATOMICS_8_1
299 return __c11_atomic_compare_exchange_strong((_Atomic uint32_t *)target, &previous, newval, ord, memory_order_relaxed);
300#else
0a7de745 301 (void)previous; // Previous not needed, monitor is held
5ba3f43e 302 return store_exclusive32(target, newval, ord);
cb323159 303#endif
5ba3f43e
A
304}
305
306static void
307atomic_exchange_abort(void)
308{
cb323159 309 os_atomic_clear_exclusive();
5ba3f43e
A
310}
311
312static boolean_t
313atomic_test_and_set32(uint32_t *target, uint32_t test_mask, uint32_t set_mask, enum memory_order ord, boolean_t wait)
314{
0a7de745 315 uint32_t value, prev;
5ba3f43e 316
0a7de745 317 for (;;) {
5ba3f43e
A
318 value = atomic_exchange_begin32(target, &prev, ord);
319 if (value & test_mask) {
0a7de745
A
320 if (wait) {
321 wait_for_event(); // Wait with monitor held
322 } else {
323 atomic_exchange_abort(); // Clear exclusive monitor
324 }
5ba3f43e
A
325 return FALSE;
326 }
327 value |= set_mask;
0a7de745 328 if (atomic_exchange_complete32(target, prev, value, ord)) {
5ba3f43e 329 return TRUE;
0a7de745 330 }
5ba3f43e
A
331 }
332}
333
cb323159
A
334inline boolean_t
335hw_atomic_test_and_set32(uint32_t *target, uint32_t test_mask, uint32_t set_mask, enum memory_order ord, boolean_t wait)
336{
337 return atomic_test_and_set32(target, test_mask, set_mask, ord, wait);
338}
339
f427ee49
A
340/*
341 * To help _disable_preemption() inline everywhere with LTO,
342 * we keep these nice non inlineable functions as the panic()
343 * codegen setup is quite large and for weird reasons causes a frame.
344 */
345__abortlike
346static void
347_disable_preemption_overflow(void)
348{
349 panic("Preemption count overflow");
350}
351
0a7de745
A
352void
353_disable_preemption(void)
5ba3f43e 354{
cb323159
A
355 thread_t thread = current_thread();
356 unsigned int count = thread->machine.preemption_count;
5ba3f43e 357
f427ee49
A
358 if (__improbable(++count == 0)) {
359 _disable_preemption_overflow();
cb323159
A
360 }
361
362 os_atomic_store(&thread->machine.preemption_count, count, compiler_acq_rel);
5ba3f43e
A
363}
364
cb323159
A
365/*
366 * This function checks whether an AST_URGENT has been pended.
367 *
368 * It is called once the preemption has been reenabled, which means the thread
369 * may have been preempted right before this was called, and when this function
370 * actually performs the check, we've changed CPU.
371 *
372 * This race is however benign: the point of AST_URGENT is to trigger a context
373 * switch, so if one happened, there's nothing left to check for, and AST_URGENT
374 * was cleared in the process.
375 *
376 * It follows that this check cannot have false negatives, which allows us
377 * to avoid fiddling with interrupt state for the vast majority of cases
378 * when the check will actually be negative.
379 */
380static NOINLINE void
381kernel_preempt_check(thread_t thread)
5ba3f43e 382{
cb323159
A
383 cpu_data_t *cpu_data_ptr;
384 long state;
385
5ba3f43e
A
386#if __arm__
387#define INTERRUPT_MASK PSR_IRQF
0a7de745 388#else // __arm__
5ba3f43e 389#define INTERRUPT_MASK DAIF_IRQF
0a7de745 390#endif // __arm__
5ba3f43e 391
cb323159
A
392 /*
393 * This check is racy and could load from another CPU's pending_ast mask,
394 * but as described above, this can't have false negatives.
395 */
396 cpu_data_ptr = os_atomic_load(&thread->machine.CpuDatap, compiler_acq_rel);
397 if (__probable((cpu_data_ptr->cpu_pending_ast & AST_URGENT) == 0)) {
398 return;
0a7de745 399 }
cb323159
A
400
401 /* If interrupts are masked, we can't take an AST here */
402 state = get_interrupts();
403 if ((state & INTERRUPT_MASK) == 0) {
404 disable_interrupts_noread(); // Disable interrupts
405
406 /*
407 * Reload cpu_data_ptr: a context switch would cause it to change.
408 * Now that interrupts are disabled, this will debounce false positives.
409 */
410 cpu_data_ptr = os_atomic_load(&thread->machine.CpuDatap, compiler_acq_rel);
411 if (thread->machine.CpuDatap->cpu_pending_ast & AST_URGENT) {
5ba3f43e
A
412#if __arm__
413#if __ARM_USER_PROTECT__
cb323159 414 uintptr_t up = arm_user_protect_begin(thread);
0a7de745 415#endif // __ARM_USER_PROTECT__
cb323159 416 enable_fiq();
0a7de745 417#endif // __arm__
cb323159 418 ast_taken_kernel(); // Handle urgent AST
5ba3f43e
A
419#if __arm__
420#if __ARM_USER_PROTECT__
cb323159 421 arm_user_protect_end(thread, up, TRUE);
0a7de745 422#endif // __ARM_USER_PROTECT__
cb323159
A
423 enable_interrupts();
424 return; // Return early on arm only due to FIQ enabling
0a7de745 425#endif // __arm__
0a7de745 426 }
cb323159 427 restore_interrupts(state); // Enable interrupts
5ba3f43e 428 }
5ba3f43e 429}
5ba3f43e 430
f427ee49
A
431/*
432 * To help _enable_preemption() inline everywhere with LTO,
433 * we keep these nice non inlineable functions as the panic()
434 * codegen setup is quite large and for weird reasons causes a frame.
435 */
436__abortlike
437static void
438_enable_preemption_underflow(void)
439{
440 panic("Preemption count underflow");
441}
442
5ba3f43e 443void
cb323159 444_enable_preemption(void)
5ba3f43e 445{
cb323159
A
446 thread_t thread = current_thread();
447 unsigned int count = thread->machine.preemption_count;
5ba3f43e 448
cb323159 449 if (__improbable(count == 0)) {
f427ee49 450 _enable_preemption_underflow();
0a7de745 451 }
cb323159 452 count -= 1;
5ba3f43e 453
cb323159
A
454 os_atomic_store(&thread->machine.preemption_count, count, compiler_acq_rel);
455 if (count == 0) {
456 kernel_preempt_check(thread);
0a7de745 457 }
f427ee49
A
458
459 os_compiler_barrier();
d9a64523
A
460}
461
cb323159
A
462int
463get_preemption_level(void)
d9a64523 464{
cb323159 465 return current_thread()->machine.preemption_count;
d9a64523 466}
5ba3f43e
A
467
468/*
469 * Routine: lck_spin_alloc_init
470 */
471lck_spin_t *
472lck_spin_alloc_init(
0a7de745
A
473 lck_grp_t * grp,
474 lck_attr_t * attr)
5ba3f43e 475{
f427ee49 476 lck_spin_t *lck;
5ba3f43e 477
f427ee49
A
478 lck = zalloc(ZV_LCK_SPIN);
479 lck_spin_init(lck, grp, attr);
0a7de745 480 return lck;
5ba3f43e
A
481}
482
483/*
484 * Routine: lck_spin_free
485 */
486void
487lck_spin_free(
0a7de745
A
488 lck_spin_t * lck,
489 lck_grp_t * grp)
5ba3f43e
A
490{
491 lck_spin_destroy(lck, grp);
f427ee49 492 zfree(ZV_LCK_SPIN, lck);
5ba3f43e
A
493}
494
495/*
496 * Routine: lck_spin_init
497 */
498void
499lck_spin_init(
0a7de745
A
500 lck_spin_t * lck,
501 lck_grp_t * grp,
502 __unused lck_attr_t * attr)
5ba3f43e 503{
5ba3f43e 504 lck->type = LCK_SPIN_TYPE;
cb323159
A
505 hw_lock_init(&lck->hwlock);
506 if (grp) {
507 lck_grp_reference(grp);
508 lck_grp_lckcnt_incr(grp, LCK_TYPE_SPIN);
509 }
5ba3f43e
A
510}
511
512/*
513 * arm_usimple_lock is a lck_spin_t without a group or attributes
514 */
c3c9b80d 515MARK_AS_HIBERNATE_TEXT void inline
5ba3f43e
A
516arm_usimple_lock_init(simple_lock_t lck, __unused unsigned short initial_value)
517{
518 lck->type = LCK_SPIN_TYPE;
519 hw_lock_init(&lck->hwlock);
5ba3f43e
A
520}
521
522
523/*
524 * Routine: lck_spin_lock
525 */
526void
527lck_spin_lock(lck_spin_t *lock)
528{
0a7de745
A
529#if DEVELOPMENT || DEBUG
530 if (lock->type != LCK_SPIN_TYPE) {
531 panic("Invalid spinlock %p", lock);
532 }
533#endif // DEVELOPMENT || DEBUG
534 hw_lock_lock(&lock->hwlock, LCK_GRP_NULL);
535}
536
537void
538lck_spin_lock_grp(lck_spin_t *lock, lck_grp_t *grp)
539{
540#pragma unused(grp)
541#if DEVELOPMENT || DEBUG
542 if (lock->type != LCK_SPIN_TYPE) {
5ba3f43e 543 panic("Invalid spinlock %p", lock);
0a7de745
A
544 }
545#endif // DEVELOPMENT || DEBUG
546 hw_lock_lock(&lock->hwlock, grp);
5ba3f43e
A
547}
548
d9a64523
A
549/*
550 * Routine: lck_spin_lock_nopreempt
551 */
552void
553lck_spin_lock_nopreempt(lck_spin_t *lock)
554{
0a7de745
A
555#if DEVELOPMENT || DEBUG
556 if (lock->type != LCK_SPIN_TYPE) {
557 panic("Invalid spinlock %p", lock);
558 }
559#endif // DEVELOPMENT || DEBUG
560 hw_lock_lock_nopreempt(&lock->hwlock, LCK_GRP_NULL);
561}
562
563void
564lck_spin_lock_nopreempt_grp(lck_spin_t *lock, lck_grp_t *grp)
565{
566#pragma unused(grp)
567#if DEVELOPMENT || DEBUG
568 if (lock->type != LCK_SPIN_TYPE) {
d9a64523 569 panic("Invalid spinlock %p", lock);
0a7de745
A
570 }
571#endif // DEVELOPMENT || DEBUG
572 hw_lock_lock_nopreempt(&lock->hwlock, grp);
d9a64523
A
573}
574
5ba3f43e
A
575/*
576 * Routine: lck_spin_try_lock
577 */
578int
579lck_spin_try_lock(lck_spin_t *lock)
580{
0a7de745
A
581 return hw_lock_try(&lock->hwlock, LCK_GRP_NULL);
582}
583
584int
585lck_spin_try_lock_grp(lck_spin_t *lock, lck_grp_t *grp)
586{
587#pragma unused(grp)
588 return hw_lock_try(&lock->hwlock, grp);
5ba3f43e
A
589}
590
d9a64523
A
591/*
592 * Routine: lck_spin_try_lock_nopreempt
593 */
594int
595lck_spin_try_lock_nopreempt(lck_spin_t *lock)
596{
0a7de745
A
597 return hw_lock_try_nopreempt(&lock->hwlock, LCK_GRP_NULL);
598}
599
600int
601lck_spin_try_lock_nopreempt_grp(lck_spin_t *lock, lck_grp_t *grp)
602{
603#pragma unused(grp)
604 return hw_lock_try_nopreempt(&lock->hwlock, grp);
d9a64523
A
605}
606
5ba3f43e
A
607/*
608 * Routine: lck_spin_unlock
609 */
610void
611lck_spin_unlock(lck_spin_t *lock)
612{
0a7de745
A
613#if DEVELOPMENT || DEBUG
614 if ((LCK_MTX_STATE_TO_THREAD(lock->lck_spin_data) != current_thread()) && LOCK_CORRECTNESS_PANIC()) {
5ba3f43e 615 panic("Spinlock not owned by thread %p = %lx", lock, lock->lck_spin_data);
0a7de745
A
616 }
617 if (lock->type != LCK_SPIN_TYPE) {
5ba3f43e 618 panic("Invalid spinlock type %p", lock);
0a7de745
A
619 }
620#endif // DEVELOPMENT || DEBUG
5ba3f43e
A
621 hw_lock_unlock(&lock->hwlock);
622}
623
d9a64523
A
624/*
625 * Routine: lck_spin_unlock_nopreempt
626 */
627void
628lck_spin_unlock_nopreempt(lck_spin_t *lock)
629{
0a7de745
A
630#if DEVELOPMENT || DEBUG
631 if ((LCK_MTX_STATE_TO_THREAD(lock->lck_spin_data) != current_thread()) && LOCK_CORRECTNESS_PANIC()) {
d9a64523 632 panic("Spinlock not owned by thread %p = %lx", lock, lock->lck_spin_data);
0a7de745
A
633 }
634 if (lock->type != LCK_SPIN_TYPE) {
d9a64523 635 panic("Invalid spinlock type %p", lock);
0a7de745
A
636 }
637#endif // DEVELOPMENT || DEBUG
d9a64523
A
638 hw_lock_unlock_nopreempt(&lock->hwlock);
639}
640
5ba3f43e
A
641/*
642 * Routine: lck_spin_destroy
643 */
644void
645lck_spin_destroy(
0a7de745
A
646 lck_spin_t * lck,
647 lck_grp_t * grp)
5ba3f43e 648{
0a7de745 649 if (lck->lck_spin_data == LCK_SPIN_TAG_DESTROYED) {
5ba3f43e 650 return;
0a7de745 651 }
5ba3f43e 652 lck->lck_spin_data = LCK_SPIN_TAG_DESTROYED;
cb323159
A
653 if (grp) {
654 lck_grp_lckcnt_decr(grp, LCK_TYPE_SPIN);
655 lck_grp_deallocate(grp);
656 }
5ba3f43e
A
657}
658
659/*
660 * Routine: kdp_lck_spin_is_acquired
661 * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
662 */
663boolean_t
0a7de745
A
664kdp_lck_spin_is_acquired(lck_spin_t *lck)
665{
5ba3f43e
A
666 if (not_in_kdp) {
667 panic("panic: spinlock acquired check done outside of kernel debugger");
668 }
669 return ((lck->lck_spin_data & ~LCK_SPIN_TAG_DESTROYED) != 0) ? TRUE:FALSE;
670}
671
672/*
673 * Initialize a usimple_lock.
674 *
675 * No change in preemption state.
676 */
677void
678usimple_lock_init(
0a7de745
A
679 usimple_lock_t l,
680 unsigned short tag)
5ba3f43e 681{
5ba3f43e 682 simple_lock_init((simple_lock_t) l, tag);
5ba3f43e
A
683}
684
685
686/*
687 * Acquire a usimple_lock.
688 *
689 * Returns with preemption disabled. Note
690 * that the hw_lock routines are responsible for
691 * maintaining preemption state.
692 */
693void
0a7de745
A
694(usimple_lock)(
695 usimple_lock_t l
696 LCK_GRP_ARG(lck_grp_t *grp))
5ba3f43e 697{
0a7de745 698 simple_lock((simple_lock_t) l, LCK_GRP_PROBEARG(grp));
5ba3f43e
A
699}
700
701
702extern void sync(void);
703
704/*
705 * Release a usimple_lock.
706 *
707 * Returns with preemption enabled. Note
708 * that the hw_lock routines are responsible for
709 * maintaining preemption state.
710 */
711void
0a7de745
A
712(usimple_unlock)(
713 usimple_lock_t l)
5ba3f43e 714{
0a7de745 715 simple_unlock((simple_lock_t)l);
5ba3f43e
A
716}
717
718
719/*
720 * Conditionally acquire a usimple_lock.
721 *
722 * On success, returns with preemption disabled.
723 * On failure, returns with preemption in the same state
724 * as when first invoked. Note that the hw_lock routines
725 * are responsible for maintaining preemption state.
726 *
727 * XXX No stats are gathered on a miss; I preserved this
728 * behavior from the original assembly-language code, but
729 * doesn't it make sense to log misses? XXX
730 */
0a7de745
A
731unsigned
732int
733(usimple_lock_try)(
734 usimple_lock_t l
735 LCK_GRP_ARG(lck_grp_t *grp))
5ba3f43e 736{
0a7de745 737 return simple_lock_try((simple_lock_t) l, grp);
5ba3f43e
A
738}
739
5ba3f43e
A
740/*
741 * The C portion of the shared/exclusive locks package.
742 */
743
744/*
745 * compute the deadline to spin against when
746 * waiting for a change of state on a lck_rw_t
747 */
5ba3f43e
A
748static inline uint64_t
749lck_rw_deadline_for_spin(lck_rw_t *lck)
750{
0a7de745 751 lck_rw_word_t word;
5ba3f43e
A
752
753 word.data = ordered_load_rw(lck);
754 if (word.can_sleep) {
755 if (word.r_waiting || word.w_waiting || (word.shared_count > machine_info.max_cpus)) {
756 /*
757 * there are already threads waiting on this lock... this
758 * implies that they have spun beyond their deadlines waiting for
759 * the desired state to show up so we will not bother spinning at this time...
760 * or
761 * the current number of threads sharing this lock exceeds our capacity to run them
762 * concurrently and since all states we're going to spin for require the rw_shared_count
763 * to be at 0, we'll not bother spinning since the latency for this to happen is
764 * unpredictable...
765 */
0a7de745 766 return mach_absolute_time();
5ba3f43e 767 }
0a7de745
A
768 return mach_absolute_time() + MutexSpin;
769 } else {
770 return mach_absolute_time() + (100000LL * 1000000000LL);
771 }
5ba3f43e 772}
5ba3f43e
A
773
774static boolean_t
775lck_rw_drain_status(lck_rw_t *lock, uint32_t status_mask, boolean_t wait __unused)
776{
0a7de745
A
777 uint64_t deadline = 0;
778 uint32_t data;
5ba3f43e 779
0a7de745 780 if (wait) {
5ba3f43e 781 deadline = lck_rw_deadline_for_spin(lock);
0a7de745 782 }
5ba3f43e 783
0a7de745 784 for (;;) {
5ba3f43e 785 data = load_exclusive32(&lock->lck_rw_data, memory_order_acquire_smp);
0a7de745 786 if ((data & status_mask) == 0) {
5ba3f43e 787 break;
0a7de745
A
788 }
789 if (wait) {
5ba3f43e 790 wait_for_event();
0a7de745 791 } else {
cb323159 792 os_atomic_clear_exclusive();
0a7de745
A
793 }
794 if (!wait || (mach_absolute_time() >= deadline)) {
5ba3f43e 795 return FALSE;
0a7de745 796 }
5ba3f43e 797 }
cb323159 798 os_atomic_clear_exclusive();
5ba3f43e 799 return TRUE;
5ba3f43e
A
800}
801
802/*
803 * Spin while interlock is held.
804 */
805static inline void
806lck_rw_interlock_spin(lck_rw_t *lock)
807{
0a7de745 808 uint32_t data;
5ba3f43e 809
0a7de745 810 for (;;) {
5ba3f43e 811 data = load_exclusive32(&lock->lck_rw_data, memory_order_relaxed);
0a7de745 812 if (data & LCK_RW_INTERLOCK) {
5ba3f43e 813 wait_for_event();
0a7de745 814 } else {
cb323159 815 os_atomic_clear_exclusive();
5ba3f43e
A
816 return;
817 }
818 }
5ba3f43e
A
819}
820
821/*
822 * We disable interrupts while holding the RW interlock to prevent an
823 * interrupt from exacerbating hold time.
824 * Hence, local helper functions lck_interlock_lock()/lck_interlock_unlock().
825 */
826static inline boolean_t
827lck_interlock_lock(lck_rw_t *lck)
828{
0a7de745 829 boolean_t istate;
5ba3f43e 830
0a7de745 831 istate = ml_set_interrupts_enabled(FALSE);
5ba3f43e
A
832 lck_rw_ilk_lock(lck);
833 return istate;
834}
835
836static inline void
837lck_interlock_unlock(lck_rw_t *lck, boolean_t istate)
838{
839 lck_rw_ilk_unlock(lck);
840 ml_set_interrupts_enabled(istate);
841}
842
843
0a7de745
A
844#define LCK_RW_GRAB_WANT 0
845#define LCK_RW_GRAB_SHARED 1
5ba3f43e
A
846
847static boolean_t
848lck_rw_grab(lck_rw_t *lock, int mode, boolean_t wait)
849{
0a7de745
A
850 uint64_t deadline = 0;
851 uint32_t data, prev;
852 boolean_t do_exch;
5ba3f43e 853
0a7de745 854 if (wait) {
5ba3f43e 855 deadline = lck_rw_deadline_for_spin(lock);
0a7de745 856 }
5ba3f43e 857
0a7de745 858 for (;;) {
5ba3f43e
A
859 data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp);
860 if (data & LCK_RW_INTERLOCK) {
861 atomic_exchange_abort();
862 lck_rw_interlock_spin(lock);
863 continue;
864 }
865 do_exch = FALSE;
866 if (mode == LCK_RW_GRAB_WANT) {
867 if ((data & LCK_RW_WANT_EXCL) == 0) {
868 data |= LCK_RW_WANT_EXCL;
869 do_exch = TRUE;
870 }
0a7de745 871 } else { // LCK_RW_GRAB_SHARED
5ba3f43e 872 if (((data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) == 0) ||
0a7de745 873 (((data & LCK_RW_SHARED_MASK)) && ((data & LCK_RW_PRIV_EXCL) == 0))) {
5ba3f43e
A
874 data += LCK_RW_SHARED_READER;
875 do_exch = TRUE;
876 }
877 }
878 if (do_exch) {
0a7de745 879 if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
5ba3f43e 880 return TRUE;
0a7de745 881 }
5ba3f43e 882 } else {
0a7de745 883 if (wait) { // Non-waiting
5ba3f43e 884 wait_for_event();
0a7de745 885 } else {
5ba3f43e 886 atomic_exchange_abort();
0a7de745
A
887 }
888 if (!wait || (mach_absolute_time() >= deadline)) {
5ba3f43e 889 return FALSE;
0a7de745 890 }
5ba3f43e
A
891 }
892 }
893}
894
895
896/*
897 * Routine: lck_rw_alloc_init
898 */
899lck_rw_t *
900lck_rw_alloc_init(
0a7de745
A
901 lck_grp_t *grp,
902 lck_attr_t *attr)
5ba3f43e 903{
f427ee49 904 lck_rw_t *lck;
5ba3f43e 905
f427ee49
A
906 lck = zalloc_flags(ZV_LCK_RW, Z_WAITOK | Z_ZERO);
907 lck_rw_init(lck, grp, attr);
5ba3f43e
A
908 return lck;
909}
910
911/*
912 * Routine: lck_rw_free
913 */
914void
915lck_rw_free(
0a7de745
A
916 lck_rw_t *lck,
917 lck_grp_t *grp)
5ba3f43e
A
918{
919 lck_rw_destroy(lck, grp);
f427ee49 920 zfree(ZV_LCK_RW, lck);
5ba3f43e
A
921}
922
923/*
924 * Routine: lck_rw_init
925 */
926void
927lck_rw_init(
0a7de745
A
928 lck_rw_t *lck,
929 lck_grp_t *grp,
930 lck_attr_t *attr)
5ba3f43e 931{
0a7de745 932 if (attr == LCK_ATTR_NULL) {
5ba3f43e 933 attr = &LockDefaultLckAttr;
0a7de745 934 }
5ba3f43e
A
935 memset(lck, 0, sizeof(lck_rw_t));
936 lck->lck_rw_can_sleep = TRUE;
0a7de745 937 if ((attr->lck_attr_val & LCK_ATTR_RW_SHARED_PRIORITY) == 0) {
5ba3f43e 938 lck->lck_rw_priv_excl = TRUE;
0a7de745 939 }
5ba3f43e
A
940
941 lck_grp_reference(grp);
942 lck_grp_lckcnt_incr(grp, LCK_TYPE_RW);
943}
944
945
946/*
947 * Routine: lck_rw_destroy
948 */
949void
950lck_rw_destroy(
0a7de745
A
951 lck_rw_t *lck,
952 lck_grp_t *grp)
5ba3f43e 953{
0a7de745 954 if (lck->lck_rw_tag == LCK_RW_TAG_DESTROYED) {
5ba3f43e 955 return;
0a7de745 956 }
5ba3f43e
A
957#if MACH_LDEBUG
958 lck_rw_assert(lck, LCK_RW_ASSERT_NOTHELD);
959#endif
960 lck->lck_rw_tag = LCK_RW_TAG_DESTROYED;
961 lck_grp_lckcnt_decr(grp, LCK_TYPE_RW);
962 lck_grp_deallocate(grp);
963 return;
964}
965
966/*
967 * Routine: lck_rw_lock
968 */
969void
970lck_rw_lock(
0a7de745
A
971 lck_rw_t *lck,
972 lck_rw_type_t lck_rw_type)
5ba3f43e 973{
0a7de745 974 if (lck_rw_type == LCK_RW_TYPE_SHARED) {
5ba3f43e 975 lck_rw_lock_shared(lck);
0a7de745 976 } else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) {
5ba3f43e 977 lck_rw_lock_exclusive(lck);
0a7de745 978 } else {
5ba3f43e 979 panic("lck_rw_lock(): Invalid RW lock type: %x", lck_rw_type);
0a7de745 980 }
5ba3f43e
A
981}
982
f427ee49
A
983#define LCK_RW_LOCK_EXCLUSIVE_TAS(lck) (atomic_test_and_set32(&(lck)->lck_rw_data, \
984 (LCK_RW_SHARED_MASK | LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE | LCK_RW_INTERLOCK), \
985 LCK_RW_WANT_EXCL, memory_order_acquire_smp, FALSE))
986
987/*
988 * Routine: lck_rw_lock_exclusive_check_contended
989 */
990bool
991lck_rw_lock_exclusive_check_contended(lck_rw_t *lock)
992{
993 thread_t thread = current_thread();
994 bool contended = false;
995
996 if (lock->lck_rw_can_sleep) {
997 thread->rwlock_count++;
998 } else if (get_preemption_level() == 0) {
999 panic("Taking non-sleepable RW lock with preemption enabled");
1000 }
1001 if (LCK_RW_LOCK_EXCLUSIVE_TAS(lock)) {
1002#if CONFIG_DTRACE
1003 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL);
1004#endif /* CONFIG_DTRACE */
1005 } else {
1006 contended = true;
1007 lck_rw_lock_exclusive_gen(lock);
1008 }
1009#if MACH_ASSERT
1010 thread_t owner = ordered_load_rw_owner(lock);
1011 assertf(owner == THREAD_NULL, "state=0x%x, owner=%p", ordered_load_rw(lock), owner);
1012#endif
1013 ordered_store_rw_owner(lock, thread);
1014 return contended;
1015}
1016
5ba3f43e
A
1017/*
1018 * Routine: lck_rw_lock_exclusive
1019 */
1020void
1021lck_rw_lock_exclusive(lck_rw_t *lock)
1022{
0a7de745 1023 thread_t thread = current_thread();
5ba3f43e 1024
f427ee49
A
1025 if (lock->lck_rw_can_sleep) {
1026 thread->rwlock_count++;
1027 } else if (get_preemption_level() == 0) {
1028 panic("Taking non-sleepable RW lock with preemption enabled");
1029 }
1030 if (LCK_RW_LOCK_EXCLUSIVE_TAS(lock)) {
0a7de745 1031#if CONFIG_DTRACE
5ba3f43e 1032 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL);
0a7de745
A
1033#endif /* CONFIG_DTRACE */
1034 } else {
5ba3f43e 1035 lck_rw_lock_exclusive_gen(lock);
0a7de745 1036 }
5ba3f43e
A
1037#if MACH_ASSERT
1038 thread_t owner = ordered_load_rw_owner(lock);
1039 assertf(owner == THREAD_NULL, "state=0x%x, owner=%p", ordered_load_rw(lock), owner);
1040#endif
1041 ordered_store_rw_owner(lock, thread);
1042}
1043
1044/*
1045 * Routine: lck_rw_lock_shared
1046 */
1047void
1048lck_rw_lock_shared(lck_rw_t *lock)
1049{
0a7de745 1050 uint32_t data, prev;
5ba3f43e 1051
f427ee49
A
1052 if (lock->lck_rw_can_sleep) {
1053 current_thread()->rwlock_count++;
1054 } else if (get_preemption_level() == 0) {
1055 panic("Taking non-sleepable RW lock with preemption enabled");
1056 }
0a7de745 1057 for (;;) {
5ba3f43e
A
1058 data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp);
1059 if (data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE | LCK_RW_INTERLOCK)) {
1060 atomic_exchange_abort();
1061 lck_rw_lock_shared_gen(lock);
1062 break;
1063 }
1064 data += LCK_RW_SHARED_READER;
0a7de745 1065 if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
5ba3f43e 1066 break;
0a7de745 1067 }
5ba3f43e
A
1068 cpu_pause();
1069 }
1070#if MACH_ASSERT
1071 thread_t owner = ordered_load_rw_owner(lock);
1072 assertf(owner == THREAD_NULL, "state=0x%x, owner=%p", ordered_load_rw(lock), owner);
1073#endif
0a7de745 1074#if CONFIG_DTRACE
5ba3f43e 1075 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lock, DTRACE_RW_SHARED);
0a7de745 1076#endif /* CONFIG_DTRACE */
5ba3f43e
A
1077 return;
1078}
1079
1080/*
1081 * Routine: lck_rw_lock_shared_to_exclusive
cb323159
A
1082 *
1083 * False returned upon failure, in this case the shared lock is dropped.
5ba3f43e
A
1084 */
1085boolean_t
1086lck_rw_lock_shared_to_exclusive(lck_rw_t *lock)
1087{
0a7de745 1088 uint32_t data, prev;
5ba3f43e 1089
0a7de745 1090 for (;;) {
5ba3f43e
A
1091 data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp);
1092 if (data & LCK_RW_INTERLOCK) {
1093 atomic_exchange_abort();
1094 lck_rw_interlock_spin(lock);
1095 continue;
1096 }
1097 if (data & LCK_RW_WANT_UPGRADE) {
1098 data -= LCK_RW_SHARED_READER;
0a7de745
A
1099 if ((data & LCK_RW_SHARED_MASK) == 0) { /* we were the last reader */
1100 data &= ~(LCK_RW_W_WAITING); /* so clear the wait indicator */
1101 }
1102 if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
5ba3f43e 1103 return lck_rw_lock_shared_to_exclusive_failure(lock, prev);
0a7de745 1104 }
5ba3f43e 1105 } else {
0a7de745
A
1106 data |= LCK_RW_WANT_UPGRADE; /* ask for WANT_UPGRADE */
1107 data -= LCK_RW_SHARED_READER; /* and shed our read count */
1108 if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
5ba3f43e 1109 break;
0a7de745 1110 }
5ba3f43e
A
1111 }
1112 cpu_pause();
1113 }
0a7de745
A
1114 /* we now own the WANT_UPGRADE */
1115 if (data & LCK_RW_SHARED_MASK) { /* check to see if all of the readers are drained */
1116 lck_rw_lock_shared_to_exclusive_success(lock); /* if not, we need to go wait */
1117 }
5ba3f43e
A
1118#if MACH_ASSERT
1119 thread_t owner = ordered_load_rw_owner(lock);
1120 assertf(owner == THREAD_NULL, "state=0x%x, owner=%p", ordered_load_rw(lock), owner);
1121#endif
1122 ordered_store_rw_owner(lock, current_thread());
0a7de745 1123#if CONFIG_DTRACE
5ba3f43e 1124 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lock, 0);
0a7de745 1125#endif /* CONFIG_DTRACE */
5ba3f43e
A
1126 return TRUE;
1127}
1128
1129
1130/*
1131 * Routine: lck_rw_lock_shared_to_exclusive_failure
1132 * Function:
1133 * Fast path code has already dropped our read
1134 * count and determined that someone else owns 'lck_rw_want_upgrade'
1135 * if 'lck_rw_shared_count' == 0, its also already dropped 'lck_w_waiting'
1136 * all we need to do here is determine if a wakeup is needed
1137 */
1138static boolean_t
1139lck_rw_lock_shared_to_exclusive_failure(
0a7de745
A
1140 lck_rw_t *lck,
1141 uint32_t prior_lock_state)
5ba3f43e 1142{
0a7de745
A
1143 thread_t thread = current_thread();
1144 uint32_t rwlock_count;
5ba3f43e
A
1145
1146 /* Check if dropping the lock means that we need to unpromote */
f427ee49
A
1147 if (lck->lck_rw_can_sleep) {
1148 rwlock_count = thread->rwlock_count--;
1149 } else {
1150 rwlock_count = UINT32_MAX;
1151 }
5ba3f43e
A
1152#if MACH_LDEBUG
1153 if (rwlock_count == 0) {
1154 panic("rw lock count underflow for thread %p", thread);
1155 }
1156#endif
1157 if ((prior_lock_state & LCK_RW_W_WAITING) &&
0a7de745 1158 ((prior_lock_state & LCK_RW_SHARED_MASK) == LCK_RW_SHARED_READER)) {
5ba3f43e
A
1159 /*
1160 * Someone else has requested upgrade.
1161 * Since we've released the read lock, wake
1162 * him up if he's blocked waiting
1163 */
1164 thread_wakeup(LCK_RW_WRITER_EVENT(lck));
1165 }
1166
1167 if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
1168 /* sched_flags checked without lock, but will be rechecked while clearing */
d9a64523 1169 lck_rw_clear_promotion(thread, unslide_for_kdebug(lck));
5ba3f43e
A
1170 }
1171
1172 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_CODE) | DBG_FUNC_NONE,
0a7de745 1173 VM_KERNEL_UNSLIDE_OR_PERM(lck), lck->lck_rw_shared_count, lck->lck_rw_want_upgrade, 0, 0);
5ba3f43e 1174
0a7de745 1175 return FALSE;
5ba3f43e
A
1176}
1177
1178/*
1179 * Routine: lck_rw_lock_shared_to_exclusive_success
1180 * Function:
1181 * assembly fast path code has already dropped our read
1182 * count and successfully acquired 'lck_rw_want_upgrade'
1183 * we just need to wait for the rest of the readers to drain
1184 * and then we can return as the exclusive holder of this lock
1185 */
1186static boolean_t
1187lck_rw_lock_shared_to_exclusive_success(
0a7de745
A
1188 lck_rw_t *lock)
1189{
1190 __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lock);
1191 int slept = 0;
1192 lck_rw_word_t word;
1193 wait_result_t res;
1194 boolean_t istate;
1195 boolean_t not_shared;
1196
1197#if CONFIG_DTRACE
1198 uint64_t wait_interval = 0;
1199 int readers_at_sleep = 0;
1200 boolean_t dtrace_ls_initialized = FALSE;
1201 boolean_t dtrace_rwl_shared_to_excl_spin, dtrace_rwl_shared_to_excl_block, dtrace_ls_enabled = FALSE;
5ba3f43e
A
1202#endif
1203
1204 while (!lck_rw_drain_status(lock, LCK_RW_SHARED_MASK, FALSE)) {
5ba3f43e 1205 word.data = ordered_load_rw(lock);
0a7de745 1206#if CONFIG_DTRACE
5ba3f43e
A
1207 if (dtrace_ls_initialized == FALSE) {
1208 dtrace_ls_initialized = TRUE;
1209 dtrace_rwl_shared_to_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN] != 0);
1210 dtrace_rwl_shared_to_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK] != 0);
1211 dtrace_ls_enabled = dtrace_rwl_shared_to_excl_spin || dtrace_rwl_shared_to_excl_block;
1212 if (dtrace_ls_enabled) {
1213 /*
1214 * Either sleeping or spinning is happening,
1215 * start a timing of our delay interval now.
1216 */
1217 readers_at_sleep = word.shared_count;
1218 wait_interval = mach_absolute_time();
1219 }
1220 }
1221#endif
1222
1223 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_START,
0a7de745 1224 trace_lck, word.shared_count, 0, 0, 0);
5ba3f43e
A
1225
1226 not_shared = lck_rw_drain_status(lock, LCK_RW_SHARED_MASK, TRUE);
1227
1228 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_END,
0a7de745 1229 trace_lck, lock->lck_rw_shared_count, 0, 0, 0);
5ba3f43e 1230
0a7de745 1231 if (not_shared) {
5ba3f43e 1232 break;
0a7de745 1233 }
5ba3f43e
A
1234
1235 /*
1236 * if we get here, the spin deadline in lck_rw_wait_on_status()
1237 * has expired w/o the rw_shared_count having drained to 0
1238 * check to see if we're allowed to do a thread_block
1239 */
1240 if (word.can_sleep) {
5ba3f43e 1241 istate = lck_interlock_lock(lock);
0a7de745 1242
5ba3f43e
A
1243 word.data = ordered_load_rw(lock);
1244 if (word.shared_count != 0) {
1245 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_START,
0a7de745 1246 trace_lck, word.shared_count, 0, 0, 0);
5ba3f43e
A
1247
1248 word.w_waiting = 1;
1249 ordered_store_rw(lock, word.data);
1250
1251 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockUpgrade);
d9a64523 1252 res = assert_wait(LCK_RW_WRITER_EVENT(lock),
0a7de745 1253 THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
5ba3f43e
A
1254 lck_interlock_unlock(lock, istate);
1255
1256 if (res == THREAD_WAITING) {
1257 res = thread_block(THREAD_CONTINUE_NULL);
1258 slept++;
1259 }
1260 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_END,
0a7de745 1261 trace_lck, res, slept, 0, 0);
5ba3f43e
A
1262 } else {
1263 lck_interlock_unlock(lock, istate);
1264 break;
1265 }
1266 }
1267 }
0a7de745 1268#if CONFIG_DTRACE
5ba3f43e
A
1269 /*
1270 * We infer whether we took the sleep/spin path above by checking readers_at_sleep.
1271 */
1272 if (dtrace_ls_enabled == TRUE) {
1273 if (slept == 0) {
0a7de745 1274 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN, lock, mach_absolute_time() - wait_interval, 0);
5ba3f43e 1275 } else {
0a7de745 1276 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK, lock,
5ba3f43e
A
1277 mach_absolute_time() - wait_interval, 1,
1278 (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1279 }
1280 }
1281 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lock, 1);
1282#endif
0a7de745 1283 return TRUE;
5ba3f43e
A
1284}
1285
1286
1287/*
1288 * Routine: lck_rw_lock_exclusive_to_shared
1289 */
1290
0a7de745
A
1291void
1292lck_rw_lock_exclusive_to_shared(lck_rw_t *lock)
5ba3f43e 1293{
0a7de745 1294 uint32_t data, prev;
5ba3f43e
A
1295
1296 assertf(lock->lck_rw_owner == current_thread(), "state=0x%x, owner=%p", lock->lck_rw_data, lock->lck_rw_owner);
1297 ordered_store_rw_owner(lock, THREAD_NULL);
0a7de745 1298 for (;;) {
5ba3f43e
A
1299 data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_release_smp);
1300 if (data & LCK_RW_INTERLOCK) {
5ba3f43e 1301 atomic_exchange_abort();
0a7de745 1302 lck_rw_interlock_spin(lock); /* wait for interlock to clear */
5ba3f43e 1303 continue;
5ba3f43e
A
1304 }
1305 data += LCK_RW_SHARED_READER;
0a7de745 1306 if (data & LCK_RW_WANT_UPGRADE) {
5ba3f43e 1307 data &= ~(LCK_RW_WANT_UPGRADE);
0a7de745 1308 } else {
5ba3f43e 1309 data &= ~(LCK_RW_WANT_EXCL);
0a7de745
A
1310 }
1311 if (!((prev & LCK_RW_W_WAITING) && (prev & LCK_RW_PRIV_EXCL))) {
5ba3f43e 1312 data &= ~(LCK_RW_W_WAITING);
0a7de745
A
1313 }
1314 if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_release_smp)) {
5ba3f43e 1315 break;
0a7de745 1316 }
5ba3f43e
A
1317 cpu_pause();
1318 }
1319 return lck_rw_lock_exclusive_to_shared_gen(lock, prev);
1320}
1321
1322/*
1323 * Routine: lck_rw_lock_exclusive_to_shared_gen
0a7de745 1324 * Function:
5ba3f43e
A
1325 * Fast path has already dropped
1326 * our exclusive state and bumped lck_rw_shared_count
1327 * all we need to do here is determine if anyone
1328 * needs to be awakened.
1329 */
1330static void
1331lck_rw_lock_exclusive_to_shared_gen(
0a7de745
A
1332 lck_rw_t *lck,
1333 uint32_t prior_lock_state)
5ba3f43e 1334{
0a7de745
A
1335 __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
1336 lck_rw_word_t fake_lck;
5ba3f43e
A
1337
1338 /*
1339 * prior_lock state is a snapshot of the 1st word of the
1340 * lock in question... we'll fake up a pointer to it
1341 * and carefully not access anything beyond whats defined
1342 * in the first word of a lck_rw_t
1343 */
1344 fake_lck.data = prior_lock_state;
1345
1346 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_START,
0a7de745 1347 trace_lck, fake_lck->want_excl, fake_lck->want_upgrade, 0, 0);
5ba3f43e
A
1348
1349 /*
1350 * don't wake up anyone waiting to take the lock exclusively
1351 * since we hold a read count... when the read count drops to 0,
1352 * the writers will be woken.
1353 *
1354 * wake up any waiting readers if we don't have any writers waiting,
1355 * or the lock is NOT marked as rw_priv_excl (writers have privilege)
1356 */
0a7de745 1357 if (!(fake_lck.priv_excl && fake_lck.w_waiting) && fake_lck.r_waiting) {
5ba3f43e 1358 thread_wakeup(LCK_RW_READER_EVENT(lck));
0a7de745 1359 }
5ba3f43e
A
1360
1361 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_END,
0a7de745 1362 trace_lck, lck->lck_rw_want_excl, lck->lck_rw_want_upgrade, lck->lck_rw_shared_count, 0);
5ba3f43e
A
1363
1364#if CONFIG_DTRACE
1365 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_TO_SHARED_DOWNGRADE, lck, 0);
1366#endif
1367}
1368
1369
1370/*
1371 * Routine: lck_rw_try_lock
1372 */
1373boolean_t
1374lck_rw_try_lock(
0a7de745
A
1375 lck_rw_t *lck,
1376 lck_rw_type_t lck_rw_type)
5ba3f43e 1377{
0a7de745 1378 if (lck_rw_type == LCK_RW_TYPE_SHARED) {
5ba3f43e 1379 return lck_rw_try_lock_shared(lck);
0a7de745 1380 } else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) {
5ba3f43e 1381 return lck_rw_try_lock_exclusive(lck);
0a7de745 1382 } else {
5ba3f43e 1383 panic("lck_rw_try_lock(): Invalid rw lock type: %x", lck_rw_type);
0a7de745 1384 }
5ba3f43e
A
1385 return FALSE;
1386}
1387
1388/*
1389 * Routine: lck_rw_try_lock_shared
1390 */
1391
0a7de745
A
1392boolean_t
1393lck_rw_try_lock_shared(lck_rw_t *lock)
5ba3f43e 1394{
0a7de745 1395 uint32_t data, prev;
5ba3f43e 1396
0a7de745 1397 for (;;) {
5ba3f43e
A
1398 data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp);
1399 if (data & LCK_RW_INTERLOCK) {
5ba3f43e
A
1400 atomic_exchange_abort();
1401 lck_rw_interlock_spin(lock);
1402 continue;
5ba3f43e
A
1403 }
1404 if (data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) {
1405 atomic_exchange_abort();
0a7de745 1406 return FALSE; /* lock is busy */
5ba3f43e 1407 }
0a7de745
A
1408 data += LCK_RW_SHARED_READER; /* Increment reader refcount */
1409 if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
5ba3f43e 1410 break;
0a7de745 1411 }
5ba3f43e
A
1412 cpu_pause();
1413 }
1414#if MACH_ASSERT
1415 thread_t owner = ordered_load_rw_owner(lock);
1416 assertf(owner == THREAD_NULL, "state=0x%x, owner=%p", ordered_load_rw(lock), owner);
1417#endif
f427ee49
A
1418
1419 if (lock->lck_rw_can_sleep) {
1420 current_thread()->rwlock_count++;
1421 } else if (get_preemption_level() == 0) {
1422 panic("Taking non-sleepable RW lock with preemption enabled");
1423 }
1424
0a7de745 1425#if CONFIG_DTRACE
5ba3f43e 1426 LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_SHARED_ACQUIRE, lock, DTRACE_RW_SHARED);
0a7de745 1427#endif /* CONFIG_DTRACE */
5ba3f43e
A
1428 return TRUE;
1429}
1430
1431
1432/*
1433 * Routine: lck_rw_try_lock_exclusive
1434 */
1435
0a7de745
A
1436boolean_t
1437lck_rw_try_lock_exclusive(lck_rw_t *lock)
5ba3f43e 1438{
0a7de745
A
1439 uint32_t data, prev;
1440 thread_t thread;
5ba3f43e 1441
0a7de745 1442 for (;;) {
5ba3f43e
A
1443 data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp);
1444 if (data & LCK_RW_INTERLOCK) {
5ba3f43e
A
1445 atomic_exchange_abort();
1446 lck_rw_interlock_spin(lock);
1447 continue;
5ba3f43e
A
1448 }
1449 if (data & (LCK_RW_SHARED_MASK | LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) {
1450 atomic_exchange_abort();
1451 return FALSE;
1452 }
1453 data |= LCK_RW_WANT_EXCL;
0a7de745 1454 if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
5ba3f43e 1455 break;
0a7de745 1456 }
5ba3f43e
A
1457 cpu_pause();
1458 }
1459 thread = current_thread();
f427ee49
A
1460 if (lock->lck_rw_can_sleep) {
1461 thread->rwlock_count++;
1462 } else if (get_preemption_level() == 0) {
1463 panic("Taking non-sleepable RW lock with preemption enabled");
1464 }
5ba3f43e
A
1465#if MACH_ASSERT
1466 thread_t owner = ordered_load_rw_owner(lock);
1467 assertf(owner == THREAD_NULL, "state=0x%x, owner=%p", ordered_load_rw(lock), owner);
1468#endif
1469 ordered_store_rw_owner(lock, thread);
0a7de745 1470#if CONFIG_DTRACE
5ba3f43e 1471 LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL);
0a7de745 1472#endif /* CONFIG_DTRACE */
5ba3f43e
A
1473 return TRUE;
1474}
1475
1476
1477/*
1478 * Routine: lck_rw_unlock
1479 */
1480void
1481lck_rw_unlock(
0a7de745
A
1482 lck_rw_t *lck,
1483 lck_rw_type_t lck_rw_type)
5ba3f43e 1484{
0a7de745 1485 if (lck_rw_type == LCK_RW_TYPE_SHARED) {
5ba3f43e 1486 lck_rw_unlock_shared(lck);
0a7de745 1487 } else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) {
5ba3f43e 1488 lck_rw_unlock_exclusive(lck);
0a7de745 1489 } else {
5ba3f43e 1490 panic("lck_rw_unlock(): Invalid RW lock type: %d", lck_rw_type);
0a7de745 1491 }
5ba3f43e
A
1492}
1493
1494
1495/*
1496 * Routine: lck_rw_unlock_shared
1497 */
1498void
1499lck_rw_unlock_shared(
0a7de745 1500 lck_rw_t *lck)
5ba3f43e 1501{
0a7de745 1502 lck_rw_type_t ret;
5ba3f43e
A
1503
1504 assertf(lck->lck_rw_owner == THREAD_NULL, "state=0x%x, owner=%p", lck->lck_rw_data, lck->lck_rw_owner);
1505 assertf(lck->lck_rw_shared_count > 0, "shared_count=0x%x", lck->lck_rw_shared_count);
1506 ret = lck_rw_done(lck);
1507
0a7de745 1508 if (ret != LCK_RW_TYPE_SHARED) {
5ba3f43e 1509 panic("lck_rw_unlock_shared(): lock %p held in mode: %d", lck, ret);
0a7de745 1510 }
5ba3f43e
A
1511}
1512
1513
1514/*
1515 * Routine: lck_rw_unlock_exclusive
1516 */
1517void
1518lck_rw_unlock_exclusive(
0a7de745 1519 lck_rw_t *lck)
5ba3f43e 1520{
0a7de745 1521 lck_rw_type_t ret;
5ba3f43e
A
1522
1523 assertf(lck->lck_rw_owner == current_thread(), "state=0x%x, owner=%p", lck->lck_rw_data, lck->lck_rw_owner);
1524 ret = lck_rw_done(lck);
1525
0a7de745 1526 if (ret != LCK_RW_TYPE_EXCLUSIVE) {
5ba3f43e 1527 panic("lck_rw_unlock_exclusive(): lock %p held in mode: %d", lck, ret);
0a7de745 1528 }
5ba3f43e
A
1529}
1530
1531
1532/*
1533 * Routine: lck_rw_lock_exclusive_gen
1534 */
1535static void
1536lck_rw_lock_exclusive_gen(
0a7de745 1537 lck_rw_t *lock)
5ba3f43e 1538{
0a7de745
A
1539 __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lock);
1540 lck_rw_word_t word;
1541 int slept = 0;
1542 boolean_t gotlock = 0;
1543 boolean_t not_shared_or_upgrade = 0;
1544 wait_result_t res = 0;
1545 boolean_t istate;
5ba3f43e 1546
0a7de745 1547#if CONFIG_DTRACE
5ba3f43e 1548 boolean_t dtrace_ls_initialized = FALSE;
0a7de745 1549 boolean_t dtrace_rwl_excl_spin, dtrace_rwl_excl_block, dtrace_ls_enabled = FALSE;
5ba3f43e
A
1550 uint64_t wait_interval = 0;
1551 int readers_at_sleep = 0;
1552#endif
1553
1554 /*
1555 * Try to acquire the lck_rw_want_excl bit.
1556 */
1557 while (!lck_rw_grab(lock, LCK_RW_GRAB_WANT, FALSE)) {
0a7de745 1558#if CONFIG_DTRACE
5ba3f43e
A
1559 if (dtrace_ls_initialized == FALSE) {
1560 dtrace_ls_initialized = TRUE;
1561 dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
1562 dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
1563 dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
1564 if (dtrace_ls_enabled) {
1565 /*
1566 * Either sleeping or spinning is happening,
1567 * start a timing of our delay interval now.
1568 */
1569 readers_at_sleep = lock->lck_rw_shared_count;
1570 wait_interval = mach_absolute_time();
1571 }
1572 }
1573#endif
1574
1575 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1576
1577 gotlock = lck_rw_grab(lock, LCK_RW_GRAB_WANT, TRUE);
1578
1579 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_END, trace_lck, 0, 0, gotlock, 0);
1580
0a7de745 1581 if (gotlock) {
5ba3f43e 1582 break;
0a7de745 1583 }
5ba3f43e
A
1584 /*
1585 * if we get here, the deadline has expired w/o us
1586 * being able to grab the lock exclusively
1587 * check to see if we're allowed to do a thread_block
1588 */
1589 word.data = ordered_load_rw(lock);
1590 if (word.can_sleep) {
5ba3f43e
A
1591 istate = lck_interlock_lock(lock);
1592 word.data = ordered_load_rw(lock);
1593
1594 if (word.want_excl) {
5ba3f43e
A
1595 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1596
1597 word.w_waiting = 1;
1598 ordered_store_rw(lock, word.data);
1599
1600 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite);
d9a64523 1601 res = assert_wait(LCK_RW_WRITER_EVENT(lock),
0a7de745 1602 THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
5ba3f43e
A
1603 lck_interlock_unlock(lock, istate);
1604
1605 if (res == THREAD_WAITING) {
1606 res = thread_block(THREAD_CONTINUE_NULL);
1607 slept++;
1608 }
1609 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_END, trace_lck, res, slept, 0, 0);
1610 } else {
1611 word.want_excl = 1;
1612 ordered_store_rw(lock, word.data);
1613 lck_interlock_unlock(lock, istate);
1614 break;
1615 }
1616 }
1617 }
1618 /*
1619 * Wait for readers (and upgrades) to finish...
1620 */
1621 while (!lck_rw_drain_status(lock, LCK_RW_SHARED_MASK | LCK_RW_WANT_UPGRADE, FALSE)) {
0a7de745 1622#if CONFIG_DTRACE
5ba3f43e
A
1623 /*
1624 * Either sleeping or spinning is happening, start
1625 * a timing of our delay interval now. If we set it
1626 * to -1 we don't have accurate data so we cannot later
1627 * decide to record a dtrace spin or sleep event.
1628 */
1629 if (dtrace_ls_initialized == FALSE) {
1630 dtrace_ls_initialized = TRUE;
1631 dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
1632 dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
1633 dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
1634 if (dtrace_ls_enabled) {
1635 /*
1636 * Either sleeping or spinning is happening,
1637 * start a timing of our delay interval now.
1638 */
1639 readers_at_sleep = lock->lck_rw_shared_count;
1640 wait_interval = mach_absolute_time();
1641 }
1642 }
1643#endif
1644
1645 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1646
1647 not_shared_or_upgrade = lck_rw_drain_status(lock, LCK_RW_SHARED_MASK | LCK_RW_WANT_UPGRADE, TRUE);
1648
1649 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_END, trace_lck, 0, 0, not_shared_or_upgrade, 0);
1650
0a7de745 1651 if (not_shared_or_upgrade) {
5ba3f43e 1652 break;
0a7de745 1653 }
5ba3f43e
A
1654 /*
1655 * if we get here, the deadline has expired w/o us
1656 * being able to grab the lock exclusively
1657 * check to see if we're allowed to do a thread_block
1658 */
1659 word.data = ordered_load_rw(lock);
1660 if (word.can_sleep) {
5ba3f43e
A
1661 istate = lck_interlock_lock(lock);
1662 word.data = ordered_load_rw(lock);
1663
1664 if (word.shared_count != 0 || word.want_upgrade) {
1665 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1666
1667 word.w_waiting = 1;
1668 ordered_store_rw(lock, word.data);
1669
1670 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite);
d9a64523 1671 res = assert_wait(LCK_RW_WRITER_EVENT(lock),
0a7de745 1672 THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
5ba3f43e
A
1673 lck_interlock_unlock(lock, istate);
1674
1675 if (res == THREAD_WAITING) {
1676 res = thread_block(THREAD_CONTINUE_NULL);
1677 slept++;
1678 }
1679 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_END, trace_lck, res, slept, 0, 0);
1680 } else {
1681 lck_interlock_unlock(lock, istate);
1682 /*
1683 * must own the lock now, since we checked for
1684 * readers or upgrade owner behind the interlock
1685 * no need for a call to 'lck_rw_drain_status'
1686 */
1687 break;
1688 }
1689 }
1690 }
1691
0a7de745 1692#if CONFIG_DTRACE
5ba3f43e
A
1693 /*
1694 * Decide what latencies we suffered that are Dtrace events.
1695 * If we have set wait_interval, then we either spun or slept.
1696 * At least we get out from under the interlock before we record
1697 * which is the best we can do here to minimize the impact
1698 * of the tracing.
1699 * If we have set wait_interval to -1, then dtrace was not enabled when we
1700 * started sleeping/spinning so we don't record this event.
1701 */
1702 if (dtrace_ls_enabled == TRUE) {
1703 if (slept == 0) {
0a7de745 1704 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_SPIN, lock,
5ba3f43e
A
1705 mach_absolute_time() - wait_interval, 1);
1706 } else {
1707 /*
1708 * For the blocking case, we also record if when we blocked
1709 * it was held for read or write, and how many readers.
1710 * Notice that above we recorded this before we dropped
1711 * the interlock so the count is accurate.
1712 */
0a7de745 1713 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_BLOCK, lock,
5ba3f43e
A
1714 mach_absolute_time() - wait_interval, 1,
1715 (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1716 }
1717 }
1718 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lock, 1);
0a7de745 1719#endif /* CONFIG_DTRACE */
5ba3f43e
A
1720}
1721
1722/*
1723 * Routine: lck_rw_done
1724 */
1725
0a7de745
A
1726lck_rw_type_t
1727lck_rw_done(lck_rw_t *lock)
5ba3f43e 1728{
0a7de745
A
1729 uint32_t data, prev;
1730 boolean_t once = FALSE;
5ba3f43e 1731
0a7de745 1732 for (;;) {
5ba3f43e 1733 data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_release_smp);
0a7de745 1734 if (data & LCK_RW_INTERLOCK) { /* wait for interlock to clear */
5ba3f43e
A
1735 atomic_exchange_abort();
1736 lck_rw_interlock_spin(lock);
1737 continue;
5ba3f43e 1738 }
0a7de745 1739 if (data & LCK_RW_SHARED_MASK) { /* lock is held shared */
5ba3f43e
A
1740 assertf(lock->lck_rw_owner == THREAD_NULL, "state=0x%x, owner=%p", lock->lck_rw_data, lock->lck_rw_owner);
1741 data -= LCK_RW_SHARED_READER;
0a7de745 1742 if ((data & LCK_RW_SHARED_MASK) == 0) { /* if reader count has now gone to 0, check for waiters */
5ba3f43e 1743 goto check_waiters;
0a7de745
A
1744 }
1745 } else { /* if reader count == 0, must be exclusive lock */
5ba3f43e
A
1746 if (data & LCK_RW_WANT_UPGRADE) {
1747 data &= ~(LCK_RW_WANT_UPGRADE);
1748 } else {
0a7de745 1749 if (data & LCK_RW_WANT_EXCL) {
5ba3f43e 1750 data &= ~(LCK_RW_WANT_EXCL);
0a7de745 1751 } else { /* lock is not 'owned', panic */
5ba3f43e 1752 panic("Releasing non-exclusive RW lock without a reader refcount!");
0a7de745 1753 }
5ba3f43e
A
1754 }
1755 if (!once) {
1756 // Only check for holder and clear it once
1757 assertf(lock->lck_rw_owner == current_thread(), "state=0x%x, owner=%p", lock->lck_rw_data, lock->lck_rw_owner);
1758 ordered_store_rw_owner(lock, THREAD_NULL);
1759 once = TRUE;
1760 }
1761check_waiters:
1762 /*
1763 * test the original values to match what
1764 * lck_rw_done_gen is going to do to determine
1765 * which wakeups need to happen...
1766 *
1767 * if !(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting)
1768 */
1769 if (prev & LCK_RW_W_WAITING) {
1770 data &= ~(LCK_RW_W_WAITING);
0a7de745 1771 if ((prev & LCK_RW_PRIV_EXCL) == 0) {
5ba3f43e 1772 data &= ~(LCK_RW_R_WAITING);
0a7de745
A
1773 }
1774 } else {
5ba3f43e 1775 data &= ~(LCK_RW_R_WAITING);
0a7de745 1776 }
5ba3f43e 1777 }
0a7de745 1778 if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_release_smp)) {
5ba3f43e 1779 break;
0a7de745 1780 }
5ba3f43e
A
1781 cpu_pause();
1782 }
1783 return lck_rw_done_gen(lock, prev);
1784}
1785
1786/*
1787 * Routine: lck_rw_done_gen
1788 *
1789 * called from the assembly language wrapper...
1790 * prior_lock_state is the value in the 1st
0a7de745 1791 * word of the lock at the time of a successful
5ba3f43e 1792 * atomic compare and exchange with the new value...
0a7de745 1793 * it represents the state of the lock before we
5ba3f43e 1794 * decremented the rw_shared_count or cleared either
0a7de745 1795 * rw_want_upgrade or rw_want_write and
5ba3f43e 1796 * the lck_x_waiting bits... since the wrapper
0a7de745 1797 * routine has already changed the state atomically,
5ba3f43e
A
1798 * we just need to decide if we should
1799 * wake up anyone and what value to return... we do
1800 * this by examining the state of the lock before
1801 * we changed it
1802 */
1803static lck_rw_type_t
1804lck_rw_done_gen(
0a7de745
A
1805 lck_rw_t *lck,
1806 uint32_t prior_lock_state)
5ba3f43e 1807{
0a7de745
A
1808 lck_rw_word_t fake_lck;
1809 lck_rw_type_t lock_type;
1810 thread_t thread;
1811 uint32_t rwlock_count;
5ba3f43e
A
1812
1813 /*
1814 * prior_lock state is a snapshot of the 1st word of the
1815 * lock in question... we'll fake up a pointer to it
1816 * and carefully not access anything beyond whats defined
1817 * in the first word of a lck_rw_t
1818 */
1819 fake_lck.data = prior_lock_state;
1820
1821 if (fake_lck.shared_count <= 1) {
0a7de745 1822 if (fake_lck.w_waiting) {
5ba3f43e 1823 thread_wakeup(LCK_RW_WRITER_EVENT(lck));
0a7de745 1824 }
5ba3f43e 1825
0a7de745 1826 if (!(fake_lck.priv_excl && fake_lck.w_waiting) && fake_lck.r_waiting) {
5ba3f43e 1827 thread_wakeup(LCK_RW_READER_EVENT(lck));
0a7de745 1828 }
5ba3f43e 1829 }
0a7de745 1830 if (fake_lck.shared_count) {
5ba3f43e 1831 lock_type = LCK_RW_TYPE_SHARED;
0a7de745 1832 } else {
5ba3f43e 1833 lock_type = LCK_RW_TYPE_EXCLUSIVE;
0a7de745 1834 }
5ba3f43e
A
1835
1836 /* Check if dropping the lock means that we need to unpromote */
1837 thread = current_thread();
f427ee49
A
1838 if (fake_lck.can_sleep) {
1839 rwlock_count = thread->rwlock_count--;
1840 } else {
1841 rwlock_count = UINT32_MAX;
1842 }
5ba3f43e 1843#if MACH_LDEBUG
0a7de745 1844 if (rwlock_count == 0) {
5ba3f43e 1845 panic("rw lock count underflow for thread %p", thread);
0a7de745 1846 }
5ba3f43e
A
1847#endif
1848 if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
1849 /* sched_flags checked without lock, but will be rechecked while clearing */
d9a64523 1850 lck_rw_clear_promotion(thread, unslide_for_kdebug(lck));
5ba3f43e
A
1851 }
1852#if CONFIG_DTRACE
1853 LOCKSTAT_RECORD(LS_LCK_RW_DONE_RELEASE, lck, lock_type == LCK_RW_TYPE_SHARED ? 0 : 1);
1854#endif
1855 return lock_type;
1856}
1857
1858/*
1859 * Routine: lck_rw_lock_shared_gen
1860 * Function:
1861 * Fast path code has determined that this lock
1862 * is held exclusively... this is where we spin/block
1863 * until we can acquire the lock in the shared mode
1864 */
1865static void
1866lck_rw_lock_shared_gen(
0a7de745 1867 lck_rw_t *lck)
5ba3f43e 1868{
0a7de745
A
1869 __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
1870 lck_rw_word_t word;
1871 boolean_t gotlock = 0;
1872 int slept = 0;
1873 wait_result_t res = 0;
1874 boolean_t istate;
5ba3f43e 1875
0a7de745 1876#if CONFIG_DTRACE
5ba3f43e
A
1877 uint64_t wait_interval = 0;
1878 int readers_at_sleep = 0;
1879 boolean_t dtrace_ls_initialized = FALSE;
1880 boolean_t dtrace_rwl_shared_spin, dtrace_rwl_shared_block, dtrace_ls_enabled = FALSE;
1881#endif /* CONFIG_DTRACE */
1882
0a7de745
A
1883 while (!lck_rw_grab(lck, LCK_RW_GRAB_SHARED, FALSE)) {
1884#if CONFIG_DTRACE
5ba3f43e
A
1885 if (dtrace_ls_initialized == FALSE) {
1886 dtrace_ls_initialized = TRUE;
1887 dtrace_rwl_shared_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_SPIN] != 0);
1888 dtrace_rwl_shared_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_BLOCK] != 0);
1889 dtrace_ls_enabled = dtrace_rwl_shared_spin || dtrace_rwl_shared_block;
1890 if (dtrace_ls_enabled) {
1891 /*
1892 * Either sleeping or spinning is happening,
1893 * start a timing of our delay interval now.
1894 */
1895 readers_at_sleep = lck->lck_rw_shared_count;
1896 wait_interval = mach_absolute_time();
1897 }
1898 }
1899#endif
1900
1901 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_START,
0a7de745 1902 trace_lck, lck->lck_rw_want_excl, lck->lck_rw_want_upgrade, 0, 0);
5ba3f43e
A
1903
1904 gotlock = lck_rw_grab(lck, LCK_RW_GRAB_SHARED, TRUE);
1905
1906 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_END,
0a7de745 1907 trace_lck, lck->lck_rw_want_excl, lck->lck_rw_want_upgrade, gotlock, 0);
5ba3f43e 1908
0a7de745 1909 if (gotlock) {
5ba3f43e 1910 break;
0a7de745 1911 }
5ba3f43e
A
1912 /*
1913 * if we get here, the deadline has expired w/o us
1914 * being able to grab the lock for read
1915 * check to see if we're allowed to do a thread_block
1916 */
1917 if (lck->lck_rw_can_sleep) {
5ba3f43e
A
1918 istate = lck_interlock_lock(lck);
1919
1920 word.data = ordered_load_rw(lck);
1921 if ((word.want_excl || word.want_upgrade) &&
1922 ((word.shared_count == 0) || word.priv_excl)) {
5ba3f43e 1923 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_START,
0a7de745 1924 trace_lck, word.want_excl, word.want_upgrade, 0, 0);
5ba3f43e
A
1925
1926 word.r_waiting = 1;
1927 ordered_store_rw(lck, word.data);
1928
1929 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockRead);
d9a64523 1930 res = assert_wait(LCK_RW_READER_EVENT(lck),
0a7de745 1931 THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
5ba3f43e
A
1932 lck_interlock_unlock(lck, istate);
1933
1934 if (res == THREAD_WAITING) {
1935 res = thread_block(THREAD_CONTINUE_NULL);
1936 slept++;
1937 }
1938 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_END,
0a7de745 1939 trace_lck, res, slept, 0, 0);
5ba3f43e
A
1940 } else {
1941 word.shared_count++;
1942 ordered_store_rw(lck, word.data);
1943 lck_interlock_unlock(lck, istate);
1944 break;
1945 }
1946 }
1947 }
1948
0a7de745 1949#if CONFIG_DTRACE
5ba3f43e
A
1950 if (dtrace_ls_enabled == TRUE) {
1951 if (slept == 0) {
0a7de745 1952 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_SPIN, lck, mach_absolute_time() - wait_interval, 0);
5ba3f43e 1953 } else {
0a7de745 1954 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_BLOCK, lck,
5ba3f43e
A
1955 mach_absolute_time() - wait_interval, 0,
1956 (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1957 }
1958 }
1959 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lck, 0);
0a7de745 1960#endif /* CONFIG_DTRACE */
5ba3f43e
A
1961}
1962
f427ee49
A
1963/*
1964 * Required to verify thread ownership for exclusive locks by virtue of PPL
1965 * usage
1966 */
5ba3f43e
A
1967void
1968lck_rw_assert(
0a7de745
A
1969 lck_rw_t *lck,
1970 unsigned int type)
5ba3f43e
A
1971{
1972 switch (type) {
1973 case LCK_RW_ASSERT_SHARED:
1974 if ((lck->lck_rw_shared_count != 0) &&
1975 (lck->lck_rw_owner == THREAD_NULL)) {
1976 return;
1977 }
1978 break;
1979 case LCK_RW_ASSERT_EXCLUSIVE:
1980 if ((lck->lck_rw_want_excl || lck->lck_rw_want_upgrade) &&
0a7de745 1981 (lck->lck_rw_shared_count == 0) &&
5ba3f43e
A
1982 (lck->lck_rw_owner == current_thread())) {
1983 return;
1984 }
1985 break;
1986 case LCK_RW_ASSERT_HELD:
0a7de745
A
1987 if (lck->lck_rw_shared_count != 0) {
1988 return; // Held shared
1989 }
5ba3f43e
A
1990 if ((lck->lck_rw_want_excl || lck->lck_rw_want_upgrade) &&
1991 (lck->lck_rw_owner == current_thread())) {
0a7de745 1992 return; // Held exclusive
5ba3f43e
A
1993 }
1994 break;
1995 case LCK_RW_ASSERT_NOTHELD:
1996 if ((lck->lck_rw_shared_count == 0) &&
0a7de745 1997 !(lck->lck_rw_want_excl || lck->lck_rw_want_upgrade) &&
5ba3f43e
A
1998 (lck->lck_rw_owner == THREAD_NULL)) {
1999 return;
2000 }
2001 break;
2002 default:
2003 break;
2004 }
2005 panic("rw lock (%p)%s held (mode=%u)", lck, (type == LCK_RW_ASSERT_NOTHELD ? "" : " not"), type);
2006}
2007
2008
2009/*
2010 * Routine: kdp_lck_rw_lock_is_acquired_exclusive
2011 * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
2012 */
2013boolean_t
0a7de745
A
2014kdp_lck_rw_lock_is_acquired_exclusive(lck_rw_t *lck)
2015{
5ba3f43e
A
2016 if (not_in_kdp) {
2017 panic("panic: rw lock exclusive check done outside of kernel debugger");
2018 }
2019 return ((lck->lck_rw_want_upgrade || lck->lck_rw_want_excl) && (lck->lck_rw_shared_count == 0)) ? TRUE : FALSE;
2020}
2021
2022/*
2023 * The C portion of the mutex package. These routines are only invoked
2024 * if the optimized assembler routines can't do the work.
2025 */
2026
2027/*
2028 * Forward declaration
2029 */
2030
0a7de745 2031void
5ba3f43e 2032lck_mtx_ext_init(
0a7de745
A
2033 lck_mtx_ext_t * lck,
2034 lck_grp_t * grp,
2035 lck_attr_t * attr);
5ba3f43e
A
2036
2037/*
2038 * Routine: lck_mtx_alloc_init
2039 */
2040lck_mtx_t *
2041lck_mtx_alloc_init(
0a7de745
A
2042 lck_grp_t * grp,
2043 lck_attr_t * attr)
5ba3f43e
A
2044{
2045 lck_mtx_t *lck;
2046
f427ee49
A
2047 lck = zalloc(ZV_LCK_MTX);
2048 lck_mtx_init(lck, grp, attr);
0a7de745 2049 return lck;
5ba3f43e
A
2050}
2051
2052/*
2053 * Routine: lck_mtx_free
2054 */
2055void
2056lck_mtx_free(
0a7de745
A
2057 lck_mtx_t * lck,
2058 lck_grp_t * grp)
5ba3f43e
A
2059{
2060 lck_mtx_destroy(lck, grp);
f427ee49 2061 zfree(ZV_LCK_MTX, lck);
5ba3f43e
A
2062}
2063
2064/*
2065 * Routine: lck_mtx_init
2066 */
2067void
2068lck_mtx_init(
0a7de745
A
2069 lck_mtx_t * lck,
2070 lck_grp_t * grp,
2071 lck_attr_t * attr)
5ba3f43e 2072{
0a7de745 2073#ifdef BER_XXX
5ba3f43e
A
2074 lck_mtx_ext_t *lck_ext;
2075#endif
2076 lck_attr_t *lck_attr;
2077
0a7de745 2078 if (attr != LCK_ATTR_NULL) {
5ba3f43e 2079 lck_attr = attr;
0a7de745 2080 } else {
5ba3f43e 2081 lck_attr = &LockDefaultLckAttr;
0a7de745 2082 }
5ba3f43e 2083
0a7de745 2084#ifdef BER_XXX
5ba3f43e 2085 if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) {
f427ee49
A
2086 lck_ext = zalloc(ZV_LCK_MTX_EXT);
2087 lck_mtx_ext_init(lck_ext, grp, lck_attr);
2088 lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT;
2089 lck->lck_mtx_ptr = lck_ext;
2090 lck->lck_mtx_type = LCK_MTX_TYPE;
5ba3f43e
A
2091 } else
2092#endif
2093 {
0a7de745 2094 lck->lck_mtx_ptr = NULL; // Clear any padding in the union fields below
5ba3f43e 2095 lck->lck_mtx_waiters = 0;
5ba3f43e
A
2096 lck->lck_mtx_type = LCK_MTX_TYPE;
2097 ordered_store_mtx(lck, 0);
2098 }
2099 lck_grp_reference(grp);
2100 lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX);
2101}
2102
2103/*
2104 * Routine: lck_mtx_init_ext
2105 */
2106void
2107lck_mtx_init_ext(
0a7de745
A
2108 lck_mtx_t * lck,
2109 lck_mtx_ext_t * lck_ext,
2110 lck_grp_t * grp,
2111 lck_attr_t * attr)
5ba3f43e
A
2112{
2113 lck_attr_t *lck_attr;
2114
0a7de745 2115 if (attr != LCK_ATTR_NULL) {
5ba3f43e 2116 lck_attr = attr;
0a7de745 2117 } else {
5ba3f43e 2118 lck_attr = &LockDefaultLckAttr;
0a7de745 2119 }
5ba3f43e
A
2120
2121 if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) {
2122 lck_mtx_ext_init(lck_ext, grp, lck_attr);
2123 lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT;
2124 lck->lck_mtx_ptr = lck_ext;
2125 lck->lck_mtx_type = LCK_MTX_TYPE;
2126 } else {
2127 lck->lck_mtx_waiters = 0;
5ba3f43e
A
2128 lck->lck_mtx_type = LCK_MTX_TYPE;
2129 ordered_store_mtx(lck, 0);
2130 }
2131 lck_grp_reference(grp);
2132 lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX);
2133}
2134
2135/*
2136 * Routine: lck_mtx_ext_init
2137 */
2138void
2139lck_mtx_ext_init(
0a7de745
A
2140 lck_mtx_ext_t * lck,
2141 lck_grp_t * grp,
2142 lck_attr_t * attr)
5ba3f43e
A
2143{
2144 bzero((void *) lck, sizeof(lck_mtx_ext_t));
2145
2146 lck->lck_mtx.lck_mtx_type = LCK_MTX_TYPE;
2147
2148 if ((attr->lck_attr_val) & LCK_ATTR_DEBUG) {
2149 lck->lck_mtx_deb.type = MUTEX_TAG;
2150 lck->lck_mtx_attr |= LCK_MTX_ATTR_DEBUG;
2151 }
2152 lck->lck_mtx_grp = grp;
2153
0a7de745 2154 if (grp->lck_grp_attr & LCK_GRP_ATTR_STAT) {
5ba3f43e 2155 lck->lck_mtx_attr |= LCK_MTX_ATTR_STAT;
0a7de745 2156 }
5ba3f43e
A
2157}
2158
2159/* The slow versions */
2160static void lck_mtx_lock_contended(lck_mtx_t *lock, thread_t thread, boolean_t interlocked);
2161static boolean_t lck_mtx_try_lock_contended(lck_mtx_t *lock, thread_t thread);
2162static void lck_mtx_unlock_contended(lck_mtx_t *lock, thread_t thread, boolean_t interlocked);
2163
0a7de745
A
2164/* The adaptive spin function */
2165static spinwait_result_t lck_mtx_lock_contended_spinwait_arm(lck_mtx_t *lock, thread_t thread, boolean_t interlocked);
2166
5ba3f43e
A
2167/*
2168 * Routine: lck_mtx_verify
2169 *
2170 * Verify if a mutex is valid
2171 */
2172static inline void
2173lck_mtx_verify(lck_mtx_t *lock)
2174{
0a7de745 2175 if (lock->lck_mtx_type != LCK_MTX_TYPE) {
5ba3f43e 2176 panic("Invalid mutex %p", lock);
0a7de745
A
2177 }
2178#if DEVELOPMENT || DEBUG
2179 if (lock->lck_mtx_tag == LCK_MTX_TAG_DESTROYED) {
5ba3f43e 2180 panic("Mutex destroyed %p", lock);
0a7de745
A
2181 }
2182#endif /* DEVELOPMENT || DEBUG */
5ba3f43e
A
2183}
2184
2185/*
2186 * Routine: lck_mtx_check_preemption
2187 *
2188 * Verify preemption is enabled when attempting to acquire a mutex.
2189 */
2190
2191static inline void
2192lck_mtx_check_preemption(lck_mtx_t *lock)
2193{
0a7de745 2194#if DEVELOPMENT || DEBUG
f427ee49
A
2195 if (current_cpu_datap()->cpu_hibernate) {
2196 return;
2197 }
2198
5ba3f43e
A
2199 int pl = get_preemption_level();
2200
0a7de745 2201 if (pl != 0) {
5ba3f43e 2202 panic("Attempt to take mutex with preemption disabled. Lock=%p, level=%d", lock, pl);
0a7de745 2203 }
5ba3f43e
A
2204#else
2205 (void)lock;
2206#endif
2207}
2208
2209/*
2210 * Routine: lck_mtx_lock
2211 */
2212void
2213lck_mtx_lock(lck_mtx_t *lock)
2214{
0a7de745 2215 thread_t thread;
5ba3f43e
A
2216
2217 lck_mtx_verify(lock);
2218 lck_mtx_check_preemption(lock);
2219 thread = current_thread();
cb323159
A
2220 if (os_atomic_cmpxchg(&lock->lck_mtx_data,
2221 0, LCK_MTX_THREAD_TO_STATE(thread), acquire)) {
0a7de745 2222#if CONFIG_DTRACE
5ba3f43e
A
2223 LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_ACQUIRE, lock, 0);
2224#endif /* CONFIG_DTRACE */
2225 return;
2226 }
2227 lck_mtx_lock_contended(lock, thread, FALSE);
2228}
2229
2230/*
0a7de745 2231 * This is the slow version of mutex locking.
5ba3f43e
A
2232 */
2233static void NOINLINE
2234lck_mtx_lock_contended(lck_mtx_t *lock, thread_t thread, boolean_t interlocked)
2235{
0a7de745
A
2236 thread_t holding_thread;
2237 uintptr_t state;
2238 int waiters = 0;
2239 spinwait_result_t sw_res;
cb323159 2240 struct turnstile *ts = NULL;
d9a64523
A
2241
2242 /* Loop waiting until I see that the mutex is unowned */
0a7de745
A
2243 for (;;) {
2244 sw_res = lck_mtx_lock_contended_spinwait_arm(lock, thread, interlocked);
2245 interlocked = FALSE;
2246
2247 switch (sw_res) {
2248 case SPINWAIT_ACQUIRED:
cb323159
A
2249 if (ts != NULL) {
2250 interlock_lock(lock);
2251 turnstile_complete((uintptr_t)lock, NULL, NULL, TURNSTILE_KERNEL_MUTEX);
2252 interlock_unlock(lock);
2253 }
0a7de745
A
2254 goto done;
2255 case SPINWAIT_INTERLOCK:
2256 goto set_owner;
2257 default:
2258 break;
2259 }
2260
5ba3f43e
A
2261 state = ordered_load_mtx(lock);
2262 holding_thread = LCK_MTX_STATE_TO_THREAD(state);
0a7de745 2263 if (holding_thread == NULL) {
5ba3f43e 2264 break;
0a7de745 2265 }
5ba3f43e 2266 ordered_store_mtx(lock, (state | LCK_ILOCK | ARM_LCK_WAITERS)); // Set waiters bit and wait
cb323159 2267 lck_mtx_lock_wait(lock, holding_thread, &ts);
d9a64523 2268 /* returns interlock unlocked */
5ba3f43e 2269 }
d9a64523 2270
0a7de745 2271set_owner:
d9a64523 2272 /* Hooray, I'm the new owner! */
0a7de745
A
2273 state = ordered_load_mtx(lock);
2274
2275 if (state & ARM_LCK_WAITERS) {
2276 /* Skip lck_mtx_lock_acquire if there are no waiters. */
cb323159
A
2277 waiters = lck_mtx_lock_acquire(lock, ts);
2278 /*
2279 * lck_mtx_lock_acquire will call
2280 * turnstile_complete
2281 */
2282 } else {
2283 if (ts != NULL) {
2284 turnstile_complete((uintptr_t)lock, NULL, NULL, TURNSTILE_KERNEL_MUTEX);
2285 }
0a7de745
A
2286 }
2287
5ba3f43e 2288 state = LCK_MTX_THREAD_TO_STATE(thread);
0a7de745 2289 if (waiters != 0) {
5ba3f43e 2290 state |= ARM_LCK_WAITERS;
0a7de745 2291 }
0a7de745
A
2292 state |= LCK_ILOCK; // Preserve interlock
2293 ordered_store_mtx(lock, state); // Set ownership
2294 interlock_unlock(lock); // Release interlock, enable preemption
0a7de745
A
2295
2296done:
5ba3f43e
A
2297 load_memory_barrier();
2298
cb323159
A
2299 assert(thread->turnstile != NULL);
2300
2301 if (ts != NULL) {
2302 turnstile_cleanup();
2303 }
2304
0a7de745 2305#if CONFIG_DTRACE
5ba3f43e
A
2306 LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_ACQUIRE, lock, 0);
2307#endif /* CONFIG_DTRACE */
2308}
2309
0a7de745
A
2310/*
2311 * Routine: lck_mtx_lock_spinwait_arm
2312 *
2313 * Invoked trying to acquire a mutex when there is contention but
2314 * the holder is running on another processor. We spin for up to a maximum
2315 * time waiting for the lock to be released.
2316 */
2317static spinwait_result_t
2318lck_mtx_lock_contended_spinwait_arm(lck_mtx_t *lock, thread_t thread, boolean_t interlocked)
2319{
2320 int has_interlock = (int)interlocked;
0a7de745 2321 __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lock);
ea3f0419
A
2322 thread_t owner, prev_owner;
2323 uint64_t window_deadline, sliding_deadline, high_deadline;
2324 uint64_t start_time, cur_time, avg_hold_time, bias, delta;
2325 int loopcount = 0;
2326 uint i, prev_owner_cpu;
2327 int total_hold_time_samples, window_hold_time_samples, unfairness;
2328 bool owner_on_core, adjust;
2329 uintptr_t state, new_state, waiters;
2330 spinwait_result_t retval = SPINWAIT_DID_SPIN_HIGH_THR;
0a7de745
A
2331
2332 if (__improbable(!(lck_mtx_adaptive_spin_mode & ADAPTIVE_SPIN_ENABLE))) {
2333 if (!has_interlock) {
2334 interlock_lock(lock);
2335 }
2336
2337 return SPINWAIT_DID_NOT_SPIN;
2338 }
2339
0a7de745
A
2340 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_START,
2341 trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(LCK_MTX_STATE_TO_THREAD(state)), lock->lck_mtx_waiters, 0, 0);
2342
ea3f0419
A
2343 start_time = mach_absolute_time();
2344 /*
2345 * window_deadline represents the "learning" phase.
2346 * The thread collects statistics about the lock during
2347 * window_deadline and then it makes a decision on whether to spin more
2348 * or block according to the concurrency behavior
2349 * observed.
2350 *
2351 * Every thread can spin at least low_MutexSpin.
2352 */
2353 window_deadline = start_time + low_MutexSpin;
2354 /*
2355 * Sliding_deadline is the adjusted spin deadline
2356 * computed after the "learning" phase.
2357 */
2358 sliding_deadline = window_deadline;
2359 /*
2360 * High_deadline is a hard deadline. No thread
2361 * can spin more than this deadline.
2362 */
2363 if (high_MutexSpin >= 0) {
2364 high_deadline = start_time + high_MutexSpin;
2365 } else {
2366 high_deadline = start_time + low_MutexSpin * real_ncpus;
0a7de745
A
2367 }
2368
ea3f0419
A
2369 /*
2370 * Do not know yet which is the owner cpu.
2371 * Initialize prev_owner_cpu with next cpu.
2372 */
2373 prev_owner_cpu = (cpu_number() + 1) % real_ncpus;
2374 total_hold_time_samples = 0;
2375 window_hold_time_samples = 0;
2376 avg_hold_time = 0;
2377 adjust = TRUE;
2378 bias = (os_hash_kernel_pointer(lock) + cpu_number()) % real_ncpus;
2379
0a7de745
A
2380 /* Snoop the lock state */
2381 state = ordered_load_mtx(lock);
ea3f0419
A
2382 owner = LCK_MTX_STATE_TO_THREAD(state);
2383 prev_owner = owner;
2384
2385 if (has_interlock) {
2386 if (owner == NULL) {
2387 retval = SPINWAIT_INTERLOCK;
2388 goto done_spinning;
2389 } else {
2390 /*
2391 * We are holding the interlock, so
2392 * we can safely dereference owner.
2393 */
f427ee49 2394 if (!machine_thread_on_core(owner) || (owner->state & TH_IDLE)) {
ea3f0419
A
2395 retval = SPINWAIT_DID_NOT_SPIN;
2396 goto done_spinning;
2397 }
2398 }
2399 interlock_unlock(lock);
2400 has_interlock = 0;
2401 }
0a7de745
A
2402
2403 /*
2404 * Spin while:
2405 * - mutex is locked, and
2406 * - it's locked as a spin lock, and
2407 * - owner is running on another processor, and
0a7de745
A
2408 * - we haven't spun for long enough.
2409 */
2410 do {
ea3f0419
A
2411 /*
2412 * Try to acquire the lock.
2413 */
2414 owner = LCK_MTX_STATE_TO_THREAD(state);
2415 if (owner == NULL) {
2416 waiters = state & ARM_LCK_WAITERS;
2417 if (waiters) {
2418 /*
2419 * preserve the waiter bit
2420 * and try acquire the interlock.
2421 * Note: we will successfully acquire
2422 * the interlock only if we can also
2423 * acquire the lock.
2424 */
2425 new_state = ARM_LCK_WAITERS | LCK_ILOCK;
2426 has_interlock = 1;
2427 retval = SPINWAIT_INTERLOCK;
2428 disable_preemption();
2429 } else {
2430 new_state = LCK_MTX_THREAD_TO_STATE(thread);
2431 retval = SPINWAIT_ACQUIRED;
0a7de745
A
2432 }
2433
ea3f0419
A
2434 /*
2435 * The cmpxchg will succed only if the lock
2436 * is not owned (doesn't have an owner set)
2437 * and it is not interlocked.
2438 * It will not fail if there are waiters.
2439 */
2440 if (os_atomic_cmpxchgv(&lock->lck_mtx_data,
2441 waiters, new_state, &state, acquire)) {
2442 goto done_spinning;
2443 } else {
2444 if (waiters) {
2445 has_interlock = 0;
2446 enable_preemption();
2447 }
2448 }
2449 }
0a7de745 2450
ea3f0419 2451 cur_time = mach_absolute_time();
0a7de745 2452
ea3f0419
A
2453 /*
2454 * Never spin past high_deadline.
2455 */
2456 if (cur_time >= high_deadline) {
2457 retval = SPINWAIT_DID_SPIN_HIGH_THR;
2458 break;
2459 }
0a7de745 2460
ea3f0419
A
2461 /*
2462 * Check if owner is on core. If not block.
2463 */
2464 owner = LCK_MTX_STATE_TO_THREAD(state);
2465 if (owner) {
2466 i = prev_owner_cpu;
2467 owner_on_core = FALSE;
0a7de745 2468
ea3f0419
A
2469 disable_preemption();
2470 state = ordered_load_mtx(lock);
2471 owner = LCK_MTX_STATE_TO_THREAD(state);
0a7de745 2472
ea3f0419
A
2473 /*
2474 * For scalability we want to check if the owner is on core
2475 * without locking the mutex interlock.
2476 * If we do not lock the mutex interlock, the owner that we see might be
2477 * invalid, so we cannot dereference it. Therefore we cannot check
2478 * any field of the thread to tell us if it is on core.
2479 * Check if the thread that is running on the other cpus matches the owner.
2480 */
2481 if (owner) {
2482 do {
2483 cpu_data_t *cpu_data_ptr = CpuDataEntries[i].cpu_data_vaddr;
2484 if ((cpu_data_ptr != NULL) && (cpu_data_ptr->cpu_active_thread == owner)) {
2485 owner_on_core = TRUE;
2486 break;
0a7de745 2487 }
ea3f0419
A
2488 if (++i >= real_ncpus) {
2489 i = 0;
2490 }
2491 } while (i != prev_owner_cpu);
2492 enable_preemption();
2493
2494 if (owner_on_core) {
2495 prev_owner_cpu = i;
2496 } else {
2497 prev_owner = owner;
2498 state = ordered_load_mtx(lock);
2499 owner = LCK_MTX_STATE_TO_THREAD(state);
2500 if (owner == prev_owner) {
2501 /*
2502 * Owner is not on core.
2503 * Stop spinning.
2504 */
2505 if (loopcount == 0) {
2506 retval = SPINWAIT_DID_NOT_SPIN;
2507 } else {
2508 retval = SPINWAIT_DID_SPIN_OWNER_NOT_CORE;
2509 }
2510 break;
2511 }
2512 /*
2513 * Fall through if the owner changed while we were scanning.
2514 * The new owner could potentially be on core, so loop
2515 * again.
2516 */
0a7de745 2517 }
ea3f0419
A
2518 } else {
2519 enable_preemption();
0a7de745
A
2520 }
2521 }
2522
ea3f0419
A
2523 /*
2524 * Save how many times we see the owner changing.
2525 * We can roughly estimate the the mutex hold
2526 * time and the fairness with that.
2527 */
2528 if (owner != prev_owner) {
2529 prev_owner = owner;
2530 total_hold_time_samples++;
2531 window_hold_time_samples++;
0a7de745
A
2532 }
2533
ea3f0419
A
2534 /*
2535 * Learning window expired.
2536 * Try to adjust the sliding_deadline.
2537 */
2538 if (cur_time >= window_deadline) {
2539 /*
2540 * If there was not contention during the window
2541 * stop spinning.
2542 */
2543 if (window_hold_time_samples < 1) {
2544 retval = SPINWAIT_DID_SPIN_NO_WINDOW_CONTENTION;
2545 break;
2546 }
2547
2548 if (adjust) {
2549 /*
2550 * For a fair lock, we'd wait for at most (NCPU-1) periods,
2551 * but the lock is unfair, so let's try to estimate by how much.
2552 */
2553 unfairness = total_hold_time_samples / real_ncpus;
2554
2555 if (unfairness == 0) {
2556 /*
2557 * We observed the owner changing `total_hold_time_samples` times which
2558 * let us estimate the average hold time of this mutex for the duration
2559 * of the spin time.
2560 * avg_hold_time = (cur_time - start_time) / total_hold_time_samples;
2561 *
2562 * In this case spin at max avg_hold_time * (real_ncpus - 1)
2563 */
2564 delta = cur_time - start_time;
2565 sliding_deadline = start_time + (delta * (real_ncpus - 1)) / total_hold_time_samples;
2566 } else {
2567 /*
2568 * In this case at least one of the other cpus was able to get the lock twice
2569 * while I was spinning.
2570 * We could spin longer but it won't necessarily help if the system is unfair.
2571 * Try to randomize the wait to reduce contention.
2572 *
2573 * We compute how much time we could potentially spin
2574 * and distribute it over the cpus.
2575 *
2576 * bias is an integer between 0 and real_ncpus.
2577 * distributed_increment = ((high_deadline - cur_time) / real_ncpus) * bias
2578 */
2579 delta = high_deadline - cur_time;
2580 sliding_deadline = cur_time + ((delta * bias) / real_ncpus);
2581 adjust = FALSE;
2582 }
2583 }
0a7de745 2584
ea3f0419
A
2585 window_deadline += low_MutexSpin;
2586 window_hold_time_samples = 0;
0a7de745
A
2587 }
2588
ea3f0419
A
2589 /*
2590 * Stop spinning if we past
2591 * the adjusted deadline.
2592 */
2593 if (cur_time >= sliding_deadline) {
2594 retval = SPINWAIT_DID_SPIN_SLIDING_THR;
2595 break;
2596 }
0a7de745 2597
ea3f0419
A
2598 /*
2599 * We want to arm the monitor for wfe,
2600 * so load exclusively the lock.
2601 *
2602 * NOTE:
2603 * we rely on the fact that wfe will
2604 * eventually return even if the cache line
2605 * is not modified. This way we will keep
2606 * looping and checking if the deadlines expired.
2607 */
2608 state = os_atomic_load_exclusive(&lock->lck_mtx_data, relaxed);
2609 owner = LCK_MTX_STATE_TO_THREAD(state);
2610 if (owner != NULL) {
2611 wait_for_event();
2612 state = ordered_load_mtx(lock);
2613 } else {
2614 atomic_exchange_abort();
0a7de745
A
2615 }
2616
2617 loopcount++;
2618 } while (TRUE);
2619
ea3f0419 2620done_spinning:
0a7de745
A
2621#if CONFIG_DTRACE
2622 /*
0a7de745
A
2623 * Note that we record a different probe id depending on whether
2624 * this is a direct or indirect mutex. This allows us to
2625 * penalize only lock groups that have debug/stats enabled
2626 * with dtrace processing if desired.
2627 */
2628 if (__probable(lock->lck_mtx_tag != LCK_MTX_TAG_INDIRECT)) {
2629 LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN, lock,
ea3f0419 2630 mach_absolute_time() - start_time);
0a7de745
A
2631 } else {
2632 LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_SPIN, lock,
ea3f0419 2633 mach_absolute_time() - start_time);
0a7de745
A
2634 }
2635 /* The lockstat acquire event is recorded by the caller. */
2636#endif
2637
2638 state = ordered_load_mtx(lock);
2639
2640 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_END,
2641 trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(LCK_MTX_STATE_TO_THREAD(state)), lock->lck_mtx_waiters, retval, 0);
0a7de745
A
2642 if ((!has_interlock) && (retval != SPINWAIT_ACQUIRED)) {
2643 /* We must own either the lock or the interlock on return. */
2644 interlock_lock(lock);
2645 }
2646
2647 return retval;
2648}
2649
ea3f0419 2650
5ba3f43e
A
2651/*
2652 * Common code for mutex locking as spinlock
2653 */
2654static inline void
2655lck_mtx_lock_spin_internal(lck_mtx_t *lock, boolean_t allow_held_as_mutex)
2656{
0a7de745 2657 uintptr_t state;
5ba3f43e
A
2658
2659 interlock_lock(lock);
2660 state = ordered_load_mtx(lock);
2661 if (LCK_MTX_STATE_TO_THREAD(state)) {
0a7de745 2662 if (allow_held_as_mutex) {
5ba3f43e 2663 lck_mtx_lock_contended(lock, current_thread(), TRUE);
0a7de745 2664 } else {
5ba3f43e
A
2665 // "Always" variants can never block. If the lock is held and blocking is not allowed
2666 // then someone is mixing always and non-always calls on the same lock, which is
2667 // forbidden.
2668 panic("Attempting to block on a lock taken as spin-always %p", lock);
0a7de745 2669 }
5ba3f43e
A
2670 return;
2671 }
0a7de745
A
2672 state &= ARM_LCK_WAITERS; // Preserve waiters bit
2673 state |= (LCK_MTX_SPIN_TAG | LCK_ILOCK); // Add spin tag and maintain interlock
5ba3f43e
A
2674 ordered_store_mtx(lock, state);
2675 load_memory_barrier();
2676
0a7de745 2677#if CONFIG_DTRACE
5ba3f43e
A
2678 LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN_ACQUIRE, lock, 0);
2679#endif /* CONFIG_DTRACE */
2680}
2681
2682/*
2683 * Routine: lck_mtx_lock_spin
2684 */
2685void
2686lck_mtx_lock_spin(lck_mtx_t *lock)
2687{
2688 lck_mtx_check_preemption(lock);
2689 lck_mtx_lock_spin_internal(lock, TRUE);
2690}
2691
2692/*
2693 * Routine: lck_mtx_lock_spin_always
2694 */
2695void
2696lck_mtx_lock_spin_always(lck_mtx_t *lock)
2697{
2698 lck_mtx_lock_spin_internal(lock, FALSE);
2699}
2700
2701/*
2702 * Routine: lck_mtx_try_lock
2703 */
2704boolean_t
2705lck_mtx_try_lock(lck_mtx_t *lock)
2706{
0a7de745 2707 thread_t thread = current_thread();
5ba3f43e
A
2708
2709 lck_mtx_verify(lock);
cb323159
A
2710 if (os_atomic_cmpxchg(&lock->lck_mtx_data,
2711 0, LCK_MTX_THREAD_TO_STATE(thread), acquire)) {
0a7de745 2712#if CONFIG_DTRACE
5ba3f43e
A
2713 LOCKSTAT_RECORD(LS_LCK_MTX_TRY_LOCK_ACQUIRE, lock, 0);
2714#endif /* CONFIG_DTRACE */
2715 return TRUE;
2716 }
2717 return lck_mtx_try_lock_contended(lock, thread);
2718}
2719
2720static boolean_t NOINLINE
2721lck_mtx_try_lock_contended(lck_mtx_t *lock, thread_t thread)
2722{
0a7de745
A
2723 thread_t holding_thread;
2724 uintptr_t state;
2725 int waiters;
5ba3f43e 2726
5ba3f43e
A
2727 interlock_lock(lock);
2728 state = ordered_load_mtx(lock);
2729 holding_thread = LCK_MTX_STATE_TO_THREAD(state);
2730 if (holding_thread) {
2731 interlock_unlock(lock);
2732 return FALSE;
2733 }
cb323159 2734 waiters = lck_mtx_lock_acquire(lock, NULL);
5ba3f43e 2735 state = LCK_MTX_THREAD_TO_STATE(thread);
0a7de745 2736 if (waiters != 0) {
5ba3f43e 2737 state |= ARM_LCK_WAITERS;
0a7de745 2738 }
0a7de745
A
2739 state |= LCK_ILOCK; // Preserve interlock
2740 ordered_store_mtx(lock, state); // Set ownership
2741 interlock_unlock(lock); // Release interlock, enable preemption
5ba3f43e 2742 load_memory_barrier();
cb323159
A
2743
2744 turnstile_cleanup();
2745
5ba3f43e
A
2746 return TRUE;
2747}
2748
2749static inline boolean_t
2750lck_mtx_try_lock_spin_internal(lck_mtx_t *lock, boolean_t allow_held_as_mutex)
2751{
0a7de745 2752 uintptr_t state;
5ba3f43e 2753
0a7de745 2754 if (!interlock_try(lock)) {
5ba3f43e 2755 return FALSE;
0a7de745 2756 }
5ba3f43e 2757 state = ordered_load_mtx(lock);
0a7de745 2758 if (LCK_MTX_STATE_TO_THREAD(state)) {
5ba3f43e 2759 // Lock is held as mutex
0a7de745 2760 if (allow_held_as_mutex) {
5ba3f43e 2761 interlock_unlock(lock);
0a7de745 2762 } else {
5ba3f43e
A
2763 // "Always" variants can never block. If the lock is held as a normal mutex
2764 // then someone is mixing always and non-always calls on the same lock, which is
2765 // forbidden.
2766 panic("Spin-mutex held as full mutex %p", lock);
0a7de745 2767 }
5ba3f43e
A
2768 return FALSE;
2769 }
0a7de745
A
2770 state &= ARM_LCK_WAITERS; // Preserve waiters bit
2771 state |= (LCK_MTX_SPIN_TAG | LCK_ILOCK); // Add spin tag and maintain interlock
5ba3f43e
A
2772 ordered_store_mtx(lock, state);
2773 load_memory_barrier();
2774
0a7de745 2775#if CONFIG_DTRACE
5ba3f43e
A
2776 LOCKSTAT_RECORD(LS_LCK_MTX_TRY_SPIN_LOCK_ACQUIRE, lock, 0);
2777#endif /* CONFIG_DTRACE */
2778 return TRUE;
2779}
2780
2781/*
2782 * Routine: lck_mtx_try_lock_spin
2783 */
2784boolean_t
2785lck_mtx_try_lock_spin(lck_mtx_t *lock)
2786{
2787 return lck_mtx_try_lock_spin_internal(lock, TRUE);
2788}
2789
2790/*
2791 * Routine: lck_mtx_try_lock_spin_always
2792 */
2793boolean_t
2794lck_mtx_try_lock_spin_always(lck_mtx_t *lock)
2795{
2796 return lck_mtx_try_lock_spin_internal(lock, FALSE);
2797}
2798
2799
2800
2801/*
2802 * Routine: lck_mtx_unlock
2803 */
2804void
2805lck_mtx_unlock(lck_mtx_t *lock)
2806{
0a7de745
A
2807 thread_t thread = current_thread();
2808 uintptr_t state;
2809 boolean_t ilk_held = FALSE;
5ba3f43e
A
2810
2811 lck_mtx_verify(lock);
2812
2813 state = ordered_load_mtx(lock);
2814 if (state & LCK_ILOCK) {
0a7de745
A
2815 if (LCK_MTX_STATE_TO_THREAD(state) == (thread_t)LCK_MTX_SPIN_TAG) {
2816 ilk_held = TRUE; // Interlock is held by (presumably) this thread
2817 }
5ba3f43e
A
2818 goto slow_case;
2819 }
2820 // Locked as a mutex
cb323159
A
2821 if (os_atomic_cmpxchg(&lock->lck_mtx_data,
2822 LCK_MTX_THREAD_TO_STATE(thread), 0, release)) {
0a7de745 2823#if CONFIG_DTRACE
5ba3f43e
A
2824 LOCKSTAT_RECORD(LS_LCK_MTX_UNLOCK_RELEASE, lock, 0);
2825#endif /* CONFIG_DTRACE */
2826 return;
2827 }
2828slow_case:
2829 lck_mtx_unlock_contended(lock, thread, ilk_held);
2830}
2831
2832static void NOINLINE
2833lck_mtx_unlock_contended(lck_mtx_t *lock, thread_t thread, boolean_t ilk_held)
2834{
0a7de745 2835 uintptr_t state;
cb323159 2836 boolean_t cleanup = FALSE;
5ba3f43e
A
2837
2838 if (ilk_held) {
2839 state = ordered_load_mtx(lock);
2840 } else {
5ba3f43e
A
2841 interlock_lock(lock);
2842 state = ordered_load_mtx(lock);
0a7de745 2843 if (thread != LCK_MTX_STATE_TO_THREAD(state)) {
5ba3f43e 2844 panic("lck_mtx_unlock(): Attempt to release lock not owned by thread (%p)", lock);
0a7de745 2845 }
d9a64523 2846 if (state & ARM_LCK_WAITERS) {
cb323159
A
2847 if (lck_mtx_unlock_wakeup(lock, thread)) {
2848 state = ARM_LCK_WAITERS;
2849 } else {
2850 state = 0;
2851 }
2852 cleanup = TRUE;
2853 goto unlock;
d9a64523 2854 }
5ba3f43e 2855 }
d9a64523 2856 state &= ARM_LCK_WAITERS; /* Clear state, retain waiters bit */
cb323159 2857unlock:
5ba3f43e
A
2858 state |= LCK_ILOCK;
2859 ordered_store_mtx(lock, state);
2860 interlock_unlock(lock);
cb323159
A
2861 if (cleanup) {
2862 /*
2863 * Do not do any turnstile operations outside of this block.
2864 * lock/unlock is called at early stage of boot with single thread,
2865 * when turnstile is not yet initialized.
2866 * Even without contention we can come throught the slow path
2867 * if the mutex is acquired as a spin lock.
2868 */
2869 turnstile_cleanup();
2870 }
5ba3f43e 2871
0a7de745 2872#if CONFIG_DTRACE
5ba3f43e
A
2873 LOCKSTAT_RECORD(LS_LCK_MTX_UNLOCK_RELEASE, lock, 0);
2874#endif /* CONFIG_DTRACE */
2875}
2876
2877/*
2878 * Routine: lck_mtx_assert
2879 */
2880void
2881lck_mtx_assert(lck_mtx_t *lock, unsigned int type)
2882{
0a7de745
A
2883 thread_t thread, holder;
2884 uintptr_t state;
5ba3f43e
A
2885
2886 state = ordered_load_mtx(lock);
2887 holder = LCK_MTX_STATE_TO_THREAD(state);
2888 if (holder == (thread_t)LCK_MTX_SPIN_TAG) {
0a7de745
A
2889 // Lock is held in spin mode, owner is unknown.
2890 return; // Punt
5ba3f43e
A
2891 }
2892 thread = current_thread();
2893 if (type == LCK_MTX_ASSERT_OWNED) {
0a7de745 2894 if (thread != holder) {
5ba3f43e 2895 panic("lck_mtx_assert(): mutex (%p) owned", lock);
0a7de745 2896 }
5ba3f43e 2897 } else if (type == LCK_MTX_ASSERT_NOTOWNED) {
0a7de745 2898 if (thread == holder) {
5ba3f43e 2899 panic("lck_mtx_assert(): mutex (%p) not owned", lock);
0a7de745
A
2900 }
2901 } else {
5ba3f43e 2902 panic("lck_mtx_assert(): invalid arg (%u)", type);
0a7de745 2903 }
5ba3f43e
A
2904}
2905
2906/*
2907 * Routine: lck_mtx_ilk_unlock
2908 */
2909boolean_t
2910lck_mtx_ilk_unlock(lck_mtx_t *lock)
2911{
2912 interlock_unlock(lock);
2913 return TRUE;
2914}
2915
2916/*
2917 * Routine: lck_mtx_convert_spin
2918 *
2919 * Convert a mutex held for spin into a held full mutex
2920 */
2921void
2922lck_mtx_convert_spin(lck_mtx_t *lock)
2923{
0a7de745
A
2924 thread_t thread = current_thread();
2925 uintptr_t state;
2926 int waiters;
5ba3f43e
A
2927
2928 state = ordered_load_mtx(lock);
0a7de745
A
2929 if (LCK_MTX_STATE_TO_THREAD(state) == thread) {
2930 return; // Already owned as mutex, return
2931 }
2932 if ((state & LCK_ILOCK) == 0 || (LCK_MTX_STATE_TO_THREAD(state) != (thread_t)LCK_MTX_SPIN_TAG)) {
5ba3f43e 2933 panic("lck_mtx_convert_spin: Not held as spinlock (%p)", lock);
0a7de745
A
2934 }
2935 state &= ~(LCK_MTX_THREAD_MASK); // Clear the spin tag
5ba3f43e 2936 ordered_store_mtx(lock, state);
cb323159 2937 waiters = lck_mtx_lock_acquire(lock, NULL); // Acquire to manage priority boosts
5ba3f43e 2938 state = LCK_MTX_THREAD_TO_STATE(thread);
0a7de745 2939 if (waiters != 0) {
5ba3f43e 2940 state |= ARM_LCK_WAITERS;
0a7de745 2941 }
5ba3f43e 2942 state |= LCK_ILOCK;
0a7de745
A
2943 ordered_store_mtx(lock, state); // Set ownership
2944 interlock_unlock(lock); // Release interlock, enable preemption
cb323159 2945 turnstile_cleanup();
5ba3f43e
A
2946}
2947
2948
2949/*
2950 * Routine: lck_mtx_destroy
2951 */
2952void
2953lck_mtx_destroy(
0a7de745
A
2954 lck_mtx_t * lck,
2955 lck_grp_t * grp)
5ba3f43e 2956{
0a7de745 2957 if (lck->lck_mtx_type != LCK_MTX_TYPE) {
5ba3f43e 2958 panic("Destroying invalid mutex %p", lck);
0a7de745
A
2959 }
2960 if (lck->lck_mtx_tag == LCK_MTX_TAG_DESTROYED) {
5ba3f43e 2961 panic("Destroying previously destroyed lock %p", lck);
0a7de745 2962 }
5ba3f43e
A
2963 lck_mtx_assert(lck, LCK_MTX_ASSERT_NOTOWNED);
2964 lck->lck_mtx_tag = LCK_MTX_TAG_DESTROYED;
2965 lck_grp_lckcnt_decr(grp, LCK_TYPE_MTX);
2966 lck_grp_deallocate(grp);
2967 return;
2968}
2969
2970/*
2971 * Routine: lck_spin_assert
2972 */
2973void
2974lck_spin_assert(lck_spin_t *lock, unsigned int type)
2975{
0a7de745
A
2976 thread_t thread, holder;
2977 uintptr_t state;
5ba3f43e 2978
0a7de745 2979 if (lock->type != LCK_SPIN_TYPE) {
5ba3f43e 2980 panic("Invalid spinlock %p", lock);
0a7de745 2981 }
5ba3f43e
A
2982
2983 state = lock->lck_spin_data;
2984 holder = (thread_t)(state & ~LCK_ILOCK);
2985 thread = current_thread();
2986 if (type == LCK_ASSERT_OWNED) {
0a7de745 2987 if (holder == 0) {
5ba3f43e 2988 panic("Lock not owned %p = %lx", lock, state);
0a7de745
A
2989 }
2990 if (holder != thread) {
5ba3f43e 2991 panic("Lock not owned by current thread %p = %lx", lock, state);
0a7de745
A
2992 }
2993 if ((state & LCK_ILOCK) == 0) {
5ba3f43e 2994 panic("Lock bit not set %p = %lx", lock, state);
0a7de745 2995 }
5ba3f43e
A
2996 } else if (type == LCK_ASSERT_NOTOWNED) {
2997 if (holder != 0) {
0a7de745 2998 if (holder == thread) {
5ba3f43e 2999 panic("Lock owned by current thread %p = %lx", lock, state);
0a7de745 3000 }
5ba3f43e 3001 }
0a7de745 3002 } else {
5ba3f43e 3003 panic("lck_spin_assert(): invalid arg (%u)", type);
0a7de745 3004 }
5ba3f43e
A
3005}
3006
3007boolean_t
3008lck_rw_lock_yield_shared(lck_rw_t *lck, boolean_t force_yield)
3009{
0a7de745 3010 lck_rw_word_t word;
5ba3f43e
A
3011
3012 lck_rw_assert(lck, LCK_RW_ASSERT_SHARED);
3013
3014 word.data = ordered_load_rw(lck);
3015 if (word.want_excl || word.want_upgrade || force_yield) {
3016 lck_rw_unlock_shared(lck);
3017 mutex_pause(2);
3018 lck_rw_lock_shared(lck);
3019 return TRUE;
3020 }
3021
3022 return FALSE;
3023}
3024
3025/*
3026 * Routine: kdp_lck_mtx_lock_spin_is_acquired
3027 * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
3028 */
3029boolean_t
3030kdp_lck_mtx_lock_spin_is_acquired(lck_mtx_t *lck)
3031{
0a7de745 3032 uintptr_t state;
5ba3f43e
A
3033
3034 if (not_in_kdp) {
3035 panic("panic: spinlock acquired check done outside of kernel debugger");
3036 }
3037 state = ordered_load_mtx(lck);
0a7de745 3038 if (state == LCK_MTX_TAG_DESTROYED) {
5ba3f43e 3039 return FALSE;
0a7de745
A
3040 }
3041 if (LCK_MTX_STATE_TO_THREAD(state) || (state & LCK_ILOCK)) {
5ba3f43e 3042 return TRUE;
0a7de745 3043 }
5ba3f43e
A
3044 return FALSE;
3045}
3046
3047void
3048kdp_lck_mtx_find_owner(__unused struct waitq * waitq, event64_t event, thread_waitinfo_t * waitinfo)
3049{
3050 lck_mtx_t * mutex = LCK_EVENT_TO_MUTEX(event);
3051 waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(mutex);
3052 uintptr_t state = ordered_load_mtx(mutex);
3053 thread_t holder = LCK_MTX_STATE_TO_THREAD(state);
3054 if ((uintptr_t)holder == (uintptr_t)LCK_MTX_SPIN_TAG) {
3055 waitinfo->owner = STACKSHOT_WAITOWNER_MTXSPIN;
3056 } else {
3057 assertf(state != (uintptr_t)LCK_MTX_TAG_DESTROYED, "state=0x%llx", (uint64_t)state);
3058 assertf(state != (uintptr_t)LCK_MTX_TAG_INDIRECT, "state=0x%llx", (uint64_t)state);
3059 waitinfo->owner = thread_tid(holder);
3060 }
3061}
3062
3063void
3064kdp_rwlck_find_owner(__unused struct waitq * waitq, event64_t event, thread_waitinfo_t * waitinfo)
3065{
0a7de745
A
3066 lck_rw_t *rwlck = NULL;
3067 switch (waitinfo->wait_type) {
3068 case kThreadWaitKernelRWLockRead:
3069 rwlck = READ_EVENT_TO_RWLOCK(event);
3070 break;
3071 case kThreadWaitKernelRWLockWrite:
3072 case kThreadWaitKernelRWLockUpgrade:
3073 rwlck = WRITE_EVENT_TO_RWLOCK(event);
3074 break;
3075 default:
3076 panic("%s was called with an invalid blocking type", __FUNCTION__);
3077 break;
5ba3f43e
A
3078 }
3079 waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(rwlck);
3080 waitinfo->owner = thread_tid(rwlck->lck_rw_owner);
3081}