]> git.saurik.com Git - apple/xnu.git/blame_incremental - osfmk/arm/locks_arm.c
xnu-7195.101.1.tar.gz
[apple/xnu.git] / osfmk / arm / locks_arm.c
... / ...
CommitLineData
1/*
2 * Copyright (c) 2007-2018 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/*
29 * @OSF_COPYRIGHT@
30 */
31/*
32 * Mach Operating System Copyright (c) 1991,1990,1989,1988,1987 Carnegie
33 * Mellon University All Rights Reserved.
34 *
35 * Permission to use, copy, modify and distribute this software and its
36 * documentation is hereby granted, provided that both the copyright notice
37 * and this permission notice appear in all copies of the software,
38 * derivative works or modified versions, and any portions thereof, and that
39 * both notices appear in supporting documentation.
40 *
41 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" CONDITION.
42 * CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR ANY DAMAGES
43 * WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
44 *
45 * Carnegie Mellon requests users of this software to return to
46 *
47 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
48 * School of Computer Science Carnegie Mellon University Pittsburgh PA
49 * 15213-3890
50 *
51 * any improvements or extensions that they make and grant Carnegie Mellon the
52 * rights to redistribute these changes.
53 */
54/*
55 * File: kern/lock.c
56 * Author: Avadis Tevanian, Jr., Michael Wayne Young
57 * Date: 1985
58 *
59 * Locking primitives implementation
60 */
61
62#define LOCK_PRIVATE 1
63
64#include <mach_ldebug.h>
65
66#include <kern/zalloc.h>
67#include <kern/lock_stat.h>
68#include <kern/locks.h>
69#include <kern/misc_protos.h>
70#include <kern/thread.h>
71#include <kern/processor.h>
72#include <kern/sched_prim.h>
73#include <kern/debug.h>
74#include <kern/kcdata.h>
75#include <string.h>
76#include <arm/cpu_internal.h>
77#include <os/hash.h>
78#include <arm/cpu_data.h>
79
80#include <arm/cpu_data_internal.h>
81#include <arm/proc_reg.h>
82#include <arm/smp.h>
83#include <machine/atomic.h>
84#include <machine/machine_cpu.h>
85
86#include <sys/kdebug.h>
87
88#if CONFIG_DTRACE
89#define DTRACE_RW_SHARED 0x0 //reader
90#define DTRACE_RW_EXCL 0x1 //writer
91#define DTRACE_NO_FLAG 0x0 //not applicable
92#endif /* CONFIG_DTRACE */
93
94#define LCK_RW_LCK_EXCLUSIVE_CODE 0x100
95#define LCK_RW_LCK_EXCLUSIVE1_CODE 0x101
96#define LCK_RW_LCK_SHARED_CODE 0x102
97#define LCK_RW_LCK_SH_TO_EX_CODE 0x103
98#define LCK_RW_LCK_SH_TO_EX1_CODE 0x104
99#define LCK_RW_LCK_EX_TO_SH_CODE 0x105
100
101
102#define ANY_LOCK_DEBUG (USLOCK_DEBUG || LOCK_DEBUG || MUTEX_DEBUG)
103
104// Panic in tests that check lock usage correctness
105// These are undesirable when in a panic or a debugger is runnning.
106#define LOCK_CORRECTNESS_PANIC() (kernel_debugger_entry_count == 0)
107
108#define ADAPTIVE_SPIN_ENABLE 0x1
109
110int lck_mtx_adaptive_spin_mode = ADAPTIVE_SPIN_ENABLE;
111
112#define SPINWAIT_OWNER_CHECK_COUNT 4
113
114typedef enum {
115 SPINWAIT_ACQUIRED, /* Got the lock. */
116 SPINWAIT_INTERLOCK, /* Got the interlock, no owner, but caller must finish acquiring the lock. */
117 SPINWAIT_DID_SPIN_HIGH_THR, /* Got the interlock, spun, but failed to get the lock. */
118 SPINWAIT_DID_SPIN_OWNER_NOT_CORE, /* Got the interlock, spun, but failed to get the lock. */
119 SPINWAIT_DID_SPIN_NO_WINDOW_CONTENTION, /* Got the interlock, spun, but failed to get the lock. */
120 SPINWAIT_DID_SPIN_SLIDING_THR,/* Got the interlock, spun, but failed to get the lock. */
121 SPINWAIT_DID_NOT_SPIN, /* Got the interlock, did not spin. */
122} spinwait_result_t;
123
124#if CONFIG_DTRACE
125extern uint64_t dtrace_spin_threshold;
126#endif
127
128/* Forwards */
129
130extern unsigned int not_in_kdp;
131
132/*
133 * We often want to know the addresses of the callers
134 * of the various lock routines. However, this information
135 * is only used for debugging and statistics.
136 */
137typedef void *pc_t;
138#define INVALID_PC ((void *) VM_MAX_KERNEL_ADDRESS)
139#define INVALID_THREAD ((void *) VM_MAX_KERNEL_ADDRESS)
140
141#ifdef lint
142/*
143 * Eliminate lint complaints about unused local pc variables.
144 */
145#define OBTAIN_PC(pc, l) ++pc
146#else /* lint */
147#define OBTAIN_PC(pc, l)
148#endif /* lint */
149
150
151/*
152 * Portable lock package implementation of usimple_locks.
153 */
154
155/*
156 * Owner thread pointer when lock held in spin mode
157 */
158#define LCK_MTX_SPIN_TAG 0xfffffff0
159
160
161#define interlock_lock(lock) hw_lock_bit ((hw_lock_bit_t*)(&(lock)->lck_mtx_data), LCK_ILOCK_BIT, LCK_GRP_NULL)
162#define interlock_try(lock) hw_lock_bit_try((hw_lock_bit_t*)(&(lock)->lck_mtx_data), LCK_ILOCK_BIT, LCK_GRP_NULL)
163#define interlock_unlock(lock) hw_unlock_bit ((hw_lock_bit_t*)(&(lock)->lck_mtx_data), LCK_ILOCK_BIT)
164#define lck_rw_ilk_lock(lock) hw_lock_bit ((hw_lock_bit_t*)(&(lock)->lck_rw_tag), LCK_RW_INTERLOCK_BIT, LCK_GRP_NULL)
165#define lck_rw_ilk_unlock(lock) hw_unlock_bit((hw_lock_bit_t*)(&(lock)->lck_rw_tag), LCK_RW_INTERLOCK_BIT)
166
167#define load_memory_barrier() os_atomic_thread_fence(acquire)
168
169// Enforce program order of loads and stores.
170#define ordered_load(target) \
171 os_atomic_load(target, compiler_acq_rel)
172#define ordered_store(target, value) \
173 os_atomic_store(target, value, compiler_acq_rel)
174
175#define ordered_load_mtx(lock) ordered_load(&(lock)->lck_mtx_data)
176#define ordered_store_mtx(lock, value) ordered_store(&(lock)->lck_mtx_data, (value))
177#define ordered_load_rw(lock) ordered_load(&(lock)->lck_rw_data)
178#define ordered_store_rw(lock, value) ordered_store(&(lock)->lck_rw_data, (value))
179#define ordered_load_rw_owner(lock) ordered_load(&(lock)->lck_rw_owner)
180#define ordered_store_rw_owner(lock, value) ordered_store(&(lock)->lck_rw_owner, (value))
181#define ordered_load_hw(lock) ordered_load(&(lock)->lock_data)
182#define ordered_store_hw(lock, value) ordered_store(&(lock)->lock_data, (value))
183#define ordered_load_bit(lock) ordered_load((lock))
184#define ordered_store_bit(lock, value) ordered_store((lock), (value))
185
186
187// Prevent the compiler from reordering memory operations around this
188#define compiler_memory_fence() __asm__ volatile ("" ::: "memory")
189
190#define LOCK_PANIC_TIMEOUT 0xc00000
191#define NOINLINE __attribute__((noinline))
192
193
194#if __arm__
195#define interrupts_disabled(mask) (mask & PSR_INTMASK)
196#else
197#define interrupts_disabled(mask) (mask & DAIF_IRQF)
198#endif
199
200
201#if __arm__
202#define enable_fiq() __asm__ volatile ("cpsie f" ::: "memory");
203#define enable_interrupts() __asm__ volatile ("cpsie if" ::: "memory");
204#endif
205
206ZONE_VIEW_DEFINE(ZV_LCK_SPIN, "lck_spin",
207 KHEAP_ID_DEFAULT, sizeof(lck_spin_t));
208
209ZONE_VIEW_DEFINE(ZV_LCK_MTX, "lck_mtx",
210 KHEAP_ID_DEFAULT, sizeof(lck_mtx_t));
211
212ZONE_VIEW_DEFINE(ZV_LCK_MTX_EXT, "lck_mtx_ext",
213 KHEAP_ID_DEFAULT, sizeof(lck_mtx_ext_t));
214
215ZONE_VIEW_DEFINE(ZV_LCK_RW, "lck_rw",
216 KHEAP_ID_DEFAULT, sizeof(lck_rw_t));
217
218/*
219 * Forward declarations
220 */
221
222static void lck_rw_lock_shared_gen(lck_rw_t *lck);
223static void lck_rw_lock_exclusive_gen(lck_rw_t *lck);
224static boolean_t lck_rw_lock_shared_to_exclusive_success(lck_rw_t *lck);
225static boolean_t lck_rw_lock_shared_to_exclusive_failure(lck_rw_t *lck, uint32_t prior_lock_state);
226static void lck_rw_lock_exclusive_to_shared_gen(lck_rw_t *lck, uint32_t prior_lock_state);
227static lck_rw_type_t lck_rw_done_gen(lck_rw_t *lck, uint32_t prior_lock_state);
228static boolean_t lck_rw_grab(lck_rw_t *lock, int mode, boolean_t wait);
229
230/*
231 * atomic exchange API is a low level abstraction of the operations
232 * to atomically read, modify, and write a pointer. This abstraction works
233 * for both Intel and ARMv8.1 compare and exchange atomic instructions as
234 * well as the ARM exclusive instructions.
235 *
236 * atomic_exchange_begin() - begin exchange and retrieve current value
237 * atomic_exchange_complete() - conclude an exchange
238 * atomic_exchange_abort() - cancel an exchange started with atomic_exchange_begin()
239 */
240__unused static uint32_t
241load_exclusive32(uint32_t *target, enum memory_order ord)
242{
243 uint32_t value;
244
245#if __arm__
246 if (_os_atomic_mo_has_release(ord)) {
247 // Pre-load release barrier
248 atomic_thread_fence(memory_order_release);
249 }
250 value = __builtin_arm_ldrex(target);
251#else
252 if (_os_atomic_mo_has_acquire(ord)) {
253 value = __builtin_arm_ldaex(target); // ldaxr
254 } else {
255 value = __builtin_arm_ldrex(target); // ldxr
256 }
257#endif // __arm__
258 return value;
259}
260
261__unused static boolean_t
262store_exclusive32(uint32_t *target, uint32_t value, enum memory_order ord)
263{
264 boolean_t err;
265
266#if __arm__
267 err = __builtin_arm_strex(value, target);
268 if (_os_atomic_mo_has_acquire(ord)) {
269 // Post-store acquire barrier
270 atomic_thread_fence(memory_order_acquire);
271 }
272#else
273 if (_os_atomic_mo_has_release(ord)) {
274 err = __builtin_arm_stlex(value, target); // stlxr
275 } else {
276 err = __builtin_arm_strex(value, target); // stxr
277 }
278#endif // __arm__
279 return !err;
280}
281
282static uint32_t
283atomic_exchange_begin32(uint32_t *target, uint32_t *previous, enum memory_order ord)
284{
285 uint32_t val;
286
287#if __ARM_ATOMICS_8_1
288 ord = memory_order_relaxed;
289#endif
290 val = load_exclusive32(target, ord);
291 *previous = val;
292 return val;
293}
294
295static boolean_t
296atomic_exchange_complete32(uint32_t *target, uint32_t previous, uint32_t newval, enum memory_order ord)
297{
298#if __ARM_ATOMICS_8_1
299 return __c11_atomic_compare_exchange_strong((_Atomic uint32_t *)target, &previous, newval, ord, memory_order_relaxed);
300#else
301 (void)previous; // Previous not needed, monitor is held
302 return store_exclusive32(target, newval, ord);
303#endif
304}
305
306static void
307atomic_exchange_abort(void)
308{
309 os_atomic_clear_exclusive();
310}
311
312static boolean_t
313atomic_test_and_set32(uint32_t *target, uint32_t test_mask, uint32_t set_mask, enum memory_order ord, boolean_t wait)
314{
315 uint32_t value, prev;
316
317 for (;;) {
318 value = atomic_exchange_begin32(target, &prev, ord);
319 if (value & test_mask) {
320 if (wait) {
321 wait_for_event(); // Wait with monitor held
322 } else {
323 atomic_exchange_abort(); // Clear exclusive monitor
324 }
325 return FALSE;
326 }
327 value |= set_mask;
328 if (atomic_exchange_complete32(target, prev, value, ord)) {
329 return TRUE;
330 }
331 }
332}
333
334inline boolean_t
335hw_atomic_test_and_set32(uint32_t *target, uint32_t test_mask, uint32_t set_mask, enum memory_order ord, boolean_t wait)
336{
337 return atomic_test_and_set32(target, test_mask, set_mask, ord, wait);
338}
339
340/*
341 * To help _disable_preemption() inline everywhere with LTO,
342 * we keep these nice non inlineable functions as the panic()
343 * codegen setup is quite large and for weird reasons causes a frame.
344 */
345__abortlike
346static void
347_disable_preemption_overflow(void)
348{
349 panic("Preemption count overflow");
350}
351
352void
353_disable_preemption(void)
354{
355 thread_t thread = current_thread();
356 unsigned int count = thread->machine.preemption_count;
357
358 if (__improbable(++count == 0)) {
359 _disable_preemption_overflow();
360 }
361
362 os_atomic_store(&thread->machine.preemption_count, count, compiler_acq_rel);
363}
364
365/*
366 * This function checks whether an AST_URGENT has been pended.
367 *
368 * It is called once the preemption has been reenabled, which means the thread
369 * may have been preempted right before this was called, and when this function
370 * actually performs the check, we've changed CPU.
371 *
372 * This race is however benign: the point of AST_URGENT is to trigger a context
373 * switch, so if one happened, there's nothing left to check for, and AST_URGENT
374 * was cleared in the process.
375 *
376 * It follows that this check cannot have false negatives, which allows us
377 * to avoid fiddling with interrupt state for the vast majority of cases
378 * when the check will actually be negative.
379 */
380static NOINLINE void
381kernel_preempt_check(thread_t thread)
382{
383 cpu_data_t *cpu_data_ptr;
384 long state;
385
386#if __arm__
387#define INTERRUPT_MASK PSR_IRQF
388#else // __arm__
389#define INTERRUPT_MASK DAIF_IRQF
390#endif // __arm__
391
392 /*
393 * This check is racy and could load from another CPU's pending_ast mask,
394 * but as described above, this can't have false negatives.
395 */
396 cpu_data_ptr = os_atomic_load(&thread->machine.CpuDatap, compiler_acq_rel);
397 if (__probable((cpu_data_ptr->cpu_pending_ast & AST_URGENT) == 0)) {
398 return;
399 }
400
401 /* If interrupts are masked, we can't take an AST here */
402 state = get_interrupts();
403 if ((state & INTERRUPT_MASK) == 0) {
404 disable_interrupts_noread(); // Disable interrupts
405
406 /*
407 * Reload cpu_data_ptr: a context switch would cause it to change.
408 * Now that interrupts are disabled, this will debounce false positives.
409 */
410 cpu_data_ptr = os_atomic_load(&thread->machine.CpuDatap, compiler_acq_rel);
411 if (thread->machine.CpuDatap->cpu_pending_ast & AST_URGENT) {
412#if __arm__
413#if __ARM_USER_PROTECT__
414 uintptr_t up = arm_user_protect_begin(thread);
415#endif // __ARM_USER_PROTECT__
416 enable_fiq();
417#endif // __arm__
418 ast_taken_kernel(); // Handle urgent AST
419#if __arm__
420#if __ARM_USER_PROTECT__
421 arm_user_protect_end(thread, up, TRUE);
422#endif // __ARM_USER_PROTECT__
423 enable_interrupts();
424 return; // Return early on arm only due to FIQ enabling
425#endif // __arm__
426 }
427 restore_interrupts(state); // Enable interrupts
428 }
429}
430
431/*
432 * To help _enable_preemption() inline everywhere with LTO,
433 * we keep these nice non inlineable functions as the panic()
434 * codegen setup is quite large and for weird reasons causes a frame.
435 */
436__abortlike
437static void
438_enable_preemption_underflow(void)
439{
440 panic("Preemption count underflow");
441}
442
443void
444_enable_preemption(void)
445{
446 thread_t thread = current_thread();
447 unsigned int count = thread->machine.preemption_count;
448
449 if (__improbable(count == 0)) {
450 _enable_preemption_underflow();
451 }
452 count -= 1;
453
454 os_atomic_store(&thread->machine.preemption_count, count, compiler_acq_rel);
455 if (count == 0) {
456 kernel_preempt_check(thread);
457 }
458
459 os_compiler_barrier();
460}
461
462int
463get_preemption_level(void)
464{
465 return current_thread()->machine.preemption_count;
466}
467
468/*
469 * Routine: lck_spin_alloc_init
470 */
471lck_spin_t *
472lck_spin_alloc_init(
473 lck_grp_t * grp,
474 lck_attr_t * attr)
475{
476 lck_spin_t *lck;
477
478 lck = zalloc(ZV_LCK_SPIN);
479 lck_spin_init(lck, grp, attr);
480 return lck;
481}
482
483/*
484 * Routine: lck_spin_free
485 */
486void
487lck_spin_free(
488 lck_spin_t * lck,
489 lck_grp_t * grp)
490{
491 lck_spin_destroy(lck, grp);
492 zfree(ZV_LCK_SPIN, lck);
493}
494
495/*
496 * Routine: lck_spin_init
497 */
498void
499lck_spin_init(
500 lck_spin_t * lck,
501 lck_grp_t * grp,
502 __unused lck_attr_t * attr)
503{
504 lck->type = LCK_SPIN_TYPE;
505 hw_lock_init(&lck->hwlock);
506 if (grp) {
507 lck_grp_reference(grp);
508 lck_grp_lckcnt_incr(grp, LCK_TYPE_SPIN);
509 }
510}
511
512/*
513 * arm_usimple_lock is a lck_spin_t without a group or attributes
514 */
515MARK_AS_HIBERNATE_TEXT void inline
516arm_usimple_lock_init(simple_lock_t lck, __unused unsigned short initial_value)
517{
518 lck->type = LCK_SPIN_TYPE;
519 hw_lock_init(&lck->hwlock);
520}
521
522
523/*
524 * Routine: lck_spin_lock
525 */
526void
527lck_spin_lock(lck_spin_t *lock)
528{
529#if DEVELOPMENT || DEBUG
530 if (lock->type != LCK_SPIN_TYPE) {
531 panic("Invalid spinlock %p", lock);
532 }
533#endif // DEVELOPMENT || DEBUG
534 hw_lock_lock(&lock->hwlock, LCK_GRP_NULL);
535}
536
537void
538lck_spin_lock_grp(lck_spin_t *lock, lck_grp_t *grp)
539{
540#pragma unused(grp)
541#if DEVELOPMENT || DEBUG
542 if (lock->type != LCK_SPIN_TYPE) {
543 panic("Invalid spinlock %p", lock);
544 }
545#endif // DEVELOPMENT || DEBUG
546 hw_lock_lock(&lock->hwlock, grp);
547}
548
549/*
550 * Routine: lck_spin_lock_nopreempt
551 */
552void
553lck_spin_lock_nopreempt(lck_spin_t *lock)
554{
555#if DEVELOPMENT || DEBUG
556 if (lock->type != LCK_SPIN_TYPE) {
557 panic("Invalid spinlock %p", lock);
558 }
559#endif // DEVELOPMENT || DEBUG
560 hw_lock_lock_nopreempt(&lock->hwlock, LCK_GRP_NULL);
561}
562
563void
564lck_spin_lock_nopreempt_grp(lck_spin_t *lock, lck_grp_t *grp)
565{
566#pragma unused(grp)
567#if DEVELOPMENT || DEBUG
568 if (lock->type != LCK_SPIN_TYPE) {
569 panic("Invalid spinlock %p", lock);
570 }
571#endif // DEVELOPMENT || DEBUG
572 hw_lock_lock_nopreempt(&lock->hwlock, grp);
573}
574
575/*
576 * Routine: lck_spin_try_lock
577 */
578int
579lck_spin_try_lock(lck_spin_t *lock)
580{
581 return hw_lock_try(&lock->hwlock, LCK_GRP_NULL);
582}
583
584int
585lck_spin_try_lock_grp(lck_spin_t *lock, lck_grp_t *grp)
586{
587#pragma unused(grp)
588 return hw_lock_try(&lock->hwlock, grp);
589}
590
591/*
592 * Routine: lck_spin_try_lock_nopreempt
593 */
594int
595lck_spin_try_lock_nopreempt(lck_spin_t *lock)
596{
597 return hw_lock_try_nopreempt(&lock->hwlock, LCK_GRP_NULL);
598}
599
600int
601lck_spin_try_lock_nopreempt_grp(lck_spin_t *lock, lck_grp_t *grp)
602{
603#pragma unused(grp)
604 return hw_lock_try_nopreempt(&lock->hwlock, grp);
605}
606
607/*
608 * Routine: lck_spin_unlock
609 */
610void
611lck_spin_unlock(lck_spin_t *lock)
612{
613#if DEVELOPMENT || DEBUG
614 if ((LCK_MTX_STATE_TO_THREAD(lock->lck_spin_data) != current_thread()) && LOCK_CORRECTNESS_PANIC()) {
615 panic("Spinlock not owned by thread %p = %lx", lock, lock->lck_spin_data);
616 }
617 if (lock->type != LCK_SPIN_TYPE) {
618 panic("Invalid spinlock type %p", lock);
619 }
620#endif // DEVELOPMENT || DEBUG
621 hw_lock_unlock(&lock->hwlock);
622}
623
624/*
625 * Routine: lck_spin_unlock_nopreempt
626 */
627void
628lck_spin_unlock_nopreempt(lck_spin_t *lock)
629{
630#if DEVELOPMENT || DEBUG
631 if ((LCK_MTX_STATE_TO_THREAD(lock->lck_spin_data) != current_thread()) && LOCK_CORRECTNESS_PANIC()) {
632 panic("Spinlock not owned by thread %p = %lx", lock, lock->lck_spin_data);
633 }
634 if (lock->type != LCK_SPIN_TYPE) {
635 panic("Invalid spinlock type %p", lock);
636 }
637#endif // DEVELOPMENT || DEBUG
638 hw_lock_unlock_nopreempt(&lock->hwlock);
639}
640
641/*
642 * Routine: lck_spin_destroy
643 */
644void
645lck_spin_destroy(
646 lck_spin_t * lck,
647 lck_grp_t * grp)
648{
649 if (lck->lck_spin_data == LCK_SPIN_TAG_DESTROYED) {
650 return;
651 }
652 lck->lck_spin_data = LCK_SPIN_TAG_DESTROYED;
653 if (grp) {
654 lck_grp_lckcnt_decr(grp, LCK_TYPE_SPIN);
655 lck_grp_deallocate(grp);
656 }
657}
658
659/*
660 * Routine: kdp_lck_spin_is_acquired
661 * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
662 */
663boolean_t
664kdp_lck_spin_is_acquired(lck_spin_t *lck)
665{
666 if (not_in_kdp) {
667 panic("panic: spinlock acquired check done outside of kernel debugger");
668 }
669 return ((lck->lck_spin_data & ~LCK_SPIN_TAG_DESTROYED) != 0) ? TRUE:FALSE;
670}
671
672/*
673 * Initialize a usimple_lock.
674 *
675 * No change in preemption state.
676 */
677void
678usimple_lock_init(
679 usimple_lock_t l,
680 unsigned short tag)
681{
682 simple_lock_init((simple_lock_t) l, tag);
683}
684
685
686/*
687 * Acquire a usimple_lock.
688 *
689 * Returns with preemption disabled. Note
690 * that the hw_lock routines are responsible for
691 * maintaining preemption state.
692 */
693void
694(usimple_lock)(
695 usimple_lock_t l
696 LCK_GRP_ARG(lck_grp_t *grp))
697{
698 simple_lock((simple_lock_t) l, LCK_GRP_PROBEARG(grp));
699}
700
701
702extern void sync(void);
703
704/*
705 * Release a usimple_lock.
706 *
707 * Returns with preemption enabled. Note
708 * that the hw_lock routines are responsible for
709 * maintaining preemption state.
710 */
711void
712(usimple_unlock)(
713 usimple_lock_t l)
714{
715 simple_unlock((simple_lock_t)l);
716}
717
718
719/*
720 * Conditionally acquire a usimple_lock.
721 *
722 * On success, returns with preemption disabled.
723 * On failure, returns with preemption in the same state
724 * as when first invoked. Note that the hw_lock routines
725 * are responsible for maintaining preemption state.
726 *
727 * XXX No stats are gathered on a miss; I preserved this
728 * behavior from the original assembly-language code, but
729 * doesn't it make sense to log misses? XXX
730 */
731unsigned
732int
733(usimple_lock_try)(
734 usimple_lock_t l
735 LCK_GRP_ARG(lck_grp_t *grp))
736{
737 return simple_lock_try((simple_lock_t) l, grp);
738}
739
740/*
741 * The C portion of the shared/exclusive locks package.
742 */
743
744/*
745 * compute the deadline to spin against when
746 * waiting for a change of state on a lck_rw_t
747 */
748static inline uint64_t
749lck_rw_deadline_for_spin(lck_rw_t *lck)
750{
751 lck_rw_word_t word;
752
753 word.data = ordered_load_rw(lck);
754 if (word.can_sleep) {
755 if (word.r_waiting || word.w_waiting || (word.shared_count > machine_info.max_cpus)) {
756 /*
757 * there are already threads waiting on this lock... this
758 * implies that they have spun beyond their deadlines waiting for
759 * the desired state to show up so we will not bother spinning at this time...
760 * or
761 * the current number of threads sharing this lock exceeds our capacity to run them
762 * concurrently and since all states we're going to spin for require the rw_shared_count
763 * to be at 0, we'll not bother spinning since the latency for this to happen is
764 * unpredictable...
765 */
766 return mach_absolute_time();
767 }
768 return mach_absolute_time() + MutexSpin;
769 } else {
770 return mach_absolute_time() + (100000LL * 1000000000LL);
771 }
772}
773
774static boolean_t
775lck_rw_drain_status(lck_rw_t *lock, uint32_t status_mask, boolean_t wait __unused)
776{
777 uint64_t deadline = 0;
778 uint32_t data;
779
780 if (wait) {
781 deadline = lck_rw_deadline_for_spin(lock);
782 }
783
784 for (;;) {
785 data = load_exclusive32(&lock->lck_rw_data, memory_order_acquire_smp);
786 if ((data & status_mask) == 0) {
787 break;
788 }
789 if (wait) {
790 wait_for_event();
791 } else {
792 os_atomic_clear_exclusive();
793 }
794 if (!wait || (mach_absolute_time() >= deadline)) {
795 return FALSE;
796 }
797 }
798 os_atomic_clear_exclusive();
799 return TRUE;
800}
801
802/*
803 * Spin while interlock is held.
804 */
805static inline void
806lck_rw_interlock_spin(lck_rw_t *lock)
807{
808 uint32_t data;
809
810 for (;;) {
811 data = load_exclusive32(&lock->lck_rw_data, memory_order_relaxed);
812 if (data & LCK_RW_INTERLOCK) {
813 wait_for_event();
814 } else {
815 os_atomic_clear_exclusive();
816 return;
817 }
818 }
819}
820
821/*
822 * We disable interrupts while holding the RW interlock to prevent an
823 * interrupt from exacerbating hold time.
824 * Hence, local helper functions lck_interlock_lock()/lck_interlock_unlock().
825 */
826static inline boolean_t
827lck_interlock_lock(lck_rw_t *lck)
828{
829 boolean_t istate;
830
831 istate = ml_set_interrupts_enabled(FALSE);
832 lck_rw_ilk_lock(lck);
833 return istate;
834}
835
836static inline void
837lck_interlock_unlock(lck_rw_t *lck, boolean_t istate)
838{
839 lck_rw_ilk_unlock(lck);
840 ml_set_interrupts_enabled(istate);
841}
842
843
844#define LCK_RW_GRAB_WANT 0
845#define LCK_RW_GRAB_SHARED 1
846
847static boolean_t
848lck_rw_grab(lck_rw_t *lock, int mode, boolean_t wait)
849{
850 uint64_t deadline = 0;
851 uint32_t data, prev;
852 boolean_t do_exch;
853
854 if (wait) {
855 deadline = lck_rw_deadline_for_spin(lock);
856 }
857
858 for (;;) {
859 data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp);
860 if (data & LCK_RW_INTERLOCK) {
861 atomic_exchange_abort();
862 lck_rw_interlock_spin(lock);
863 continue;
864 }
865 do_exch = FALSE;
866 if (mode == LCK_RW_GRAB_WANT) {
867 if ((data & LCK_RW_WANT_EXCL) == 0) {
868 data |= LCK_RW_WANT_EXCL;
869 do_exch = TRUE;
870 }
871 } else { // LCK_RW_GRAB_SHARED
872 if (((data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) == 0) ||
873 (((data & LCK_RW_SHARED_MASK)) && ((data & LCK_RW_PRIV_EXCL) == 0))) {
874 data += LCK_RW_SHARED_READER;
875 do_exch = TRUE;
876 }
877 }
878 if (do_exch) {
879 if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
880 return TRUE;
881 }
882 } else {
883 if (wait) { // Non-waiting
884 wait_for_event();
885 } else {
886 atomic_exchange_abort();
887 }
888 if (!wait || (mach_absolute_time() >= deadline)) {
889 return FALSE;
890 }
891 }
892 }
893}
894
895
896/*
897 * Routine: lck_rw_alloc_init
898 */
899lck_rw_t *
900lck_rw_alloc_init(
901 lck_grp_t *grp,
902 lck_attr_t *attr)
903{
904 lck_rw_t *lck;
905
906 lck = zalloc_flags(ZV_LCK_RW, Z_WAITOK | Z_ZERO);
907 lck_rw_init(lck, grp, attr);
908 return lck;
909}
910
911/*
912 * Routine: lck_rw_free
913 */
914void
915lck_rw_free(
916 lck_rw_t *lck,
917 lck_grp_t *grp)
918{
919 lck_rw_destroy(lck, grp);
920 zfree(ZV_LCK_RW, lck);
921}
922
923/*
924 * Routine: lck_rw_init
925 */
926void
927lck_rw_init(
928 lck_rw_t *lck,
929 lck_grp_t *grp,
930 lck_attr_t *attr)
931{
932 if (attr == LCK_ATTR_NULL) {
933 attr = &LockDefaultLckAttr;
934 }
935 memset(lck, 0, sizeof(lck_rw_t));
936 lck->lck_rw_can_sleep = TRUE;
937 if ((attr->lck_attr_val & LCK_ATTR_RW_SHARED_PRIORITY) == 0) {
938 lck->lck_rw_priv_excl = TRUE;
939 }
940
941 lck_grp_reference(grp);
942 lck_grp_lckcnt_incr(grp, LCK_TYPE_RW);
943}
944
945
946/*
947 * Routine: lck_rw_destroy
948 */
949void
950lck_rw_destroy(
951 lck_rw_t *lck,
952 lck_grp_t *grp)
953{
954 if (lck->lck_rw_tag == LCK_RW_TAG_DESTROYED) {
955 return;
956 }
957#if MACH_LDEBUG
958 lck_rw_assert(lck, LCK_RW_ASSERT_NOTHELD);
959#endif
960 lck->lck_rw_tag = LCK_RW_TAG_DESTROYED;
961 lck_grp_lckcnt_decr(grp, LCK_TYPE_RW);
962 lck_grp_deallocate(grp);
963 return;
964}
965
966/*
967 * Routine: lck_rw_lock
968 */
969void
970lck_rw_lock(
971 lck_rw_t *lck,
972 lck_rw_type_t lck_rw_type)
973{
974 if (lck_rw_type == LCK_RW_TYPE_SHARED) {
975 lck_rw_lock_shared(lck);
976 } else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) {
977 lck_rw_lock_exclusive(lck);
978 } else {
979 panic("lck_rw_lock(): Invalid RW lock type: %x", lck_rw_type);
980 }
981}
982
983#define LCK_RW_LOCK_EXCLUSIVE_TAS(lck) (atomic_test_and_set32(&(lck)->lck_rw_data, \
984 (LCK_RW_SHARED_MASK | LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE | LCK_RW_INTERLOCK), \
985 LCK_RW_WANT_EXCL, memory_order_acquire_smp, FALSE))
986
987/*
988 * Routine: lck_rw_lock_exclusive_check_contended
989 */
990bool
991lck_rw_lock_exclusive_check_contended(lck_rw_t *lock)
992{
993 thread_t thread = current_thread();
994 bool contended = false;
995
996 if (lock->lck_rw_can_sleep) {
997 thread->rwlock_count++;
998 } else if (get_preemption_level() == 0) {
999 panic("Taking non-sleepable RW lock with preemption enabled");
1000 }
1001 if (LCK_RW_LOCK_EXCLUSIVE_TAS(lock)) {
1002#if CONFIG_DTRACE
1003 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL);
1004#endif /* CONFIG_DTRACE */
1005 } else {
1006 contended = true;
1007 lck_rw_lock_exclusive_gen(lock);
1008 }
1009#if MACH_ASSERT
1010 thread_t owner = ordered_load_rw_owner(lock);
1011 assertf(owner == THREAD_NULL, "state=0x%x, owner=%p", ordered_load_rw(lock), owner);
1012#endif
1013 ordered_store_rw_owner(lock, thread);
1014 return contended;
1015}
1016
1017/*
1018 * Routine: lck_rw_lock_exclusive
1019 */
1020void
1021lck_rw_lock_exclusive(lck_rw_t *lock)
1022{
1023 thread_t thread = current_thread();
1024
1025 if (lock->lck_rw_can_sleep) {
1026 thread->rwlock_count++;
1027 } else if (get_preemption_level() == 0) {
1028 panic("Taking non-sleepable RW lock with preemption enabled");
1029 }
1030 if (LCK_RW_LOCK_EXCLUSIVE_TAS(lock)) {
1031#if CONFIG_DTRACE
1032 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL);
1033#endif /* CONFIG_DTRACE */
1034 } else {
1035 lck_rw_lock_exclusive_gen(lock);
1036 }
1037#if MACH_ASSERT
1038 thread_t owner = ordered_load_rw_owner(lock);
1039 assertf(owner == THREAD_NULL, "state=0x%x, owner=%p", ordered_load_rw(lock), owner);
1040#endif
1041 ordered_store_rw_owner(lock, thread);
1042}
1043
1044/*
1045 * Routine: lck_rw_lock_shared
1046 */
1047void
1048lck_rw_lock_shared(lck_rw_t *lock)
1049{
1050 uint32_t data, prev;
1051
1052 if (lock->lck_rw_can_sleep) {
1053 current_thread()->rwlock_count++;
1054 } else if (get_preemption_level() == 0) {
1055 panic("Taking non-sleepable RW lock with preemption enabled");
1056 }
1057 for (;;) {
1058 data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp);
1059 if (data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE | LCK_RW_INTERLOCK)) {
1060 atomic_exchange_abort();
1061 lck_rw_lock_shared_gen(lock);
1062 break;
1063 }
1064 data += LCK_RW_SHARED_READER;
1065 if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
1066 break;
1067 }
1068 cpu_pause();
1069 }
1070#if MACH_ASSERT
1071 thread_t owner = ordered_load_rw_owner(lock);
1072 assertf(owner == THREAD_NULL, "state=0x%x, owner=%p", ordered_load_rw(lock), owner);
1073#endif
1074#if CONFIG_DTRACE
1075 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lock, DTRACE_RW_SHARED);
1076#endif /* CONFIG_DTRACE */
1077 return;
1078}
1079
1080/*
1081 * Routine: lck_rw_lock_shared_to_exclusive
1082 *
1083 * False returned upon failure, in this case the shared lock is dropped.
1084 */
1085boolean_t
1086lck_rw_lock_shared_to_exclusive(lck_rw_t *lock)
1087{
1088 uint32_t data, prev;
1089
1090 for (;;) {
1091 data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp);
1092 if (data & LCK_RW_INTERLOCK) {
1093 atomic_exchange_abort();
1094 lck_rw_interlock_spin(lock);
1095 continue;
1096 }
1097 if (data & LCK_RW_WANT_UPGRADE) {
1098 data -= LCK_RW_SHARED_READER;
1099 if ((data & LCK_RW_SHARED_MASK) == 0) { /* we were the last reader */
1100 data &= ~(LCK_RW_W_WAITING); /* so clear the wait indicator */
1101 }
1102 if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
1103 return lck_rw_lock_shared_to_exclusive_failure(lock, prev);
1104 }
1105 } else {
1106 data |= LCK_RW_WANT_UPGRADE; /* ask for WANT_UPGRADE */
1107 data -= LCK_RW_SHARED_READER; /* and shed our read count */
1108 if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
1109 break;
1110 }
1111 }
1112 cpu_pause();
1113 }
1114 /* we now own the WANT_UPGRADE */
1115 if (data & LCK_RW_SHARED_MASK) { /* check to see if all of the readers are drained */
1116 lck_rw_lock_shared_to_exclusive_success(lock); /* if not, we need to go wait */
1117 }
1118#if MACH_ASSERT
1119 thread_t owner = ordered_load_rw_owner(lock);
1120 assertf(owner == THREAD_NULL, "state=0x%x, owner=%p", ordered_load_rw(lock), owner);
1121#endif
1122 ordered_store_rw_owner(lock, current_thread());
1123#if CONFIG_DTRACE
1124 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lock, 0);
1125#endif /* CONFIG_DTRACE */
1126 return TRUE;
1127}
1128
1129
1130/*
1131 * Routine: lck_rw_lock_shared_to_exclusive_failure
1132 * Function:
1133 * Fast path code has already dropped our read
1134 * count and determined that someone else owns 'lck_rw_want_upgrade'
1135 * if 'lck_rw_shared_count' == 0, its also already dropped 'lck_w_waiting'
1136 * all we need to do here is determine if a wakeup is needed
1137 */
1138static boolean_t
1139lck_rw_lock_shared_to_exclusive_failure(
1140 lck_rw_t *lck,
1141 uint32_t prior_lock_state)
1142{
1143 thread_t thread = current_thread();
1144 uint32_t rwlock_count;
1145
1146 /* Check if dropping the lock means that we need to unpromote */
1147 if (lck->lck_rw_can_sleep) {
1148 rwlock_count = thread->rwlock_count--;
1149 } else {
1150 rwlock_count = UINT32_MAX;
1151 }
1152#if MACH_LDEBUG
1153 if (rwlock_count == 0) {
1154 panic("rw lock count underflow for thread %p", thread);
1155 }
1156#endif
1157 if ((prior_lock_state & LCK_RW_W_WAITING) &&
1158 ((prior_lock_state & LCK_RW_SHARED_MASK) == LCK_RW_SHARED_READER)) {
1159 /*
1160 * Someone else has requested upgrade.
1161 * Since we've released the read lock, wake
1162 * him up if he's blocked waiting
1163 */
1164 thread_wakeup(LCK_RW_WRITER_EVENT(lck));
1165 }
1166
1167 if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
1168 /* sched_flags checked without lock, but will be rechecked while clearing */
1169 lck_rw_clear_promotion(thread, unslide_for_kdebug(lck));
1170 }
1171
1172 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_CODE) | DBG_FUNC_NONE,
1173 VM_KERNEL_UNSLIDE_OR_PERM(lck), lck->lck_rw_shared_count, lck->lck_rw_want_upgrade, 0, 0);
1174
1175 return FALSE;
1176}
1177
1178/*
1179 * Routine: lck_rw_lock_shared_to_exclusive_success
1180 * Function:
1181 * assembly fast path code has already dropped our read
1182 * count and successfully acquired 'lck_rw_want_upgrade'
1183 * we just need to wait for the rest of the readers to drain
1184 * and then we can return as the exclusive holder of this lock
1185 */
1186static boolean_t
1187lck_rw_lock_shared_to_exclusive_success(
1188 lck_rw_t *lock)
1189{
1190 __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lock);
1191 int slept = 0;
1192 lck_rw_word_t word;
1193 wait_result_t res;
1194 boolean_t istate;
1195 boolean_t not_shared;
1196
1197#if CONFIG_DTRACE
1198 uint64_t wait_interval = 0;
1199 int readers_at_sleep = 0;
1200 boolean_t dtrace_ls_initialized = FALSE;
1201 boolean_t dtrace_rwl_shared_to_excl_spin, dtrace_rwl_shared_to_excl_block, dtrace_ls_enabled = FALSE;
1202#endif
1203
1204 while (!lck_rw_drain_status(lock, LCK_RW_SHARED_MASK, FALSE)) {
1205 word.data = ordered_load_rw(lock);
1206#if CONFIG_DTRACE
1207 if (dtrace_ls_initialized == FALSE) {
1208 dtrace_ls_initialized = TRUE;
1209 dtrace_rwl_shared_to_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN] != 0);
1210 dtrace_rwl_shared_to_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK] != 0);
1211 dtrace_ls_enabled = dtrace_rwl_shared_to_excl_spin || dtrace_rwl_shared_to_excl_block;
1212 if (dtrace_ls_enabled) {
1213 /*
1214 * Either sleeping or spinning is happening,
1215 * start a timing of our delay interval now.
1216 */
1217 readers_at_sleep = word.shared_count;
1218 wait_interval = mach_absolute_time();
1219 }
1220 }
1221#endif
1222
1223 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_START,
1224 trace_lck, word.shared_count, 0, 0, 0);
1225
1226 not_shared = lck_rw_drain_status(lock, LCK_RW_SHARED_MASK, TRUE);
1227
1228 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_END,
1229 trace_lck, lock->lck_rw_shared_count, 0, 0, 0);
1230
1231 if (not_shared) {
1232 break;
1233 }
1234
1235 /*
1236 * if we get here, the spin deadline in lck_rw_wait_on_status()
1237 * has expired w/o the rw_shared_count having drained to 0
1238 * check to see if we're allowed to do a thread_block
1239 */
1240 if (word.can_sleep) {
1241 istate = lck_interlock_lock(lock);
1242
1243 word.data = ordered_load_rw(lock);
1244 if (word.shared_count != 0) {
1245 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_START,
1246 trace_lck, word.shared_count, 0, 0, 0);
1247
1248 word.w_waiting = 1;
1249 ordered_store_rw(lock, word.data);
1250
1251 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockUpgrade);
1252 res = assert_wait(LCK_RW_WRITER_EVENT(lock),
1253 THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
1254 lck_interlock_unlock(lock, istate);
1255
1256 if (res == THREAD_WAITING) {
1257 res = thread_block(THREAD_CONTINUE_NULL);
1258 slept++;
1259 }
1260 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_END,
1261 trace_lck, res, slept, 0, 0);
1262 } else {
1263 lck_interlock_unlock(lock, istate);
1264 break;
1265 }
1266 }
1267 }
1268#if CONFIG_DTRACE
1269 /*
1270 * We infer whether we took the sleep/spin path above by checking readers_at_sleep.
1271 */
1272 if (dtrace_ls_enabled == TRUE) {
1273 if (slept == 0) {
1274 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN, lock, mach_absolute_time() - wait_interval, 0);
1275 } else {
1276 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK, lock,
1277 mach_absolute_time() - wait_interval, 1,
1278 (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1279 }
1280 }
1281 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lock, 1);
1282#endif
1283 return TRUE;
1284}
1285
1286
1287/*
1288 * Routine: lck_rw_lock_exclusive_to_shared
1289 */
1290
1291void
1292lck_rw_lock_exclusive_to_shared(lck_rw_t *lock)
1293{
1294 uint32_t data, prev;
1295
1296 assertf(lock->lck_rw_owner == current_thread(), "state=0x%x, owner=%p", lock->lck_rw_data, lock->lck_rw_owner);
1297 ordered_store_rw_owner(lock, THREAD_NULL);
1298 for (;;) {
1299 data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_release_smp);
1300 if (data & LCK_RW_INTERLOCK) {
1301 atomic_exchange_abort();
1302 lck_rw_interlock_spin(lock); /* wait for interlock to clear */
1303 continue;
1304 }
1305 data += LCK_RW_SHARED_READER;
1306 if (data & LCK_RW_WANT_UPGRADE) {
1307 data &= ~(LCK_RW_WANT_UPGRADE);
1308 } else {
1309 data &= ~(LCK_RW_WANT_EXCL);
1310 }
1311 if (!((prev & LCK_RW_W_WAITING) && (prev & LCK_RW_PRIV_EXCL))) {
1312 data &= ~(LCK_RW_W_WAITING);
1313 }
1314 if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_release_smp)) {
1315 break;
1316 }
1317 cpu_pause();
1318 }
1319 return lck_rw_lock_exclusive_to_shared_gen(lock, prev);
1320}
1321
1322/*
1323 * Routine: lck_rw_lock_exclusive_to_shared_gen
1324 * Function:
1325 * Fast path has already dropped
1326 * our exclusive state and bumped lck_rw_shared_count
1327 * all we need to do here is determine if anyone
1328 * needs to be awakened.
1329 */
1330static void
1331lck_rw_lock_exclusive_to_shared_gen(
1332 lck_rw_t *lck,
1333 uint32_t prior_lock_state)
1334{
1335 __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
1336 lck_rw_word_t fake_lck;
1337
1338 /*
1339 * prior_lock state is a snapshot of the 1st word of the
1340 * lock in question... we'll fake up a pointer to it
1341 * and carefully not access anything beyond whats defined
1342 * in the first word of a lck_rw_t
1343 */
1344 fake_lck.data = prior_lock_state;
1345
1346 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_START,
1347 trace_lck, fake_lck->want_excl, fake_lck->want_upgrade, 0, 0);
1348
1349 /*
1350 * don't wake up anyone waiting to take the lock exclusively
1351 * since we hold a read count... when the read count drops to 0,
1352 * the writers will be woken.
1353 *
1354 * wake up any waiting readers if we don't have any writers waiting,
1355 * or the lock is NOT marked as rw_priv_excl (writers have privilege)
1356 */
1357 if (!(fake_lck.priv_excl && fake_lck.w_waiting) && fake_lck.r_waiting) {
1358 thread_wakeup(LCK_RW_READER_EVENT(lck));
1359 }
1360
1361 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_END,
1362 trace_lck, lck->lck_rw_want_excl, lck->lck_rw_want_upgrade, lck->lck_rw_shared_count, 0);
1363
1364#if CONFIG_DTRACE
1365 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_TO_SHARED_DOWNGRADE, lck, 0);
1366#endif
1367}
1368
1369
1370/*
1371 * Routine: lck_rw_try_lock
1372 */
1373boolean_t
1374lck_rw_try_lock(
1375 lck_rw_t *lck,
1376 lck_rw_type_t lck_rw_type)
1377{
1378 if (lck_rw_type == LCK_RW_TYPE_SHARED) {
1379 return lck_rw_try_lock_shared(lck);
1380 } else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) {
1381 return lck_rw_try_lock_exclusive(lck);
1382 } else {
1383 panic("lck_rw_try_lock(): Invalid rw lock type: %x", lck_rw_type);
1384 }
1385 return FALSE;
1386}
1387
1388/*
1389 * Routine: lck_rw_try_lock_shared
1390 */
1391
1392boolean_t
1393lck_rw_try_lock_shared(lck_rw_t *lock)
1394{
1395 uint32_t data, prev;
1396
1397 for (;;) {
1398 data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp);
1399 if (data & LCK_RW_INTERLOCK) {
1400 atomic_exchange_abort();
1401 lck_rw_interlock_spin(lock);
1402 continue;
1403 }
1404 if (data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) {
1405 atomic_exchange_abort();
1406 return FALSE; /* lock is busy */
1407 }
1408 data += LCK_RW_SHARED_READER; /* Increment reader refcount */
1409 if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
1410 break;
1411 }
1412 cpu_pause();
1413 }
1414#if MACH_ASSERT
1415 thread_t owner = ordered_load_rw_owner(lock);
1416 assertf(owner == THREAD_NULL, "state=0x%x, owner=%p", ordered_load_rw(lock), owner);
1417#endif
1418
1419 if (lock->lck_rw_can_sleep) {
1420 current_thread()->rwlock_count++;
1421 } else if (get_preemption_level() == 0) {
1422 panic("Taking non-sleepable RW lock with preemption enabled");
1423 }
1424
1425#if CONFIG_DTRACE
1426 LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_SHARED_ACQUIRE, lock, DTRACE_RW_SHARED);
1427#endif /* CONFIG_DTRACE */
1428 return TRUE;
1429}
1430
1431
1432/*
1433 * Routine: lck_rw_try_lock_exclusive
1434 */
1435
1436boolean_t
1437lck_rw_try_lock_exclusive(lck_rw_t *lock)
1438{
1439 uint32_t data, prev;
1440 thread_t thread;
1441
1442 for (;;) {
1443 data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp);
1444 if (data & LCK_RW_INTERLOCK) {
1445 atomic_exchange_abort();
1446 lck_rw_interlock_spin(lock);
1447 continue;
1448 }
1449 if (data & (LCK_RW_SHARED_MASK | LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) {
1450 atomic_exchange_abort();
1451 return FALSE;
1452 }
1453 data |= LCK_RW_WANT_EXCL;
1454 if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
1455 break;
1456 }
1457 cpu_pause();
1458 }
1459 thread = current_thread();
1460 if (lock->lck_rw_can_sleep) {
1461 thread->rwlock_count++;
1462 } else if (get_preemption_level() == 0) {
1463 panic("Taking non-sleepable RW lock with preemption enabled");
1464 }
1465#if MACH_ASSERT
1466 thread_t owner = ordered_load_rw_owner(lock);
1467 assertf(owner == THREAD_NULL, "state=0x%x, owner=%p", ordered_load_rw(lock), owner);
1468#endif
1469 ordered_store_rw_owner(lock, thread);
1470#if CONFIG_DTRACE
1471 LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL);
1472#endif /* CONFIG_DTRACE */
1473 return TRUE;
1474}
1475
1476
1477/*
1478 * Routine: lck_rw_unlock
1479 */
1480void
1481lck_rw_unlock(
1482 lck_rw_t *lck,
1483 lck_rw_type_t lck_rw_type)
1484{
1485 if (lck_rw_type == LCK_RW_TYPE_SHARED) {
1486 lck_rw_unlock_shared(lck);
1487 } else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) {
1488 lck_rw_unlock_exclusive(lck);
1489 } else {
1490 panic("lck_rw_unlock(): Invalid RW lock type: %d", lck_rw_type);
1491 }
1492}
1493
1494
1495/*
1496 * Routine: lck_rw_unlock_shared
1497 */
1498void
1499lck_rw_unlock_shared(
1500 lck_rw_t *lck)
1501{
1502 lck_rw_type_t ret;
1503
1504 assertf(lck->lck_rw_owner == THREAD_NULL, "state=0x%x, owner=%p", lck->lck_rw_data, lck->lck_rw_owner);
1505 assertf(lck->lck_rw_shared_count > 0, "shared_count=0x%x", lck->lck_rw_shared_count);
1506 ret = lck_rw_done(lck);
1507
1508 if (ret != LCK_RW_TYPE_SHARED) {
1509 panic("lck_rw_unlock_shared(): lock %p held in mode: %d", lck, ret);
1510 }
1511}
1512
1513
1514/*
1515 * Routine: lck_rw_unlock_exclusive
1516 */
1517void
1518lck_rw_unlock_exclusive(
1519 lck_rw_t *lck)
1520{
1521 lck_rw_type_t ret;
1522
1523 assertf(lck->lck_rw_owner == current_thread(), "state=0x%x, owner=%p", lck->lck_rw_data, lck->lck_rw_owner);
1524 ret = lck_rw_done(lck);
1525
1526 if (ret != LCK_RW_TYPE_EXCLUSIVE) {
1527 panic("lck_rw_unlock_exclusive(): lock %p held in mode: %d", lck, ret);
1528 }
1529}
1530
1531
1532/*
1533 * Routine: lck_rw_lock_exclusive_gen
1534 */
1535static void
1536lck_rw_lock_exclusive_gen(
1537 lck_rw_t *lock)
1538{
1539 __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lock);
1540 lck_rw_word_t word;
1541 int slept = 0;
1542 boolean_t gotlock = 0;
1543 boolean_t not_shared_or_upgrade = 0;
1544 wait_result_t res = 0;
1545 boolean_t istate;
1546
1547#if CONFIG_DTRACE
1548 boolean_t dtrace_ls_initialized = FALSE;
1549 boolean_t dtrace_rwl_excl_spin, dtrace_rwl_excl_block, dtrace_ls_enabled = FALSE;
1550 uint64_t wait_interval = 0;
1551 int readers_at_sleep = 0;
1552#endif
1553
1554 /*
1555 * Try to acquire the lck_rw_want_excl bit.
1556 */
1557 while (!lck_rw_grab(lock, LCK_RW_GRAB_WANT, FALSE)) {
1558#if CONFIG_DTRACE
1559 if (dtrace_ls_initialized == FALSE) {
1560 dtrace_ls_initialized = TRUE;
1561 dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
1562 dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
1563 dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
1564 if (dtrace_ls_enabled) {
1565 /*
1566 * Either sleeping or spinning is happening,
1567 * start a timing of our delay interval now.
1568 */
1569 readers_at_sleep = lock->lck_rw_shared_count;
1570 wait_interval = mach_absolute_time();
1571 }
1572 }
1573#endif
1574
1575 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1576
1577 gotlock = lck_rw_grab(lock, LCK_RW_GRAB_WANT, TRUE);
1578
1579 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_END, trace_lck, 0, 0, gotlock, 0);
1580
1581 if (gotlock) {
1582 break;
1583 }
1584 /*
1585 * if we get here, the deadline has expired w/o us
1586 * being able to grab the lock exclusively
1587 * check to see if we're allowed to do a thread_block
1588 */
1589 word.data = ordered_load_rw(lock);
1590 if (word.can_sleep) {
1591 istate = lck_interlock_lock(lock);
1592 word.data = ordered_load_rw(lock);
1593
1594 if (word.want_excl) {
1595 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1596
1597 word.w_waiting = 1;
1598 ordered_store_rw(lock, word.data);
1599
1600 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite);
1601 res = assert_wait(LCK_RW_WRITER_EVENT(lock),
1602 THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
1603 lck_interlock_unlock(lock, istate);
1604
1605 if (res == THREAD_WAITING) {
1606 res = thread_block(THREAD_CONTINUE_NULL);
1607 slept++;
1608 }
1609 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_END, trace_lck, res, slept, 0, 0);
1610 } else {
1611 word.want_excl = 1;
1612 ordered_store_rw(lock, word.data);
1613 lck_interlock_unlock(lock, istate);
1614 break;
1615 }
1616 }
1617 }
1618 /*
1619 * Wait for readers (and upgrades) to finish...
1620 */
1621 while (!lck_rw_drain_status(lock, LCK_RW_SHARED_MASK | LCK_RW_WANT_UPGRADE, FALSE)) {
1622#if CONFIG_DTRACE
1623 /*
1624 * Either sleeping or spinning is happening, start
1625 * a timing of our delay interval now. If we set it
1626 * to -1 we don't have accurate data so we cannot later
1627 * decide to record a dtrace spin or sleep event.
1628 */
1629 if (dtrace_ls_initialized == FALSE) {
1630 dtrace_ls_initialized = TRUE;
1631 dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
1632 dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
1633 dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
1634 if (dtrace_ls_enabled) {
1635 /*
1636 * Either sleeping or spinning is happening,
1637 * start a timing of our delay interval now.
1638 */
1639 readers_at_sleep = lock->lck_rw_shared_count;
1640 wait_interval = mach_absolute_time();
1641 }
1642 }
1643#endif
1644
1645 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1646
1647 not_shared_or_upgrade = lck_rw_drain_status(lock, LCK_RW_SHARED_MASK | LCK_RW_WANT_UPGRADE, TRUE);
1648
1649 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_END, trace_lck, 0, 0, not_shared_or_upgrade, 0);
1650
1651 if (not_shared_or_upgrade) {
1652 break;
1653 }
1654 /*
1655 * if we get here, the deadline has expired w/o us
1656 * being able to grab the lock exclusively
1657 * check to see if we're allowed to do a thread_block
1658 */
1659 word.data = ordered_load_rw(lock);
1660 if (word.can_sleep) {
1661 istate = lck_interlock_lock(lock);
1662 word.data = ordered_load_rw(lock);
1663
1664 if (word.shared_count != 0 || word.want_upgrade) {
1665 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1666
1667 word.w_waiting = 1;
1668 ordered_store_rw(lock, word.data);
1669
1670 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite);
1671 res = assert_wait(LCK_RW_WRITER_EVENT(lock),
1672 THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
1673 lck_interlock_unlock(lock, istate);
1674
1675 if (res == THREAD_WAITING) {
1676 res = thread_block(THREAD_CONTINUE_NULL);
1677 slept++;
1678 }
1679 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_END, trace_lck, res, slept, 0, 0);
1680 } else {
1681 lck_interlock_unlock(lock, istate);
1682 /*
1683 * must own the lock now, since we checked for
1684 * readers or upgrade owner behind the interlock
1685 * no need for a call to 'lck_rw_drain_status'
1686 */
1687 break;
1688 }
1689 }
1690 }
1691
1692#if CONFIG_DTRACE
1693 /*
1694 * Decide what latencies we suffered that are Dtrace events.
1695 * If we have set wait_interval, then we either spun or slept.
1696 * At least we get out from under the interlock before we record
1697 * which is the best we can do here to minimize the impact
1698 * of the tracing.
1699 * If we have set wait_interval to -1, then dtrace was not enabled when we
1700 * started sleeping/spinning so we don't record this event.
1701 */
1702 if (dtrace_ls_enabled == TRUE) {
1703 if (slept == 0) {
1704 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_SPIN, lock,
1705 mach_absolute_time() - wait_interval, 1);
1706 } else {
1707 /*
1708 * For the blocking case, we also record if when we blocked
1709 * it was held for read or write, and how many readers.
1710 * Notice that above we recorded this before we dropped
1711 * the interlock so the count is accurate.
1712 */
1713 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_BLOCK, lock,
1714 mach_absolute_time() - wait_interval, 1,
1715 (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1716 }
1717 }
1718 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lock, 1);
1719#endif /* CONFIG_DTRACE */
1720}
1721
1722/*
1723 * Routine: lck_rw_done
1724 */
1725
1726lck_rw_type_t
1727lck_rw_done(lck_rw_t *lock)
1728{
1729 uint32_t data, prev;
1730 boolean_t once = FALSE;
1731
1732 for (;;) {
1733 data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_release_smp);
1734 if (data & LCK_RW_INTERLOCK) { /* wait for interlock to clear */
1735 atomic_exchange_abort();
1736 lck_rw_interlock_spin(lock);
1737 continue;
1738 }
1739 if (data & LCK_RW_SHARED_MASK) { /* lock is held shared */
1740 assertf(lock->lck_rw_owner == THREAD_NULL, "state=0x%x, owner=%p", lock->lck_rw_data, lock->lck_rw_owner);
1741 data -= LCK_RW_SHARED_READER;
1742 if ((data & LCK_RW_SHARED_MASK) == 0) { /* if reader count has now gone to 0, check for waiters */
1743 goto check_waiters;
1744 }
1745 } else { /* if reader count == 0, must be exclusive lock */
1746 if (data & LCK_RW_WANT_UPGRADE) {
1747 data &= ~(LCK_RW_WANT_UPGRADE);
1748 } else {
1749 if (data & LCK_RW_WANT_EXCL) {
1750 data &= ~(LCK_RW_WANT_EXCL);
1751 } else { /* lock is not 'owned', panic */
1752 panic("Releasing non-exclusive RW lock without a reader refcount!");
1753 }
1754 }
1755 if (!once) {
1756 // Only check for holder and clear it once
1757 assertf(lock->lck_rw_owner == current_thread(), "state=0x%x, owner=%p", lock->lck_rw_data, lock->lck_rw_owner);
1758 ordered_store_rw_owner(lock, THREAD_NULL);
1759 once = TRUE;
1760 }
1761check_waiters:
1762 /*
1763 * test the original values to match what
1764 * lck_rw_done_gen is going to do to determine
1765 * which wakeups need to happen...
1766 *
1767 * if !(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting)
1768 */
1769 if (prev & LCK_RW_W_WAITING) {
1770 data &= ~(LCK_RW_W_WAITING);
1771 if ((prev & LCK_RW_PRIV_EXCL) == 0) {
1772 data &= ~(LCK_RW_R_WAITING);
1773 }
1774 } else {
1775 data &= ~(LCK_RW_R_WAITING);
1776 }
1777 }
1778 if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_release_smp)) {
1779 break;
1780 }
1781 cpu_pause();
1782 }
1783 return lck_rw_done_gen(lock, prev);
1784}
1785
1786/*
1787 * Routine: lck_rw_done_gen
1788 *
1789 * called from the assembly language wrapper...
1790 * prior_lock_state is the value in the 1st
1791 * word of the lock at the time of a successful
1792 * atomic compare and exchange with the new value...
1793 * it represents the state of the lock before we
1794 * decremented the rw_shared_count or cleared either
1795 * rw_want_upgrade or rw_want_write and
1796 * the lck_x_waiting bits... since the wrapper
1797 * routine has already changed the state atomically,
1798 * we just need to decide if we should
1799 * wake up anyone and what value to return... we do
1800 * this by examining the state of the lock before
1801 * we changed it
1802 */
1803static lck_rw_type_t
1804lck_rw_done_gen(
1805 lck_rw_t *lck,
1806 uint32_t prior_lock_state)
1807{
1808 lck_rw_word_t fake_lck;
1809 lck_rw_type_t lock_type;
1810 thread_t thread;
1811 uint32_t rwlock_count;
1812
1813 /*
1814 * prior_lock state is a snapshot of the 1st word of the
1815 * lock in question... we'll fake up a pointer to it
1816 * and carefully not access anything beyond whats defined
1817 * in the first word of a lck_rw_t
1818 */
1819 fake_lck.data = prior_lock_state;
1820
1821 if (fake_lck.shared_count <= 1) {
1822 if (fake_lck.w_waiting) {
1823 thread_wakeup(LCK_RW_WRITER_EVENT(lck));
1824 }
1825
1826 if (!(fake_lck.priv_excl && fake_lck.w_waiting) && fake_lck.r_waiting) {
1827 thread_wakeup(LCK_RW_READER_EVENT(lck));
1828 }
1829 }
1830 if (fake_lck.shared_count) {
1831 lock_type = LCK_RW_TYPE_SHARED;
1832 } else {
1833 lock_type = LCK_RW_TYPE_EXCLUSIVE;
1834 }
1835
1836 /* Check if dropping the lock means that we need to unpromote */
1837 thread = current_thread();
1838 if (fake_lck.can_sleep) {
1839 rwlock_count = thread->rwlock_count--;
1840 } else {
1841 rwlock_count = UINT32_MAX;
1842 }
1843#if MACH_LDEBUG
1844 if (rwlock_count == 0) {
1845 panic("rw lock count underflow for thread %p", thread);
1846 }
1847#endif
1848 if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
1849 /* sched_flags checked without lock, but will be rechecked while clearing */
1850 lck_rw_clear_promotion(thread, unslide_for_kdebug(lck));
1851 }
1852#if CONFIG_DTRACE
1853 LOCKSTAT_RECORD(LS_LCK_RW_DONE_RELEASE, lck, lock_type == LCK_RW_TYPE_SHARED ? 0 : 1);
1854#endif
1855 return lock_type;
1856}
1857
1858/*
1859 * Routine: lck_rw_lock_shared_gen
1860 * Function:
1861 * Fast path code has determined that this lock
1862 * is held exclusively... this is where we spin/block
1863 * until we can acquire the lock in the shared mode
1864 */
1865static void
1866lck_rw_lock_shared_gen(
1867 lck_rw_t *lck)
1868{
1869 __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
1870 lck_rw_word_t word;
1871 boolean_t gotlock = 0;
1872 int slept = 0;
1873 wait_result_t res = 0;
1874 boolean_t istate;
1875
1876#if CONFIG_DTRACE
1877 uint64_t wait_interval = 0;
1878 int readers_at_sleep = 0;
1879 boolean_t dtrace_ls_initialized = FALSE;
1880 boolean_t dtrace_rwl_shared_spin, dtrace_rwl_shared_block, dtrace_ls_enabled = FALSE;
1881#endif /* CONFIG_DTRACE */
1882
1883 while (!lck_rw_grab(lck, LCK_RW_GRAB_SHARED, FALSE)) {
1884#if CONFIG_DTRACE
1885 if (dtrace_ls_initialized == FALSE) {
1886 dtrace_ls_initialized = TRUE;
1887 dtrace_rwl_shared_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_SPIN] != 0);
1888 dtrace_rwl_shared_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_BLOCK] != 0);
1889 dtrace_ls_enabled = dtrace_rwl_shared_spin || dtrace_rwl_shared_block;
1890 if (dtrace_ls_enabled) {
1891 /*
1892 * Either sleeping or spinning is happening,
1893 * start a timing of our delay interval now.
1894 */
1895 readers_at_sleep = lck->lck_rw_shared_count;
1896 wait_interval = mach_absolute_time();
1897 }
1898 }
1899#endif
1900
1901 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_START,
1902 trace_lck, lck->lck_rw_want_excl, lck->lck_rw_want_upgrade, 0, 0);
1903
1904 gotlock = lck_rw_grab(lck, LCK_RW_GRAB_SHARED, TRUE);
1905
1906 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_END,
1907 trace_lck, lck->lck_rw_want_excl, lck->lck_rw_want_upgrade, gotlock, 0);
1908
1909 if (gotlock) {
1910 break;
1911 }
1912 /*
1913 * if we get here, the deadline has expired w/o us
1914 * being able to grab the lock for read
1915 * check to see if we're allowed to do a thread_block
1916 */
1917 if (lck->lck_rw_can_sleep) {
1918 istate = lck_interlock_lock(lck);
1919
1920 word.data = ordered_load_rw(lck);
1921 if ((word.want_excl || word.want_upgrade) &&
1922 ((word.shared_count == 0) || word.priv_excl)) {
1923 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_START,
1924 trace_lck, word.want_excl, word.want_upgrade, 0, 0);
1925
1926 word.r_waiting = 1;
1927 ordered_store_rw(lck, word.data);
1928
1929 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockRead);
1930 res = assert_wait(LCK_RW_READER_EVENT(lck),
1931 THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
1932 lck_interlock_unlock(lck, istate);
1933
1934 if (res == THREAD_WAITING) {
1935 res = thread_block(THREAD_CONTINUE_NULL);
1936 slept++;
1937 }
1938 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_END,
1939 trace_lck, res, slept, 0, 0);
1940 } else {
1941 word.shared_count++;
1942 ordered_store_rw(lck, word.data);
1943 lck_interlock_unlock(lck, istate);
1944 break;
1945 }
1946 }
1947 }
1948
1949#if CONFIG_DTRACE
1950 if (dtrace_ls_enabled == TRUE) {
1951 if (slept == 0) {
1952 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_SPIN, lck, mach_absolute_time() - wait_interval, 0);
1953 } else {
1954 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_BLOCK, lck,
1955 mach_absolute_time() - wait_interval, 0,
1956 (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1957 }
1958 }
1959 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lck, 0);
1960#endif /* CONFIG_DTRACE */
1961}
1962
1963/*
1964 * Required to verify thread ownership for exclusive locks by virtue of PPL
1965 * usage
1966 */
1967void
1968lck_rw_assert(
1969 lck_rw_t *lck,
1970 unsigned int type)
1971{
1972 switch (type) {
1973 case LCK_RW_ASSERT_SHARED:
1974 if ((lck->lck_rw_shared_count != 0) &&
1975 (lck->lck_rw_owner == THREAD_NULL)) {
1976 return;
1977 }
1978 break;
1979 case LCK_RW_ASSERT_EXCLUSIVE:
1980 if ((lck->lck_rw_want_excl || lck->lck_rw_want_upgrade) &&
1981 (lck->lck_rw_shared_count == 0) &&
1982 (lck->lck_rw_owner == current_thread())) {
1983 return;
1984 }
1985 break;
1986 case LCK_RW_ASSERT_HELD:
1987 if (lck->lck_rw_shared_count != 0) {
1988 return; // Held shared
1989 }
1990 if ((lck->lck_rw_want_excl || lck->lck_rw_want_upgrade) &&
1991 (lck->lck_rw_owner == current_thread())) {
1992 return; // Held exclusive
1993 }
1994 break;
1995 case LCK_RW_ASSERT_NOTHELD:
1996 if ((lck->lck_rw_shared_count == 0) &&
1997 !(lck->lck_rw_want_excl || lck->lck_rw_want_upgrade) &&
1998 (lck->lck_rw_owner == THREAD_NULL)) {
1999 return;
2000 }
2001 break;
2002 default:
2003 break;
2004 }
2005 panic("rw lock (%p)%s held (mode=%u)", lck, (type == LCK_RW_ASSERT_NOTHELD ? "" : " not"), type);
2006}
2007
2008
2009/*
2010 * Routine: kdp_lck_rw_lock_is_acquired_exclusive
2011 * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
2012 */
2013boolean_t
2014kdp_lck_rw_lock_is_acquired_exclusive(lck_rw_t *lck)
2015{
2016 if (not_in_kdp) {
2017 panic("panic: rw lock exclusive check done outside of kernel debugger");
2018 }
2019 return ((lck->lck_rw_want_upgrade || lck->lck_rw_want_excl) && (lck->lck_rw_shared_count == 0)) ? TRUE : FALSE;
2020}
2021
2022/*
2023 * The C portion of the mutex package. These routines are only invoked
2024 * if the optimized assembler routines can't do the work.
2025 */
2026
2027/*
2028 * Forward declaration
2029 */
2030
2031void
2032lck_mtx_ext_init(
2033 lck_mtx_ext_t * lck,
2034 lck_grp_t * grp,
2035 lck_attr_t * attr);
2036
2037/*
2038 * Routine: lck_mtx_alloc_init
2039 */
2040lck_mtx_t *
2041lck_mtx_alloc_init(
2042 lck_grp_t * grp,
2043 lck_attr_t * attr)
2044{
2045 lck_mtx_t *lck;
2046
2047 lck = zalloc(ZV_LCK_MTX);
2048 lck_mtx_init(lck, grp, attr);
2049 return lck;
2050}
2051
2052/*
2053 * Routine: lck_mtx_free
2054 */
2055void
2056lck_mtx_free(
2057 lck_mtx_t * lck,
2058 lck_grp_t * grp)
2059{
2060 lck_mtx_destroy(lck, grp);
2061 zfree(ZV_LCK_MTX, lck);
2062}
2063
2064/*
2065 * Routine: lck_mtx_init
2066 */
2067void
2068lck_mtx_init(
2069 lck_mtx_t * lck,
2070 lck_grp_t * grp,
2071 lck_attr_t * attr)
2072{
2073#ifdef BER_XXX
2074 lck_mtx_ext_t *lck_ext;
2075#endif
2076 lck_attr_t *lck_attr;
2077
2078 if (attr != LCK_ATTR_NULL) {
2079 lck_attr = attr;
2080 } else {
2081 lck_attr = &LockDefaultLckAttr;
2082 }
2083
2084#ifdef BER_XXX
2085 if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) {
2086 lck_ext = zalloc(ZV_LCK_MTX_EXT);
2087 lck_mtx_ext_init(lck_ext, grp, lck_attr);
2088 lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT;
2089 lck->lck_mtx_ptr = lck_ext;
2090 lck->lck_mtx_type = LCK_MTX_TYPE;
2091 } else
2092#endif
2093 {
2094 lck->lck_mtx_ptr = NULL; // Clear any padding in the union fields below
2095 lck->lck_mtx_waiters = 0;
2096 lck->lck_mtx_type = LCK_MTX_TYPE;
2097 ordered_store_mtx(lck, 0);
2098 }
2099 lck_grp_reference(grp);
2100 lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX);
2101}
2102
2103/*
2104 * Routine: lck_mtx_init_ext
2105 */
2106void
2107lck_mtx_init_ext(
2108 lck_mtx_t * lck,
2109 lck_mtx_ext_t * lck_ext,
2110 lck_grp_t * grp,
2111 lck_attr_t * attr)
2112{
2113 lck_attr_t *lck_attr;
2114
2115 if (attr != LCK_ATTR_NULL) {
2116 lck_attr = attr;
2117 } else {
2118 lck_attr = &LockDefaultLckAttr;
2119 }
2120
2121 if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) {
2122 lck_mtx_ext_init(lck_ext, grp, lck_attr);
2123 lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT;
2124 lck->lck_mtx_ptr = lck_ext;
2125 lck->lck_mtx_type = LCK_MTX_TYPE;
2126 } else {
2127 lck->lck_mtx_waiters = 0;
2128 lck->lck_mtx_type = LCK_MTX_TYPE;
2129 ordered_store_mtx(lck, 0);
2130 }
2131 lck_grp_reference(grp);
2132 lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX);
2133}
2134
2135/*
2136 * Routine: lck_mtx_ext_init
2137 */
2138void
2139lck_mtx_ext_init(
2140 lck_mtx_ext_t * lck,
2141 lck_grp_t * grp,
2142 lck_attr_t * attr)
2143{
2144 bzero((void *) lck, sizeof(lck_mtx_ext_t));
2145
2146 lck->lck_mtx.lck_mtx_type = LCK_MTX_TYPE;
2147
2148 if ((attr->lck_attr_val) & LCK_ATTR_DEBUG) {
2149 lck->lck_mtx_deb.type = MUTEX_TAG;
2150 lck->lck_mtx_attr |= LCK_MTX_ATTR_DEBUG;
2151 }
2152 lck->lck_mtx_grp = grp;
2153
2154 if (grp->lck_grp_attr & LCK_GRP_ATTR_STAT) {
2155 lck->lck_mtx_attr |= LCK_MTX_ATTR_STAT;
2156 }
2157}
2158
2159/* The slow versions */
2160static void lck_mtx_lock_contended(lck_mtx_t *lock, thread_t thread, boolean_t interlocked);
2161static boolean_t lck_mtx_try_lock_contended(lck_mtx_t *lock, thread_t thread);
2162static void lck_mtx_unlock_contended(lck_mtx_t *lock, thread_t thread, boolean_t interlocked);
2163
2164/* The adaptive spin function */
2165static spinwait_result_t lck_mtx_lock_contended_spinwait_arm(lck_mtx_t *lock, thread_t thread, boolean_t interlocked);
2166
2167/*
2168 * Routine: lck_mtx_verify
2169 *
2170 * Verify if a mutex is valid
2171 */
2172static inline void
2173lck_mtx_verify(lck_mtx_t *lock)
2174{
2175 if (lock->lck_mtx_type != LCK_MTX_TYPE) {
2176 panic("Invalid mutex %p", lock);
2177 }
2178#if DEVELOPMENT || DEBUG
2179 if (lock->lck_mtx_tag == LCK_MTX_TAG_DESTROYED) {
2180 panic("Mutex destroyed %p", lock);
2181 }
2182#endif /* DEVELOPMENT || DEBUG */
2183}
2184
2185/*
2186 * Routine: lck_mtx_check_preemption
2187 *
2188 * Verify preemption is enabled when attempting to acquire a mutex.
2189 */
2190
2191static inline void
2192lck_mtx_check_preemption(lck_mtx_t *lock)
2193{
2194#if DEVELOPMENT || DEBUG
2195 if (current_cpu_datap()->cpu_hibernate) {
2196 return;
2197 }
2198
2199 int pl = get_preemption_level();
2200
2201 if (pl != 0) {
2202 panic("Attempt to take mutex with preemption disabled. Lock=%p, level=%d", lock, pl);
2203 }
2204#else
2205 (void)lock;
2206#endif
2207}
2208
2209/*
2210 * Routine: lck_mtx_lock
2211 */
2212void
2213lck_mtx_lock(lck_mtx_t *lock)
2214{
2215 thread_t thread;
2216
2217 lck_mtx_verify(lock);
2218 lck_mtx_check_preemption(lock);
2219 thread = current_thread();
2220 if (os_atomic_cmpxchg(&lock->lck_mtx_data,
2221 0, LCK_MTX_THREAD_TO_STATE(thread), acquire)) {
2222#if CONFIG_DTRACE
2223 LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_ACQUIRE, lock, 0);
2224#endif /* CONFIG_DTRACE */
2225 return;
2226 }
2227 lck_mtx_lock_contended(lock, thread, FALSE);
2228}
2229
2230/*
2231 * This is the slow version of mutex locking.
2232 */
2233static void NOINLINE
2234lck_mtx_lock_contended(lck_mtx_t *lock, thread_t thread, boolean_t interlocked)
2235{
2236 thread_t holding_thread;
2237 uintptr_t state;
2238 int waiters = 0;
2239 spinwait_result_t sw_res;
2240 struct turnstile *ts = NULL;
2241
2242 /* Loop waiting until I see that the mutex is unowned */
2243 for (;;) {
2244 sw_res = lck_mtx_lock_contended_spinwait_arm(lock, thread, interlocked);
2245 interlocked = FALSE;
2246
2247 switch (sw_res) {
2248 case SPINWAIT_ACQUIRED:
2249 if (ts != NULL) {
2250 interlock_lock(lock);
2251 turnstile_complete((uintptr_t)lock, NULL, NULL, TURNSTILE_KERNEL_MUTEX);
2252 interlock_unlock(lock);
2253 }
2254 goto done;
2255 case SPINWAIT_INTERLOCK:
2256 goto set_owner;
2257 default:
2258 break;
2259 }
2260
2261 state = ordered_load_mtx(lock);
2262 holding_thread = LCK_MTX_STATE_TO_THREAD(state);
2263 if (holding_thread == NULL) {
2264 break;
2265 }
2266 ordered_store_mtx(lock, (state | LCK_ILOCK | ARM_LCK_WAITERS)); // Set waiters bit and wait
2267 lck_mtx_lock_wait(lock, holding_thread, &ts);
2268 /* returns interlock unlocked */
2269 }
2270
2271set_owner:
2272 /* Hooray, I'm the new owner! */
2273 state = ordered_load_mtx(lock);
2274
2275 if (state & ARM_LCK_WAITERS) {
2276 /* Skip lck_mtx_lock_acquire if there are no waiters. */
2277 waiters = lck_mtx_lock_acquire(lock, ts);
2278 /*
2279 * lck_mtx_lock_acquire will call
2280 * turnstile_complete
2281 */
2282 } else {
2283 if (ts != NULL) {
2284 turnstile_complete((uintptr_t)lock, NULL, NULL, TURNSTILE_KERNEL_MUTEX);
2285 }
2286 }
2287
2288 state = LCK_MTX_THREAD_TO_STATE(thread);
2289 if (waiters != 0) {
2290 state |= ARM_LCK_WAITERS;
2291 }
2292 state |= LCK_ILOCK; // Preserve interlock
2293 ordered_store_mtx(lock, state); // Set ownership
2294 interlock_unlock(lock); // Release interlock, enable preemption
2295
2296done:
2297 load_memory_barrier();
2298
2299 assert(thread->turnstile != NULL);
2300
2301 if (ts != NULL) {
2302 turnstile_cleanup();
2303 }
2304
2305#if CONFIG_DTRACE
2306 LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_ACQUIRE, lock, 0);
2307#endif /* CONFIG_DTRACE */
2308}
2309
2310/*
2311 * Routine: lck_mtx_lock_spinwait_arm
2312 *
2313 * Invoked trying to acquire a mutex when there is contention but
2314 * the holder is running on another processor. We spin for up to a maximum
2315 * time waiting for the lock to be released.
2316 */
2317static spinwait_result_t
2318lck_mtx_lock_contended_spinwait_arm(lck_mtx_t *lock, thread_t thread, boolean_t interlocked)
2319{
2320 int has_interlock = (int)interlocked;
2321 __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lock);
2322 thread_t owner, prev_owner;
2323 uint64_t window_deadline, sliding_deadline, high_deadline;
2324 uint64_t start_time, cur_time, avg_hold_time, bias, delta;
2325 int loopcount = 0;
2326 uint i, prev_owner_cpu;
2327 int total_hold_time_samples, window_hold_time_samples, unfairness;
2328 bool owner_on_core, adjust;
2329 uintptr_t state, new_state, waiters;
2330 spinwait_result_t retval = SPINWAIT_DID_SPIN_HIGH_THR;
2331
2332 if (__improbable(!(lck_mtx_adaptive_spin_mode & ADAPTIVE_SPIN_ENABLE))) {
2333 if (!has_interlock) {
2334 interlock_lock(lock);
2335 }
2336
2337 return SPINWAIT_DID_NOT_SPIN;
2338 }
2339
2340 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_START,
2341 trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(LCK_MTX_STATE_TO_THREAD(state)), lock->lck_mtx_waiters, 0, 0);
2342
2343 start_time = mach_absolute_time();
2344 /*
2345 * window_deadline represents the "learning" phase.
2346 * The thread collects statistics about the lock during
2347 * window_deadline and then it makes a decision on whether to spin more
2348 * or block according to the concurrency behavior
2349 * observed.
2350 *
2351 * Every thread can spin at least low_MutexSpin.
2352 */
2353 window_deadline = start_time + low_MutexSpin;
2354 /*
2355 * Sliding_deadline is the adjusted spin deadline
2356 * computed after the "learning" phase.
2357 */
2358 sliding_deadline = window_deadline;
2359 /*
2360 * High_deadline is a hard deadline. No thread
2361 * can spin more than this deadline.
2362 */
2363 if (high_MutexSpin >= 0) {
2364 high_deadline = start_time + high_MutexSpin;
2365 } else {
2366 high_deadline = start_time + low_MutexSpin * real_ncpus;
2367 }
2368
2369 /*
2370 * Do not know yet which is the owner cpu.
2371 * Initialize prev_owner_cpu with next cpu.
2372 */
2373 prev_owner_cpu = (cpu_number() + 1) % real_ncpus;
2374 total_hold_time_samples = 0;
2375 window_hold_time_samples = 0;
2376 avg_hold_time = 0;
2377 adjust = TRUE;
2378 bias = (os_hash_kernel_pointer(lock) + cpu_number()) % real_ncpus;
2379
2380 /* Snoop the lock state */
2381 state = ordered_load_mtx(lock);
2382 owner = LCK_MTX_STATE_TO_THREAD(state);
2383 prev_owner = owner;
2384
2385 if (has_interlock) {
2386 if (owner == NULL) {
2387 retval = SPINWAIT_INTERLOCK;
2388 goto done_spinning;
2389 } else {
2390 /*
2391 * We are holding the interlock, so
2392 * we can safely dereference owner.
2393 */
2394 if (!machine_thread_on_core(owner) || (owner->state & TH_IDLE)) {
2395 retval = SPINWAIT_DID_NOT_SPIN;
2396 goto done_spinning;
2397 }
2398 }
2399 interlock_unlock(lock);
2400 has_interlock = 0;
2401 }
2402
2403 /*
2404 * Spin while:
2405 * - mutex is locked, and
2406 * - it's locked as a spin lock, and
2407 * - owner is running on another processor, and
2408 * - we haven't spun for long enough.
2409 */
2410 do {
2411 /*
2412 * Try to acquire the lock.
2413 */
2414 owner = LCK_MTX_STATE_TO_THREAD(state);
2415 if (owner == NULL) {
2416 waiters = state & ARM_LCK_WAITERS;
2417 if (waiters) {
2418 /*
2419 * preserve the waiter bit
2420 * and try acquire the interlock.
2421 * Note: we will successfully acquire
2422 * the interlock only if we can also
2423 * acquire the lock.
2424 */
2425 new_state = ARM_LCK_WAITERS | LCK_ILOCK;
2426 has_interlock = 1;
2427 retval = SPINWAIT_INTERLOCK;
2428 disable_preemption();
2429 } else {
2430 new_state = LCK_MTX_THREAD_TO_STATE(thread);
2431 retval = SPINWAIT_ACQUIRED;
2432 }
2433
2434 /*
2435 * The cmpxchg will succed only if the lock
2436 * is not owned (doesn't have an owner set)
2437 * and it is not interlocked.
2438 * It will not fail if there are waiters.
2439 */
2440 if (os_atomic_cmpxchgv(&lock->lck_mtx_data,
2441 waiters, new_state, &state, acquire)) {
2442 goto done_spinning;
2443 } else {
2444 if (waiters) {
2445 has_interlock = 0;
2446 enable_preemption();
2447 }
2448 }
2449 }
2450
2451 cur_time = mach_absolute_time();
2452
2453 /*
2454 * Never spin past high_deadline.
2455 */
2456 if (cur_time >= high_deadline) {
2457 retval = SPINWAIT_DID_SPIN_HIGH_THR;
2458 break;
2459 }
2460
2461 /*
2462 * Check if owner is on core. If not block.
2463 */
2464 owner = LCK_MTX_STATE_TO_THREAD(state);
2465 if (owner) {
2466 i = prev_owner_cpu;
2467 owner_on_core = FALSE;
2468
2469 disable_preemption();
2470 state = ordered_load_mtx(lock);
2471 owner = LCK_MTX_STATE_TO_THREAD(state);
2472
2473 /*
2474 * For scalability we want to check if the owner is on core
2475 * without locking the mutex interlock.
2476 * If we do not lock the mutex interlock, the owner that we see might be
2477 * invalid, so we cannot dereference it. Therefore we cannot check
2478 * any field of the thread to tell us if it is on core.
2479 * Check if the thread that is running on the other cpus matches the owner.
2480 */
2481 if (owner) {
2482 do {
2483 cpu_data_t *cpu_data_ptr = CpuDataEntries[i].cpu_data_vaddr;
2484 if ((cpu_data_ptr != NULL) && (cpu_data_ptr->cpu_active_thread == owner)) {
2485 owner_on_core = TRUE;
2486 break;
2487 }
2488 if (++i >= real_ncpus) {
2489 i = 0;
2490 }
2491 } while (i != prev_owner_cpu);
2492 enable_preemption();
2493
2494 if (owner_on_core) {
2495 prev_owner_cpu = i;
2496 } else {
2497 prev_owner = owner;
2498 state = ordered_load_mtx(lock);
2499 owner = LCK_MTX_STATE_TO_THREAD(state);
2500 if (owner == prev_owner) {
2501 /*
2502 * Owner is not on core.
2503 * Stop spinning.
2504 */
2505 if (loopcount == 0) {
2506 retval = SPINWAIT_DID_NOT_SPIN;
2507 } else {
2508 retval = SPINWAIT_DID_SPIN_OWNER_NOT_CORE;
2509 }
2510 break;
2511 }
2512 /*
2513 * Fall through if the owner changed while we were scanning.
2514 * The new owner could potentially be on core, so loop
2515 * again.
2516 */
2517 }
2518 } else {
2519 enable_preemption();
2520 }
2521 }
2522
2523 /*
2524 * Save how many times we see the owner changing.
2525 * We can roughly estimate the the mutex hold
2526 * time and the fairness with that.
2527 */
2528 if (owner != prev_owner) {
2529 prev_owner = owner;
2530 total_hold_time_samples++;
2531 window_hold_time_samples++;
2532 }
2533
2534 /*
2535 * Learning window expired.
2536 * Try to adjust the sliding_deadline.
2537 */
2538 if (cur_time >= window_deadline) {
2539 /*
2540 * If there was not contention during the window
2541 * stop spinning.
2542 */
2543 if (window_hold_time_samples < 1) {
2544 retval = SPINWAIT_DID_SPIN_NO_WINDOW_CONTENTION;
2545 break;
2546 }
2547
2548 if (adjust) {
2549 /*
2550 * For a fair lock, we'd wait for at most (NCPU-1) periods,
2551 * but the lock is unfair, so let's try to estimate by how much.
2552 */
2553 unfairness = total_hold_time_samples / real_ncpus;
2554
2555 if (unfairness == 0) {
2556 /*
2557 * We observed the owner changing `total_hold_time_samples` times which
2558 * let us estimate the average hold time of this mutex for the duration
2559 * of the spin time.
2560 * avg_hold_time = (cur_time - start_time) / total_hold_time_samples;
2561 *
2562 * In this case spin at max avg_hold_time * (real_ncpus - 1)
2563 */
2564 delta = cur_time - start_time;
2565 sliding_deadline = start_time + (delta * (real_ncpus - 1)) / total_hold_time_samples;
2566 } else {
2567 /*
2568 * In this case at least one of the other cpus was able to get the lock twice
2569 * while I was spinning.
2570 * We could spin longer but it won't necessarily help if the system is unfair.
2571 * Try to randomize the wait to reduce contention.
2572 *
2573 * We compute how much time we could potentially spin
2574 * and distribute it over the cpus.
2575 *
2576 * bias is an integer between 0 and real_ncpus.
2577 * distributed_increment = ((high_deadline - cur_time) / real_ncpus) * bias
2578 */
2579 delta = high_deadline - cur_time;
2580 sliding_deadline = cur_time + ((delta * bias) / real_ncpus);
2581 adjust = FALSE;
2582 }
2583 }
2584
2585 window_deadline += low_MutexSpin;
2586 window_hold_time_samples = 0;
2587 }
2588
2589 /*
2590 * Stop spinning if we past
2591 * the adjusted deadline.
2592 */
2593 if (cur_time >= sliding_deadline) {
2594 retval = SPINWAIT_DID_SPIN_SLIDING_THR;
2595 break;
2596 }
2597
2598 /*
2599 * We want to arm the monitor for wfe,
2600 * so load exclusively the lock.
2601 *
2602 * NOTE:
2603 * we rely on the fact that wfe will
2604 * eventually return even if the cache line
2605 * is not modified. This way we will keep
2606 * looping and checking if the deadlines expired.
2607 */
2608 state = os_atomic_load_exclusive(&lock->lck_mtx_data, relaxed);
2609 owner = LCK_MTX_STATE_TO_THREAD(state);
2610 if (owner != NULL) {
2611 wait_for_event();
2612 state = ordered_load_mtx(lock);
2613 } else {
2614 atomic_exchange_abort();
2615 }
2616
2617 loopcount++;
2618 } while (TRUE);
2619
2620done_spinning:
2621#if CONFIG_DTRACE
2622 /*
2623 * Note that we record a different probe id depending on whether
2624 * this is a direct or indirect mutex. This allows us to
2625 * penalize only lock groups that have debug/stats enabled
2626 * with dtrace processing if desired.
2627 */
2628 if (__probable(lock->lck_mtx_tag != LCK_MTX_TAG_INDIRECT)) {
2629 LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN, lock,
2630 mach_absolute_time() - start_time);
2631 } else {
2632 LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_SPIN, lock,
2633 mach_absolute_time() - start_time);
2634 }
2635 /* The lockstat acquire event is recorded by the caller. */
2636#endif
2637
2638 state = ordered_load_mtx(lock);
2639
2640 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_END,
2641 trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(LCK_MTX_STATE_TO_THREAD(state)), lock->lck_mtx_waiters, retval, 0);
2642 if ((!has_interlock) && (retval != SPINWAIT_ACQUIRED)) {
2643 /* We must own either the lock or the interlock on return. */
2644 interlock_lock(lock);
2645 }
2646
2647 return retval;
2648}
2649
2650
2651/*
2652 * Common code for mutex locking as spinlock
2653 */
2654static inline void
2655lck_mtx_lock_spin_internal(lck_mtx_t *lock, boolean_t allow_held_as_mutex)
2656{
2657 uintptr_t state;
2658
2659 interlock_lock(lock);
2660 state = ordered_load_mtx(lock);
2661 if (LCK_MTX_STATE_TO_THREAD(state)) {
2662 if (allow_held_as_mutex) {
2663 lck_mtx_lock_contended(lock, current_thread(), TRUE);
2664 } else {
2665 // "Always" variants can never block. If the lock is held and blocking is not allowed
2666 // then someone is mixing always and non-always calls on the same lock, which is
2667 // forbidden.
2668 panic("Attempting to block on a lock taken as spin-always %p", lock);
2669 }
2670 return;
2671 }
2672 state &= ARM_LCK_WAITERS; // Preserve waiters bit
2673 state |= (LCK_MTX_SPIN_TAG | LCK_ILOCK); // Add spin tag and maintain interlock
2674 ordered_store_mtx(lock, state);
2675 load_memory_barrier();
2676
2677#if CONFIG_DTRACE
2678 LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN_ACQUIRE, lock, 0);
2679#endif /* CONFIG_DTRACE */
2680}
2681
2682/*
2683 * Routine: lck_mtx_lock_spin
2684 */
2685void
2686lck_mtx_lock_spin(lck_mtx_t *lock)
2687{
2688 lck_mtx_check_preemption(lock);
2689 lck_mtx_lock_spin_internal(lock, TRUE);
2690}
2691
2692/*
2693 * Routine: lck_mtx_lock_spin_always
2694 */
2695void
2696lck_mtx_lock_spin_always(lck_mtx_t *lock)
2697{
2698 lck_mtx_lock_spin_internal(lock, FALSE);
2699}
2700
2701/*
2702 * Routine: lck_mtx_try_lock
2703 */
2704boolean_t
2705lck_mtx_try_lock(lck_mtx_t *lock)
2706{
2707 thread_t thread = current_thread();
2708
2709 lck_mtx_verify(lock);
2710 if (os_atomic_cmpxchg(&lock->lck_mtx_data,
2711 0, LCK_MTX_THREAD_TO_STATE(thread), acquire)) {
2712#if CONFIG_DTRACE
2713 LOCKSTAT_RECORD(LS_LCK_MTX_TRY_LOCK_ACQUIRE, lock, 0);
2714#endif /* CONFIG_DTRACE */
2715 return TRUE;
2716 }
2717 return lck_mtx_try_lock_contended(lock, thread);
2718}
2719
2720static boolean_t NOINLINE
2721lck_mtx_try_lock_contended(lck_mtx_t *lock, thread_t thread)
2722{
2723 thread_t holding_thread;
2724 uintptr_t state;
2725 int waiters;
2726
2727 interlock_lock(lock);
2728 state = ordered_load_mtx(lock);
2729 holding_thread = LCK_MTX_STATE_TO_THREAD(state);
2730 if (holding_thread) {
2731 interlock_unlock(lock);
2732 return FALSE;
2733 }
2734 waiters = lck_mtx_lock_acquire(lock, NULL);
2735 state = LCK_MTX_THREAD_TO_STATE(thread);
2736 if (waiters != 0) {
2737 state |= ARM_LCK_WAITERS;
2738 }
2739 state |= LCK_ILOCK; // Preserve interlock
2740 ordered_store_mtx(lock, state); // Set ownership
2741 interlock_unlock(lock); // Release interlock, enable preemption
2742 load_memory_barrier();
2743
2744 turnstile_cleanup();
2745
2746 return TRUE;
2747}
2748
2749static inline boolean_t
2750lck_mtx_try_lock_spin_internal(lck_mtx_t *lock, boolean_t allow_held_as_mutex)
2751{
2752 uintptr_t state;
2753
2754 if (!interlock_try(lock)) {
2755 return FALSE;
2756 }
2757 state = ordered_load_mtx(lock);
2758 if (LCK_MTX_STATE_TO_THREAD(state)) {
2759 // Lock is held as mutex
2760 if (allow_held_as_mutex) {
2761 interlock_unlock(lock);
2762 } else {
2763 // "Always" variants can never block. If the lock is held as a normal mutex
2764 // then someone is mixing always and non-always calls on the same lock, which is
2765 // forbidden.
2766 panic("Spin-mutex held as full mutex %p", lock);
2767 }
2768 return FALSE;
2769 }
2770 state &= ARM_LCK_WAITERS; // Preserve waiters bit
2771 state |= (LCK_MTX_SPIN_TAG | LCK_ILOCK); // Add spin tag and maintain interlock
2772 ordered_store_mtx(lock, state);
2773 load_memory_barrier();
2774
2775#if CONFIG_DTRACE
2776 LOCKSTAT_RECORD(LS_LCK_MTX_TRY_SPIN_LOCK_ACQUIRE, lock, 0);
2777#endif /* CONFIG_DTRACE */
2778 return TRUE;
2779}
2780
2781/*
2782 * Routine: lck_mtx_try_lock_spin
2783 */
2784boolean_t
2785lck_mtx_try_lock_spin(lck_mtx_t *lock)
2786{
2787 return lck_mtx_try_lock_spin_internal(lock, TRUE);
2788}
2789
2790/*
2791 * Routine: lck_mtx_try_lock_spin_always
2792 */
2793boolean_t
2794lck_mtx_try_lock_spin_always(lck_mtx_t *lock)
2795{
2796 return lck_mtx_try_lock_spin_internal(lock, FALSE);
2797}
2798
2799
2800
2801/*
2802 * Routine: lck_mtx_unlock
2803 */
2804void
2805lck_mtx_unlock(lck_mtx_t *lock)
2806{
2807 thread_t thread = current_thread();
2808 uintptr_t state;
2809 boolean_t ilk_held = FALSE;
2810
2811 lck_mtx_verify(lock);
2812
2813 state = ordered_load_mtx(lock);
2814 if (state & LCK_ILOCK) {
2815 if (LCK_MTX_STATE_TO_THREAD(state) == (thread_t)LCK_MTX_SPIN_TAG) {
2816 ilk_held = TRUE; // Interlock is held by (presumably) this thread
2817 }
2818 goto slow_case;
2819 }
2820 // Locked as a mutex
2821 if (os_atomic_cmpxchg(&lock->lck_mtx_data,
2822 LCK_MTX_THREAD_TO_STATE(thread), 0, release)) {
2823#if CONFIG_DTRACE
2824 LOCKSTAT_RECORD(LS_LCK_MTX_UNLOCK_RELEASE, lock, 0);
2825#endif /* CONFIG_DTRACE */
2826 return;
2827 }
2828slow_case:
2829 lck_mtx_unlock_contended(lock, thread, ilk_held);
2830}
2831
2832static void NOINLINE
2833lck_mtx_unlock_contended(lck_mtx_t *lock, thread_t thread, boolean_t ilk_held)
2834{
2835 uintptr_t state;
2836 boolean_t cleanup = FALSE;
2837
2838 if (ilk_held) {
2839 state = ordered_load_mtx(lock);
2840 } else {
2841 interlock_lock(lock);
2842 state = ordered_load_mtx(lock);
2843 if (thread != LCK_MTX_STATE_TO_THREAD(state)) {
2844 panic("lck_mtx_unlock(): Attempt to release lock not owned by thread (%p)", lock);
2845 }
2846 if (state & ARM_LCK_WAITERS) {
2847 if (lck_mtx_unlock_wakeup(lock, thread)) {
2848 state = ARM_LCK_WAITERS;
2849 } else {
2850 state = 0;
2851 }
2852 cleanup = TRUE;
2853 goto unlock;
2854 }
2855 }
2856 state &= ARM_LCK_WAITERS; /* Clear state, retain waiters bit */
2857unlock:
2858 state |= LCK_ILOCK;
2859 ordered_store_mtx(lock, state);
2860 interlock_unlock(lock);
2861 if (cleanup) {
2862 /*
2863 * Do not do any turnstile operations outside of this block.
2864 * lock/unlock is called at early stage of boot with single thread,
2865 * when turnstile is not yet initialized.
2866 * Even without contention we can come throught the slow path
2867 * if the mutex is acquired as a spin lock.
2868 */
2869 turnstile_cleanup();
2870 }
2871
2872#if CONFIG_DTRACE
2873 LOCKSTAT_RECORD(LS_LCK_MTX_UNLOCK_RELEASE, lock, 0);
2874#endif /* CONFIG_DTRACE */
2875}
2876
2877/*
2878 * Routine: lck_mtx_assert
2879 */
2880void
2881lck_mtx_assert(lck_mtx_t *lock, unsigned int type)
2882{
2883 thread_t thread, holder;
2884 uintptr_t state;
2885
2886 state = ordered_load_mtx(lock);
2887 holder = LCK_MTX_STATE_TO_THREAD(state);
2888 if (holder == (thread_t)LCK_MTX_SPIN_TAG) {
2889 // Lock is held in spin mode, owner is unknown.
2890 return; // Punt
2891 }
2892 thread = current_thread();
2893 if (type == LCK_MTX_ASSERT_OWNED) {
2894 if (thread != holder) {
2895 panic("lck_mtx_assert(): mutex (%p) owned", lock);
2896 }
2897 } else if (type == LCK_MTX_ASSERT_NOTOWNED) {
2898 if (thread == holder) {
2899 panic("lck_mtx_assert(): mutex (%p) not owned", lock);
2900 }
2901 } else {
2902 panic("lck_mtx_assert(): invalid arg (%u)", type);
2903 }
2904}
2905
2906/*
2907 * Routine: lck_mtx_ilk_unlock
2908 */
2909boolean_t
2910lck_mtx_ilk_unlock(lck_mtx_t *lock)
2911{
2912 interlock_unlock(lock);
2913 return TRUE;
2914}
2915
2916/*
2917 * Routine: lck_mtx_convert_spin
2918 *
2919 * Convert a mutex held for spin into a held full mutex
2920 */
2921void
2922lck_mtx_convert_spin(lck_mtx_t *lock)
2923{
2924 thread_t thread = current_thread();
2925 uintptr_t state;
2926 int waiters;
2927
2928 state = ordered_load_mtx(lock);
2929 if (LCK_MTX_STATE_TO_THREAD(state) == thread) {
2930 return; // Already owned as mutex, return
2931 }
2932 if ((state & LCK_ILOCK) == 0 || (LCK_MTX_STATE_TO_THREAD(state) != (thread_t)LCK_MTX_SPIN_TAG)) {
2933 panic("lck_mtx_convert_spin: Not held as spinlock (%p)", lock);
2934 }
2935 state &= ~(LCK_MTX_THREAD_MASK); // Clear the spin tag
2936 ordered_store_mtx(lock, state);
2937 waiters = lck_mtx_lock_acquire(lock, NULL); // Acquire to manage priority boosts
2938 state = LCK_MTX_THREAD_TO_STATE(thread);
2939 if (waiters != 0) {
2940 state |= ARM_LCK_WAITERS;
2941 }
2942 state |= LCK_ILOCK;
2943 ordered_store_mtx(lock, state); // Set ownership
2944 interlock_unlock(lock); // Release interlock, enable preemption
2945 turnstile_cleanup();
2946}
2947
2948
2949/*
2950 * Routine: lck_mtx_destroy
2951 */
2952void
2953lck_mtx_destroy(
2954 lck_mtx_t * lck,
2955 lck_grp_t * grp)
2956{
2957 if (lck->lck_mtx_type != LCK_MTX_TYPE) {
2958 panic("Destroying invalid mutex %p", lck);
2959 }
2960 if (lck->lck_mtx_tag == LCK_MTX_TAG_DESTROYED) {
2961 panic("Destroying previously destroyed lock %p", lck);
2962 }
2963 lck_mtx_assert(lck, LCK_MTX_ASSERT_NOTOWNED);
2964 lck->lck_mtx_tag = LCK_MTX_TAG_DESTROYED;
2965 lck_grp_lckcnt_decr(grp, LCK_TYPE_MTX);
2966 lck_grp_deallocate(grp);
2967 return;
2968}
2969
2970/*
2971 * Routine: lck_spin_assert
2972 */
2973void
2974lck_spin_assert(lck_spin_t *lock, unsigned int type)
2975{
2976 thread_t thread, holder;
2977 uintptr_t state;
2978
2979 if (lock->type != LCK_SPIN_TYPE) {
2980 panic("Invalid spinlock %p", lock);
2981 }
2982
2983 state = lock->lck_spin_data;
2984 holder = (thread_t)(state & ~LCK_ILOCK);
2985 thread = current_thread();
2986 if (type == LCK_ASSERT_OWNED) {
2987 if (holder == 0) {
2988 panic("Lock not owned %p = %lx", lock, state);
2989 }
2990 if (holder != thread) {
2991 panic("Lock not owned by current thread %p = %lx", lock, state);
2992 }
2993 if ((state & LCK_ILOCK) == 0) {
2994 panic("Lock bit not set %p = %lx", lock, state);
2995 }
2996 } else if (type == LCK_ASSERT_NOTOWNED) {
2997 if (holder != 0) {
2998 if (holder == thread) {
2999 panic("Lock owned by current thread %p = %lx", lock, state);
3000 }
3001 }
3002 } else {
3003 panic("lck_spin_assert(): invalid arg (%u)", type);
3004 }
3005}
3006
3007boolean_t
3008lck_rw_lock_yield_shared(lck_rw_t *lck, boolean_t force_yield)
3009{
3010 lck_rw_word_t word;
3011
3012 lck_rw_assert(lck, LCK_RW_ASSERT_SHARED);
3013
3014 word.data = ordered_load_rw(lck);
3015 if (word.want_excl || word.want_upgrade || force_yield) {
3016 lck_rw_unlock_shared(lck);
3017 mutex_pause(2);
3018 lck_rw_lock_shared(lck);
3019 return TRUE;
3020 }
3021
3022 return FALSE;
3023}
3024
3025/*
3026 * Routine: kdp_lck_mtx_lock_spin_is_acquired
3027 * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
3028 */
3029boolean_t
3030kdp_lck_mtx_lock_spin_is_acquired(lck_mtx_t *lck)
3031{
3032 uintptr_t state;
3033
3034 if (not_in_kdp) {
3035 panic("panic: spinlock acquired check done outside of kernel debugger");
3036 }
3037 state = ordered_load_mtx(lck);
3038 if (state == LCK_MTX_TAG_DESTROYED) {
3039 return FALSE;
3040 }
3041 if (LCK_MTX_STATE_TO_THREAD(state) || (state & LCK_ILOCK)) {
3042 return TRUE;
3043 }
3044 return FALSE;
3045}
3046
3047void
3048kdp_lck_mtx_find_owner(__unused struct waitq * waitq, event64_t event, thread_waitinfo_t * waitinfo)
3049{
3050 lck_mtx_t * mutex = LCK_EVENT_TO_MUTEX(event);
3051 waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(mutex);
3052 uintptr_t state = ordered_load_mtx(mutex);
3053 thread_t holder = LCK_MTX_STATE_TO_THREAD(state);
3054 if ((uintptr_t)holder == (uintptr_t)LCK_MTX_SPIN_TAG) {
3055 waitinfo->owner = STACKSHOT_WAITOWNER_MTXSPIN;
3056 } else {
3057 assertf(state != (uintptr_t)LCK_MTX_TAG_DESTROYED, "state=0x%llx", (uint64_t)state);
3058 assertf(state != (uintptr_t)LCK_MTX_TAG_INDIRECT, "state=0x%llx", (uint64_t)state);
3059 waitinfo->owner = thread_tid(holder);
3060 }
3061}
3062
3063void
3064kdp_rwlck_find_owner(__unused struct waitq * waitq, event64_t event, thread_waitinfo_t * waitinfo)
3065{
3066 lck_rw_t *rwlck = NULL;
3067 switch (waitinfo->wait_type) {
3068 case kThreadWaitKernelRWLockRead:
3069 rwlck = READ_EVENT_TO_RWLOCK(event);
3070 break;
3071 case kThreadWaitKernelRWLockWrite:
3072 case kThreadWaitKernelRWLockUpgrade:
3073 rwlck = WRITE_EVENT_TO_RWLOCK(event);
3074 break;
3075 default:
3076 panic("%s was called with an invalid blocking type", __FUNCTION__);
3077 break;
3078 }
3079 waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(rwlck);
3080 waitinfo->owner = thread_tid(rwlck->lck_rw_owner);
3081}