]> git.saurik.com Git - apple/xnu.git/blob - osfmk/arm/locks_arm.c
xnu-7195.101.1.tar.gz
[apple/xnu.git] / osfmk / arm / locks_arm.c
1 /*
2 * Copyright (c) 2007-2018 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System Copyright (c) 1991,1990,1989,1988,1987 Carnegie
33 * Mellon University All Rights Reserved.
34 *
35 * Permission to use, copy, modify and distribute this software and its
36 * documentation is hereby granted, provided that both the copyright notice
37 * and this permission notice appear in all copies of the software,
38 * derivative works or modified versions, and any portions thereof, and that
39 * both notices appear in supporting documentation.
40 *
41 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" CONDITION.
42 * CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR ANY DAMAGES
43 * WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
44 *
45 * Carnegie Mellon requests users of this software to return to
46 *
47 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
48 * School of Computer Science Carnegie Mellon University Pittsburgh PA
49 * 15213-3890
50 *
51 * any improvements or extensions that they make and grant Carnegie Mellon the
52 * rights to redistribute these changes.
53 */
54 /*
55 * File: kern/lock.c
56 * Author: Avadis Tevanian, Jr., Michael Wayne Young
57 * Date: 1985
58 *
59 * Locking primitives implementation
60 */
61
62 #define LOCK_PRIVATE 1
63
64 #include <mach_ldebug.h>
65
66 #include <kern/zalloc.h>
67 #include <kern/lock_stat.h>
68 #include <kern/locks.h>
69 #include <kern/misc_protos.h>
70 #include <kern/thread.h>
71 #include <kern/processor.h>
72 #include <kern/sched_prim.h>
73 #include <kern/debug.h>
74 #include <kern/kcdata.h>
75 #include <string.h>
76 #include <arm/cpu_internal.h>
77 #include <os/hash.h>
78 #include <arm/cpu_data.h>
79
80 #include <arm/cpu_data_internal.h>
81 #include <arm/proc_reg.h>
82 #include <arm/smp.h>
83 #include <machine/atomic.h>
84 #include <machine/machine_cpu.h>
85
86 #include <sys/kdebug.h>
87
88 #if CONFIG_DTRACE
89 #define DTRACE_RW_SHARED 0x0 //reader
90 #define DTRACE_RW_EXCL 0x1 //writer
91 #define DTRACE_NO_FLAG 0x0 //not applicable
92 #endif /* CONFIG_DTRACE */
93
94 #define LCK_RW_LCK_EXCLUSIVE_CODE 0x100
95 #define LCK_RW_LCK_EXCLUSIVE1_CODE 0x101
96 #define LCK_RW_LCK_SHARED_CODE 0x102
97 #define LCK_RW_LCK_SH_TO_EX_CODE 0x103
98 #define LCK_RW_LCK_SH_TO_EX1_CODE 0x104
99 #define LCK_RW_LCK_EX_TO_SH_CODE 0x105
100
101
102 #define ANY_LOCK_DEBUG (USLOCK_DEBUG || LOCK_DEBUG || MUTEX_DEBUG)
103
104 // Panic in tests that check lock usage correctness
105 // These are undesirable when in a panic or a debugger is runnning.
106 #define LOCK_CORRECTNESS_PANIC() (kernel_debugger_entry_count == 0)
107
108 #define ADAPTIVE_SPIN_ENABLE 0x1
109
110 int lck_mtx_adaptive_spin_mode = ADAPTIVE_SPIN_ENABLE;
111
112 #define SPINWAIT_OWNER_CHECK_COUNT 4
113
114 typedef enum {
115 SPINWAIT_ACQUIRED, /* Got the lock. */
116 SPINWAIT_INTERLOCK, /* Got the interlock, no owner, but caller must finish acquiring the lock. */
117 SPINWAIT_DID_SPIN_HIGH_THR, /* Got the interlock, spun, but failed to get the lock. */
118 SPINWAIT_DID_SPIN_OWNER_NOT_CORE, /* Got the interlock, spun, but failed to get the lock. */
119 SPINWAIT_DID_SPIN_NO_WINDOW_CONTENTION, /* Got the interlock, spun, but failed to get the lock. */
120 SPINWAIT_DID_SPIN_SLIDING_THR,/* Got the interlock, spun, but failed to get the lock. */
121 SPINWAIT_DID_NOT_SPIN, /* Got the interlock, did not spin. */
122 } spinwait_result_t;
123
124 #if CONFIG_DTRACE
125 extern uint64_t dtrace_spin_threshold;
126 #endif
127
128 /* Forwards */
129
130 extern unsigned int not_in_kdp;
131
132 /*
133 * We often want to know the addresses of the callers
134 * of the various lock routines. However, this information
135 * is only used for debugging and statistics.
136 */
137 typedef void *pc_t;
138 #define INVALID_PC ((void *) VM_MAX_KERNEL_ADDRESS)
139 #define INVALID_THREAD ((void *) VM_MAX_KERNEL_ADDRESS)
140
141 #ifdef lint
142 /*
143 * Eliminate lint complaints about unused local pc variables.
144 */
145 #define OBTAIN_PC(pc, l) ++pc
146 #else /* lint */
147 #define OBTAIN_PC(pc, l)
148 #endif /* lint */
149
150
151 /*
152 * Portable lock package implementation of usimple_locks.
153 */
154
155 /*
156 * Owner thread pointer when lock held in spin mode
157 */
158 #define LCK_MTX_SPIN_TAG 0xfffffff0
159
160
161 #define interlock_lock(lock) hw_lock_bit ((hw_lock_bit_t*)(&(lock)->lck_mtx_data), LCK_ILOCK_BIT, LCK_GRP_NULL)
162 #define interlock_try(lock) hw_lock_bit_try((hw_lock_bit_t*)(&(lock)->lck_mtx_data), LCK_ILOCK_BIT, LCK_GRP_NULL)
163 #define interlock_unlock(lock) hw_unlock_bit ((hw_lock_bit_t*)(&(lock)->lck_mtx_data), LCK_ILOCK_BIT)
164 #define lck_rw_ilk_lock(lock) hw_lock_bit ((hw_lock_bit_t*)(&(lock)->lck_rw_tag), LCK_RW_INTERLOCK_BIT, LCK_GRP_NULL)
165 #define lck_rw_ilk_unlock(lock) hw_unlock_bit((hw_lock_bit_t*)(&(lock)->lck_rw_tag), LCK_RW_INTERLOCK_BIT)
166
167 #define load_memory_barrier() os_atomic_thread_fence(acquire)
168
169 // Enforce program order of loads and stores.
170 #define ordered_load(target) \
171 os_atomic_load(target, compiler_acq_rel)
172 #define ordered_store(target, value) \
173 os_atomic_store(target, value, compiler_acq_rel)
174
175 #define ordered_load_mtx(lock) ordered_load(&(lock)->lck_mtx_data)
176 #define ordered_store_mtx(lock, value) ordered_store(&(lock)->lck_mtx_data, (value))
177 #define ordered_load_rw(lock) ordered_load(&(lock)->lck_rw_data)
178 #define ordered_store_rw(lock, value) ordered_store(&(lock)->lck_rw_data, (value))
179 #define ordered_load_rw_owner(lock) ordered_load(&(lock)->lck_rw_owner)
180 #define ordered_store_rw_owner(lock, value) ordered_store(&(lock)->lck_rw_owner, (value))
181 #define ordered_load_hw(lock) ordered_load(&(lock)->lock_data)
182 #define ordered_store_hw(lock, value) ordered_store(&(lock)->lock_data, (value))
183 #define ordered_load_bit(lock) ordered_load((lock))
184 #define ordered_store_bit(lock, value) ordered_store((lock), (value))
185
186
187 // Prevent the compiler from reordering memory operations around this
188 #define compiler_memory_fence() __asm__ volatile ("" ::: "memory")
189
190 #define LOCK_PANIC_TIMEOUT 0xc00000
191 #define NOINLINE __attribute__((noinline))
192
193
194 #if __arm__
195 #define interrupts_disabled(mask) (mask & PSR_INTMASK)
196 #else
197 #define interrupts_disabled(mask) (mask & DAIF_IRQF)
198 #endif
199
200
201 #if __arm__
202 #define enable_fiq() __asm__ volatile ("cpsie f" ::: "memory");
203 #define enable_interrupts() __asm__ volatile ("cpsie if" ::: "memory");
204 #endif
205
206 ZONE_VIEW_DEFINE(ZV_LCK_SPIN, "lck_spin",
207 KHEAP_ID_DEFAULT, sizeof(lck_spin_t));
208
209 ZONE_VIEW_DEFINE(ZV_LCK_MTX, "lck_mtx",
210 KHEAP_ID_DEFAULT, sizeof(lck_mtx_t));
211
212 ZONE_VIEW_DEFINE(ZV_LCK_MTX_EXT, "lck_mtx_ext",
213 KHEAP_ID_DEFAULT, sizeof(lck_mtx_ext_t));
214
215 ZONE_VIEW_DEFINE(ZV_LCK_RW, "lck_rw",
216 KHEAP_ID_DEFAULT, sizeof(lck_rw_t));
217
218 /*
219 * Forward declarations
220 */
221
222 static void lck_rw_lock_shared_gen(lck_rw_t *lck);
223 static void lck_rw_lock_exclusive_gen(lck_rw_t *lck);
224 static boolean_t lck_rw_lock_shared_to_exclusive_success(lck_rw_t *lck);
225 static boolean_t lck_rw_lock_shared_to_exclusive_failure(lck_rw_t *lck, uint32_t prior_lock_state);
226 static void lck_rw_lock_exclusive_to_shared_gen(lck_rw_t *lck, uint32_t prior_lock_state);
227 static lck_rw_type_t lck_rw_done_gen(lck_rw_t *lck, uint32_t prior_lock_state);
228 static boolean_t lck_rw_grab(lck_rw_t *lock, int mode, boolean_t wait);
229
230 /*
231 * atomic exchange API is a low level abstraction of the operations
232 * to atomically read, modify, and write a pointer. This abstraction works
233 * for both Intel and ARMv8.1 compare and exchange atomic instructions as
234 * well as the ARM exclusive instructions.
235 *
236 * atomic_exchange_begin() - begin exchange and retrieve current value
237 * atomic_exchange_complete() - conclude an exchange
238 * atomic_exchange_abort() - cancel an exchange started with atomic_exchange_begin()
239 */
240 __unused static uint32_t
241 load_exclusive32(uint32_t *target, enum memory_order ord)
242 {
243 uint32_t value;
244
245 #if __arm__
246 if (_os_atomic_mo_has_release(ord)) {
247 // Pre-load release barrier
248 atomic_thread_fence(memory_order_release);
249 }
250 value = __builtin_arm_ldrex(target);
251 #else
252 if (_os_atomic_mo_has_acquire(ord)) {
253 value = __builtin_arm_ldaex(target); // ldaxr
254 } else {
255 value = __builtin_arm_ldrex(target); // ldxr
256 }
257 #endif // __arm__
258 return value;
259 }
260
261 __unused static boolean_t
262 store_exclusive32(uint32_t *target, uint32_t value, enum memory_order ord)
263 {
264 boolean_t err;
265
266 #if __arm__
267 err = __builtin_arm_strex(value, target);
268 if (_os_atomic_mo_has_acquire(ord)) {
269 // Post-store acquire barrier
270 atomic_thread_fence(memory_order_acquire);
271 }
272 #else
273 if (_os_atomic_mo_has_release(ord)) {
274 err = __builtin_arm_stlex(value, target); // stlxr
275 } else {
276 err = __builtin_arm_strex(value, target); // stxr
277 }
278 #endif // __arm__
279 return !err;
280 }
281
282 static uint32_t
283 atomic_exchange_begin32(uint32_t *target, uint32_t *previous, enum memory_order ord)
284 {
285 uint32_t val;
286
287 #if __ARM_ATOMICS_8_1
288 ord = memory_order_relaxed;
289 #endif
290 val = load_exclusive32(target, ord);
291 *previous = val;
292 return val;
293 }
294
295 static boolean_t
296 atomic_exchange_complete32(uint32_t *target, uint32_t previous, uint32_t newval, enum memory_order ord)
297 {
298 #if __ARM_ATOMICS_8_1
299 return __c11_atomic_compare_exchange_strong((_Atomic uint32_t *)target, &previous, newval, ord, memory_order_relaxed);
300 #else
301 (void)previous; // Previous not needed, monitor is held
302 return store_exclusive32(target, newval, ord);
303 #endif
304 }
305
306 static void
307 atomic_exchange_abort(void)
308 {
309 os_atomic_clear_exclusive();
310 }
311
312 static boolean_t
313 atomic_test_and_set32(uint32_t *target, uint32_t test_mask, uint32_t set_mask, enum memory_order ord, boolean_t wait)
314 {
315 uint32_t value, prev;
316
317 for (;;) {
318 value = atomic_exchange_begin32(target, &prev, ord);
319 if (value & test_mask) {
320 if (wait) {
321 wait_for_event(); // Wait with monitor held
322 } else {
323 atomic_exchange_abort(); // Clear exclusive monitor
324 }
325 return FALSE;
326 }
327 value |= set_mask;
328 if (atomic_exchange_complete32(target, prev, value, ord)) {
329 return TRUE;
330 }
331 }
332 }
333
334 inline boolean_t
335 hw_atomic_test_and_set32(uint32_t *target, uint32_t test_mask, uint32_t set_mask, enum memory_order ord, boolean_t wait)
336 {
337 return atomic_test_and_set32(target, test_mask, set_mask, ord, wait);
338 }
339
340 /*
341 * To help _disable_preemption() inline everywhere with LTO,
342 * we keep these nice non inlineable functions as the panic()
343 * codegen setup is quite large and for weird reasons causes a frame.
344 */
345 __abortlike
346 static void
347 _disable_preemption_overflow(void)
348 {
349 panic("Preemption count overflow");
350 }
351
352 void
353 _disable_preemption(void)
354 {
355 thread_t thread = current_thread();
356 unsigned int count = thread->machine.preemption_count;
357
358 if (__improbable(++count == 0)) {
359 _disable_preemption_overflow();
360 }
361
362 os_atomic_store(&thread->machine.preemption_count, count, compiler_acq_rel);
363 }
364
365 /*
366 * This function checks whether an AST_URGENT has been pended.
367 *
368 * It is called once the preemption has been reenabled, which means the thread
369 * may have been preempted right before this was called, and when this function
370 * actually performs the check, we've changed CPU.
371 *
372 * This race is however benign: the point of AST_URGENT is to trigger a context
373 * switch, so if one happened, there's nothing left to check for, and AST_URGENT
374 * was cleared in the process.
375 *
376 * It follows that this check cannot have false negatives, which allows us
377 * to avoid fiddling with interrupt state for the vast majority of cases
378 * when the check will actually be negative.
379 */
380 static NOINLINE void
381 kernel_preempt_check(thread_t thread)
382 {
383 cpu_data_t *cpu_data_ptr;
384 long state;
385
386 #if __arm__
387 #define INTERRUPT_MASK PSR_IRQF
388 #else // __arm__
389 #define INTERRUPT_MASK DAIF_IRQF
390 #endif // __arm__
391
392 /*
393 * This check is racy and could load from another CPU's pending_ast mask,
394 * but as described above, this can't have false negatives.
395 */
396 cpu_data_ptr = os_atomic_load(&thread->machine.CpuDatap, compiler_acq_rel);
397 if (__probable((cpu_data_ptr->cpu_pending_ast & AST_URGENT) == 0)) {
398 return;
399 }
400
401 /* If interrupts are masked, we can't take an AST here */
402 state = get_interrupts();
403 if ((state & INTERRUPT_MASK) == 0) {
404 disable_interrupts_noread(); // Disable interrupts
405
406 /*
407 * Reload cpu_data_ptr: a context switch would cause it to change.
408 * Now that interrupts are disabled, this will debounce false positives.
409 */
410 cpu_data_ptr = os_atomic_load(&thread->machine.CpuDatap, compiler_acq_rel);
411 if (thread->machine.CpuDatap->cpu_pending_ast & AST_URGENT) {
412 #if __arm__
413 #if __ARM_USER_PROTECT__
414 uintptr_t up = arm_user_protect_begin(thread);
415 #endif // __ARM_USER_PROTECT__
416 enable_fiq();
417 #endif // __arm__
418 ast_taken_kernel(); // Handle urgent AST
419 #if __arm__
420 #if __ARM_USER_PROTECT__
421 arm_user_protect_end(thread, up, TRUE);
422 #endif // __ARM_USER_PROTECT__
423 enable_interrupts();
424 return; // Return early on arm only due to FIQ enabling
425 #endif // __arm__
426 }
427 restore_interrupts(state); // Enable interrupts
428 }
429 }
430
431 /*
432 * To help _enable_preemption() inline everywhere with LTO,
433 * we keep these nice non inlineable functions as the panic()
434 * codegen setup is quite large and for weird reasons causes a frame.
435 */
436 __abortlike
437 static void
438 _enable_preemption_underflow(void)
439 {
440 panic("Preemption count underflow");
441 }
442
443 void
444 _enable_preemption(void)
445 {
446 thread_t thread = current_thread();
447 unsigned int count = thread->machine.preemption_count;
448
449 if (__improbable(count == 0)) {
450 _enable_preemption_underflow();
451 }
452 count -= 1;
453
454 os_atomic_store(&thread->machine.preemption_count, count, compiler_acq_rel);
455 if (count == 0) {
456 kernel_preempt_check(thread);
457 }
458
459 os_compiler_barrier();
460 }
461
462 int
463 get_preemption_level(void)
464 {
465 return current_thread()->machine.preemption_count;
466 }
467
468 /*
469 * Routine: lck_spin_alloc_init
470 */
471 lck_spin_t *
472 lck_spin_alloc_init(
473 lck_grp_t * grp,
474 lck_attr_t * attr)
475 {
476 lck_spin_t *lck;
477
478 lck = zalloc(ZV_LCK_SPIN);
479 lck_spin_init(lck, grp, attr);
480 return lck;
481 }
482
483 /*
484 * Routine: lck_spin_free
485 */
486 void
487 lck_spin_free(
488 lck_spin_t * lck,
489 lck_grp_t * grp)
490 {
491 lck_spin_destroy(lck, grp);
492 zfree(ZV_LCK_SPIN, lck);
493 }
494
495 /*
496 * Routine: lck_spin_init
497 */
498 void
499 lck_spin_init(
500 lck_spin_t * lck,
501 lck_grp_t * grp,
502 __unused lck_attr_t * attr)
503 {
504 lck->type = LCK_SPIN_TYPE;
505 hw_lock_init(&lck->hwlock);
506 if (grp) {
507 lck_grp_reference(grp);
508 lck_grp_lckcnt_incr(grp, LCK_TYPE_SPIN);
509 }
510 }
511
512 /*
513 * arm_usimple_lock is a lck_spin_t without a group or attributes
514 */
515 MARK_AS_HIBERNATE_TEXT void inline
516 arm_usimple_lock_init(simple_lock_t lck, __unused unsigned short initial_value)
517 {
518 lck->type = LCK_SPIN_TYPE;
519 hw_lock_init(&lck->hwlock);
520 }
521
522
523 /*
524 * Routine: lck_spin_lock
525 */
526 void
527 lck_spin_lock(lck_spin_t *lock)
528 {
529 #if DEVELOPMENT || DEBUG
530 if (lock->type != LCK_SPIN_TYPE) {
531 panic("Invalid spinlock %p", lock);
532 }
533 #endif // DEVELOPMENT || DEBUG
534 hw_lock_lock(&lock->hwlock, LCK_GRP_NULL);
535 }
536
537 void
538 lck_spin_lock_grp(lck_spin_t *lock, lck_grp_t *grp)
539 {
540 #pragma unused(grp)
541 #if DEVELOPMENT || DEBUG
542 if (lock->type != LCK_SPIN_TYPE) {
543 panic("Invalid spinlock %p", lock);
544 }
545 #endif // DEVELOPMENT || DEBUG
546 hw_lock_lock(&lock->hwlock, grp);
547 }
548
549 /*
550 * Routine: lck_spin_lock_nopreempt
551 */
552 void
553 lck_spin_lock_nopreempt(lck_spin_t *lock)
554 {
555 #if DEVELOPMENT || DEBUG
556 if (lock->type != LCK_SPIN_TYPE) {
557 panic("Invalid spinlock %p", lock);
558 }
559 #endif // DEVELOPMENT || DEBUG
560 hw_lock_lock_nopreempt(&lock->hwlock, LCK_GRP_NULL);
561 }
562
563 void
564 lck_spin_lock_nopreempt_grp(lck_spin_t *lock, lck_grp_t *grp)
565 {
566 #pragma unused(grp)
567 #if DEVELOPMENT || DEBUG
568 if (lock->type != LCK_SPIN_TYPE) {
569 panic("Invalid spinlock %p", lock);
570 }
571 #endif // DEVELOPMENT || DEBUG
572 hw_lock_lock_nopreempt(&lock->hwlock, grp);
573 }
574
575 /*
576 * Routine: lck_spin_try_lock
577 */
578 int
579 lck_spin_try_lock(lck_spin_t *lock)
580 {
581 return hw_lock_try(&lock->hwlock, LCK_GRP_NULL);
582 }
583
584 int
585 lck_spin_try_lock_grp(lck_spin_t *lock, lck_grp_t *grp)
586 {
587 #pragma unused(grp)
588 return hw_lock_try(&lock->hwlock, grp);
589 }
590
591 /*
592 * Routine: lck_spin_try_lock_nopreempt
593 */
594 int
595 lck_spin_try_lock_nopreempt(lck_spin_t *lock)
596 {
597 return hw_lock_try_nopreempt(&lock->hwlock, LCK_GRP_NULL);
598 }
599
600 int
601 lck_spin_try_lock_nopreempt_grp(lck_spin_t *lock, lck_grp_t *grp)
602 {
603 #pragma unused(grp)
604 return hw_lock_try_nopreempt(&lock->hwlock, grp);
605 }
606
607 /*
608 * Routine: lck_spin_unlock
609 */
610 void
611 lck_spin_unlock(lck_spin_t *lock)
612 {
613 #if DEVELOPMENT || DEBUG
614 if ((LCK_MTX_STATE_TO_THREAD(lock->lck_spin_data) != current_thread()) && LOCK_CORRECTNESS_PANIC()) {
615 panic("Spinlock not owned by thread %p = %lx", lock, lock->lck_spin_data);
616 }
617 if (lock->type != LCK_SPIN_TYPE) {
618 panic("Invalid spinlock type %p", lock);
619 }
620 #endif // DEVELOPMENT || DEBUG
621 hw_lock_unlock(&lock->hwlock);
622 }
623
624 /*
625 * Routine: lck_spin_unlock_nopreempt
626 */
627 void
628 lck_spin_unlock_nopreempt(lck_spin_t *lock)
629 {
630 #if DEVELOPMENT || DEBUG
631 if ((LCK_MTX_STATE_TO_THREAD(lock->lck_spin_data) != current_thread()) && LOCK_CORRECTNESS_PANIC()) {
632 panic("Spinlock not owned by thread %p = %lx", lock, lock->lck_spin_data);
633 }
634 if (lock->type != LCK_SPIN_TYPE) {
635 panic("Invalid spinlock type %p", lock);
636 }
637 #endif // DEVELOPMENT || DEBUG
638 hw_lock_unlock_nopreempt(&lock->hwlock);
639 }
640
641 /*
642 * Routine: lck_spin_destroy
643 */
644 void
645 lck_spin_destroy(
646 lck_spin_t * lck,
647 lck_grp_t * grp)
648 {
649 if (lck->lck_spin_data == LCK_SPIN_TAG_DESTROYED) {
650 return;
651 }
652 lck->lck_spin_data = LCK_SPIN_TAG_DESTROYED;
653 if (grp) {
654 lck_grp_lckcnt_decr(grp, LCK_TYPE_SPIN);
655 lck_grp_deallocate(grp);
656 }
657 }
658
659 /*
660 * Routine: kdp_lck_spin_is_acquired
661 * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
662 */
663 boolean_t
664 kdp_lck_spin_is_acquired(lck_spin_t *lck)
665 {
666 if (not_in_kdp) {
667 panic("panic: spinlock acquired check done outside of kernel debugger");
668 }
669 return ((lck->lck_spin_data & ~LCK_SPIN_TAG_DESTROYED) != 0) ? TRUE:FALSE;
670 }
671
672 /*
673 * Initialize a usimple_lock.
674 *
675 * No change in preemption state.
676 */
677 void
678 usimple_lock_init(
679 usimple_lock_t l,
680 unsigned short tag)
681 {
682 simple_lock_init((simple_lock_t) l, tag);
683 }
684
685
686 /*
687 * Acquire a usimple_lock.
688 *
689 * Returns with preemption disabled. Note
690 * that the hw_lock routines are responsible for
691 * maintaining preemption state.
692 */
693 void
694 (usimple_lock)(
695 usimple_lock_t l
696 LCK_GRP_ARG(lck_grp_t *grp))
697 {
698 simple_lock((simple_lock_t) l, LCK_GRP_PROBEARG(grp));
699 }
700
701
702 extern void sync(void);
703
704 /*
705 * Release a usimple_lock.
706 *
707 * Returns with preemption enabled. Note
708 * that the hw_lock routines are responsible for
709 * maintaining preemption state.
710 */
711 void
712 (usimple_unlock)(
713 usimple_lock_t l)
714 {
715 simple_unlock((simple_lock_t)l);
716 }
717
718
719 /*
720 * Conditionally acquire a usimple_lock.
721 *
722 * On success, returns with preemption disabled.
723 * On failure, returns with preemption in the same state
724 * as when first invoked. Note that the hw_lock routines
725 * are responsible for maintaining preemption state.
726 *
727 * XXX No stats are gathered on a miss; I preserved this
728 * behavior from the original assembly-language code, but
729 * doesn't it make sense to log misses? XXX
730 */
731 unsigned
732 int
733 (usimple_lock_try)(
734 usimple_lock_t l
735 LCK_GRP_ARG(lck_grp_t *grp))
736 {
737 return simple_lock_try((simple_lock_t) l, grp);
738 }
739
740 /*
741 * The C portion of the shared/exclusive locks package.
742 */
743
744 /*
745 * compute the deadline to spin against when
746 * waiting for a change of state on a lck_rw_t
747 */
748 static inline uint64_t
749 lck_rw_deadline_for_spin(lck_rw_t *lck)
750 {
751 lck_rw_word_t word;
752
753 word.data = ordered_load_rw(lck);
754 if (word.can_sleep) {
755 if (word.r_waiting || word.w_waiting || (word.shared_count > machine_info.max_cpus)) {
756 /*
757 * there are already threads waiting on this lock... this
758 * implies that they have spun beyond their deadlines waiting for
759 * the desired state to show up so we will not bother spinning at this time...
760 * or
761 * the current number of threads sharing this lock exceeds our capacity to run them
762 * concurrently and since all states we're going to spin for require the rw_shared_count
763 * to be at 0, we'll not bother spinning since the latency for this to happen is
764 * unpredictable...
765 */
766 return mach_absolute_time();
767 }
768 return mach_absolute_time() + MutexSpin;
769 } else {
770 return mach_absolute_time() + (100000LL * 1000000000LL);
771 }
772 }
773
774 static boolean_t
775 lck_rw_drain_status(lck_rw_t *lock, uint32_t status_mask, boolean_t wait __unused)
776 {
777 uint64_t deadline = 0;
778 uint32_t data;
779
780 if (wait) {
781 deadline = lck_rw_deadline_for_spin(lock);
782 }
783
784 for (;;) {
785 data = load_exclusive32(&lock->lck_rw_data, memory_order_acquire_smp);
786 if ((data & status_mask) == 0) {
787 break;
788 }
789 if (wait) {
790 wait_for_event();
791 } else {
792 os_atomic_clear_exclusive();
793 }
794 if (!wait || (mach_absolute_time() >= deadline)) {
795 return FALSE;
796 }
797 }
798 os_atomic_clear_exclusive();
799 return TRUE;
800 }
801
802 /*
803 * Spin while interlock is held.
804 */
805 static inline void
806 lck_rw_interlock_spin(lck_rw_t *lock)
807 {
808 uint32_t data;
809
810 for (;;) {
811 data = load_exclusive32(&lock->lck_rw_data, memory_order_relaxed);
812 if (data & LCK_RW_INTERLOCK) {
813 wait_for_event();
814 } else {
815 os_atomic_clear_exclusive();
816 return;
817 }
818 }
819 }
820
821 /*
822 * We disable interrupts while holding the RW interlock to prevent an
823 * interrupt from exacerbating hold time.
824 * Hence, local helper functions lck_interlock_lock()/lck_interlock_unlock().
825 */
826 static inline boolean_t
827 lck_interlock_lock(lck_rw_t *lck)
828 {
829 boolean_t istate;
830
831 istate = ml_set_interrupts_enabled(FALSE);
832 lck_rw_ilk_lock(lck);
833 return istate;
834 }
835
836 static inline void
837 lck_interlock_unlock(lck_rw_t *lck, boolean_t istate)
838 {
839 lck_rw_ilk_unlock(lck);
840 ml_set_interrupts_enabled(istate);
841 }
842
843
844 #define LCK_RW_GRAB_WANT 0
845 #define LCK_RW_GRAB_SHARED 1
846
847 static boolean_t
848 lck_rw_grab(lck_rw_t *lock, int mode, boolean_t wait)
849 {
850 uint64_t deadline = 0;
851 uint32_t data, prev;
852 boolean_t do_exch;
853
854 if (wait) {
855 deadline = lck_rw_deadline_for_spin(lock);
856 }
857
858 for (;;) {
859 data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp);
860 if (data & LCK_RW_INTERLOCK) {
861 atomic_exchange_abort();
862 lck_rw_interlock_spin(lock);
863 continue;
864 }
865 do_exch = FALSE;
866 if (mode == LCK_RW_GRAB_WANT) {
867 if ((data & LCK_RW_WANT_EXCL) == 0) {
868 data |= LCK_RW_WANT_EXCL;
869 do_exch = TRUE;
870 }
871 } else { // LCK_RW_GRAB_SHARED
872 if (((data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) == 0) ||
873 (((data & LCK_RW_SHARED_MASK)) && ((data & LCK_RW_PRIV_EXCL) == 0))) {
874 data += LCK_RW_SHARED_READER;
875 do_exch = TRUE;
876 }
877 }
878 if (do_exch) {
879 if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
880 return TRUE;
881 }
882 } else {
883 if (wait) { // Non-waiting
884 wait_for_event();
885 } else {
886 atomic_exchange_abort();
887 }
888 if (!wait || (mach_absolute_time() >= deadline)) {
889 return FALSE;
890 }
891 }
892 }
893 }
894
895
896 /*
897 * Routine: lck_rw_alloc_init
898 */
899 lck_rw_t *
900 lck_rw_alloc_init(
901 lck_grp_t *grp,
902 lck_attr_t *attr)
903 {
904 lck_rw_t *lck;
905
906 lck = zalloc_flags(ZV_LCK_RW, Z_WAITOK | Z_ZERO);
907 lck_rw_init(lck, grp, attr);
908 return lck;
909 }
910
911 /*
912 * Routine: lck_rw_free
913 */
914 void
915 lck_rw_free(
916 lck_rw_t *lck,
917 lck_grp_t *grp)
918 {
919 lck_rw_destroy(lck, grp);
920 zfree(ZV_LCK_RW, lck);
921 }
922
923 /*
924 * Routine: lck_rw_init
925 */
926 void
927 lck_rw_init(
928 lck_rw_t *lck,
929 lck_grp_t *grp,
930 lck_attr_t *attr)
931 {
932 if (attr == LCK_ATTR_NULL) {
933 attr = &LockDefaultLckAttr;
934 }
935 memset(lck, 0, sizeof(lck_rw_t));
936 lck->lck_rw_can_sleep = TRUE;
937 if ((attr->lck_attr_val & LCK_ATTR_RW_SHARED_PRIORITY) == 0) {
938 lck->lck_rw_priv_excl = TRUE;
939 }
940
941 lck_grp_reference(grp);
942 lck_grp_lckcnt_incr(grp, LCK_TYPE_RW);
943 }
944
945
946 /*
947 * Routine: lck_rw_destroy
948 */
949 void
950 lck_rw_destroy(
951 lck_rw_t *lck,
952 lck_grp_t *grp)
953 {
954 if (lck->lck_rw_tag == LCK_RW_TAG_DESTROYED) {
955 return;
956 }
957 #if MACH_LDEBUG
958 lck_rw_assert(lck, LCK_RW_ASSERT_NOTHELD);
959 #endif
960 lck->lck_rw_tag = LCK_RW_TAG_DESTROYED;
961 lck_grp_lckcnt_decr(grp, LCK_TYPE_RW);
962 lck_grp_deallocate(grp);
963 return;
964 }
965
966 /*
967 * Routine: lck_rw_lock
968 */
969 void
970 lck_rw_lock(
971 lck_rw_t *lck,
972 lck_rw_type_t lck_rw_type)
973 {
974 if (lck_rw_type == LCK_RW_TYPE_SHARED) {
975 lck_rw_lock_shared(lck);
976 } else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) {
977 lck_rw_lock_exclusive(lck);
978 } else {
979 panic("lck_rw_lock(): Invalid RW lock type: %x", lck_rw_type);
980 }
981 }
982
983 #define LCK_RW_LOCK_EXCLUSIVE_TAS(lck) (atomic_test_and_set32(&(lck)->lck_rw_data, \
984 (LCK_RW_SHARED_MASK | LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE | LCK_RW_INTERLOCK), \
985 LCK_RW_WANT_EXCL, memory_order_acquire_smp, FALSE))
986
987 /*
988 * Routine: lck_rw_lock_exclusive_check_contended
989 */
990 bool
991 lck_rw_lock_exclusive_check_contended(lck_rw_t *lock)
992 {
993 thread_t thread = current_thread();
994 bool contended = false;
995
996 if (lock->lck_rw_can_sleep) {
997 thread->rwlock_count++;
998 } else if (get_preemption_level() == 0) {
999 panic("Taking non-sleepable RW lock with preemption enabled");
1000 }
1001 if (LCK_RW_LOCK_EXCLUSIVE_TAS(lock)) {
1002 #if CONFIG_DTRACE
1003 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL);
1004 #endif /* CONFIG_DTRACE */
1005 } else {
1006 contended = true;
1007 lck_rw_lock_exclusive_gen(lock);
1008 }
1009 #if MACH_ASSERT
1010 thread_t owner = ordered_load_rw_owner(lock);
1011 assertf(owner == THREAD_NULL, "state=0x%x, owner=%p", ordered_load_rw(lock), owner);
1012 #endif
1013 ordered_store_rw_owner(lock, thread);
1014 return contended;
1015 }
1016
1017 /*
1018 * Routine: lck_rw_lock_exclusive
1019 */
1020 void
1021 lck_rw_lock_exclusive(lck_rw_t *lock)
1022 {
1023 thread_t thread = current_thread();
1024
1025 if (lock->lck_rw_can_sleep) {
1026 thread->rwlock_count++;
1027 } else if (get_preemption_level() == 0) {
1028 panic("Taking non-sleepable RW lock with preemption enabled");
1029 }
1030 if (LCK_RW_LOCK_EXCLUSIVE_TAS(lock)) {
1031 #if CONFIG_DTRACE
1032 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL);
1033 #endif /* CONFIG_DTRACE */
1034 } else {
1035 lck_rw_lock_exclusive_gen(lock);
1036 }
1037 #if MACH_ASSERT
1038 thread_t owner = ordered_load_rw_owner(lock);
1039 assertf(owner == THREAD_NULL, "state=0x%x, owner=%p", ordered_load_rw(lock), owner);
1040 #endif
1041 ordered_store_rw_owner(lock, thread);
1042 }
1043
1044 /*
1045 * Routine: lck_rw_lock_shared
1046 */
1047 void
1048 lck_rw_lock_shared(lck_rw_t *lock)
1049 {
1050 uint32_t data, prev;
1051
1052 if (lock->lck_rw_can_sleep) {
1053 current_thread()->rwlock_count++;
1054 } else if (get_preemption_level() == 0) {
1055 panic("Taking non-sleepable RW lock with preemption enabled");
1056 }
1057 for (;;) {
1058 data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp);
1059 if (data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE | LCK_RW_INTERLOCK)) {
1060 atomic_exchange_abort();
1061 lck_rw_lock_shared_gen(lock);
1062 break;
1063 }
1064 data += LCK_RW_SHARED_READER;
1065 if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
1066 break;
1067 }
1068 cpu_pause();
1069 }
1070 #if MACH_ASSERT
1071 thread_t owner = ordered_load_rw_owner(lock);
1072 assertf(owner == THREAD_NULL, "state=0x%x, owner=%p", ordered_load_rw(lock), owner);
1073 #endif
1074 #if CONFIG_DTRACE
1075 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lock, DTRACE_RW_SHARED);
1076 #endif /* CONFIG_DTRACE */
1077 return;
1078 }
1079
1080 /*
1081 * Routine: lck_rw_lock_shared_to_exclusive
1082 *
1083 * False returned upon failure, in this case the shared lock is dropped.
1084 */
1085 boolean_t
1086 lck_rw_lock_shared_to_exclusive(lck_rw_t *lock)
1087 {
1088 uint32_t data, prev;
1089
1090 for (;;) {
1091 data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp);
1092 if (data & LCK_RW_INTERLOCK) {
1093 atomic_exchange_abort();
1094 lck_rw_interlock_spin(lock);
1095 continue;
1096 }
1097 if (data & LCK_RW_WANT_UPGRADE) {
1098 data -= LCK_RW_SHARED_READER;
1099 if ((data & LCK_RW_SHARED_MASK) == 0) { /* we were the last reader */
1100 data &= ~(LCK_RW_W_WAITING); /* so clear the wait indicator */
1101 }
1102 if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
1103 return lck_rw_lock_shared_to_exclusive_failure(lock, prev);
1104 }
1105 } else {
1106 data |= LCK_RW_WANT_UPGRADE; /* ask for WANT_UPGRADE */
1107 data -= LCK_RW_SHARED_READER; /* and shed our read count */
1108 if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
1109 break;
1110 }
1111 }
1112 cpu_pause();
1113 }
1114 /* we now own the WANT_UPGRADE */
1115 if (data & LCK_RW_SHARED_MASK) { /* check to see if all of the readers are drained */
1116 lck_rw_lock_shared_to_exclusive_success(lock); /* if not, we need to go wait */
1117 }
1118 #if MACH_ASSERT
1119 thread_t owner = ordered_load_rw_owner(lock);
1120 assertf(owner == THREAD_NULL, "state=0x%x, owner=%p", ordered_load_rw(lock), owner);
1121 #endif
1122 ordered_store_rw_owner(lock, current_thread());
1123 #if CONFIG_DTRACE
1124 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lock, 0);
1125 #endif /* CONFIG_DTRACE */
1126 return TRUE;
1127 }
1128
1129
1130 /*
1131 * Routine: lck_rw_lock_shared_to_exclusive_failure
1132 * Function:
1133 * Fast path code has already dropped our read
1134 * count and determined that someone else owns 'lck_rw_want_upgrade'
1135 * if 'lck_rw_shared_count' == 0, its also already dropped 'lck_w_waiting'
1136 * all we need to do here is determine if a wakeup is needed
1137 */
1138 static boolean_t
1139 lck_rw_lock_shared_to_exclusive_failure(
1140 lck_rw_t *lck,
1141 uint32_t prior_lock_state)
1142 {
1143 thread_t thread = current_thread();
1144 uint32_t rwlock_count;
1145
1146 /* Check if dropping the lock means that we need to unpromote */
1147 if (lck->lck_rw_can_sleep) {
1148 rwlock_count = thread->rwlock_count--;
1149 } else {
1150 rwlock_count = UINT32_MAX;
1151 }
1152 #if MACH_LDEBUG
1153 if (rwlock_count == 0) {
1154 panic("rw lock count underflow for thread %p", thread);
1155 }
1156 #endif
1157 if ((prior_lock_state & LCK_RW_W_WAITING) &&
1158 ((prior_lock_state & LCK_RW_SHARED_MASK) == LCK_RW_SHARED_READER)) {
1159 /*
1160 * Someone else has requested upgrade.
1161 * Since we've released the read lock, wake
1162 * him up if he's blocked waiting
1163 */
1164 thread_wakeup(LCK_RW_WRITER_EVENT(lck));
1165 }
1166
1167 if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
1168 /* sched_flags checked without lock, but will be rechecked while clearing */
1169 lck_rw_clear_promotion(thread, unslide_for_kdebug(lck));
1170 }
1171
1172 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_CODE) | DBG_FUNC_NONE,
1173 VM_KERNEL_UNSLIDE_OR_PERM(lck), lck->lck_rw_shared_count, lck->lck_rw_want_upgrade, 0, 0);
1174
1175 return FALSE;
1176 }
1177
1178 /*
1179 * Routine: lck_rw_lock_shared_to_exclusive_success
1180 * Function:
1181 * assembly fast path code has already dropped our read
1182 * count and successfully acquired 'lck_rw_want_upgrade'
1183 * we just need to wait for the rest of the readers to drain
1184 * and then we can return as the exclusive holder of this lock
1185 */
1186 static boolean_t
1187 lck_rw_lock_shared_to_exclusive_success(
1188 lck_rw_t *lock)
1189 {
1190 __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lock);
1191 int slept = 0;
1192 lck_rw_word_t word;
1193 wait_result_t res;
1194 boolean_t istate;
1195 boolean_t not_shared;
1196
1197 #if CONFIG_DTRACE
1198 uint64_t wait_interval = 0;
1199 int readers_at_sleep = 0;
1200 boolean_t dtrace_ls_initialized = FALSE;
1201 boolean_t dtrace_rwl_shared_to_excl_spin, dtrace_rwl_shared_to_excl_block, dtrace_ls_enabled = FALSE;
1202 #endif
1203
1204 while (!lck_rw_drain_status(lock, LCK_RW_SHARED_MASK, FALSE)) {
1205 word.data = ordered_load_rw(lock);
1206 #if CONFIG_DTRACE
1207 if (dtrace_ls_initialized == FALSE) {
1208 dtrace_ls_initialized = TRUE;
1209 dtrace_rwl_shared_to_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN] != 0);
1210 dtrace_rwl_shared_to_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK] != 0);
1211 dtrace_ls_enabled = dtrace_rwl_shared_to_excl_spin || dtrace_rwl_shared_to_excl_block;
1212 if (dtrace_ls_enabled) {
1213 /*
1214 * Either sleeping or spinning is happening,
1215 * start a timing of our delay interval now.
1216 */
1217 readers_at_sleep = word.shared_count;
1218 wait_interval = mach_absolute_time();
1219 }
1220 }
1221 #endif
1222
1223 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_START,
1224 trace_lck, word.shared_count, 0, 0, 0);
1225
1226 not_shared = lck_rw_drain_status(lock, LCK_RW_SHARED_MASK, TRUE);
1227
1228 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_END,
1229 trace_lck, lock->lck_rw_shared_count, 0, 0, 0);
1230
1231 if (not_shared) {
1232 break;
1233 }
1234
1235 /*
1236 * if we get here, the spin deadline in lck_rw_wait_on_status()
1237 * has expired w/o the rw_shared_count having drained to 0
1238 * check to see if we're allowed to do a thread_block
1239 */
1240 if (word.can_sleep) {
1241 istate = lck_interlock_lock(lock);
1242
1243 word.data = ordered_load_rw(lock);
1244 if (word.shared_count != 0) {
1245 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_START,
1246 trace_lck, word.shared_count, 0, 0, 0);
1247
1248 word.w_waiting = 1;
1249 ordered_store_rw(lock, word.data);
1250
1251 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockUpgrade);
1252 res = assert_wait(LCK_RW_WRITER_EVENT(lock),
1253 THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
1254 lck_interlock_unlock(lock, istate);
1255
1256 if (res == THREAD_WAITING) {
1257 res = thread_block(THREAD_CONTINUE_NULL);
1258 slept++;
1259 }
1260 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_END,
1261 trace_lck, res, slept, 0, 0);
1262 } else {
1263 lck_interlock_unlock(lock, istate);
1264 break;
1265 }
1266 }
1267 }
1268 #if CONFIG_DTRACE
1269 /*
1270 * We infer whether we took the sleep/spin path above by checking readers_at_sleep.
1271 */
1272 if (dtrace_ls_enabled == TRUE) {
1273 if (slept == 0) {
1274 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN, lock, mach_absolute_time() - wait_interval, 0);
1275 } else {
1276 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK, lock,
1277 mach_absolute_time() - wait_interval, 1,
1278 (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1279 }
1280 }
1281 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lock, 1);
1282 #endif
1283 return TRUE;
1284 }
1285
1286
1287 /*
1288 * Routine: lck_rw_lock_exclusive_to_shared
1289 */
1290
1291 void
1292 lck_rw_lock_exclusive_to_shared(lck_rw_t *lock)
1293 {
1294 uint32_t data, prev;
1295
1296 assertf(lock->lck_rw_owner == current_thread(), "state=0x%x, owner=%p", lock->lck_rw_data, lock->lck_rw_owner);
1297 ordered_store_rw_owner(lock, THREAD_NULL);
1298 for (;;) {
1299 data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_release_smp);
1300 if (data & LCK_RW_INTERLOCK) {
1301 atomic_exchange_abort();
1302 lck_rw_interlock_spin(lock); /* wait for interlock to clear */
1303 continue;
1304 }
1305 data += LCK_RW_SHARED_READER;
1306 if (data & LCK_RW_WANT_UPGRADE) {
1307 data &= ~(LCK_RW_WANT_UPGRADE);
1308 } else {
1309 data &= ~(LCK_RW_WANT_EXCL);
1310 }
1311 if (!((prev & LCK_RW_W_WAITING) && (prev & LCK_RW_PRIV_EXCL))) {
1312 data &= ~(LCK_RW_W_WAITING);
1313 }
1314 if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_release_smp)) {
1315 break;
1316 }
1317 cpu_pause();
1318 }
1319 return lck_rw_lock_exclusive_to_shared_gen(lock, prev);
1320 }
1321
1322 /*
1323 * Routine: lck_rw_lock_exclusive_to_shared_gen
1324 * Function:
1325 * Fast path has already dropped
1326 * our exclusive state and bumped lck_rw_shared_count
1327 * all we need to do here is determine if anyone
1328 * needs to be awakened.
1329 */
1330 static void
1331 lck_rw_lock_exclusive_to_shared_gen(
1332 lck_rw_t *lck,
1333 uint32_t prior_lock_state)
1334 {
1335 __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
1336 lck_rw_word_t fake_lck;
1337
1338 /*
1339 * prior_lock state is a snapshot of the 1st word of the
1340 * lock in question... we'll fake up a pointer to it
1341 * and carefully not access anything beyond whats defined
1342 * in the first word of a lck_rw_t
1343 */
1344 fake_lck.data = prior_lock_state;
1345
1346 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_START,
1347 trace_lck, fake_lck->want_excl, fake_lck->want_upgrade, 0, 0);
1348
1349 /*
1350 * don't wake up anyone waiting to take the lock exclusively
1351 * since we hold a read count... when the read count drops to 0,
1352 * the writers will be woken.
1353 *
1354 * wake up any waiting readers if we don't have any writers waiting,
1355 * or the lock is NOT marked as rw_priv_excl (writers have privilege)
1356 */
1357 if (!(fake_lck.priv_excl && fake_lck.w_waiting) && fake_lck.r_waiting) {
1358 thread_wakeup(LCK_RW_READER_EVENT(lck));
1359 }
1360
1361 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_END,
1362 trace_lck, lck->lck_rw_want_excl, lck->lck_rw_want_upgrade, lck->lck_rw_shared_count, 0);
1363
1364 #if CONFIG_DTRACE
1365 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_TO_SHARED_DOWNGRADE, lck, 0);
1366 #endif
1367 }
1368
1369
1370 /*
1371 * Routine: lck_rw_try_lock
1372 */
1373 boolean_t
1374 lck_rw_try_lock(
1375 lck_rw_t *lck,
1376 lck_rw_type_t lck_rw_type)
1377 {
1378 if (lck_rw_type == LCK_RW_TYPE_SHARED) {
1379 return lck_rw_try_lock_shared(lck);
1380 } else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) {
1381 return lck_rw_try_lock_exclusive(lck);
1382 } else {
1383 panic("lck_rw_try_lock(): Invalid rw lock type: %x", lck_rw_type);
1384 }
1385 return FALSE;
1386 }
1387
1388 /*
1389 * Routine: lck_rw_try_lock_shared
1390 */
1391
1392 boolean_t
1393 lck_rw_try_lock_shared(lck_rw_t *lock)
1394 {
1395 uint32_t data, prev;
1396
1397 for (;;) {
1398 data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp);
1399 if (data & LCK_RW_INTERLOCK) {
1400 atomic_exchange_abort();
1401 lck_rw_interlock_spin(lock);
1402 continue;
1403 }
1404 if (data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) {
1405 atomic_exchange_abort();
1406 return FALSE; /* lock is busy */
1407 }
1408 data += LCK_RW_SHARED_READER; /* Increment reader refcount */
1409 if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
1410 break;
1411 }
1412 cpu_pause();
1413 }
1414 #if MACH_ASSERT
1415 thread_t owner = ordered_load_rw_owner(lock);
1416 assertf(owner == THREAD_NULL, "state=0x%x, owner=%p", ordered_load_rw(lock), owner);
1417 #endif
1418
1419 if (lock->lck_rw_can_sleep) {
1420 current_thread()->rwlock_count++;
1421 } else if (get_preemption_level() == 0) {
1422 panic("Taking non-sleepable RW lock with preemption enabled");
1423 }
1424
1425 #if CONFIG_DTRACE
1426 LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_SHARED_ACQUIRE, lock, DTRACE_RW_SHARED);
1427 #endif /* CONFIG_DTRACE */
1428 return TRUE;
1429 }
1430
1431
1432 /*
1433 * Routine: lck_rw_try_lock_exclusive
1434 */
1435
1436 boolean_t
1437 lck_rw_try_lock_exclusive(lck_rw_t *lock)
1438 {
1439 uint32_t data, prev;
1440 thread_t thread;
1441
1442 for (;;) {
1443 data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp);
1444 if (data & LCK_RW_INTERLOCK) {
1445 atomic_exchange_abort();
1446 lck_rw_interlock_spin(lock);
1447 continue;
1448 }
1449 if (data & (LCK_RW_SHARED_MASK | LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) {
1450 atomic_exchange_abort();
1451 return FALSE;
1452 }
1453 data |= LCK_RW_WANT_EXCL;
1454 if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
1455 break;
1456 }
1457 cpu_pause();
1458 }
1459 thread = current_thread();
1460 if (lock->lck_rw_can_sleep) {
1461 thread->rwlock_count++;
1462 } else if (get_preemption_level() == 0) {
1463 panic("Taking non-sleepable RW lock with preemption enabled");
1464 }
1465 #if MACH_ASSERT
1466 thread_t owner = ordered_load_rw_owner(lock);
1467 assertf(owner == THREAD_NULL, "state=0x%x, owner=%p", ordered_load_rw(lock), owner);
1468 #endif
1469 ordered_store_rw_owner(lock, thread);
1470 #if CONFIG_DTRACE
1471 LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL);
1472 #endif /* CONFIG_DTRACE */
1473 return TRUE;
1474 }
1475
1476
1477 /*
1478 * Routine: lck_rw_unlock
1479 */
1480 void
1481 lck_rw_unlock(
1482 lck_rw_t *lck,
1483 lck_rw_type_t lck_rw_type)
1484 {
1485 if (lck_rw_type == LCK_RW_TYPE_SHARED) {
1486 lck_rw_unlock_shared(lck);
1487 } else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) {
1488 lck_rw_unlock_exclusive(lck);
1489 } else {
1490 panic("lck_rw_unlock(): Invalid RW lock type: %d", lck_rw_type);
1491 }
1492 }
1493
1494
1495 /*
1496 * Routine: lck_rw_unlock_shared
1497 */
1498 void
1499 lck_rw_unlock_shared(
1500 lck_rw_t *lck)
1501 {
1502 lck_rw_type_t ret;
1503
1504 assertf(lck->lck_rw_owner == THREAD_NULL, "state=0x%x, owner=%p", lck->lck_rw_data, lck->lck_rw_owner);
1505 assertf(lck->lck_rw_shared_count > 0, "shared_count=0x%x", lck->lck_rw_shared_count);
1506 ret = lck_rw_done(lck);
1507
1508 if (ret != LCK_RW_TYPE_SHARED) {
1509 panic("lck_rw_unlock_shared(): lock %p held in mode: %d", lck, ret);
1510 }
1511 }
1512
1513
1514 /*
1515 * Routine: lck_rw_unlock_exclusive
1516 */
1517 void
1518 lck_rw_unlock_exclusive(
1519 lck_rw_t *lck)
1520 {
1521 lck_rw_type_t ret;
1522
1523 assertf(lck->lck_rw_owner == current_thread(), "state=0x%x, owner=%p", lck->lck_rw_data, lck->lck_rw_owner);
1524 ret = lck_rw_done(lck);
1525
1526 if (ret != LCK_RW_TYPE_EXCLUSIVE) {
1527 panic("lck_rw_unlock_exclusive(): lock %p held in mode: %d", lck, ret);
1528 }
1529 }
1530
1531
1532 /*
1533 * Routine: lck_rw_lock_exclusive_gen
1534 */
1535 static void
1536 lck_rw_lock_exclusive_gen(
1537 lck_rw_t *lock)
1538 {
1539 __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lock);
1540 lck_rw_word_t word;
1541 int slept = 0;
1542 boolean_t gotlock = 0;
1543 boolean_t not_shared_or_upgrade = 0;
1544 wait_result_t res = 0;
1545 boolean_t istate;
1546
1547 #if CONFIG_DTRACE
1548 boolean_t dtrace_ls_initialized = FALSE;
1549 boolean_t dtrace_rwl_excl_spin, dtrace_rwl_excl_block, dtrace_ls_enabled = FALSE;
1550 uint64_t wait_interval = 0;
1551 int readers_at_sleep = 0;
1552 #endif
1553
1554 /*
1555 * Try to acquire the lck_rw_want_excl bit.
1556 */
1557 while (!lck_rw_grab(lock, LCK_RW_GRAB_WANT, FALSE)) {
1558 #if CONFIG_DTRACE
1559 if (dtrace_ls_initialized == FALSE) {
1560 dtrace_ls_initialized = TRUE;
1561 dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
1562 dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
1563 dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
1564 if (dtrace_ls_enabled) {
1565 /*
1566 * Either sleeping or spinning is happening,
1567 * start a timing of our delay interval now.
1568 */
1569 readers_at_sleep = lock->lck_rw_shared_count;
1570 wait_interval = mach_absolute_time();
1571 }
1572 }
1573 #endif
1574
1575 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1576
1577 gotlock = lck_rw_grab(lock, LCK_RW_GRAB_WANT, TRUE);
1578
1579 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_END, trace_lck, 0, 0, gotlock, 0);
1580
1581 if (gotlock) {
1582 break;
1583 }
1584 /*
1585 * if we get here, the deadline has expired w/o us
1586 * being able to grab the lock exclusively
1587 * check to see if we're allowed to do a thread_block
1588 */
1589 word.data = ordered_load_rw(lock);
1590 if (word.can_sleep) {
1591 istate = lck_interlock_lock(lock);
1592 word.data = ordered_load_rw(lock);
1593
1594 if (word.want_excl) {
1595 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1596
1597 word.w_waiting = 1;
1598 ordered_store_rw(lock, word.data);
1599
1600 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite);
1601 res = assert_wait(LCK_RW_WRITER_EVENT(lock),
1602 THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
1603 lck_interlock_unlock(lock, istate);
1604
1605 if (res == THREAD_WAITING) {
1606 res = thread_block(THREAD_CONTINUE_NULL);
1607 slept++;
1608 }
1609 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_END, trace_lck, res, slept, 0, 0);
1610 } else {
1611 word.want_excl = 1;
1612 ordered_store_rw(lock, word.data);
1613 lck_interlock_unlock(lock, istate);
1614 break;
1615 }
1616 }
1617 }
1618 /*
1619 * Wait for readers (and upgrades) to finish...
1620 */
1621 while (!lck_rw_drain_status(lock, LCK_RW_SHARED_MASK | LCK_RW_WANT_UPGRADE, FALSE)) {
1622 #if CONFIG_DTRACE
1623 /*
1624 * Either sleeping or spinning is happening, start
1625 * a timing of our delay interval now. If we set it
1626 * to -1 we don't have accurate data so we cannot later
1627 * decide to record a dtrace spin or sleep event.
1628 */
1629 if (dtrace_ls_initialized == FALSE) {
1630 dtrace_ls_initialized = TRUE;
1631 dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
1632 dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
1633 dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
1634 if (dtrace_ls_enabled) {
1635 /*
1636 * Either sleeping or spinning is happening,
1637 * start a timing of our delay interval now.
1638 */
1639 readers_at_sleep = lock->lck_rw_shared_count;
1640 wait_interval = mach_absolute_time();
1641 }
1642 }
1643 #endif
1644
1645 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1646
1647 not_shared_or_upgrade = lck_rw_drain_status(lock, LCK_RW_SHARED_MASK | LCK_RW_WANT_UPGRADE, TRUE);
1648
1649 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_END, trace_lck, 0, 0, not_shared_or_upgrade, 0);
1650
1651 if (not_shared_or_upgrade) {
1652 break;
1653 }
1654 /*
1655 * if we get here, the deadline has expired w/o us
1656 * being able to grab the lock exclusively
1657 * check to see if we're allowed to do a thread_block
1658 */
1659 word.data = ordered_load_rw(lock);
1660 if (word.can_sleep) {
1661 istate = lck_interlock_lock(lock);
1662 word.data = ordered_load_rw(lock);
1663
1664 if (word.shared_count != 0 || word.want_upgrade) {
1665 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1666
1667 word.w_waiting = 1;
1668 ordered_store_rw(lock, word.data);
1669
1670 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite);
1671 res = assert_wait(LCK_RW_WRITER_EVENT(lock),
1672 THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
1673 lck_interlock_unlock(lock, istate);
1674
1675 if (res == THREAD_WAITING) {
1676 res = thread_block(THREAD_CONTINUE_NULL);
1677 slept++;
1678 }
1679 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_END, trace_lck, res, slept, 0, 0);
1680 } else {
1681 lck_interlock_unlock(lock, istate);
1682 /*
1683 * must own the lock now, since we checked for
1684 * readers or upgrade owner behind the interlock
1685 * no need for a call to 'lck_rw_drain_status'
1686 */
1687 break;
1688 }
1689 }
1690 }
1691
1692 #if CONFIG_DTRACE
1693 /*
1694 * Decide what latencies we suffered that are Dtrace events.
1695 * If we have set wait_interval, then we either spun or slept.
1696 * At least we get out from under the interlock before we record
1697 * which is the best we can do here to minimize the impact
1698 * of the tracing.
1699 * If we have set wait_interval to -1, then dtrace was not enabled when we
1700 * started sleeping/spinning so we don't record this event.
1701 */
1702 if (dtrace_ls_enabled == TRUE) {
1703 if (slept == 0) {
1704 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_SPIN, lock,
1705 mach_absolute_time() - wait_interval, 1);
1706 } else {
1707 /*
1708 * For the blocking case, we also record if when we blocked
1709 * it was held for read or write, and how many readers.
1710 * Notice that above we recorded this before we dropped
1711 * the interlock so the count is accurate.
1712 */
1713 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_BLOCK, lock,
1714 mach_absolute_time() - wait_interval, 1,
1715 (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1716 }
1717 }
1718 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lock, 1);
1719 #endif /* CONFIG_DTRACE */
1720 }
1721
1722 /*
1723 * Routine: lck_rw_done
1724 */
1725
1726 lck_rw_type_t
1727 lck_rw_done(lck_rw_t *lock)
1728 {
1729 uint32_t data, prev;
1730 boolean_t once = FALSE;
1731
1732 for (;;) {
1733 data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_release_smp);
1734 if (data & LCK_RW_INTERLOCK) { /* wait for interlock to clear */
1735 atomic_exchange_abort();
1736 lck_rw_interlock_spin(lock);
1737 continue;
1738 }
1739 if (data & LCK_RW_SHARED_MASK) { /* lock is held shared */
1740 assertf(lock->lck_rw_owner == THREAD_NULL, "state=0x%x, owner=%p", lock->lck_rw_data, lock->lck_rw_owner);
1741 data -= LCK_RW_SHARED_READER;
1742 if ((data & LCK_RW_SHARED_MASK) == 0) { /* if reader count has now gone to 0, check for waiters */
1743 goto check_waiters;
1744 }
1745 } else { /* if reader count == 0, must be exclusive lock */
1746 if (data & LCK_RW_WANT_UPGRADE) {
1747 data &= ~(LCK_RW_WANT_UPGRADE);
1748 } else {
1749 if (data & LCK_RW_WANT_EXCL) {
1750 data &= ~(LCK_RW_WANT_EXCL);
1751 } else { /* lock is not 'owned', panic */
1752 panic("Releasing non-exclusive RW lock without a reader refcount!");
1753 }
1754 }
1755 if (!once) {
1756 // Only check for holder and clear it once
1757 assertf(lock->lck_rw_owner == current_thread(), "state=0x%x, owner=%p", lock->lck_rw_data, lock->lck_rw_owner);
1758 ordered_store_rw_owner(lock, THREAD_NULL);
1759 once = TRUE;
1760 }
1761 check_waiters:
1762 /*
1763 * test the original values to match what
1764 * lck_rw_done_gen is going to do to determine
1765 * which wakeups need to happen...
1766 *
1767 * if !(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting)
1768 */
1769 if (prev & LCK_RW_W_WAITING) {
1770 data &= ~(LCK_RW_W_WAITING);
1771 if ((prev & LCK_RW_PRIV_EXCL) == 0) {
1772 data &= ~(LCK_RW_R_WAITING);
1773 }
1774 } else {
1775 data &= ~(LCK_RW_R_WAITING);
1776 }
1777 }
1778 if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_release_smp)) {
1779 break;
1780 }
1781 cpu_pause();
1782 }
1783 return lck_rw_done_gen(lock, prev);
1784 }
1785
1786 /*
1787 * Routine: lck_rw_done_gen
1788 *
1789 * called from the assembly language wrapper...
1790 * prior_lock_state is the value in the 1st
1791 * word of the lock at the time of a successful
1792 * atomic compare and exchange with the new value...
1793 * it represents the state of the lock before we
1794 * decremented the rw_shared_count or cleared either
1795 * rw_want_upgrade or rw_want_write and
1796 * the lck_x_waiting bits... since the wrapper
1797 * routine has already changed the state atomically,
1798 * we just need to decide if we should
1799 * wake up anyone and what value to return... we do
1800 * this by examining the state of the lock before
1801 * we changed it
1802 */
1803 static lck_rw_type_t
1804 lck_rw_done_gen(
1805 lck_rw_t *lck,
1806 uint32_t prior_lock_state)
1807 {
1808 lck_rw_word_t fake_lck;
1809 lck_rw_type_t lock_type;
1810 thread_t thread;
1811 uint32_t rwlock_count;
1812
1813 /*
1814 * prior_lock state is a snapshot of the 1st word of the
1815 * lock in question... we'll fake up a pointer to it
1816 * and carefully not access anything beyond whats defined
1817 * in the first word of a lck_rw_t
1818 */
1819 fake_lck.data = prior_lock_state;
1820
1821 if (fake_lck.shared_count <= 1) {
1822 if (fake_lck.w_waiting) {
1823 thread_wakeup(LCK_RW_WRITER_EVENT(lck));
1824 }
1825
1826 if (!(fake_lck.priv_excl && fake_lck.w_waiting) && fake_lck.r_waiting) {
1827 thread_wakeup(LCK_RW_READER_EVENT(lck));
1828 }
1829 }
1830 if (fake_lck.shared_count) {
1831 lock_type = LCK_RW_TYPE_SHARED;
1832 } else {
1833 lock_type = LCK_RW_TYPE_EXCLUSIVE;
1834 }
1835
1836 /* Check if dropping the lock means that we need to unpromote */
1837 thread = current_thread();
1838 if (fake_lck.can_sleep) {
1839 rwlock_count = thread->rwlock_count--;
1840 } else {
1841 rwlock_count = UINT32_MAX;
1842 }
1843 #if MACH_LDEBUG
1844 if (rwlock_count == 0) {
1845 panic("rw lock count underflow for thread %p", thread);
1846 }
1847 #endif
1848 if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
1849 /* sched_flags checked without lock, but will be rechecked while clearing */
1850 lck_rw_clear_promotion(thread, unslide_for_kdebug(lck));
1851 }
1852 #if CONFIG_DTRACE
1853 LOCKSTAT_RECORD(LS_LCK_RW_DONE_RELEASE, lck, lock_type == LCK_RW_TYPE_SHARED ? 0 : 1);
1854 #endif
1855 return lock_type;
1856 }
1857
1858 /*
1859 * Routine: lck_rw_lock_shared_gen
1860 * Function:
1861 * Fast path code has determined that this lock
1862 * is held exclusively... this is where we spin/block
1863 * until we can acquire the lock in the shared mode
1864 */
1865 static void
1866 lck_rw_lock_shared_gen(
1867 lck_rw_t *lck)
1868 {
1869 __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
1870 lck_rw_word_t word;
1871 boolean_t gotlock = 0;
1872 int slept = 0;
1873 wait_result_t res = 0;
1874 boolean_t istate;
1875
1876 #if CONFIG_DTRACE
1877 uint64_t wait_interval = 0;
1878 int readers_at_sleep = 0;
1879 boolean_t dtrace_ls_initialized = FALSE;
1880 boolean_t dtrace_rwl_shared_spin, dtrace_rwl_shared_block, dtrace_ls_enabled = FALSE;
1881 #endif /* CONFIG_DTRACE */
1882
1883 while (!lck_rw_grab(lck, LCK_RW_GRAB_SHARED, FALSE)) {
1884 #if CONFIG_DTRACE
1885 if (dtrace_ls_initialized == FALSE) {
1886 dtrace_ls_initialized = TRUE;
1887 dtrace_rwl_shared_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_SPIN] != 0);
1888 dtrace_rwl_shared_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_BLOCK] != 0);
1889 dtrace_ls_enabled = dtrace_rwl_shared_spin || dtrace_rwl_shared_block;
1890 if (dtrace_ls_enabled) {
1891 /*
1892 * Either sleeping or spinning is happening,
1893 * start a timing of our delay interval now.
1894 */
1895 readers_at_sleep = lck->lck_rw_shared_count;
1896 wait_interval = mach_absolute_time();
1897 }
1898 }
1899 #endif
1900
1901 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_START,
1902 trace_lck, lck->lck_rw_want_excl, lck->lck_rw_want_upgrade, 0, 0);
1903
1904 gotlock = lck_rw_grab(lck, LCK_RW_GRAB_SHARED, TRUE);
1905
1906 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_END,
1907 trace_lck, lck->lck_rw_want_excl, lck->lck_rw_want_upgrade, gotlock, 0);
1908
1909 if (gotlock) {
1910 break;
1911 }
1912 /*
1913 * if we get here, the deadline has expired w/o us
1914 * being able to grab the lock for read
1915 * check to see if we're allowed to do a thread_block
1916 */
1917 if (lck->lck_rw_can_sleep) {
1918 istate = lck_interlock_lock(lck);
1919
1920 word.data = ordered_load_rw(lck);
1921 if ((word.want_excl || word.want_upgrade) &&
1922 ((word.shared_count == 0) || word.priv_excl)) {
1923 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_START,
1924 trace_lck, word.want_excl, word.want_upgrade, 0, 0);
1925
1926 word.r_waiting = 1;
1927 ordered_store_rw(lck, word.data);
1928
1929 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockRead);
1930 res = assert_wait(LCK_RW_READER_EVENT(lck),
1931 THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
1932 lck_interlock_unlock(lck, istate);
1933
1934 if (res == THREAD_WAITING) {
1935 res = thread_block(THREAD_CONTINUE_NULL);
1936 slept++;
1937 }
1938 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_END,
1939 trace_lck, res, slept, 0, 0);
1940 } else {
1941 word.shared_count++;
1942 ordered_store_rw(lck, word.data);
1943 lck_interlock_unlock(lck, istate);
1944 break;
1945 }
1946 }
1947 }
1948
1949 #if CONFIG_DTRACE
1950 if (dtrace_ls_enabled == TRUE) {
1951 if (slept == 0) {
1952 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_SPIN, lck, mach_absolute_time() - wait_interval, 0);
1953 } else {
1954 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_BLOCK, lck,
1955 mach_absolute_time() - wait_interval, 0,
1956 (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1957 }
1958 }
1959 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lck, 0);
1960 #endif /* CONFIG_DTRACE */
1961 }
1962
1963 /*
1964 * Required to verify thread ownership for exclusive locks by virtue of PPL
1965 * usage
1966 */
1967 void
1968 lck_rw_assert(
1969 lck_rw_t *lck,
1970 unsigned int type)
1971 {
1972 switch (type) {
1973 case LCK_RW_ASSERT_SHARED:
1974 if ((lck->lck_rw_shared_count != 0) &&
1975 (lck->lck_rw_owner == THREAD_NULL)) {
1976 return;
1977 }
1978 break;
1979 case LCK_RW_ASSERT_EXCLUSIVE:
1980 if ((lck->lck_rw_want_excl || lck->lck_rw_want_upgrade) &&
1981 (lck->lck_rw_shared_count == 0) &&
1982 (lck->lck_rw_owner == current_thread())) {
1983 return;
1984 }
1985 break;
1986 case LCK_RW_ASSERT_HELD:
1987 if (lck->lck_rw_shared_count != 0) {
1988 return; // Held shared
1989 }
1990 if ((lck->lck_rw_want_excl || lck->lck_rw_want_upgrade) &&
1991 (lck->lck_rw_owner == current_thread())) {
1992 return; // Held exclusive
1993 }
1994 break;
1995 case LCK_RW_ASSERT_NOTHELD:
1996 if ((lck->lck_rw_shared_count == 0) &&
1997 !(lck->lck_rw_want_excl || lck->lck_rw_want_upgrade) &&
1998 (lck->lck_rw_owner == THREAD_NULL)) {
1999 return;
2000 }
2001 break;
2002 default:
2003 break;
2004 }
2005 panic("rw lock (%p)%s held (mode=%u)", lck, (type == LCK_RW_ASSERT_NOTHELD ? "" : " not"), type);
2006 }
2007
2008
2009 /*
2010 * Routine: kdp_lck_rw_lock_is_acquired_exclusive
2011 * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
2012 */
2013 boolean_t
2014 kdp_lck_rw_lock_is_acquired_exclusive(lck_rw_t *lck)
2015 {
2016 if (not_in_kdp) {
2017 panic("panic: rw lock exclusive check done outside of kernel debugger");
2018 }
2019 return ((lck->lck_rw_want_upgrade || lck->lck_rw_want_excl) && (lck->lck_rw_shared_count == 0)) ? TRUE : FALSE;
2020 }
2021
2022 /*
2023 * The C portion of the mutex package. These routines are only invoked
2024 * if the optimized assembler routines can't do the work.
2025 */
2026
2027 /*
2028 * Forward declaration
2029 */
2030
2031 void
2032 lck_mtx_ext_init(
2033 lck_mtx_ext_t * lck,
2034 lck_grp_t * grp,
2035 lck_attr_t * attr);
2036
2037 /*
2038 * Routine: lck_mtx_alloc_init
2039 */
2040 lck_mtx_t *
2041 lck_mtx_alloc_init(
2042 lck_grp_t * grp,
2043 lck_attr_t * attr)
2044 {
2045 lck_mtx_t *lck;
2046
2047 lck = zalloc(ZV_LCK_MTX);
2048 lck_mtx_init(lck, grp, attr);
2049 return lck;
2050 }
2051
2052 /*
2053 * Routine: lck_mtx_free
2054 */
2055 void
2056 lck_mtx_free(
2057 lck_mtx_t * lck,
2058 lck_grp_t * grp)
2059 {
2060 lck_mtx_destroy(lck, grp);
2061 zfree(ZV_LCK_MTX, lck);
2062 }
2063
2064 /*
2065 * Routine: lck_mtx_init
2066 */
2067 void
2068 lck_mtx_init(
2069 lck_mtx_t * lck,
2070 lck_grp_t * grp,
2071 lck_attr_t * attr)
2072 {
2073 #ifdef BER_XXX
2074 lck_mtx_ext_t *lck_ext;
2075 #endif
2076 lck_attr_t *lck_attr;
2077
2078 if (attr != LCK_ATTR_NULL) {
2079 lck_attr = attr;
2080 } else {
2081 lck_attr = &LockDefaultLckAttr;
2082 }
2083
2084 #ifdef BER_XXX
2085 if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) {
2086 lck_ext = zalloc(ZV_LCK_MTX_EXT);
2087 lck_mtx_ext_init(lck_ext, grp, lck_attr);
2088 lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT;
2089 lck->lck_mtx_ptr = lck_ext;
2090 lck->lck_mtx_type = LCK_MTX_TYPE;
2091 } else
2092 #endif
2093 {
2094 lck->lck_mtx_ptr = NULL; // Clear any padding in the union fields below
2095 lck->lck_mtx_waiters = 0;
2096 lck->lck_mtx_type = LCK_MTX_TYPE;
2097 ordered_store_mtx(lck, 0);
2098 }
2099 lck_grp_reference(grp);
2100 lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX);
2101 }
2102
2103 /*
2104 * Routine: lck_mtx_init_ext
2105 */
2106 void
2107 lck_mtx_init_ext(
2108 lck_mtx_t * lck,
2109 lck_mtx_ext_t * lck_ext,
2110 lck_grp_t * grp,
2111 lck_attr_t * attr)
2112 {
2113 lck_attr_t *lck_attr;
2114
2115 if (attr != LCK_ATTR_NULL) {
2116 lck_attr = attr;
2117 } else {
2118 lck_attr = &LockDefaultLckAttr;
2119 }
2120
2121 if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) {
2122 lck_mtx_ext_init(lck_ext, grp, lck_attr);
2123 lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT;
2124 lck->lck_mtx_ptr = lck_ext;
2125 lck->lck_mtx_type = LCK_MTX_TYPE;
2126 } else {
2127 lck->lck_mtx_waiters = 0;
2128 lck->lck_mtx_type = LCK_MTX_TYPE;
2129 ordered_store_mtx(lck, 0);
2130 }
2131 lck_grp_reference(grp);
2132 lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX);
2133 }
2134
2135 /*
2136 * Routine: lck_mtx_ext_init
2137 */
2138 void
2139 lck_mtx_ext_init(
2140 lck_mtx_ext_t * lck,
2141 lck_grp_t * grp,
2142 lck_attr_t * attr)
2143 {
2144 bzero((void *) lck, sizeof(lck_mtx_ext_t));
2145
2146 lck->lck_mtx.lck_mtx_type = LCK_MTX_TYPE;
2147
2148 if ((attr->lck_attr_val) & LCK_ATTR_DEBUG) {
2149 lck->lck_mtx_deb.type = MUTEX_TAG;
2150 lck->lck_mtx_attr |= LCK_MTX_ATTR_DEBUG;
2151 }
2152 lck->lck_mtx_grp = grp;
2153
2154 if (grp->lck_grp_attr & LCK_GRP_ATTR_STAT) {
2155 lck->lck_mtx_attr |= LCK_MTX_ATTR_STAT;
2156 }
2157 }
2158
2159 /* The slow versions */
2160 static void lck_mtx_lock_contended(lck_mtx_t *lock, thread_t thread, boolean_t interlocked);
2161 static boolean_t lck_mtx_try_lock_contended(lck_mtx_t *lock, thread_t thread);
2162 static void lck_mtx_unlock_contended(lck_mtx_t *lock, thread_t thread, boolean_t interlocked);
2163
2164 /* The adaptive spin function */
2165 static spinwait_result_t lck_mtx_lock_contended_spinwait_arm(lck_mtx_t *lock, thread_t thread, boolean_t interlocked);
2166
2167 /*
2168 * Routine: lck_mtx_verify
2169 *
2170 * Verify if a mutex is valid
2171 */
2172 static inline void
2173 lck_mtx_verify(lck_mtx_t *lock)
2174 {
2175 if (lock->lck_mtx_type != LCK_MTX_TYPE) {
2176 panic("Invalid mutex %p", lock);
2177 }
2178 #if DEVELOPMENT || DEBUG
2179 if (lock->lck_mtx_tag == LCK_MTX_TAG_DESTROYED) {
2180 panic("Mutex destroyed %p", lock);
2181 }
2182 #endif /* DEVELOPMENT || DEBUG */
2183 }
2184
2185 /*
2186 * Routine: lck_mtx_check_preemption
2187 *
2188 * Verify preemption is enabled when attempting to acquire a mutex.
2189 */
2190
2191 static inline void
2192 lck_mtx_check_preemption(lck_mtx_t *lock)
2193 {
2194 #if DEVELOPMENT || DEBUG
2195 if (current_cpu_datap()->cpu_hibernate) {
2196 return;
2197 }
2198
2199 int pl = get_preemption_level();
2200
2201 if (pl != 0) {
2202 panic("Attempt to take mutex with preemption disabled. Lock=%p, level=%d", lock, pl);
2203 }
2204 #else
2205 (void)lock;
2206 #endif
2207 }
2208
2209 /*
2210 * Routine: lck_mtx_lock
2211 */
2212 void
2213 lck_mtx_lock(lck_mtx_t *lock)
2214 {
2215 thread_t thread;
2216
2217 lck_mtx_verify(lock);
2218 lck_mtx_check_preemption(lock);
2219 thread = current_thread();
2220 if (os_atomic_cmpxchg(&lock->lck_mtx_data,
2221 0, LCK_MTX_THREAD_TO_STATE(thread), acquire)) {
2222 #if CONFIG_DTRACE
2223 LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_ACQUIRE, lock, 0);
2224 #endif /* CONFIG_DTRACE */
2225 return;
2226 }
2227 lck_mtx_lock_contended(lock, thread, FALSE);
2228 }
2229
2230 /*
2231 * This is the slow version of mutex locking.
2232 */
2233 static void NOINLINE
2234 lck_mtx_lock_contended(lck_mtx_t *lock, thread_t thread, boolean_t interlocked)
2235 {
2236 thread_t holding_thread;
2237 uintptr_t state;
2238 int waiters = 0;
2239 spinwait_result_t sw_res;
2240 struct turnstile *ts = NULL;
2241
2242 /* Loop waiting until I see that the mutex is unowned */
2243 for (;;) {
2244 sw_res = lck_mtx_lock_contended_spinwait_arm(lock, thread, interlocked);
2245 interlocked = FALSE;
2246
2247 switch (sw_res) {
2248 case SPINWAIT_ACQUIRED:
2249 if (ts != NULL) {
2250 interlock_lock(lock);
2251 turnstile_complete((uintptr_t)lock, NULL, NULL, TURNSTILE_KERNEL_MUTEX);
2252 interlock_unlock(lock);
2253 }
2254 goto done;
2255 case SPINWAIT_INTERLOCK:
2256 goto set_owner;
2257 default:
2258 break;
2259 }
2260
2261 state = ordered_load_mtx(lock);
2262 holding_thread = LCK_MTX_STATE_TO_THREAD(state);
2263 if (holding_thread == NULL) {
2264 break;
2265 }
2266 ordered_store_mtx(lock, (state | LCK_ILOCK | ARM_LCK_WAITERS)); // Set waiters bit and wait
2267 lck_mtx_lock_wait(lock, holding_thread, &ts);
2268 /* returns interlock unlocked */
2269 }
2270
2271 set_owner:
2272 /* Hooray, I'm the new owner! */
2273 state = ordered_load_mtx(lock);
2274
2275 if (state & ARM_LCK_WAITERS) {
2276 /* Skip lck_mtx_lock_acquire if there are no waiters. */
2277 waiters = lck_mtx_lock_acquire(lock, ts);
2278 /*
2279 * lck_mtx_lock_acquire will call
2280 * turnstile_complete
2281 */
2282 } else {
2283 if (ts != NULL) {
2284 turnstile_complete((uintptr_t)lock, NULL, NULL, TURNSTILE_KERNEL_MUTEX);
2285 }
2286 }
2287
2288 state = LCK_MTX_THREAD_TO_STATE(thread);
2289 if (waiters != 0) {
2290 state |= ARM_LCK_WAITERS;
2291 }
2292 state |= LCK_ILOCK; // Preserve interlock
2293 ordered_store_mtx(lock, state); // Set ownership
2294 interlock_unlock(lock); // Release interlock, enable preemption
2295
2296 done:
2297 load_memory_barrier();
2298
2299 assert(thread->turnstile != NULL);
2300
2301 if (ts != NULL) {
2302 turnstile_cleanup();
2303 }
2304
2305 #if CONFIG_DTRACE
2306 LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_ACQUIRE, lock, 0);
2307 #endif /* CONFIG_DTRACE */
2308 }
2309
2310 /*
2311 * Routine: lck_mtx_lock_spinwait_arm
2312 *
2313 * Invoked trying to acquire a mutex when there is contention but
2314 * the holder is running on another processor. We spin for up to a maximum
2315 * time waiting for the lock to be released.
2316 */
2317 static spinwait_result_t
2318 lck_mtx_lock_contended_spinwait_arm(lck_mtx_t *lock, thread_t thread, boolean_t interlocked)
2319 {
2320 int has_interlock = (int)interlocked;
2321 __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lock);
2322 thread_t owner, prev_owner;
2323 uint64_t window_deadline, sliding_deadline, high_deadline;
2324 uint64_t start_time, cur_time, avg_hold_time, bias, delta;
2325 int loopcount = 0;
2326 uint i, prev_owner_cpu;
2327 int total_hold_time_samples, window_hold_time_samples, unfairness;
2328 bool owner_on_core, adjust;
2329 uintptr_t state, new_state, waiters;
2330 spinwait_result_t retval = SPINWAIT_DID_SPIN_HIGH_THR;
2331
2332 if (__improbable(!(lck_mtx_adaptive_spin_mode & ADAPTIVE_SPIN_ENABLE))) {
2333 if (!has_interlock) {
2334 interlock_lock(lock);
2335 }
2336
2337 return SPINWAIT_DID_NOT_SPIN;
2338 }
2339
2340 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_START,
2341 trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(LCK_MTX_STATE_TO_THREAD(state)), lock->lck_mtx_waiters, 0, 0);
2342
2343 start_time = mach_absolute_time();
2344 /*
2345 * window_deadline represents the "learning" phase.
2346 * The thread collects statistics about the lock during
2347 * window_deadline and then it makes a decision on whether to spin more
2348 * or block according to the concurrency behavior
2349 * observed.
2350 *
2351 * Every thread can spin at least low_MutexSpin.
2352 */
2353 window_deadline = start_time + low_MutexSpin;
2354 /*
2355 * Sliding_deadline is the adjusted spin deadline
2356 * computed after the "learning" phase.
2357 */
2358 sliding_deadline = window_deadline;
2359 /*
2360 * High_deadline is a hard deadline. No thread
2361 * can spin more than this deadline.
2362 */
2363 if (high_MutexSpin >= 0) {
2364 high_deadline = start_time + high_MutexSpin;
2365 } else {
2366 high_deadline = start_time + low_MutexSpin * real_ncpus;
2367 }
2368
2369 /*
2370 * Do not know yet which is the owner cpu.
2371 * Initialize prev_owner_cpu with next cpu.
2372 */
2373 prev_owner_cpu = (cpu_number() + 1) % real_ncpus;
2374 total_hold_time_samples = 0;
2375 window_hold_time_samples = 0;
2376 avg_hold_time = 0;
2377 adjust = TRUE;
2378 bias = (os_hash_kernel_pointer(lock) + cpu_number()) % real_ncpus;
2379
2380 /* Snoop the lock state */
2381 state = ordered_load_mtx(lock);
2382 owner = LCK_MTX_STATE_TO_THREAD(state);
2383 prev_owner = owner;
2384
2385 if (has_interlock) {
2386 if (owner == NULL) {
2387 retval = SPINWAIT_INTERLOCK;
2388 goto done_spinning;
2389 } else {
2390 /*
2391 * We are holding the interlock, so
2392 * we can safely dereference owner.
2393 */
2394 if (!machine_thread_on_core(owner) || (owner->state & TH_IDLE)) {
2395 retval = SPINWAIT_DID_NOT_SPIN;
2396 goto done_spinning;
2397 }
2398 }
2399 interlock_unlock(lock);
2400 has_interlock = 0;
2401 }
2402
2403 /*
2404 * Spin while:
2405 * - mutex is locked, and
2406 * - it's locked as a spin lock, and
2407 * - owner is running on another processor, and
2408 * - we haven't spun for long enough.
2409 */
2410 do {
2411 /*
2412 * Try to acquire the lock.
2413 */
2414 owner = LCK_MTX_STATE_TO_THREAD(state);
2415 if (owner == NULL) {
2416 waiters = state & ARM_LCK_WAITERS;
2417 if (waiters) {
2418 /*
2419 * preserve the waiter bit
2420 * and try acquire the interlock.
2421 * Note: we will successfully acquire
2422 * the interlock only if we can also
2423 * acquire the lock.
2424 */
2425 new_state = ARM_LCK_WAITERS | LCK_ILOCK;
2426 has_interlock = 1;
2427 retval = SPINWAIT_INTERLOCK;
2428 disable_preemption();
2429 } else {
2430 new_state = LCK_MTX_THREAD_TO_STATE(thread);
2431 retval = SPINWAIT_ACQUIRED;
2432 }
2433
2434 /*
2435 * The cmpxchg will succed only if the lock
2436 * is not owned (doesn't have an owner set)
2437 * and it is not interlocked.
2438 * It will not fail if there are waiters.
2439 */
2440 if (os_atomic_cmpxchgv(&lock->lck_mtx_data,
2441 waiters, new_state, &state, acquire)) {
2442 goto done_spinning;
2443 } else {
2444 if (waiters) {
2445 has_interlock = 0;
2446 enable_preemption();
2447 }
2448 }
2449 }
2450
2451 cur_time = mach_absolute_time();
2452
2453 /*
2454 * Never spin past high_deadline.
2455 */
2456 if (cur_time >= high_deadline) {
2457 retval = SPINWAIT_DID_SPIN_HIGH_THR;
2458 break;
2459 }
2460
2461 /*
2462 * Check if owner is on core. If not block.
2463 */
2464 owner = LCK_MTX_STATE_TO_THREAD(state);
2465 if (owner) {
2466 i = prev_owner_cpu;
2467 owner_on_core = FALSE;
2468
2469 disable_preemption();
2470 state = ordered_load_mtx(lock);
2471 owner = LCK_MTX_STATE_TO_THREAD(state);
2472
2473 /*
2474 * For scalability we want to check if the owner is on core
2475 * without locking the mutex interlock.
2476 * If we do not lock the mutex interlock, the owner that we see might be
2477 * invalid, so we cannot dereference it. Therefore we cannot check
2478 * any field of the thread to tell us if it is on core.
2479 * Check if the thread that is running on the other cpus matches the owner.
2480 */
2481 if (owner) {
2482 do {
2483 cpu_data_t *cpu_data_ptr = CpuDataEntries[i].cpu_data_vaddr;
2484 if ((cpu_data_ptr != NULL) && (cpu_data_ptr->cpu_active_thread == owner)) {
2485 owner_on_core = TRUE;
2486 break;
2487 }
2488 if (++i >= real_ncpus) {
2489 i = 0;
2490 }
2491 } while (i != prev_owner_cpu);
2492 enable_preemption();
2493
2494 if (owner_on_core) {
2495 prev_owner_cpu = i;
2496 } else {
2497 prev_owner = owner;
2498 state = ordered_load_mtx(lock);
2499 owner = LCK_MTX_STATE_TO_THREAD(state);
2500 if (owner == prev_owner) {
2501 /*
2502 * Owner is not on core.
2503 * Stop spinning.
2504 */
2505 if (loopcount == 0) {
2506 retval = SPINWAIT_DID_NOT_SPIN;
2507 } else {
2508 retval = SPINWAIT_DID_SPIN_OWNER_NOT_CORE;
2509 }
2510 break;
2511 }
2512 /*
2513 * Fall through if the owner changed while we were scanning.
2514 * The new owner could potentially be on core, so loop
2515 * again.
2516 */
2517 }
2518 } else {
2519 enable_preemption();
2520 }
2521 }
2522
2523 /*
2524 * Save how many times we see the owner changing.
2525 * We can roughly estimate the the mutex hold
2526 * time and the fairness with that.
2527 */
2528 if (owner != prev_owner) {
2529 prev_owner = owner;
2530 total_hold_time_samples++;
2531 window_hold_time_samples++;
2532 }
2533
2534 /*
2535 * Learning window expired.
2536 * Try to adjust the sliding_deadline.
2537 */
2538 if (cur_time >= window_deadline) {
2539 /*
2540 * If there was not contention during the window
2541 * stop spinning.
2542 */
2543 if (window_hold_time_samples < 1) {
2544 retval = SPINWAIT_DID_SPIN_NO_WINDOW_CONTENTION;
2545 break;
2546 }
2547
2548 if (adjust) {
2549 /*
2550 * For a fair lock, we'd wait for at most (NCPU-1) periods,
2551 * but the lock is unfair, so let's try to estimate by how much.
2552 */
2553 unfairness = total_hold_time_samples / real_ncpus;
2554
2555 if (unfairness == 0) {
2556 /*
2557 * We observed the owner changing `total_hold_time_samples` times which
2558 * let us estimate the average hold time of this mutex for the duration
2559 * of the spin time.
2560 * avg_hold_time = (cur_time - start_time) / total_hold_time_samples;
2561 *
2562 * In this case spin at max avg_hold_time * (real_ncpus - 1)
2563 */
2564 delta = cur_time - start_time;
2565 sliding_deadline = start_time + (delta * (real_ncpus - 1)) / total_hold_time_samples;
2566 } else {
2567 /*
2568 * In this case at least one of the other cpus was able to get the lock twice
2569 * while I was spinning.
2570 * We could spin longer but it won't necessarily help if the system is unfair.
2571 * Try to randomize the wait to reduce contention.
2572 *
2573 * We compute how much time we could potentially spin
2574 * and distribute it over the cpus.
2575 *
2576 * bias is an integer between 0 and real_ncpus.
2577 * distributed_increment = ((high_deadline - cur_time) / real_ncpus) * bias
2578 */
2579 delta = high_deadline - cur_time;
2580 sliding_deadline = cur_time + ((delta * bias) / real_ncpus);
2581 adjust = FALSE;
2582 }
2583 }
2584
2585 window_deadline += low_MutexSpin;
2586 window_hold_time_samples = 0;
2587 }
2588
2589 /*
2590 * Stop spinning if we past
2591 * the adjusted deadline.
2592 */
2593 if (cur_time >= sliding_deadline) {
2594 retval = SPINWAIT_DID_SPIN_SLIDING_THR;
2595 break;
2596 }
2597
2598 /*
2599 * We want to arm the monitor for wfe,
2600 * so load exclusively the lock.
2601 *
2602 * NOTE:
2603 * we rely on the fact that wfe will
2604 * eventually return even if the cache line
2605 * is not modified. This way we will keep
2606 * looping and checking if the deadlines expired.
2607 */
2608 state = os_atomic_load_exclusive(&lock->lck_mtx_data, relaxed);
2609 owner = LCK_MTX_STATE_TO_THREAD(state);
2610 if (owner != NULL) {
2611 wait_for_event();
2612 state = ordered_load_mtx(lock);
2613 } else {
2614 atomic_exchange_abort();
2615 }
2616
2617 loopcount++;
2618 } while (TRUE);
2619
2620 done_spinning:
2621 #if CONFIG_DTRACE
2622 /*
2623 * Note that we record a different probe id depending on whether
2624 * this is a direct or indirect mutex. This allows us to
2625 * penalize only lock groups that have debug/stats enabled
2626 * with dtrace processing if desired.
2627 */
2628 if (__probable(lock->lck_mtx_tag != LCK_MTX_TAG_INDIRECT)) {
2629 LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN, lock,
2630 mach_absolute_time() - start_time);
2631 } else {
2632 LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_SPIN, lock,
2633 mach_absolute_time() - start_time);
2634 }
2635 /* The lockstat acquire event is recorded by the caller. */
2636 #endif
2637
2638 state = ordered_load_mtx(lock);
2639
2640 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_END,
2641 trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(LCK_MTX_STATE_TO_THREAD(state)), lock->lck_mtx_waiters, retval, 0);
2642 if ((!has_interlock) && (retval != SPINWAIT_ACQUIRED)) {
2643 /* We must own either the lock or the interlock on return. */
2644 interlock_lock(lock);
2645 }
2646
2647 return retval;
2648 }
2649
2650
2651 /*
2652 * Common code for mutex locking as spinlock
2653 */
2654 static inline void
2655 lck_mtx_lock_spin_internal(lck_mtx_t *lock, boolean_t allow_held_as_mutex)
2656 {
2657 uintptr_t state;
2658
2659 interlock_lock(lock);
2660 state = ordered_load_mtx(lock);
2661 if (LCK_MTX_STATE_TO_THREAD(state)) {
2662 if (allow_held_as_mutex) {
2663 lck_mtx_lock_contended(lock, current_thread(), TRUE);
2664 } else {
2665 // "Always" variants can never block. If the lock is held and blocking is not allowed
2666 // then someone is mixing always and non-always calls on the same lock, which is
2667 // forbidden.
2668 panic("Attempting to block on a lock taken as spin-always %p", lock);
2669 }
2670 return;
2671 }
2672 state &= ARM_LCK_WAITERS; // Preserve waiters bit
2673 state |= (LCK_MTX_SPIN_TAG | LCK_ILOCK); // Add spin tag and maintain interlock
2674 ordered_store_mtx(lock, state);
2675 load_memory_barrier();
2676
2677 #if CONFIG_DTRACE
2678 LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN_ACQUIRE, lock, 0);
2679 #endif /* CONFIG_DTRACE */
2680 }
2681
2682 /*
2683 * Routine: lck_mtx_lock_spin
2684 */
2685 void
2686 lck_mtx_lock_spin(lck_mtx_t *lock)
2687 {
2688 lck_mtx_check_preemption(lock);
2689 lck_mtx_lock_spin_internal(lock, TRUE);
2690 }
2691
2692 /*
2693 * Routine: lck_mtx_lock_spin_always
2694 */
2695 void
2696 lck_mtx_lock_spin_always(lck_mtx_t *lock)
2697 {
2698 lck_mtx_lock_spin_internal(lock, FALSE);
2699 }
2700
2701 /*
2702 * Routine: lck_mtx_try_lock
2703 */
2704 boolean_t
2705 lck_mtx_try_lock(lck_mtx_t *lock)
2706 {
2707 thread_t thread = current_thread();
2708
2709 lck_mtx_verify(lock);
2710 if (os_atomic_cmpxchg(&lock->lck_mtx_data,
2711 0, LCK_MTX_THREAD_TO_STATE(thread), acquire)) {
2712 #if CONFIG_DTRACE
2713 LOCKSTAT_RECORD(LS_LCK_MTX_TRY_LOCK_ACQUIRE, lock, 0);
2714 #endif /* CONFIG_DTRACE */
2715 return TRUE;
2716 }
2717 return lck_mtx_try_lock_contended(lock, thread);
2718 }
2719
2720 static boolean_t NOINLINE
2721 lck_mtx_try_lock_contended(lck_mtx_t *lock, thread_t thread)
2722 {
2723 thread_t holding_thread;
2724 uintptr_t state;
2725 int waiters;
2726
2727 interlock_lock(lock);
2728 state = ordered_load_mtx(lock);
2729 holding_thread = LCK_MTX_STATE_TO_THREAD(state);
2730 if (holding_thread) {
2731 interlock_unlock(lock);
2732 return FALSE;
2733 }
2734 waiters = lck_mtx_lock_acquire(lock, NULL);
2735 state = LCK_MTX_THREAD_TO_STATE(thread);
2736 if (waiters != 0) {
2737 state |= ARM_LCK_WAITERS;
2738 }
2739 state |= LCK_ILOCK; // Preserve interlock
2740 ordered_store_mtx(lock, state); // Set ownership
2741 interlock_unlock(lock); // Release interlock, enable preemption
2742 load_memory_barrier();
2743
2744 turnstile_cleanup();
2745
2746 return TRUE;
2747 }
2748
2749 static inline boolean_t
2750 lck_mtx_try_lock_spin_internal(lck_mtx_t *lock, boolean_t allow_held_as_mutex)
2751 {
2752 uintptr_t state;
2753
2754 if (!interlock_try(lock)) {
2755 return FALSE;
2756 }
2757 state = ordered_load_mtx(lock);
2758 if (LCK_MTX_STATE_TO_THREAD(state)) {
2759 // Lock is held as mutex
2760 if (allow_held_as_mutex) {
2761 interlock_unlock(lock);
2762 } else {
2763 // "Always" variants can never block. If the lock is held as a normal mutex
2764 // then someone is mixing always and non-always calls on the same lock, which is
2765 // forbidden.
2766 panic("Spin-mutex held as full mutex %p", lock);
2767 }
2768 return FALSE;
2769 }
2770 state &= ARM_LCK_WAITERS; // Preserve waiters bit
2771 state |= (LCK_MTX_SPIN_TAG | LCK_ILOCK); // Add spin tag and maintain interlock
2772 ordered_store_mtx(lock, state);
2773 load_memory_barrier();
2774
2775 #if CONFIG_DTRACE
2776 LOCKSTAT_RECORD(LS_LCK_MTX_TRY_SPIN_LOCK_ACQUIRE, lock, 0);
2777 #endif /* CONFIG_DTRACE */
2778 return TRUE;
2779 }
2780
2781 /*
2782 * Routine: lck_mtx_try_lock_spin
2783 */
2784 boolean_t
2785 lck_mtx_try_lock_spin(lck_mtx_t *lock)
2786 {
2787 return lck_mtx_try_lock_spin_internal(lock, TRUE);
2788 }
2789
2790 /*
2791 * Routine: lck_mtx_try_lock_spin_always
2792 */
2793 boolean_t
2794 lck_mtx_try_lock_spin_always(lck_mtx_t *lock)
2795 {
2796 return lck_mtx_try_lock_spin_internal(lock, FALSE);
2797 }
2798
2799
2800
2801 /*
2802 * Routine: lck_mtx_unlock
2803 */
2804 void
2805 lck_mtx_unlock(lck_mtx_t *lock)
2806 {
2807 thread_t thread = current_thread();
2808 uintptr_t state;
2809 boolean_t ilk_held = FALSE;
2810
2811 lck_mtx_verify(lock);
2812
2813 state = ordered_load_mtx(lock);
2814 if (state & LCK_ILOCK) {
2815 if (LCK_MTX_STATE_TO_THREAD(state) == (thread_t)LCK_MTX_SPIN_TAG) {
2816 ilk_held = TRUE; // Interlock is held by (presumably) this thread
2817 }
2818 goto slow_case;
2819 }
2820 // Locked as a mutex
2821 if (os_atomic_cmpxchg(&lock->lck_mtx_data,
2822 LCK_MTX_THREAD_TO_STATE(thread), 0, release)) {
2823 #if CONFIG_DTRACE
2824 LOCKSTAT_RECORD(LS_LCK_MTX_UNLOCK_RELEASE, lock, 0);
2825 #endif /* CONFIG_DTRACE */
2826 return;
2827 }
2828 slow_case:
2829 lck_mtx_unlock_contended(lock, thread, ilk_held);
2830 }
2831
2832 static void NOINLINE
2833 lck_mtx_unlock_contended(lck_mtx_t *lock, thread_t thread, boolean_t ilk_held)
2834 {
2835 uintptr_t state;
2836 boolean_t cleanup = FALSE;
2837
2838 if (ilk_held) {
2839 state = ordered_load_mtx(lock);
2840 } else {
2841 interlock_lock(lock);
2842 state = ordered_load_mtx(lock);
2843 if (thread != LCK_MTX_STATE_TO_THREAD(state)) {
2844 panic("lck_mtx_unlock(): Attempt to release lock not owned by thread (%p)", lock);
2845 }
2846 if (state & ARM_LCK_WAITERS) {
2847 if (lck_mtx_unlock_wakeup(lock, thread)) {
2848 state = ARM_LCK_WAITERS;
2849 } else {
2850 state = 0;
2851 }
2852 cleanup = TRUE;
2853 goto unlock;
2854 }
2855 }
2856 state &= ARM_LCK_WAITERS; /* Clear state, retain waiters bit */
2857 unlock:
2858 state |= LCK_ILOCK;
2859 ordered_store_mtx(lock, state);
2860 interlock_unlock(lock);
2861 if (cleanup) {
2862 /*
2863 * Do not do any turnstile operations outside of this block.
2864 * lock/unlock is called at early stage of boot with single thread,
2865 * when turnstile is not yet initialized.
2866 * Even without contention we can come throught the slow path
2867 * if the mutex is acquired as a spin lock.
2868 */
2869 turnstile_cleanup();
2870 }
2871
2872 #if CONFIG_DTRACE
2873 LOCKSTAT_RECORD(LS_LCK_MTX_UNLOCK_RELEASE, lock, 0);
2874 #endif /* CONFIG_DTRACE */
2875 }
2876
2877 /*
2878 * Routine: lck_mtx_assert
2879 */
2880 void
2881 lck_mtx_assert(lck_mtx_t *lock, unsigned int type)
2882 {
2883 thread_t thread, holder;
2884 uintptr_t state;
2885
2886 state = ordered_load_mtx(lock);
2887 holder = LCK_MTX_STATE_TO_THREAD(state);
2888 if (holder == (thread_t)LCK_MTX_SPIN_TAG) {
2889 // Lock is held in spin mode, owner is unknown.
2890 return; // Punt
2891 }
2892 thread = current_thread();
2893 if (type == LCK_MTX_ASSERT_OWNED) {
2894 if (thread != holder) {
2895 panic("lck_mtx_assert(): mutex (%p) owned", lock);
2896 }
2897 } else if (type == LCK_MTX_ASSERT_NOTOWNED) {
2898 if (thread == holder) {
2899 panic("lck_mtx_assert(): mutex (%p) not owned", lock);
2900 }
2901 } else {
2902 panic("lck_mtx_assert(): invalid arg (%u)", type);
2903 }
2904 }
2905
2906 /*
2907 * Routine: lck_mtx_ilk_unlock
2908 */
2909 boolean_t
2910 lck_mtx_ilk_unlock(lck_mtx_t *lock)
2911 {
2912 interlock_unlock(lock);
2913 return TRUE;
2914 }
2915
2916 /*
2917 * Routine: lck_mtx_convert_spin
2918 *
2919 * Convert a mutex held for spin into a held full mutex
2920 */
2921 void
2922 lck_mtx_convert_spin(lck_mtx_t *lock)
2923 {
2924 thread_t thread = current_thread();
2925 uintptr_t state;
2926 int waiters;
2927
2928 state = ordered_load_mtx(lock);
2929 if (LCK_MTX_STATE_TO_THREAD(state) == thread) {
2930 return; // Already owned as mutex, return
2931 }
2932 if ((state & LCK_ILOCK) == 0 || (LCK_MTX_STATE_TO_THREAD(state) != (thread_t)LCK_MTX_SPIN_TAG)) {
2933 panic("lck_mtx_convert_spin: Not held as spinlock (%p)", lock);
2934 }
2935 state &= ~(LCK_MTX_THREAD_MASK); // Clear the spin tag
2936 ordered_store_mtx(lock, state);
2937 waiters = lck_mtx_lock_acquire(lock, NULL); // Acquire to manage priority boosts
2938 state = LCK_MTX_THREAD_TO_STATE(thread);
2939 if (waiters != 0) {
2940 state |= ARM_LCK_WAITERS;
2941 }
2942 state |= LCK_ILOCK;
2943 ordered_store_mtx(lock, state); // Set ownership
2944 interlock_unlock(lock); // Release interlock, enable preemption
2945 turnstile_cleanup();
2946 }
2947
2948
2949 /*
2950 * Routine: lck_mtx_destroy
2951 */
2952 void
2953 lck_mtx_destroy(
2954 lck_mtx_t * lck,
2955 lck_grp_t * grp)
2956 {
2957 if (lck->lck_mtx_type != LCK_MTX_TYPE) {
2958 panic("Destroying invalid mutex %p", lck);
2959 }
2960 if (lck->lck_mtx_tag == LCK_MTX_TAG_DESTROYED) {
2961 panic("Destroying previously destroyed lock %p", lck);
2962 }
2963 lck_mtx_assert(lck, LCK_MTX_ASSERT_NOTOWNED);
2964 lck->lck_mtx_tag = LCK_MTX_TAG_DESTROYED;
2965 lck_grp_lckcnt_decr(grp, LCK_TYPE_MTX);
2966 lck_grp_deallocate(grp);
2967 return;
2968 }
2969
2970 /*
2971 * Routine: lck_spin_assert
2972 */
2973 void
2974 lck_spin_assert(lck_spin_t *lock, unsigned int type)
2975 {
2976 thread_t thread, holder;
2977 uintptr_t state;
2978
2979 if (lock->type != LCK_SPIN_TYPE) {
2980 panic("Invalid spinlock %p", lock);
2981 }
2982
2983 state = lock->lck_spin_data;
2984 holder = (thread_t)(state & ~LCK_ILOCK);
2985 thread = current_thread();
2986 if (type == LCK_ASSERT_OWNED) {
2987 if (holder == 0) {
2988 panic("Lock not owned %p = %lx", lock, state);
2989 }
2990 if (holder != thread) {
2991 panic("Lock not owned by current thread %p = %lx", lock, state);
2992 }
2993 if ((state & LCK_ILOCK) == 0) {
2994 panic("Lock bit not set %p = %lx", lock, state);
2995 }
2996 } else if (type == LCK_ASSERT_NOTOWNED) {
2997 if (holder != 0) {
2998 if (holder == thread) {
2999 panic("Lock owned by current thread %p = %lx", lock, state);
3000 }
3001 }
3002 } else {
3003 panic("lck_spin_assert(): invalid arg (%u)", type);
3004 }
3005 }
3006
3007 boolean_t
3008 lck_rw_lock_yield_shared(lck_rw_t *lck, boolean_t force_yield)
3009 {
3010 lck_rw_word_t word;
3011
3012 lck_rw_assert(lck, LCK_RW_ASSERT_SHARED);
3013
3014 word.data = ordered_load_rw(lck);
3015 if (word.want_excl || word.want_upgrade || force_yield) {
3016 lck_rw_unlock_shared(lck);
3017 mutex_pause(2);
3018 lck_rw_lock_shared(lck);
3019 return TRUE;
3020 }
3021
3022 return FALSE;
3023 }
3024
3025 /*
3026 * Routine: kdp_lck_mtx_lock_spin_is_acquired
3027 * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
3028 */
3029 boolean_t
3030 kdp_lck_mtx_lock_spin_is_acquired(lck_mtx_t *lck)
3031 {
3032 uintptr_t state;
3033
3034 if (not_in_kdp) {
3035 panic("panic: spinlock acquired check done outside of kernel debugger");
3036 }
3037 state = ordered_load_mtx(lck);
3038 if (state == LCK_MTX_TAG_DESTROYED) {
3039 return FALSE;
3040 }
3041 if (LCK_MTX_STATE_TO_THREAD(state) || (state & LCK_ILOCK)) {
3042 return TRUE;
3043 }
3044 return FALSE;
3045 }
3046
3047 void
3048 kdp_lck_mtx_find_owner(__unused struct waitq * waitq, event64_t event, thread_waitinfo_t * waitinfo)
3049 {
3050 lck_mtx_t * mutex = LCK_EVENT_TO_MUTEX(event);
3051 waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(mutex);
3052 uintptr_t state = ordered_load_mtx(mutex);
3053 thread_t holder = LCK_MTX_STATE_TO_THREAD(state);
3054 if ((uintptr_t)holder == (uintptr_t)LCK_MTX_SPIN_TAG) {
3055 waitinfo->owner = STACKSHOT_WAITOWNER_MTXSPIN;
3056 } else {
3057 assertf(state != (uintptr_t)LCK_MTX_TAG_DESTROYED, "state=0x%llx", (uint64_t)state);
3058 assertf(state != (uintptr_t)LCK_MTX_TAG_INDIRECT, "state=0x%llx", (uint64_t)state);
3059 waitinfo->owner = thread_tid(holder);
3060 }
3061 }
3062
3063 void
3064 kdp_rwlck_find_owner(__unused struct waitq * waitq, event64_t event, thread_waitinfo_t * waitinfo)
3065 {
3066 lck_rw_t *rwlck = NULL;
3067 switch (waitinfo->wait_type) {
3068 case kThreadWaitKernelRWLockRead:
3069 rwlck = READ_EVENT_TO_RWLOCK(event);
3070 break;
3071 case kThreadWaitKernelRWLockWrite:
3072 case kThreadWaitKernelRWLockUpgrade:
3073 rwlck = WRITE_EVENT_TO_RWLOCK(event);
3074 break;
3075 default:
3076 panic("%s was called with an invalid blocking type", __FUNCTION__);
3077 break;
3078 }
3079 waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(rwlck);
3080 waitinfo->owner = thread_tid(rwlck->lck_rw_owner);
3081 }