]> git.saurik.com Git - apple/xnu.git/blob - osfmk/i386/locks_i386.c
xnu-6153.141.1.tar.gz
[apple/xnu.git] / osfmk / i386 / locks_i386.c
1 /*
2 * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 * File: kern/lock.c
58 * Author: Avadis Tevanian, Jr., Michael Wayne Young
59 * Date: 1985
60 *
61 * Locking primitives implementation
62 */
63
64 #define LOCK_PRIVATE 1
65
66 #include <mach_ldebug.h>
67
68 #include <kern/lock_stat.h>
69 #include <kern/locks.h>
70 #include <kern/kalloc.h>
71 #include <kern/misc_protos.h>
72 #include <kern/thread.h>
73 #include <kern/processor.h>
74 #include <kern/cpu_data.h>
75 #include <kern/cpu_number.h>
76 #include <kern/sched_prim.h>
77 #include <kern/debug.h>
78 #include <string.h>
79
80 #include <i386/machine_routines.h> /* machine_timeout_suspended() */
81 #include <machine/atomic.h>
82 #include <machine/machine_cpu.h>
83 #include <i386/mp.h>
84 #include <machine/atomic.h>
85 #include <sys/kdebug.h>
86 #include <i386/locks_i386_inlines.h>
87 #include <kern/cpu_number.h>
88 #include <os/hash.h>
89
90 #if CONFIG_DTRACE
91 #define DTRACE_RW_SHARED 0x0 //reader
92 #define DTRACE_RW_EXCL 0x1 //writer
93 #define DTRACE_NO_FLAG 0x0 //not applicable
94 #endif /* CONFIG_DTRACE */
95
96 #define LCK_RW_LCK_EXCLUSIVE_CODE 0x100
97 #define LCK_RW_LCK_EXCLUSIVE1_CODE 0x101
98 #define LCK_RW_LCK_SHARED_CODE 0x102
99 #define LCK_RW_LCK_SH_TO_EX_CODE 0x103
100 #define LCK_RW_LCK_SH_TO_EX1_CODE 0x104
101 #define LCK_RW_LCK_EX_TO_SH_CODE 0x105
102
103 #define LCK_RW_LCK_EX_WRITER_SPIN_CODE 0x106
104 #define LCK_RW_LCK_EX_WRITER_WAIT_CODE 0x107
105 #define LCK_RW_LCK_EX_READER_SPIN_CODE 0x108
106 #define LCK_RW_LCK_EX_READER_WAIT_CODE 0x109
107 #define LCK_RW_LCK_SHARED_SPIN_CODE 0x110
108 #define LCK_RW_LCK_SHARED_WAIT_CODE 0x111
109 #define LCK_RW_LCK_SH_TO_EX_SPIN_CODE 0x112
110 #define LCK_RW_LCK_SH_TO_EX_WAIT_CODE 0x113
111
112
113 #define ANY_LOCK_DEBUG (USLOCK_DEBUG || LOCK_DEBUG || MUTEX_DEBUG)
114
115 unsigned int LcksOpts = 0;
116
117 #if DEVELOPMENT || DEBUG
118 unsigned int LckDisablePreemptCheck = 0;
119 #endif
120
121 /* Forwards */
122
123 #if USLOCK_DEBUG
124 /*
125 * Perform simple lock checks.
126 */
127 int uslock_check = 1;
128 int max_lock_loops = 100000000;
129 decl_simple_lock_data(extern, printf_lock);
130 decl_simple_lock_data(extern, panic_lock);
131 #endif /* USLOCK_DEBUG */
132
133 extern unsigned int not_in_kdp;
134
135 /*
136 * We often want to know the addresses of the callers
137 * of the various lock routines. However, this information
138 * is only used for debugging and statistics.
139 */
140 typedef void *pc_t;
141 #define INVALID_PC ((void *) VM_MAX_KERNEL_ADDRESS)
142 #define INVALID_THREAD ((void *) VM_MAX_KERNEL_ADDRESS)
143 #if ANY_LOCK_DEBUG
144 #define OBTAIN_PC(pc) ((pc) = GET_RETURN_PC())
145 #define DECL_PC(pc) pc_t pc;
146 #else /* ANY_LOCK_DEBUG */
147 #define DECL_PC(pc)
148 #ifdef lint
149 /*
150 * Eliminate lint complaints about unused local pc variables.
151 */
152 #define OBTAIN_PC(pc) ++pc
153 #else /* lint */
154 #define OBTAIN_PC(pc)
155 #endif /* lint */
156 #endif /* USLOCK_DEBUG */
157
158 /*
159 * atomic exchange API is a low level abstraction of the operations
160 * to atomically read, modify, and write a pointer. This abstraction works
161 * for both Intel and ARMv8.1 compare and exchange atomic instructions as
162 * well as the ARM exclusive instructions.
163 *
164 * atomic_exchange_begin() - begin exchange and retrieve current value
165 * atomic_exchange_complete() - conclude an exchange
166 * atomic_exchange_abort() - cancel an exchange started with atomic_exchange_begin()
167 */
168 static uint32_t
169 atomic_exchange_begin32(uint32_t *target, uint32_t *previous, enum memory_order ord)
170 {
171 uint32_t val;
172
173 (void)ord; // Memory order not used
174 val = os_atomic_load(target, relaxed);
175 *previous = val;
176 return val;
177 }
178
179 static boolean_t
180 atomic_exchange_complete32(uint32_t *target, uint32_t previous, uint32_t newval, enum memory_order ord)
181 {
182 return __c11_atomic_compare_exchange_strong((_Atomic uint32_t *)target, &previous, newval, ord, memory_order_relaxed);
183 }
184
185 static void
186 atomic_exchange_abort(void)
187 {
188 }
189
190 static boolean_t
191 atomic_test_and_set32(uint32_t *target, uint32_t test_mask, uint32_t set_mask, enum memory_order ord, boolean_t wait)
192 {
193 uint32_t value, prev;
194
195 for (;;) {
196 value = atomic_exchange_begin32(target, &prev, ord);
197 if (value & test_mask) {
198 if (wait) {
199 cpu_pause();
200 } else {
201 atomic_exchange_abort();
202 }
203 return FALSE;
204 }
205 value |= set_mask;
206 if (atomic_exchange_complete32(target, prev, value, ord)) {
207 return TRUE;
208 }
209 }
210 }
211
212 inline boolean_t
213 hw_atomic_test_and_set32(uint32_t *target, uint32_t test_mask, uint32_t set_mask, enum memory_order ord, boolean_t wait)
214 {
215 return atomic_test_and_set32(target, test_mask, set_mask, ord, wait);
216 }
217
218 /*
219 * Portable lock package implementation of usimple_locks.
220 */
221
222 #if USLOCK_DEBUG
223 #define USLDBG(stmt) stmt
224 void usld_lock_init(usimple_lock_t, unsigned short);
225 void usld_lock_pre(usimple_lock_t, pc_t);
226 void usld_lock_post(usimple_lock_t, pc_t);
227 void usld_unlock(usimple_lock_t, pc_t);
228 void usld_lock_try_pre(usimple_lock_t, pc_t);
229 void usld_lock_try_post(usimple_lock_t, pc_t);
230 int usld_lock_common_checks(usimple_lock_t, char *);
231 #else /* USLOCK_DEBUG */
232 #define USLDBG(stmt)
233 #endif /* USLOCK_DEBUG */
234
235 /*
236 * Forward definitions
237 */
238
239 static void lck_rw_lock_shared_gen(lck_rw_t *lck);
240 static void lck_rw_lock_exclusive_gen(lck_rw_t *lck);
241 static boolean_t lck_rw_lock_shared_to_exclusive_success(lck_rw_t *lck);
242 static boolean_t lck_rw_lock_shared_to_exclusive_failure(lck_rw_t *lck, uint32_t prior_lock_state);
243 static void lck_rw_lock_exclusive_to_shared_gen(lck_rw_t *lck, uint32_t prior_lock_state);
244 static lck_rw_type_t lck_rw_done_gen(lck_rw_t *lck, uint32_t prior_lock_state);
245 void lck_rw_clear_promotions_x86(thread_t thread);
246 static boolean_t lck_rw_held_read_or_upgrade(lck_rw_t *lock);
247 static boolean_t lck_rw_grab_want(lck_rw_t *lock);
248 static boolean_t lck_rw_grab_shared(lck_rw_t *lock);
249 static void lck_mtx_unlock_wakeup_tail(lck_mtx_t *mutex, uint32_t state, boolean_t indirect);
250 static void lck_mtx_interlock_lock(lck_mtx_t *mutex, uint32_t *new_state);
251 static void lck_mtx_interlock_lock_clear_flags(lck_mtx_t *mutex, uint32_t and_flags, uint32_t *new_state);
252 static int lck_mtx_interlock_try_lock_set_flags(lck_mtx_t *mutex, uint32_t or_flags, uint32_t *new_state);
253 static boolean_t lck_mtx_lock_wait_interlock_to_clear(lck_mtx_t *lock, uint32_t *new_state);
254 static boolean_t lck_mtx_try_lock_wait_interlock_to_clear(lck_mtx_t *lock, uint32_t *new_state);
255
256
257 /*
258 * Routine: lck_spin_alloc_init
259 */
260 lck_spin_t *
261 lck_spin_alloc_init(
262 lck_grp_t *grp,
263 lck_attr_t *attr)
264 {
265 lck_spin_t *lck;
266
267 if ((lck = (lck_spin_t *)kalloc(sizeof(lck_spin_t))) != 0) {
268 lck_spin_init(lck, grp, attr);
269 }
270
271 return lck;
272 }
273
274 /*
275 * Routine: lck_spin_free
276 */
277 void
278 lck_spin_free(
279 lck_spin_t *lck,
280 lck_grp_t *grp)
281 {
282 lck_spin_destroy(lck, grp);
283 kfree(lck, sizeof(lck_spin_t));
284 }
285
286 /*
287 * Routine: lck_spin_init
288 */
289 void
290 lck_spin_init(
291 lck_spin_t *lck,
292 lck_grp_t *grp,
293 __unused lck_attr_t *attr)
294 {
295 usimple_lock_init((usimple_lock_t) lck, 0);
296 if (grp) {
297 lck_grp_reference(grp);
298 lck_grp_lckcnt_incr(grp, LCK_TYPE_SPIN);
299 }
300 }
301
302 /*
303 * Routine: lck_spin_destroy
304 */
305 void
306 lck_spin_destroy(
307 lck_spin_t *lck,
308 lck_grp_t *grp)
309 {
310 if (lck->interlock == LCK_SPIN_TAG_DESTROYED) {
311 return;
312 }
313 lck->interlock = LCK_SPIN_TAG_DESTROYED;
314 if (grp) {
315 lck_grp_lckcnt_decr(grp, LCK_TYPE_SPIN);
316 lck_grp_deallocate(grp);
317 }
318 return;
319 }
320
321 /*
322 * Routine: lck_spin_lock
323 */
324 void
325 lck_spin_lock_grp(
326 lck_spin_t *lck,
327 lck_grp_t *grp)
328 {
329 #pragma unused(grp)
330 usimple_lock((usimple_lock_t) lck, grp);
331 }
332
333 void
334 lck_spin_lock(
335 lck_spin_t *lck)
336 {
337 usimple_lock((usimple_lock_t) lck, NULL);
338 }
339
340 /*
341 * Routine: lck_spin_unlock
342 */
343 void
344 lck_spin_unlock(
345 lck_spin_t *lck)
346 {
347 usimple_unlock((usimple_lock_t) lck);
348 }
349
350 boolean_t
351 lck_spin_try_lock_grp(
352 lck_spin_t *lck,
353 lck_grp_t *grp)
354 {
355 #pragma unused(grp)
356 boolean_t lrval = (boolean_t)usimple_lock_try((usimple_lock_t) lck, grp);
357 #if DEVELOPMENT || DEBUG
358 if (lrval) {
359 pltrace(FALSE);
360 }
361 #endif
362 return lrval;
363 }
364
365
366 /*
367 * Routine: lck_spin_try_lock
368 */
369 boolean_t
370 lck_spin_try_lock(
371 lck_spin_t *lck)
372 {
373 boolean_t lrval = (boolean_t)usimple_lock_try((usimple_lock_t) lck, LCK_GRP_NULL);
374 #if DEVELOPMENT || DEBUG
375 if (lrval) {
376 pltrace(FALSE);
377 }
378 #endif
379 return lrval;
380 }
381
382 /*
383 * Routine: lck_spin_assert
384 */
385 void
386 lck_spin_assert(lck_spin_t *lock, unsigned int type)
387 {
388 thread_t thread, holder;
389 uintptr_t state;
390
391 if (__improbable(type != LCK_ASSERT_OWNED && type != LCK_ASSERT_NOTOWNED)) {
392 panic("lck_spin_assert(): invalid arg (%u)", type);
393 }
394
395 state = lock->interlock;
396 holder = (thread_t)state;
397 thread = current_thread();
398 if (type == LCK_ASSERT_OWNED) {
399 if (__improbable(holder == THREAD_NULL)) {
400 panic("Lock not owned %p = %lx", lock, state);
401 }
402 if (__improbable(holder != thread)) {
403 panic("Lock not owned by current thread %p = %lx", lock, state);
404 }
405 } else if (type == LCK_ASSERT_NOTOWNED) {
406 if (__improbable(holder != THREAD_NULL)) {
407 if (holder == thread) {
408 panic("Lock owned by current thread %p = %lx", lock, state);
409 }
410 }
411 }
412 }
413
414 /*
415 * Routine: kdp_lck_spin_is_acquired
416 * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
417 * Returns: TRUE if lock is acquired.
418 */
419 boolean_t
420 kdp_lck_spin_is_acquired(lck_spin_t *lck)
421 {
422 if (not_in_kdp) {
423 panic("panic: spinlock acquired check done outside of kernel debugger");
424 }
425 return (lck->interlock != 0)? TRUE : FALSE;
426 }
427
428 /*
429 * Initialize a usimple_lock.
430 *
431 * No change in preemption state.
432 */
433 void
434 usimple_lock_init(
435 usimple_lock_t l,
436 __unused unsigned short tag)
437 {
438 #ifndef MACHINE_SIMPLE_LOCK
439 USLDBG(usld_lock_init(l, tag));
440 hw_lock_init(&l->interlock);
441 #else
442 simple_lock_init((simple_lock_t)l, tag);
443 #endif
444 }
445
446 volatile uint32_t spinlock_owner_cpu = ~0;
447 volatile usimple_lock_t spinlock_timed_out;
448
449 uint32_t
450 spinlock_timeout_NMI(uintptr_t thread_addr)
451 {
452 uint32_t i;
453
454 for (i = 0; i < real_ncpus; i++) {
455 if ((cpu_data_ptr[i] != NULL) && ((uintptr_t)cpu_data_ptr[i]->cpu_active_thread == thread_addr)) {
456 spinlock_owner_cpu = i;
457 if ((uint32_t) cpu_number() != i) {
458 /* Cause NMI and panic on the owner's cpu */
459 NMIPI_panic(cpu_to_cpumask(i), SPINLOCK_TIMEOUT);
460 }
461 break;
462 }
463 }
464
465 return spinlock_owner_cpu;
466 }
467
468 /*
469 * Acquire a usimple_lock.
470 *
471 * Returns with preemption disabled. Note
472 * that the hw_lock routines are responsible for
473 * maintaining preemption state.
474 */
475 void
476 (usimple_lock)(
477 usimple_lock_t l
478 LCK_GRP_ARG(lck_grp_t *grp))
479 {
480 #ifndef MACHINE_SIMPLE_LOCK
481 DECL_PC(pc);
482
483 OBTAIN_PC(pc);
484 USLDBG(usld_lock_pre(l, pc));
485
486 if (__improbable(hw_lock_to(&l->interlock, LockTimeOutTSC, grp) == 0)) {
487 boolean_t uslock_acquired = FALSE;
488 while (machine_timeout_suspended()) {
489 enable_preemption();
490 if ((uslock_acquired = hw_lock_to(&l->interlock, LockTimeOutTSC, grp))) {
491 break;
492 }
493 }
494
495 if (uslock_acquired == FALSE) {
496 uint32_t lock_cpu;
497 uintptr_t lowner = (uintptr_t)l->interlock.lock_data;
498 spinlock_timed_out = l;
499 lock_cpu = spinlock_timeout_NMI(lowner);
500 panic("Spinlock acquisition timed out: lock=%p, lock owner thread=0x%lx, current_thread: %p, lock owner active on CPU 0x%x, current owner: 0x%lx, time: %llu",
501 l, lowner, current_thread(), lock_cpu, (uintptr_t)l->interlock.lock_data, mach_absolute_time());
502 }
503 }
504 #if DEVELOPMENT || DEBUG
505 pltrace(FALSE);
506 #endif
507
508 USLDBG(usld_lock_post(l, pc));
509 #else
510 simple_lock((simple_lock_t)l, grp);
511 #endif
512 #if CONFIG_DTRACE
513 LOCKSTAT_RECORD(LS_LCK_SPIN_LOCK_ACQUIRE, l, 0, (uintptr_t)LCK_GRP_PROBEARG(grp));
514 #endif
515 }
516
517
518 /*
519 * Release a usimple_lock.
520 *
521 * Returns with preemption enabled. Note
522 * that the hw_lock routines are responsible for
523 * maintaining preemption state.
524 */
525 void
526 usimple_unlock(
527 usimple_lock_t l)
528 {
529 #ifndef MACHINE_SIMPLE_LOCK
530 DECL_PC(pc);
531
532 OBTAIN_PC(pc);
533 USLDBG(usld_unlock(l, pc));
534 #if DEVELOPMENT || DEBUG
535 pltrace(TRUE);
536 #endif
537 hw_lock_unlock(&l->interlock);
538 #else
539 simple_unlock_rwmb((simple_lock_t)l);
540 #endif
541 }
542
543
544 /*
545 * Conditionally acquire a usimple_lock.
546 *
547 * On success, returns with preemption disabled.
548 * On failure, returns with preemption in the same state
549 * as when first invoked. Note that the hw_lock routines
550 * are responsible for maintaining preemption state.
551 *
552 * XXX No stats are gathered on a miss; I preserved this
553 * behavior from the original assembly-language code, but
554 * doesn't it make sense to log misses? XXX
555 */
556 unsigned int
557 usimple_lock_try(
558 usimple_lock_t l,
559 lck_grp_t *grp)
560 {
561 #ifndef MACHINE_SIMPLE_LOCK
562 unsigned int success;
563 DECL_PC(pc);
564
565 OBTAIN_PC(pc);
566 USLDBG(usld_lock_try_pre(l, pc));
567 if ((success = hw_lock_try(&l->interlock, grp))) {
568 #if DEVELOPMENT || DEBUG
569 pltrace(FALSE);
570 #endif
571 USLDBG(usld_lock_try_post(l, pc));
572 }
573 return success;
574 #else
575 return simple_lock_try((simple_lock_t)l, grp);
576 #endif
577 }
578
579 /*
580 * Acquire a usimple_lock while polling for pending cpu signals
581 * and spinning on a lock.
582 *
583 */
584 unsigned
585 int
586 (usimple_lock_try_lock_mp_signal_safe_loop_deadline)(usimple_lock_t l,
587 uint64_t deadline
588 LCK_GRP_ARG(lck_grp_t *grp))
589 {
590 boolean_t istate = ml_get_interrupts_enabled();
591
592 if (deadline < mach_absolute_time()) {
593 return 0;
594 }
595
596 while (!simple_lock_try(l, grp)) {
597 if (!istate) {
598 cpu_signal_handler(NULL);
599 }
600
601 if (deadline < mach_absolute_time()) {
602 return 0;
603 }
604
605 cpu_pause();
606 }
607
608 return 1;
609 }
610
611 void
612 (usimple_lock_try_lock_loop)(usimple_lock_t l
613 LCK_GRP_ARG(lck_grp_t *grp))
614 {
615 usimple_lock_try_lock_mp_signal_safe_loop_deadline(l, ULLONG_MAX, grp);
616 }
617
618 unsigned
619 int
620 (usimple_lock_try_lock_mp_signal_safe_loop_duration)(usimple_lock_t l,
621 uint64_t duration
622 LCK_GRP_ARG(lck_grp_t *grp))
623 {
624 uint64_t deadline;
625 uint64_t base_at = mach_absolute_time();
626 uint64_t duration_at;
627
628 nanoseconds_to_absolutetime(duration, &duration_at);
629 deadline = base_at + duration_at;
630 if (deadline < base_at) {
631 /* deadline has overflowed, make it saturate */
632 deadline = ULLONG_MAX;
633 }
634
635 return usimple_lock_try_lock_mp_signal_safe_loop_deadline(l, deadline, grp);
636 }
637
638 #if USLOCK_DEBUG
639 /*
640 * States of a usimple_lock. The default when initializing
641 * a usimple_lock is setting it up for debug checking.
642 */
643 #define USLOCK_CHECKED 0x0001 /* lock is being checked */
644 #define USLOCK_TAKEN 0x0002 /* lock has been taken */
645 #define USLOCK_INIT 0xBAA0 /* lock has been initialized */
646 #define USLOCK_INITIALIZED (USLOCK_INIT|USLOCK_CHECKED)
647 #define USLOCK_CHECKING(l) (uslock_check && \
648 ((l)->debug.state & USLOCK_CHECKED))
649
650 /*
651 * Initialize the debugging information contained
652 * in a usimple_lock.
653 */
654 void
655 usld_lock_init(
656 usimple_lock_t l,
657 __unused unsigned short tag)
658 {
659 if (l == USIMPLE_LOCK_NULL) {
660 panic("lock initialization: null lock pointer");
661 }
662 l->lock_type = USLOCK_TAG;
663 l->debug.state = uslock_check ? USLOCK_INITIALIZED : 0;
664 l->debug.lock_cpu = l->debug.unlock_cpu = 0;
665 l->debug.lock_pc = l->debug.unlock_pc = INVALID_PC;
666 l->debug.lock_thread = l->debug.unlock_thread = INVALID_THREAD;
667 l->debug.duration[0] = l->debug.duration[1] = 0;
668 l->debug.unlock_cpu = l->debug.unlock_cpu = 0;
669 l->debug.unlock_pc = l->debug.unlock_pc = INVALID_PC;
670 l->debug.unlock_thread = l->debug.unlock_thread = INVALID_THREAD;
671 }
672
673
674 /*
675 * These checks apply to all usimple_locks, not just
676 * those with USLOCK_CHECKED turned on.
677 */
678 int
679 usld_lock_common_checks(
680 usimple_lock_t l,
681 char *caller)
682 {
683 if (l == USIMPLE_LOCK_NULL) {
684 panic("%s: null lock pointer", caller);
685 }
686 if (l->lock_type != USLOCK_TAG) {
687 panic("%s: %p is not a usimple lock, 0x%x", caller, l, l->lock_type);
688 }
689 if (!(l->debug.state & USLOCK_INIT)) {
690 panic("%s: %p is not an initialized lock, 0x%x", caller, l, l->debug.state);
691 }
692 return USLOCK_CHECKING(l);
693 }
694
695
696 /*
697 * Debug checks on a usimple_lock just before attempting
698 * to acquire it.
699 */
700 /* ARGSUSED */
701 void
702 usld_lock_pre(
703 usimple_lock_t l,
704 pc_t pc)
705 {
706 char caller[] = "usimple_lock";
707
708
709 if (!usld_lock_common_checks(l, caller)) {
710 return;
711 }
712
713 /*
714 * Note that we have a weird case where we are getting a lock when we are]
715 * in the process of putting the system to sleep. We are running with no
716 * current threads, therefore we can't tell if we are trying to retake a lock
717 * we have or someone on the other processor has it. Therefore we just
718 * ignore this test if the locking thread is 0.
719 */
720
721 if ((l->debug.state & USLOCK_TAKEN) && l->debug.lock_thread &&
722 l->debug.lock_thread == (void *) current_thread()) {
723 printf("%s: lock %p already locked (at %p) by",
724 caller, l, l->debug.lock_pc);
725 printf(" current thread %p (new attempt at pc %p)\n",
726 l->debug.lock_thread, pc);
727 panic("%s", caller);
728 }
729 mp_disable_preemption();
730 mp_enable_preemption();
731 }
732
733
734 /*
735 * Debug checks on a usimple_lock just after acquiring it.
736 *
737 * Pre-emption has been disabled at this point,
738 * so we are safe in using cpu_number.
739 */
740 void
741 usld_lock_post(
742 usimple_lock_t l,
743 pc_t pc)
744 {
745 int mycpu;
746 char caller[] = "successful usimple_lock";
747
748
749 if (!usld_lock_common_checks(l, caller)) {
750 return;
751 }
752
753 if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED)) {
754 panic("%s: lock %p became uninitialized",
755 caller, l);
756 }
757 if ((l->debug.state & USLOCK_TAKEN)) {
758 panic("%s: lock 0x%p became TAKEN by someone else",
759 caller, l);
760 }
761
762 mycpu = cpu_number();
763 l->debug.lock_thread = (void *)current_thread();
764 l->debug.state |= USLOCK_TAKEN;
765 l->debug.lock_pc = pc;
766 l->debug.lock_cpu = mycpu;
767 }
768
769
770 /*
771 * Debug checks on a usimple_lock just before
772 * releasing it. Note that the caller has not
773 * yet released the hardware lock.
774 *
775 * Preemption is still disabled, so there's
776 * no problem using cpu_number.
777 */
778 void
779 usld_unlock(
780 usimple_lock_t l,
781 pc_t pc)
782 {
783 int mycpu;
784 char caller[] = "usimple_unlock";
785
786
787 if (!usld_lock_common_checks(l, caller)) {
788 return;
789 }
790
791 mycpu = cpu_number();
792
793 if (!(l->debug.state & USLOCK_TAKEN)) {
794 panic("%s: lock 0x%p hasn't been taken",
795 caller, l);
796 }
797 if (l->debug.lock_thread != (void *) current_thread()) {
798 panic("%s: unlocking lock 0x%p, owned by thread %p",
799 caller, l, l->debug.lock_thread);
800 }
801 if (l->debug.lock_cpu != mycpu) {
802 printf("%s: unlocking lock 0x%p on cpu 0x%x",
803 caller, l, mycpu);
804 printf(" (acquired on cpu 0x%x)\n", l->debug.lock_cpu);
805 panic("%s", caller);
806 }
807
808 l->debug.unlock_thread = l->debug.lock_thread;
809 l->debug.lock_thread = INVALID_PC;
810 l->debug.state &= ~USLOCK_TAKEN;
811 l->debug.unlock_pc = pc;
812 l->debug.unlock_cpu = mycpu;
813 }
814
815
816 /*
817 * Debug checks on a usimple_lock just before
818 * attempting to acquire it.
819 *
820 * Preemption isn't guaranteed to be disabled.
821 */
822 void
823 usld_lock_try_pre(
824 usimple_lock_t l,
825 __unused pc_t pc)
826 {
827 char caller[] = "usimple_lock_try";
828
829 if (!usld_lock_common_checks(l, caller)) {
830 return;
831 }
832 }
833
834
835 /*
836 * Debug checks on a usimple_lock just after
837 * successfully attempting to acquire it.
838 *
839 * Preemption has been disabled by the
840 * lock acquisition attempt, so it's safe
841 * to use cpu_number.
842 */
843 void
844 usld_lock_try_post(
845 usimple_lock_t l,
846 pc_t pc)
847 {
848 int mycpu;
849 char caller[] = "successful usimple_lock_try";
850
851 if (!usld_lock_common_checks(l, caller)) {
852 return;
853 }
854
855 if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED)) {
856 panic("%s: lock 0x%p became uninitialized",
857 caller, l);
858 }
859 if ((l->debug.state & USLOCK_TAKEN)) {
860 panic("%s: lock 0x%p became TAKEN by someone else",
861 caller, l);
862 }
863
864 mycpu = cpu_number();
865 l->debug.lock_thread = (void *) current_thread();
866 l->debug.state |= USLOCK_TAKEN;
867 l->debug.lock_pc = pc;
868 l->debug.lock_cpu = mycpu;
869 }
870 #endif /* USLOCK_DEBUG */
871
872 /*
873 * Routine: lck_rw_alloc_init
874 */
875 lck_rw_t *
876 lck_rw_alloc_init(
877 lck_grp_t *grp,
878 lck_attr_t *attr)
879 {
880 lck_rw_t *lck;
881
882 if ((lck = (lck_rw_t *)kalloc(sizeof(lck_rw_t))) != 0) {
883 bzero(lck, sizeof(lck_rw_t));
884 lck_rw_init(lck, grp, attr);
885 }
886
887 return lck;
888 }
889
890 /*
891 * Routine: lck_rw_free
892 */
893 void
894 lck_rw_free(
895 lck_rw_t *lck,
896 lck_grp_t *grp)
897 {
898 lck_rw_destroy(lck, grp);
899 kfree(lck, sizeof(lck_rw_t));
900 }
901
902 /*
903 * Routine: lck_rw_init
904 */
905 void
906 lck_rw_init(
907 lck_rw_t *lck,
908 lck_grp_t *grp,
909 lck_attr_t *attr)
910 {
911 lck_attr_t *lck_attr = (attr != LCK_ATTR_NULL) ?
912 attr : &LockDefaultLckAttr;
913
914 hw_lock_byte_init(&lck->lck_rw_interlock);
915 lck->lck_rw_want_write = FALSE;
916 lck->lck_rw_want_upgrade = FALSE;
917 lck->lck_rw_shared_count = 0;
918 lck->lck_rw_can_sleep = TRUE;
919 lck->lck_r_waiting = lck->lck_w_waiting = 0;
920 lck->lck_rw_tag = 0;
921 lck->lck_rw_priv_excl = ((lck_attr->lck_attr_val &
922 LCK_ATTR_RW_SHARED_PRIORITY) == 0);
923
924 lck_grp_reference(grp);
925 lck_grp_lckcnt_incr(grp, LCK_TYPE_RW);
926 }
927
928 /*
929 * Routine: lck_rw_destroy
930 */
931 void
932 lck_rw_destroy(
933 lck_rw_t *lck,
934 lck_grp_t *grp)
935 {
936 if (lck->lck_rw_tag == LCK_RW_TAG_DESTROYED) {
937 return;
938 }
939 #if MACH_LDEBUG
940 lck_rw_assert(lck, LCK_RW_ASSERT_NOTHELD);
941 #endif
942 lck->lck_rw_tag = LCK_RW_TAG_DESTROYED;
943 lck_grp_lckcnt_decr(grp, LCK_TYPE_RW);
944 lck_grp_deallocate(grp);
945 return;
946 }
947
948 /*
949 * Sleep locks. These use the same data structure and algorithm
950 * as the spin locks, but the process sleeps while it is waiting
951 * for the lock. These work on uniprocessor systems.
952 */
953
954 #define DECREMENTER_TIMEOUT 1000000
955
956 /*
957 * We disable interrupts while holding the RW interlock to prevent an
958 * interrupt from exacerbating hold time.
959 * Hence, local helper functions lck_interlock_lock()/lck_interlock_unlock().
960 */
961 static inline boolean_t
962 lck_interlock_lock(lck_rw_t *lck)
963 {
964 boolean_t istate;
965
966 istate = ml_set_interrupts_enabled(FALSE);
967 hw_lock_byte_lock(&lck->lck_rw_interlock);
968 return istate;
969 }
970
971 static inline void
972 lck_interlock_unlock(lck_rw_t *lck, boolean_t istate)
973 {
974 hw_lock_byte_unlock(&lck->lck_rw_interlock);
975 ml_set_interrupts_enabled(istate);
976 }
977
978 /*
979 * This inline is used when busy-waiting for an rw lock.
980 * If interrupts were disabled when the lock primitive was called,
981 * we poll the IPI handler for pending tlb flushes.
982 * XXX This is a hack to avoid deadlocking on the pmap_system_lock.
983 */
984 static inline void
985 lck_rw_lock_pause(boolean_t interrupts_enabled)
986 {
987 if (!interrupts_enabled) {
988 handle_pending_TLB_flushes();
989 }
990 cpu_pause();
991 }
992
993 static inline boolean_t
994 lck_rw_held_read_or_upgrade(lck_rw_t *lock)
995 {
996 if (ordered_load(&lock->data) & (LCK_RW_SHARED_MASK | LCK_RW_INTERLOCK | LCK_RW_WANT_UPGRADE)) {
997 return TRUE;
998 }
999 return FALSE;
1000 }
1001
1002 /*
1003 * compute the deadline to spin against when
1004 * waiting for a change of state on a lck_rw_t
1005 */
1006 static inline uint64_t
1007 lck_rw_deadline_for_spin(lck_rw_t *lck)
1008 {
1009 if (lck->lck_rw_can_sleep) {
1010 if (lck->lck_r_waiting || lck->lck_w_waiting || lck->lck_rw_shared_count > machine_info.max_cpus) {
1011 /*
1012 * there are already threads waiting on this lock... this
1013 * implies that they have spun beyond their deadlines waiting for
1014 * the desired state to show up so we will not bother spinning at this time...
1015 * or
1016 * the current number of threads sharing this lock exceeds our capacity to run them
1017 * concurrently and since all states we're going to spin for require the rw_shared_count
1018 * to be at 0, we'll not bother spinning since the latency for this to happen is
1019 * unpredictable...
1020 */
1021 return mach_absolute_time();
1022 }
1023 return mach_absolute_time() + MutexSpin;
1024 } else {
1025 return mach_absolute_time() + (100000LL * 1000000000LL);
1026 }
1027 }
1028
1029
1030 /*
1031 * Spin while interlock is held.
1032 */
1033
1034 static inline void
1035 lck_rw_interlock_spin(lck_rw_t *lock)
1036 {
1037 while (ordered_load(&lock->data) & LCK_RW_INTERLOCK) {
1038 cpu_pause();
1039 }
1040 }
1041
1042 static boolean_t
1043 lck_rw_grab_want(lck_rw_t *lock)
1044 {
1045 uint32_t data, prev;
1046
1047 for (;;) {
1048 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_relaxed);
1049 if ((data & LCK_RW_INTERLOCK) == 0) {
1050 break;
1051 }
1052 atomic_exchange_abort();
1053 lck_rw_interlock_spin(lock);
1054 }
1055 if (data & LCK_RW_WANT_WRITE) {
1056 atomic_exchange_abort();
1057 return FALSE;
1058 }
1059 data |= LCK_RW_WANT_WRITE;
1060 return atomic_exchange_complete32(&lock->data, prev, data, memory_order_relaxed);
1061 }
1062
1063 static boolean_t
1064 lck_rw_grab_shared(lck_rw_t *lock)
1065 {
1066 uint32_t data, prev;
1067
1068 for (;;) {
1069 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
1070 if ((data & LCK_RW_INTERLOCK) == 0) {
1071 break;
1072 }
1073 atomic_exchange_abort();
1074 lck_rw_interlock_spin(lock);
1075 }
1076 if (data & (LCK_RW_WANT_WRITE | LCK_RW_WANT_UPGRADE)) {
1077 if (((data & LCK_RW_SHARED_MASK) == 0) || (data & LCK_RW_PRIV_EXCL)) {
1078 atomic_exchange_abort();
1079 return FALSE;
1080 }
1081 }
1082 data += LCK_RW_SHARED_READER;
1083 return atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp);
1084 }
1085
1086 /*
1087 * Routine: lck_rw_lock_exclusive
1088 */
1089 static void
1090 lck_rw_lock_exclusive_gen(
1091 lck_rw_t *lck)
1092 {
1093 __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck);
1094 uint64_t deadline = 0;
1095 int slept = 0;
1096 int gotlock = 0;
1097 int lockheld = 0;
1098 wait_result_t res = 0;
1099 boolean_t istate = -1;
1100
1101 #if CONFIG_DTRACE
1102 boolean_t dtrace_ls_initialized = FALSE;
1103 boolean_t dtrace_rwl_excl_spin, dtrace_rwl_excl_block, dtrace_ls_enabled = FALSE;
1104 uint64_t wait_interval = 0;
1105 int readers_at_sleep = 0;
1106 #endif
1107
1108 /*
1109 * Try to acquire the lck_rw_want_write bit.
1110 */
1111 while (!lck_rw_grab_want(lck)) {
1112 #if CONFIG_DTRACE
1113 if (dtrace_ls_initialized == FALSE) {
1114 dtrace_ls_initialized = TRUE;
1115 dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
1116 dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
1117 dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
1118 if (dtrace_ls_enabled) {
1119 /*
1120 * Either sleeping or spinning is happening,
1121 * start a timing of our delay interval now.
1122 */
1123 readers_at_sleep = lck->lck_rw_shared_count;
1124 wait_interval = mach_absolute_time();
1125 }
1126 }
1127 #endif
1128 if (istate == -1) {
1129 istate = ml_get_interrupts_enabled();
1130 }
1131
1132 deadline = lck_rw_deadline_for_spin(lck);
1133
1134 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1135
1136 while (((gotlock = lck_rw_grab_want(lck)) == 0) && mach_absolute_time() < deadline) {
1137 lck_rw_lock_pause(istate);
1138 }
1139
1140 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_END, trace_lck, 0, 0, gotlock, 0);
1141
1142 if (gotlock) {
1143 break;
1144 }
1145 /*
1146 * if we get here, the deadline has expired w/o us
1147 * being able to grab the lock exclusively
1148 * check to see if we're allowed to do a thread_block
1149 */
1150 if (lck->lck_rw_can_sleep) {
1151 istate = lck_interlock_lock(lck);
1152
1153 if (lck->lck_rw_want_write) {
1154 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1155
1156 lck->lck_w_waiting = TRUE;
1157
1158 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite);
1159 res = assert_wait(RW_LOCK_WRITER_EVENT(lck),
1160 THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
1161 lck_interlock_unlock(lck, istate);
1162
1163 if (res == THREAD_WAITING) {
1164 res = thread_block(THREAD_CONTINUE_NULL);
1165 slept++;
1166 }
1167 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_END, trace_lck, res, slept, 0, 0);
1168 } else {
1169 lck->lck_rw_want_write = TRUE;
1170 lck_interlock_unlock(lck, istate);
1171 break;
1172 }
1173 }
1174 }
1175 /*
1176 * Wait for readers (and upgrades) to finish...
1177 * the test for these conditions must be done simultaneously with
1178 * a check of the interlock not being held since
1179 * the rw_shared_count will drop to 0 first and then want_upgrade
1180 * will be set to 1 in the shared_to_exclusive scenario... those
1181 * adjustments are done behind the interlock and represent an
1182 * atomic change in state and must be considered as such
1183 * however, once we see the read count at 0, the want_upgrade not set
1184 * and the interlock not held, we are safe to proceed
1185 */
1186 while (lck_rw_held_read_or_upgrade(lck)) {
1187 #if CONFIG_DTRACE
1188 /*
1189 * Either sleeping or spinning is happening, start
1190 * a timing of our delay interval now. If we set it
1191 * to -1 we don't have accurate data so we cannot later
1192 * decide to record a dtrace spin or sleep event.
1193 */
1194 if (dtrace_ls_initialized == FALSE) {
1195 dtrace_ls_initialized = TRUE;
1196 dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
1197 dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
1198 dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
1199 if (dtrace_ls_enabled) {
1200 /*
1201 * Either sleeping or spinning is happening,
1202 * start a timing of our delay interval now.
1203 */
1204 readers_at_sleep = lck->lck_rw_shared_count;
1205 wait_interval = mach_absolute_time();
1206 }
1207 }
1208 #endif
1209 if (istate == -1) {
1210 istate = ml_get_interrupts_enabled();
1211 }
1212
1213 deadline = lck_rw_deadline_for_spin(lck);
1214
1215 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1216
1217 while ((lockheld = lck_rw_held_read_or_upgrade(lck)) && mach_absolute_time() < deadline) {
1218 lck_rw_lock_pause(istate);
1219 }
1220
1221 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_END, trace_lck, 0, 0, lockheld, 0);
1222
1223 if (!lockheld) {
1224 break;
1225 }
1226 /*
1227 * if we get here, the deadline has expired w/o us
1228 * being able to grab the lock exclusively
1229 * check to see if we're allowed to do a thread_block
1230 */
1231 if (lck->lck_rw_can_sleep) {
1232 istate = lck_interlock_lock(lck);
1233
1234 if (lck->lck_rw_shared_count != 0 || lck->lck_rw_want_upgrade) {
1235 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1236
1237 lck->lck_w_waiting = TRUE;
1238
1239 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite);
1240 res = assert_wait(RW_LOCK_WRITER_EVENT(lck),
1241 THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
1242 lck_interlock_unlock(lck, istate);
1243
1244 if (res == THREAD_WAITING) {
1245 res = thread_block(THREAD_CONTINUE_NULL);
1246 slept++;
1247 }
1248 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_END, trace_lck, res, slept, 0, 0);
1249 } else {
1250 lck_interlock_unlock(lck, istate);
1251 /*
1252 * must own the lock now, since we checked for
1253 * readers or upgrade owner behind the interlock
1254 * no need for a call to 'lck_rw_held_read_or_upgrade'
1255 */
1256 break;
1257 }
1258 }
1259 }
1260
1261 #if CONFIG_DTRACE
1262 /*
1263 * Decide what latencies we suffered that are Dtrace events.
1264 * If we have set wait_interval, then we either spun or slept.
1265 * At least we get out from under the interlock before we record
1266 * which is the best we can do here to minimize the impact
1267 * of the tracing.
1268 * If we have set wait_interval to -1, then dtrace was not enabled when we
1269 * started sleeping/spinning so we don't record this event.
1270 */
1271 if (dtrace_ls_enabled == TRUE) {
1272 if (slept == 0) {
1273 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_SPIN, lck,
1274 mach_absolute_time() - wait_interval, 1);
1275 } else {
1276 /*
1277 * For the blocking case, we also record if when we blocked
1278 * it was held for read or write, and how many readers.
1279 * Notice that above we recorded this before we dropped
1280 * the interlock so the count is accurate.
1281 */
1282 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_BLOCK, lck,
1283 mach_absolute_time() - wait_interval, 1,
1284 (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1285 }
1286 }
1287 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lck, 1);
1288 #endif
1289 }
1290
1291 /*
1292 * Routine: lck_rw_done
1293 */
1294
1295 lck_rw_type_t
1296 lck_rw_done(lck_rw_t *lock)
1297 {
1298 uint32_t data, prev;
1299
1300 for (;;) {
1301 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_release_smp);
1302 if (data & LCK_RW_INTERLOCK) { /* wait for interlock to clear */
1303 atomic_exchange_abort();
1304 lck_rw_interlock_spin(lock);
1305 continue;
1306 }
1307 if (data & LCK_RW_SHARED_MASK) {
1308 data -= LCK_RW_SHARED_READER;
1309 if ((data & LCK_RW_SHARED_MASK) == 0) { /* if reader count has now gone to 0, check for waiters */
1310 goto check_waiters;
1311 }
1312 } else { /* if reader count == 0, must be exclusive lock */
1313 if (data & LCK_RW_WANT_UPGRADE) {
1314 data &= ~(LCK_RW_WANT_UPGRADE);
1315 } else {
1316 if (data & LCK_RW_WANT_WRITE) {
1317 data &= ~(LCK_RW_WANT_EXCL);
1318 } else { /* lock is not 'owned', panic */
1319 panic("Releasing non-exclusive RW lock without a reader refcount!");
1320 }
1321 }
1322 check_waiters:
1323 if (prev & LCK_RW_W_WAITING) {
1324 data &= ~(LCK_RW_W_WAITING);
1325 if ((prev & LCK_RW_PRIV_EXCL) == 0) {
1326 data &= ~(LCK_RW_R_WAITING);
1327 }
1328 } else {
1329 data &= ~(LCK_RW_R_WAITING);
1330 }
1331 }
1332 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_release_smp)) {
1333 break;
1334 }
1335 cpu_pause();
1336 }
1337 return lck_rw_done_gen(lock, prev);
1338 }
1339
1340 /*
1341 * Routine: lck_rw_done_gen
1342 *
1343 * called from lck_rw_done()
1344 * prior_lock_state is the value in the 1st
1345 * word of the lock at the time of a successful
1346 * atomic compare and exchange with the new value...
1347 * it represents the state of the lock before we
1348 * decremented the rw_shared_count or cleared either
1349 * rw_want_upgrade or rw_want_write and
1350 * the lck_x_waiting bits... since the wrapper
1351 * routine has already changed the state atomically,
1352 * we just need to decide if we should
1353 * wake up anyone and what value to return... we do
1354 * this by examining the state of the lock before
1355 * we changed it
1356 */
1357 static lck_rw_type_t
1358 lck_rw_done_gen(
1359 lck_rw_t *lck,
1360 uint32_t prior_lock_state)
1361 {
1362 lck_rw_t *fake_lck;
1363 lck_rw_type_t lock_type;
1364 thread_t thread;
1365 uint32_t rwlock_count;
1366
1367 thread = current_thread();
1368 rwlock_count = thread->rwlock_count--;
1369 fake_lck = (lck_rw_t *)&prior_lock_state;
1370
1371 if (lck->lck_rw_can_sleep) {
1372 /*
1373 * prior_lock state is a snapshot of the 1st word of the
1374 * lock in question... we'll fake up a pointer to it
1375 * and carefully not access anything beyond whats defined
1376 * in the first word of a lck_rw_t
1377 */
1378
1379 if (fake_lck->lck_rw_shared_count <= 1) {
1380 if (fake_lck->lck_w_waiting) {
1381 thread_wakeup(RW_LOCK_WRITER_EVENT(lck));
1382 }
1383
1384 if (!(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting) && fake_lck->lck_r_waiting) {
1385 thread_wakeup(RW_LOCK_READER_EVENT(lck));
1386 }
1387 }
1388 #if MACH_LDEBUG
1389 if (rwlock_count == 0) {
1390 panic("rw lock count underflow for thread %p", thread);
1391 }
1392 #endif
1393 /* Check if dropping the lock means that we need to unpromote */
1394
1395 if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
1396 /* sched_flags checked without lock, but will be rechecked while clearing */
1397 lck_rw_clear_promotion(thread, unslide_for_kdebug(lck));
1398 }
1399 }
1400 if (fake_lck->lck_rw_shared_count) {
1401 lock_type = LCK_RW_TYPE_SHARED;
1402 } else {
1403 lock_type = LCK_RW_TYPE_EXCLUSIVE;
1404 }
1405
1406 #if CONFIG_DTRACE
1407 LOCKSTAT_RECORD(LS_LCK_RW_DONE_RELEASE, lck, lock_type == LCK_RW_TYPE_SHARED ? 0 : 1);
1408 #endif
1409
1410 return lock_type;
1411 }
1412
1413
1414 /*
1415 * Routine: lck_rw_unlock
1416 */
1417 void
1418 lck_rw_unlock(
1419 lck_rw_t *lck,
1420 lck_rw_type_t lck_rw_type)
1421 {
1422 if (lck_rw_type == LCK_RW_TYPE_SHARED) {
1423 lck_rw_unlock_shared(lck);
1424 } else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) {
1425 lck_rw_unlock_exclusive(lck);
1426 } else {
1427 panic("lck_rw_unlock(): Invalid RW lock type: %d\n", lck_rw_type);
1428 }
1429 }
1430
1431
1432 /*
1433 * Routine: lck_rw_unlock_shared
1434 */
1435 void
1436 lck_rw_unlock_shared(
1437 lck_rw_t *lck)
1438 {
1439 lck_rw_type_t ret;
1440
1441 assertf(lck->lck_rw_shared_count > 0, "lck %p has shared_count=0x%x", lck, lck->lck_rw_shared_count);
1442 ret = lck_rw_done(lck);
1443
1444 if (ret != LCK_RW_TYPE_SHARED) {
1445 panic("lck_rw_unlock_shared(): lock %p held in mode: %d\n", lck, ret);
1446 }
1447 }
1448
1449
1450 /*
1451 * Routine: lck_rw_unlock_exclusive
1452 */
1453 void
1454 lck_rw_unlock_exclusive(
1455 lck_rw_t *lck)
1456 {
1457 lck_rw_type_t ret;
1458
1459 ret = lck_rw_done(lck);
1460
1461 if (ret != LCK_RW_TYPE_EXCLUSIVE) {
1462 panic("lck_rw_unlock_exclusive(): lock held in mode: %d\n", ret);
1463 }
1464 }
1465
1466
1467 /*
1468 * Routine: lck_rw_lock
1469 */
1470 void
1471 lck_rw_lock(
1472 lck_rw_t *lck,
1473 lck_rw_type_t lck_rw_type)
1474 {
1475 if (lck_rw_type == LCK_RW_TYPE_SHARED) {
1476 lck_rw_lock_shared(lck);
1477 } else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) {
1478 lck_rw_lock_exclusive(lck);
1479 } else {
1480 panic("lck_rw_lock(): Invalid RW lock type: %x\n", lck_rw_type);
1481 }
1482 }
1483
1484 /*
1485 * Routine: lck_rw_lock_shared
1486 */
1487 void
1488 lck_rw_lock_shared(lck_rw_t *lock)
1489 {
1490 uint32_t data, prev;
1491
1492 current_thread()->rwlock_count++;
1493 for (;;) {
1494 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
1495 if (data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE | LCK_RW_INTERLOCK)) {
1496 atomic_exchange_abort();
1497 if (lock->lck_rw_can_sleep) {
1498 lck_rw_lock_shared_gen(lock);
1499 } else {
1500 cpu_pause();
1501 continue;
1502 }
1503 break;
1504 }
1505 data += LCK_RW_SHARED_READER;
1506 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp)) {
1507 break;
1508 }
1509 cpu_pause();
1510 }
1511 #if CONFIG_DTRACE
1512 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lock, DTRACE_RW_SHARED);
1513 #endif /* CONFIG_DTRACE */
1514 return;
1515 }
1516
1517 /*
1518 * Routine: lck_rw_lock_shared_gen
1519 * Function:
1520 * assembly fast path code has determined that this lock
1521 * is held exclusively... this is where we spin/block
1522 * until we can acquire the lock in the shared mode
1523 */
1524 static void
1525 lck_rw_lock_shared_gen(
1526 lck_rw_t *lck)
1527 {
1528 __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck);
1529 uint64_t deadline = 0;
1530 int gotlock = 0;
1531 int slept = 0;
1532 wait_result_t res = 0;
1533 boolean_t istate = -1;
1534
1535 #if CONFIG_DTRACE
1536 uint64_t wait_interval = 0;
1537 int readers_at_sleep = 0;
1538 boolean_t dtrace_ls_initialized = FALSE;
1539 boolean_t dtrace_rwl_shared_spin, dtrace_rwl_shared_block, dtrace_ls_enabled = FALSE;
1540 #endif
1541
1542 while (!lck_rw_grab_shared(lck)) {
1543 #if CONFIG_DTRACE
1544 if (dtrace_ls_initialized == FALSE) {
1545 dtrace_ls_initialized = TRUE;
1546 dtrace_rwl_shared_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_SPIN] != 0);
1547 dtrace_rwl_shared_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_BLOCK] != 0);
1548 dtrace_ls_enabled = dtrace_rwl_shared_spin || dtrace_rwl_shared_block;
1549 if (dtrace_ls_enabled) {
1550 /*
1551 * Either sleeping or spinning is happening,
1552 * start a timing of our delay interval now.
1553 */
1554 readers_at_sleep = lck->lck_rw_shared_count;
1555 wait_interval = mach_absolute_time();
1556 }
1557 }
1558 #endif
1559 if (istate == -1) {
1560 istate = ml_get_interrupts_enabled();
1561 }
1562
1563 deadline = lck_rw_deadline_for_spin(lck);
1564
1565 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_START,
1566 trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0);
1567
1568 while (((gotlock = lck_rw_grab_shared(lck)) == 0) && mach_absolute_time() < deadline) {
1569 lck_rw_lock_pause(istate);
1570 }
1571
1572 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_END,
1573 trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, gotlock, 0);
1574
1575 if (gotlock) {
1576 break;
1577 }
1578 /*
1579 * if we get here, the deadline has expired w/o us
1580 * being able to grab the lock for read
1581 * check to see if we're allowed to do a thread_block
1582 */
1583 if (lck->lck_rw_can_sleep) {
1584 istate = lck_interlock_lock(lck);
1585
1586 if ((lck->lck_rw_want_write || lck->lck_rw_want_upgrade) &&
1587 ((lck->lck_rw_shared_count == 0) || lck->lck_rw_priv_excl)) {
1588 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_START,
1589 trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0);
1590
1591 lck->lck_r_waiting = TRUE;
1592
1593 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockRead);
1594 res = assert_wait(RW_LOCK_READER_EVENT(lck),
1595 THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
1596 lck_interlock_unlock(lck, istate);
1597
1598 if (res == THREAD_WAITING) {
1599 res = thread_block(THREAD_CONTINUE_NULL);
1600 slept++;
1601 }
1602 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_END,
1603 trace_lck, res, slept, 0, 0);
1604 } else {
1605 lck->lck_rw_shared_count++;
1606 lck_interlock_unlock(lck, istate);
1607 break;
1608 }
1609 }
1610 }
1611
1612 #if CONFIG_DTRACE
1613 if (dtrace_ls_enabled == TRUE) {
1614 if (slept == 0) {
1615 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_SPIN, lck, mach_absolute_time() - wait_interval, 0);
1616 } else {
1617 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_BLOCK, lck,
1618 mach_absolute_time() - wait_interval, 0,
1619 (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1620 }
1621 }
1622 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lck, 0);
1623 #endif
1624 }
1625
1626
1627 /*
1628 * Routine: lck_rw_lock_exclusive
1629 */
1630
1631 void
1632 lck_rw_lock_exclusive(lck_rw_t *lock)
1633 {
1634 current_thread()->rwlock_count++;
1635 if (atomic_test_and_set32(&lock->data,
1636 (LCK_RW_SHARED_MASK | LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE | LCK_RW_INTERLOCK),
1637 LCK_RW_WANT_EXCL, memory_order_acquire_smp, FALSE)) {
1638 #if CONFIG_DTRACE
1639 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL);
1640 #endif /* CONFIG_DTRACE */
1641 } else {
1642 lck_rw_lock_exclusive_gen(lock);
1643 }
1644 }
1645
1646
1647 /*
1648 * Routine: lck_rw_lock_shared_to_exclusive
1649 *
1650 * False returned upon failure, in this case the shared lock is dropped.
1651 */
1652
1653 boolean_t
1654 lck_rw_lock_shared_to_exclusive(lck_rw_t *lock)
1655 {
1656 uint32_t data, prev;
1657
1658 for (;;) {
1659 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
1660 if (data & LCK_RW_INTERLOCK) {
1661 atomic_exchange_abort();
1662 lck_rw_interlock_spin(lock);
1663 continue;
1664 }
1665 if (data & LCK_RW_WANT_UPGRADE) {
1666 data -= LCK_RW_SHARED_READER;
1667 if ((data & LCK_RW_SHARED_MASK) == 0) { /* we were the last reader */
1668 data &= ~(LCK_RW_W_WAITING); /* so clear the wait indicator */
1669 }
1670 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp)) {
1671 return lck_rw_lock_shared_to_exclusive_failure(lock, prev);
1672 }
1673 } else {
1674 data |= LCK_RW_WANT_UPGRADE; /* ask for WANT_UPGRADE */
1675 data -= LCK_RW_SHARED_READER; /* and shed our read count */
1676 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp)) {
1677 break;
1678 }
1679 }
1680 cpu_pause();
1681 }
1682 /* we now own the WANT_UPGRADE */
1683 if (data & LCK_RW_SHARED_MASK) { /* check to see if all of the readers are drained */
1684 lck_rw_lock_shared_to_exclusive_success(lock); /* if not, we need to go wait */
1685 }
1686 #if CONFIG_DTRACE
1687 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lock, 0);
1688 #endif
1689 return TRUE;
1690 }
1691
1692
1693 /*
1694 * Routine: lck_rw_lock_shared_to_exclusive_failure
1695 * Function:
1696 * assembly fast path code has already dropped our read
1697 * count and determined that someone else owns 'lck_rw_want_upgrade'
1698 * if 'lck_rw_shared_count' == 0, its also already dropped 'lck_w_waiting'
1699 * all we need to do here is determine if a wakeup is needed
1700 */
1701 static boolean_t
1702 lck_rw_lock_shared_to_exclusive_failure(
1703 lck_rw_t *lck,
1704 uint32_t prior_lock_state)
1705 {
1706 lck_rw_t *fake_lck;
1707 thread_t thread = current_thread();
1708 uint32_t rwlock_count;
1709
1710 /* Check if dropping the lock means that we need to unpromote */
1711 rwlock_count = thread->rwlock_count--;
1712 #if MACH_LDEBUG
1713 if (rwlock_count == 0) {
1714 panic("rw lock count underflow for thread %p", thread);
1715 }
1716 #endif
1717 fake_lck = (lck_rw_t *)&prior_lock_state;
1718
1719 if (fake_lck->lck_w_waiting && fake_lck->lck_rw_shared_count == 1) {
1720 /*
1721 * Someone else has requested upgrade.
1722 * Since we've released the read lock, wake
1723 * him up if he's blocked waiting
1724 */
1725 thread_wakeup(RW_LOCK_WRITER_EVENT(lck));
1726 }
1727
1728 if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
1729 /* sched_flags checked without lock, but will be rechecked while clearing */
1730 lck_rw_clear_promotion(thread, unslide_for_kdebug(lck));
1731 }
1732
1733 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_CODE) | DBG_FUNC_NONE,
1734 VM_KERNEL_UNSLIDE_OR_PERM(lck), lck->lck_rw_shared_count, lck->lck_rw_want_upgrade, 0, 0);
1735
1736 return FALSE;
1737 }
1738
1739
1740 /*
1741 * Routine: lck_rw_lock_shared_to_exclusive_failure
1742 * Function:
1743 * assembly fast path code has already dropped our read
1744 * count and successfully acquired 'lck_rw_want_upgrade'
1745 * we just need to wait for the rest of the readers to drain
1746 * and then we can return as the exclusive holder of this lock
1747 */
1748 static boolean_t
1749 lck_rw_lock_shared_to_exclusive_success(
1750 lck_rw_t *lck)
1751 {
1752 __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck);
1753 uint64_t deadline = 0;
1754 int slept = 0;
1755 int still_shared = 0;
1756 wait_result_t res;
1757 boolean_t istate = -1;
1758
1759 #if CONFIG_DTRACE
1760 uint64_t wait_interval = 0;
1761 int readers_at_sleep = 0;
1762 boolean_t dtrace_ls_initialized = FALSE;
1763 boolean_t dtrace_rwl_shared_to_excl_spin, dtrace_rwl_shared_to_excl_block, dtrace_ls_enabled = FALSE;
1764 #endif
1765
1766 while (lck->lck_rw_shared_count != 0) {
1767 #if CONFIG_DTRACE
1768 if (dtrace_ls_initialized == FALSE) {
1769 dtrace_ls_initialized = TRUE;
1770 dtrace_rwl_shared_to_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN] != 0);
1771 dtrace_rwl_shared_to_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK] != 0);
1772 dtrace_ls_enabled = dtrace_rwl_shared_to_excl_spin || dtrace_rwl_shared_to_excl_block;
1773 if (dtrace_ls_enabled) {
1774 /*
1775 * Either sleeping or spinning is happening,
1776 * start a timing of our delay interval now.
1777 */
1778 readers_at_sleep = lck->lck_rw_shared_count;
1779 wait_interval = mach_absolute_time();
1780 }
1781 }
1782 #endif
1783 if (istate == -1) {
1784 istate = ml_get_interrupts_enabled();
1785 }
1786
1787 deadline = lck_rw_deadline_for_spin(lck);
1788
1789 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_START,
1790 trace_lck, lck->lck_rw_shared_count, 0, 0, 0);
1791
1792 while ((still_shared = lck->lck_rw_shared_count) && mach_absolute_time() < deadline) {
1793 lck_rw_lock_pause(istate);
1794 }
1795
1796 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_END,
1797 trace_lck, lck->lck_rw_shared_count, 0, 0, 0);
1798
1799 if (!still_shared) {
1800 break;
1801 }
1802 /*
1803 * if we get here, the deadline has expired w/o
1804 * the rw_shared_count having drained to 0
1805 * check to see if we're allowed to do a thread_block
1806 */
1807 if (lck->lck_rw_can_sleep) {
1808 istate = lck_interlock_lock(lck);
1809
1810 if (lck->lck_rw_shared_count != 0) {
1811 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_START,
1812 trace_lck, lck->lck_rw_shared_count, 0, 0, 0);
1813
1814 lck->lck_w_waiting = TRUE;
1815
1816 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockUpgrade);
1817 res = assert_wait(RW_LOCK_WRITER_EVENT(lck),
1818 THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
1819 lck_interlock_unlock(lck, istate);
1820
1821 if (res == THREAD_WAITING) {
1822 res = thread_block(THREAD_CONTINUE_NULL);
1823 slept++;
1824 }
1825 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_END,
1826 trace_lck, res, slept, 0, 0);
1827 } else {
1828 lck_interlock_unlock(lck, istate);
1829 break;
1830 }
1831 }
1832 }
1833 #if CONFIG_DTRACE
1834 /*
1835 * We infer whether we took the sleep/spin path above by checking readers_at_sleep.
1836 */
1837 if (dtrace_ls_enabled == TRUE) {
1838 if (slept == 0) {
1839 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN, lck, mach_absolute_time() - wait_interval, 0);
1840 } else {
1841 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK, lck,
1842 mach_absolute_time() - wait_interval, 1,
1843 (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1844 }
1845 }
1846 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lck, 1);
1847 #endif
1848 return TRUE;
1849 }
1850
1851 /*
1852 * Routine: lck_rw_lock_exclusive_to_shared
1853 */
1854
1855 void
1856 lck_rw_lock_exclusive_to_shared(lck_rw_t *lock)
1857 {
1858 uint32_t data, prev;
1859
1860 for (;;) {
1861 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_release_smp);
1862 if (data & LCK_RW_INTERLOCK) {
1863 atomic_exchange_abort();
1864 lck_rw_interlock_spin(lock); /* wait for interlock to clear */
1865 continue;
1866 }
1867 data += LCK_RW_SHARED_READER;
1868 if (data & LCK_RW_WANT_UPGRADE) {
1869 data &= ~(LCK_RW_WANT_UPGRADE);
1870 } else {
1871 data &= ~(LCK_RW_WANT_EXCL);
1872 }
1873 if (!((prev & LCK_RW_W_WAITING) && (prev & LCK_RW_PRIV_EXCL))) {
1874 data &= ~(LCK_RW_W_WAITING);
1875 }
1876 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_release_smp)) {
1877 break;
1878 }
1879 cpu_pause();
1880 }
1881 return lck_rw_lock_exclusive_to_shared_gen(lock, prev);
1882 }
1883
1884
1885 /*
1886 * Routine: lck_rw_lock_exclusive_to_shared_gen
1887 * Function:
1888 * assembly fast path has already dropped
1889 * our exclusive state and bumped lck_rw_shared_count
1890 * all we need to do here is determine if anyone
1891 * needs to be awakened.
1892 */
1893 static void
1894 lck_rw_lock_exclusive_to_shared_gen(
1895 lck_rw_t *lck,
1896 uint32_t prior_lock_state)
1897 {
1898 __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck);
1899 lck_rw_t *fake_lck;
1900
1901 fake_lck = (lck_rw_t *)&prior_lock_state;
1902
1903 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_START,
1904 trace_lck, fake_lck->lck_rw_want_write, fake_lck->lck_rw_want_upgrade, 0, 0);
1905
1906 /*
1907 * don't wake up anyone waiting to take the lock exclusively
1908 * since we hold a read count... when the read count drops to 0,
1909 * the writers will be woken.
1910 *
1911 * wake up any waiting readers if we don't have any writers waiting,
1912 * or the lock is NOT marked as rw_priv_excl (writers have privilege)
1913 */
1914 if (!(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting) && fake_lck->lck_r_waiting) {
1915 thread_wakeup(RW_LOCK_READER_EVENT(lck));
1916 }
1917
1918 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_END,
1919 trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, lck->lck_rw_shared_count, 0);
1920
1921 #if CONFIG_DTRACE
1922 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_TO_SHARED_DOWNGRADE, lck, 0);
1923 #endif
1924 }
1925
1926
1927 /*
1928 * Routine: lck_rw_try_lock
1929 */
1930 boolean_t
1931 lck_rw_try_lock(
1932 lck_rw_t *lck,
1933 lck_rw_type_t lck_rw_type)
1934 {
1935 if (lck_rw_type == LCK_RW_TYPE_SHARED) {
1936 return lck_rw_try_lock_shared(lck);
1937 } else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) {
1938 return lck_rw_try_lock_exclusive(lck);
1939 } else {
1940 panic("lck_rw_try_lock(): Invalid rw lock type: %x\n", lck_rw_type);
1941 }
1942 return FALSE;
1943 }
1944
1945 /*
1946 * Routine: lck_rw_try_lock_shared
1947 */
1948
1949 boolean_t
1950 lck_rw_try_lock_shared(lck_rw_t *lock)
1951 {
1952 uint32_t data, prev;
1953
1954 for (;;) {
1955 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
1956 if (data & LCK_RW_INTERLOCK) {
1957 atomic_exchange_abort();
1958 lck_rw_interlock_spin(lock);
1959 continue;
1960 }
1961 if (data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) {
1962 atomic_exchange_abort();
1963 return FALSE; /* lock is busy */
1964 }
1965 data += LCK_RW_SHARED_READER; /* Increment reader refcount */
1966 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp)) {
1967 break;
1968 }
1969 cpu_pause();
1970 }
1971 current_thread()->rwlock_count++;
1972 /* There is a 3 instr window where preemption may not notice rwlock_count after cmpxchg */
1973 #if CONFIG_DTRACE
1974 LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_SHARED_ACQUIRE, lock, DTRACE_RW_SHARED);
1975 #endif /* CONFIG_DTRACE */
1976 return TRUE;
1977 }
1978
1979
1980 /*
1981 * Routine: lck_rw_try_lock_exclusive
1982 */
1983
1984 boolean_t
1985 lck_rw_try_lock_exclusive(lck_rw_t *lock)
1986 {
1987 uint32_t data, prev;
1988
1989 for (;;) {
1990 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
1991 if (data & LCK_RW_INTERLOCK) {
1992 atomic_exchange_abort();
1993 lck_rw_interlock_spin(lock);
1994 continue;
1995 }
1996 if (data & (LCK_RW_SHARED_MASK | LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) {
1997 atomic_exchange_abort();
1998 return FALSE; /* can't get it */
1999 }
2000 data |= LCK_RW_WANT_EXCL;
2001 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp)) {
2002 break;
2003 }
2004 cpu_pause();
2005 }
2006
2007 current_thread()->rwlock_count++;
2008 #if CONFIG_DTRACE
2009 LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL);
2010 #endif /* CONFIG_DTRACE */
2011 return TRUE;
2012 }
2013
2014
2015 void
2016 lck_rw_assert(
2017 lck_rw_t *lck,
2018 unsigned int type)
2019 {
2020 switch (type) {
2021 case LCK_RW_ASSERT_SHARED:
2022 if (lck->lck_rw_shared_count != 0) {
2023 return;
2024 }
2025 break;
2026 case LCK_RW_ASSERT_EXCLUSIVE:
2027 if ((lck->lck_rw_want_write ||
2028 lck->lck_rw_want_upgrade) &&
2029 lck->lck_rw_shared_count == 0) {
2030 return;
2031 }
2032 break;
2033 case LCK_RW_ASSERT_HELD:
2034 if (lck->lck_rw_want_write ||
2035 lck->lck_rw_want_upgrade ||
2036 lck->lck_rw_shared_count != 0) {
2037 return;
2038 }
2039 break;
2040 case LCK_RW_ASSERT_NOTHELD:
2041 if (!(lck->lck_rw_want_write ||
2042 lck->lck_rw_want_upgrade ||
2043 lck->lck_rw_shared_count != 0)) {
2044 return;
2045 }
2046 break;
2047 default:
2048 break;
2049 }
2050
2051 panic("rw lock (%p)%s held (mode=%u), first word %08x\n", lck, (type == LCK_RW_ASSERT_NOTHELD ? "" : " not"), type, *(uint32_t *)lck);
2052 }
2053
2054 /* On return to userspace, this routine is called if the rwlock_count is somehow imbalanced */
2055 #if MACH_LDEBUG
2056 __dead2
2057 #endif
2058 void
2059 lck_rw_clear_promotions_x86(thread_t thread)
2060 {
2061 #if MACH_LDEBUG
2062 /* It's fatal to leave a RW lock locked and return to userspace */
2063 panic("%u rw lock(s) held on return to userspace for thread %p", thread->rwlock_count, thread);
2064 #else
2065 /* Paper over the issue */
2066 thread->rwlock_count = 0;
2067 lck_rw_clear_promotion(thread, 0);
2068 #endif
2069 }
2070
2071 boolean_t
2072 lck_rw_lock_yield_shared(lck_rw_t *lck, boolean_t force_yield)
2073 {
2074 lck_rw_assert(lck, LCK_RW_ASSERT_SHARED);
2075
2076 if (lck->lck_rw_want_write || lck->lck_rw_want_upgrade || force_yield) {
2077 lck_rw_unlock_shared(lck);
2078 mutex_pause(2);
2079 lck_rw_lock_shared(lck);
2080 return TRUE;
2081 }
2082
2083 return FALSE;
2084 }
2085
2086 /*
2087 * Routine: kdp_lck_rw_lock_is_acquired_exclusive
2088 * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
2089 */
2090 boolean_t
2091 kdp_lck_rw_lock_is_acquired_exclusive(lck_rw_t *lck)
2092 {
2093 if (not_in_kdp) {
2094 panic("panic: rw lock exclusive check done outside of kernel debugger");
2095 }
2096 return ((lck->lck_rw_want_upgrade || lck->lck_rw_want_write) && (lck->lck_rw_shared_count == 0)) ? TRUE : FALSE;
2097 }
2098
2099 /*
2100 * Slow path routines for lck_mtx locking and unlocking functions.
2101 *
2102 * These functions were previously implemented in x86 assembly,
2103 * and some optimizations are in place in this c code to obtain a compiled code
2104 * as performant and compact as the assembly version.
2105 *
2106 * To avoid to inline these functions on the fast path, all functions directly called by
2107 * the fast paths have the __attribute__((noinline)) specified. Also they are all implemented
2108 * in such a way the fast path can tail call into them. In this way the return address
2109 * does not need to be pushed on the caller stack and stack optimization can happen on the caller.
2110 *
2111 * Slow path code is structured in such a way there are no calls to functions that will return
2112 * on the context of the caller function, i.e. all functions called are or tail call functions
2113 * or inline functions. The number of arguments of the tail call functions are less then six,
2114 * so that they can be passed over registers and do not need to be pushed on stack.
2115 * This allows the compiler to not create a stack frame for the functions.
2116 *
2117 * __improbable and __probable are used to compile the slow path code in such a way
2118 * the fast path case will be on a sequence of instructions with as less jumps as possible,
2119 * to make this case the most optimized even if falling through the slow path.
2120 */
2121
2122 /*
2123 * Intel lock invariants:
2124 *
2125 * lck_mtx_waiters: contains the count of threads currently in the mutex waitqueue
2126 *
2127 * The lock owner is promoted to the max priority of all its waiters only if it
2128 * was a lower priority when it acquired or was an owner when a waiter waited.
2129 * Max priority is capped at MAXPRI_PROMOTE.
2130 *
2131 * The last waiter will not be promoted as it is woken up, but the last
2132 * lock owner may not have been the last thread to have been woken up depending on the
2133 * luck of the draw. Therefore a last-owner may still have the promoted-on-wakeup
2134 * flag set.
2135 *
2136 * TODO: Figure out an algorithm for stopping a lock holder which is already at the right
2137 * priority from dropping priority in the future without having to take thread lock
2138 * on acquire.
2139 */
2140
2141 #ifdef MUTEX_ZONE
2142 extern zone_t lck_mtx_zone;
2143 #endif
2144
2145 /*
2146 * Routine: lck_mtx_alloc_init
2147 */
2148 lck_mtx_t *
2149 lck_mtx_alloc_init(
2150 lck_grp_t *grp,
2151 lck_attr_t *attr)
2152 {
2153 lck_mtx_t *lck;
2154 #ifdef MUTEX_ZONE
2155 if ((lck = (lck_mtx_t *)zalloc(lck_mtx_zone)) != 0) {
2156 lck_mtx_init(lck, grp, attr);
2157 }
2158 #else
2159 if ((lck = (lck_mtx_t *)kalloc(sizeof(lck_mtx_t))) != 0) {
2160 lck_mtx_init(lck, grp, attr);
2161 }
2162 #endif
2163 return lck;
2164 }
2165
2166 /*
2167 * Routine: lck_mtx_free
2168 */
2169 void
2170 lck_mtx_free(
2171 lck_mtx_t *lck,
2172 lck_grp_t *grp)
2173 {
2174 lck_mtx_destroy(lck, grp);
2175 #ifdef MUTEX_ZONE
2176 zfree(lck_mtx_zone, lck);
2177 #else
2178 kfree(lck, sizeof(lck_mtx_t));
2179 #endif
2180 }
2181
2182 /*
2183 * Routine: lck_mtx_ext_init
2184 */
2185 static void
2186 lck_mtx_ext_init(
2187 lck_mtx_ext_t *lck,
2188 lck_grp_t *grp,
2189 lck_attr_t *attr)
2190 {
2191 bzero((void *)lck, sizeof(lck_mtx_ext_t));
2192
2193 if ((attr->lck_attr_val) & LCK_ATTR_DEBUG) {
2194 lck->lck_mtx_deb.type = MUTEX_TAG;
2195 lck->lck_mtx_attr |= LCK_MTX_ATTR_DEBUG;
2196 }
2197
2198 lck->lck_mtx_grp = grp;
2199
2200 if (grp->lck_grp_attr & LCK_GRP_ATTR_STAT) {
2201 lck->lck_mtx_attr |= LCK_MTX_ATTR_STAT;
2202 }
2203
2204 lck->lck_mtx.lck_mtx_is_ext = 1;
2205 lck->lck_mtx.lck_mtx_pad32 = 0xFFFFFFFF;
2206 }
2207
2208 /*
2209 * Routine: lck_mtx_init
2210 */
2211 void
2212 lck_mtx_init(
2213 lck_mtx_t *lck,
2214 lck_grp_t *grp,
2215 lck_attr_t *attr)
2216 {
2217 lck_mtx_ext_t *lck_ext;
2218 lck_attr_t *lck_attr;
2219
2220 if (attr != LCK_ATTR_NULL) {
2221 lck_attr = attr;
2222 } else {
2223 lck_attr = &LockDefaultLckAttr;
2224 }
2225
2226 if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) {
2227 if ((lck_ext = (lck_mtx_ext_t *)kalloc(sizeof(lck_mtx_ext_t))) != 0) {
2228 lck_mtx_ext_init(lck_ext, grp, lck_attr);
2229 lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT;
2230 lck->lck_mtx_ptr = lck_ext;
2231 }
2232 } else {
2233 lck->lck_mtx_owner = 0;
2234 lck->lck_mtx_state = 0;
2235 }
2236 lck->lck_mtx_pad32 = 0xFFFFFFFF;
2237 lck_grp_reference(grp);
2238 lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX);
2239 }
2240
2241 /*
2242 * Routine: lck_mtx_init_ext
2243 */
2244 void
2245 lck_mtx_init_ext(
2246 lck_mtx_t *lck,
2247 lck_mtx_ext_t *lck_ext,
2248 lck_grp_t *grp,
2249 lck_attr_t *attr)
2250 {
2251 lck_attr_t *lck_attr;
2252
2253 if (attr != LCK_ATTR_NULL) {
2254 lck_attr = attr;
2255 } else {
2256 lck_attr = &LockDefaultLckAttr;
2257 }
2258
2259 if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) {
2260 lck_mtx_ext_init(lck_ext, grp, lck_attr);
2261 lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT;
2262 lck->lck_mtx_ptr = lck_ext;
2263 } else {
2264 lck->lck_mtx_owner = 0;
2265 lck->lck_mtx_state = 0;
2266 }
2267 lck->lck_mtx_pad32 = 0xFFFFFFFF;
2268
2269 lck_grp_reference(grp);
2270 lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX);
2271 }
2272
2273 static void
2274 lck_mtx_lock_mark_destroyed(
2275 lck_mtx_t *mutex,
2276 boolean_t indirect)
2277 {
2278 uint32_t state;
2279
2280 if (indirect) {
2281 /* convert to destroyed state */
2282 ordered_store_mtx_state_release(mutex, LCK_MTX_TAG_DESTROYED);
2283 return;
2284 }
2285
2286 state = ordered_load_mtx_state(mutex);
2287 lck_mtx_interlock_lock(mutex, &state);
2288
2289 ordered_store_mtx_state_release(mutex, LCK_MTX_TAG_DESTROYED);
2290
2291 enable_preemption();
2292 }
2293
2294 /*
2295 * Routine: lck_mtx_destroy
2296 */
2297 void
2298 lck_mtx_destroy(
2299 lck_mtx_t *lck,
2300 lck_grp_t *grp)
2301 {
2302 boolean_t indirect;
2303
2304 if (lck->lck_mtx_tag == LCK_MTX_TAG_DESTROYED) {
2305 return;
2306 }
2307 #if MACH_LDEBUG
2308 lck_mtx_assert(lck, LCK_MTX_ASSERT_NOTOWNED);
2309 #endif
2310 indirect = (lck->lck_mtx_tag == LCK_MTX_TAG_INDIRECT);
2311
2312 lck_mtx_lock_mark_destroyed(lck, indirect);
2313
2314 if (indirect) {
2315 kfree(lck->lck_mtx_ptr, sizeof(lck_mtx_ext_t));
2316 }
2317 lck_grp_lckcnt_decr(grp, LCK_TYPE_MTX);
2318 lck_grp_deallocate(grp);
2319 return;
2320 }
2321
2322
2323 #if DEVELOPMENT | DEBUG
2324 __attribute__((noinline))
2325 void
2326 lck_mtx_owner_check_panic(
2327 lck_mtx_t *lock)
2328 {
2329 thread_t owner = (thread_t)lock->lck_mtx_owner;
2330 panic("Mutex unlock attempted from non-owner thread. Owner=%p lock=%p", owner, lock);
2331 }
2332 #endif
2333
2334 __attribute__((always_inline))
2335 static boolean_t
2336 get_indirect_mutex(
2337 lck_mtx_t **lock,
2338 uint32_t *state)
2339 {
2340 *lock = &((*lock)->lck_mtx_ptr->lck_mtx);
2341 *state = ordered_load_mtx_state(*lock);
2342 return TRUE;
2343 }
2344
2345 /*
2346 * Routine: lck_mtx_unlock_slow
2347 *
2348 * Unlocks a mutex held by current thread.
2349 *
2350 * It will wake up waiters if necessary.
2351 *
2352 * Interlock can be held.
2353 */
2354 __attribute__((noinline))
2355 void
2356 lck_mtx_unlock_slow(
2357 lck_mtx_t *lock)
2358 {
2359 thread_t thread;
2360 uint32_t state, prev;
2361 boolean_t indirect = FALSE;
2362
2363 state = ordered_load_mtx_state(lock);
2364
2365 /* Is this an indirect mutex? */
2366 if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
2367 indirect = get_indirect_mutex(&lock, &state);
2368 }
2369
2370 thread = current_thread();
2371
2372 #if DEVELOPMENT | DEBUG
2373 thread_t owner = (thread_t)lock->lck_mtx_owner;
2374 if (__improbable(owner != thread)) {
2375 lck_mtx_owner_check_panic(lock);
2376 }
2377 #endif
2378
2379 /* check if it is held as a spinlock */
2380 if (__improbable((state & LCK_MTX_MLOCKED_MSK) == 0)) {
2381 goto unlock;
2382 }
2383
2384 lck_mtx_interlock_lock_clear_flags(lock, LCK_MTX_MLOCKED_MSK, &state);
2385
2386 unlock:
2387 /* preemption disabled, interlock held and mutex not held */
2388
2389 /* clear owner */
2390 ordered_store_mtx_owner(lock, 0);
2391 /* keep original state in prev for later evaluation */
2392 prev = state;
2393
2394 if (__improbable(state & LCK_MTX_WAITERS_MSK)) {
2395 #if MACH_LDEBUG
2396 if (thread) {
2397 thread->mutex_count--;
2398 }
2399 #endif
2400 return lck_mtx_unlock_wakeup_tail(lock, state, indirect);
2401 }
2402
2403 /* release interlock, promotion and clear spin flag */
2404 state &= (~(LCK_MTX_ILOCKED_MSK | LCK_MTX_SPIN_MSK));
2405 ordered_store_mtx_state_release(lock, state); /* since I own the interlock, I don't need an atomic update */
2406
2407 #if MACH_LDEBUG
2408 /* perform lock statistics after drop to prevent delay */
2409 if (thread) {
2410 thread->mutex_count--; /* lock statistic */
2411 }
2412 #endif /* MACH_LDEBUG */
2413
2414 /* re-enable preemption */
2415 lck_mtx_unlock_finish_inline(lock, FALSE);
2416
2417 return;
2418 }
2419
2420 #define LCK_MTX_LCK_WAIT_CODE 0x20
2421 #define LCK_MTX_LCK_WAKEUP_CODE 0x21
2422 #define LCK_MTX_LCK_SPIN_CODE 0x22
2423 #define LCK_MTX_LCK_ACQUIRE_CODE 0x23
2424 #define LCK_MTX_LCK_DEMOTE_CODE 0x24
2425
2426 /*
2427 * Routine: lck_mtx_unlock_wakeup_tail
2428 *
2429 * Invoked on unlock when there is
2430 * contention, i.e. the assembly routine sees
2431 * that mutex->lck_mtx_waiters != 0
2432 *
2433 * neither the mutex or interlock is held
2434 *
2435 * Note that this routine might not be called if there are pending
2436 * waiters which have previously been woken up, and they didn't
2437 * end up boosting the old owner.
2438 *
2439 * assembly routine previously did the following to mutex:
2440 * (after saving the state in prior_lock_state)
2441 * decremented lck_mtx_waiters if nonzero
2442 *
2443 * This function needs to be called as a tail call
2444 * to optimize the compiled code.
2445 */
2446 __attribute__((noinline))
2447 static void
2448 lck_mtx_unlock_wakeup_tail(
2449 lck_mtx_t *mutex,
2450 uint32_t state,
2451 boolean_t indirect)
2452 {
2453 struct turnstile *ts;
2454
2455 __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex);
2456 kern_return_t did_wake;
2457
2458 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_START,
2459 trace_lck, 0, mutex->lck_mtx_waiters, 0, 0);
2460
2461 ts = turnstile_prepare((uintptr_t)mutex, NULL, TURNSTILE_NULL, TURNSTILE_KERNEL_MUTEX);
2462
2463 if (mutex->lck_mtx_waiters > 1) {
2464 /* WAITQ_PROMOTE_ON_WAKE will call turnstile_update_inheritor on the wokenup thread */
2465 did_wake = waitq_wakeup64_one(&ts->ts_waitq, CAST_EVENT64_T(LCK_MTX_EVENT(mutex)), THREAD_AWAKENED, WAITQ_PROMOTE_ON_WAKE);
2466 } else {
2467 did_wake = waitq_wakeup64_one(&ts->ts_waitq, CAST_EVENT64_T(LCK_MTX_EVENT(mutex)), THREAD_AWAKENED, WAITQ_ALL_PRIORITIES);
2468 turnstile_update_inheritor(ts, NULL, TURNSTILE_IMMEDIATE_UPDATE);
2469 }
2470 assert(did_wake == KERN_SUCCESS);
2471
2472 turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
2473 turnstile_complete((uintptr_t)mutex, NULL, NULL, TURNSTILE_KERNEL_MUTEX);
2474
2475 state -= LCK_MTX_WAITER;
2476 state &= (~(LCK_MTX_SPIN_MSK | LCK_MTX_ILOCKED_MSK));
2477 ordered_store_mtx_state_release(mutex, state);
2478
2479 assert(current_thread()->turnstile != NULL);
2480
2481 turnstile_cleanup();
2482
2483 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_END,
2484 trace_lck, 0, mutex->lck_mtx_waiters, 0, 0);
2485
2486 lck_mtx_unlock_finish_inline(mutex, indirect);
2487 }
2488
2489 /*
2490 * Routine: lck_mtx_lock_acquire_x86
2491 *
2492 * Invoked on acquiring the mutex when there is
2493 * contention (i.e. the assembly routine sees that
2494 * that mutex->lck_mtx_waiters != 0
2495 *
2496 * mutex is owned... interlock is held... preemption is disabled
2497 */
2498 __attribute__((always_inline))
2499 static void
2500 lck_mtx_lock_acquire_inline(
2501 lck_mtx_t *mutex,
2502 struct turnstile *ts)
2503 {
2504 __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex);
2505
2506 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_START,
2507 trace_lck, 0, mutex->lck_mtx_waiters, 0, 0);
2508
2509 thread_t thread = (thread_t)mutex->lck_mtx_owner; /* faster than current_thread() */
2510 assert(thread->waiting_for_mutex == NULL);
2511
2512 if (mutex->lck_mtx_waiters > 0) {
2513 if (ts == NULL) {
2514 ts = turnstile_prepare((uintptr_t)mutex, NULL, TURNSTILE_NULL, TURNSTILE_KERNEL_MUTEX);
2515 }
2516
2517 turnstile_update_inheritor(ts, thread, (TURNSTILE_IMMEDIATE_UPDATE | TURNSTILE_INHERITOR_THREAD));
2518 turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
2519 }
2520
2521 if (ts != NULL) {
2522 turnstile_complete((uintptr_t)mutex, NULL, NULL, TURNSTILE_KERNEL_MUTEX);
2523 }
2524
2525 assert(current_thread()->turnstile != NULL);
2526
2527 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_END,
2528 trace_lck, 0, mutex->lck_mtx_waiters, 0, 0);
2529 }
2530
2531 void
2532 lck_mtx_lock_acquire_x86(
2533 lck_mtx_t *mutex)
2534 {
2535 return lck_mtx_lock_acquire_inline(mutex, NULL);
2536 }
2537
2538 /*
2539 * Tail call helpers for lock functions that perform
2540 * lck_mtx_lock_acquire followed by the caller's finish routine, to optimize
2541 * the caller's compiled code.
2542 */
2543
2544 __attribute__((noinline))
2545 static void
2546 lck_mtx_lock_acquire_tail(
2547 lck_mtx_t *mutex,
2548 boolean_t indirect,
2549 struct turnstile *ts)
2550 {
2551 lck_mtx_lock_acquire_inline(mutex, ts);
2552 lck_mtx_lock_finish_inline_with_cleanup(mutex, ordered_load_mtx_state(mutex), indirect);
2553 }
2554
2555 __attribute__((noinline))
2556 static boolean_t
2557 lck_mtx_try_lock_acquire_tail(
2558 lck_mtx_t *mutex)
2559 {
2560 lck_mtx_lock_acquire_inline(mutex, NULL);
2561 lck_mtx_try_lock_finish_inline(mutex, ordered_load_mtx_state(mutex));
2562
2563 return TRUE;
2564 }
2565
2566 __attribute__((noinline))
2567 static void
2568 lck_mtx_convert_spin_acquire_tail(
2569 lck_mtx_t *mutex)
2570 {
2571 lck_mtx_lock_acquire_inline(mutex, NULL);
2572 lck_mtx_convert_spin_finish_inline(mutex, ordered_load_mtx_state(mutex));
2573 }
2574
2575 boolean_t
2576 lck_mtx_ilk_unlock(
2577 lck_mtx_t *mutex)
2578 {
2579 lck_mtx_ilk_unlock_inline(mutex, ordered_load_mtx_state(mutex));
2580 return TRUE;
2581 }
2582
2583 static inline void
2584 lck_mtx_interlock_lock_set_and_clear_flags(
2585 lck_mtx_t *mutex,
2586 uint32_t xor_flags,
2587 uint32_t and_flags,
2588 uint32_t *new_state)
2589 {
2590 uint32_t state, prev;
2591 state = *new_state;
2592
2593 for (;;) {
2594 /* have to wait for interlock to clear */
2595 while (__improbable(state & (LCK_MTX_ILOCKED_MSK | xor_flags))) {
2596 cpu_pause();
2597 state = ordered_load_mtx_state(mutex);
2598 }
2599 prev = state; /* prev contains snapshot for exchange */
2600 state |= LCK_MTX_ILOCKED_MSK | xor_flags; /* pick up interlock */
2601 state &= ~and_flags; /* clear flags */
2602
2603 disable_preemption();
2604 if (os_atomic_cmpxchg(&mutex->lck_mtx_state, prev, state, acquire)) {
2605 break;
2606 }
2607 enable_preemption();
2608 cpu_pause();
2609 state = ordered_load_mtx_state(mutex);
2610 }
2611 *new_state = state;
2612 return;
2613 }
2614
2615 static inline void
2616 lck_mtx_interlock_lock_clear_flags(
2617 lck_mtx_t *mutex,
2618 uint32_t and_flags,
2619 uint32_t *new_state)
2620 {
2621 return lck_mtx_interlock_lock_set_and_clear_flags(mutex, 0, and_flags, new_state);
2622 }
2623
2624 static inline void
2625 lck_mtx_interlock_lock(
2626 lck_mtx_t *mutex,
2627 uint32_t *new_state)
2628 {
2629 return lck_mtx_interlock_lock_set_and_clear_flags(mutex, 0, 0, new_state);
2630 }
2631
2632 static inline int
2633 lck_mtx_interlock_try_lock_set_flags(
2634 lck_mtx_t *mutex,
2635 uint32_t or_flags,
2636 uint32_t *new_state)
2637 {
2638 uint32_t state, prev;
2639 state = *new_state;
2640
2641 /* have to wait for interlock to clear */
2642 if (state & (LCK_MTX_ILOCKED_MSK | or_flags)) {
2643 return 0;
2644 }
2645 prev = state; /* prev contains snapshot for exchange */
2646 state |= LCK_MTX_ILOCKED_MSK | or_flags; /* pick up interlock */
2647 disable_preemption();
2648 if (os_atomic_cmpxchg(&mutex->lck_mtx_state, prev, state, acquire)) {
2649 *new_state = state;
2650 return 1;
2651 }
2652
2653 enable_preemption();
2654 return 0;
2655 }
2656
2657 __attribute__((noinline))
2658 static void
2659 lck_mtx_lock_contended(
2660 lck_mtx_t *lock,
2661 boolean_t indirect,
2662 boolean_t *first_miss)
2663 {
2664 lck_mtx_spinwait_ret_type_t ret;
2665 uint32_t state;
2666 thread_t thread;
2667 struct turnstile *ts = NULL;
2668
2669 try_again:
2670
2671 if (indirect) {
2672 lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, first_miss);
2673 }
2674
2675 ret = lck_mtx_lock_spinwait_x86(lock);
2676 state = ordered_load_mtx_state(lock);
2677 switch (ret) {
2678 case LCK_MTX_SPINWAIT_NO_SPIN:
2679 /*
2680 * owner not on core, lck_mtx_lock_spinwait_x86 didn't even
2681 * try to spin.
2682 */
2683 if (indirect) {
2684 lck_grp_mtx_update_direct_wait((struct _lck_mtx_ext_*)lock);
2685 }
2686
2687 /* just fall through case LCK_MTX_SPINWAIT_SPUN */
2688 case LCK_MTX_SPINWAIT_SPUN_HIGH_THR:
2689 case LCK_MTX_SPINWAIT_SPUN_OWNER_NOT_CORE:
2690 case LCK_MTX_SPINWAIT_SPUN_NO_WINDOW_CONTENTION:
2691 case LCK_MTX_SPINWAIT_SPUN_SLIDING_THR:
2692 /*
2693 * mutex not acquired but lck_mtx_lock_spinwait_x86 tried to spin
2694 * interlock not held
2695 */
2696 lck_mtx_interlock_lock(lock, &state);
2697 assert(state & LCK_MTX_ILOCKED_MSK);
2698
2699 if (state & LCK_MTX_MLOCKED_MSK) {
2700 if (indirect) {
2701 lck_grp_mtx_update_wait((struct _lck_mtx_ext_*)lock, first_miss);
2702 }
2703 lck_mtx_lock_wait_x86(lock, &ts);
2704 /*
2705 * interlock is not held here.
2706 */
2707 goto try_again;
2708 } else {
2709 /* grab the mutex */
2710 state |= LCK_MTX_MLOCKED_MSK;
2711 ordered_store_mtx_state_release(lock, state);
2712 thread = current_thread();
2713 ordered_store_mtx_owner(lock, (uintptr_t)thread);
2714 #if MACH_LDEBUG
2715 if (thread) {
2716 thread->mutex_count++;
2717 }
2718 #endif /* MACH_LDEBUG */
2719 }
2720
2721 break;
2722 case LCK_MTX_SPINWAIT_ACQUIRED:
2723 /*
2724 * mutex has been acquired by lck_mtx_lock_spinwait_x86
2725 * interlock is held and preemption disabled
2726 * owner is set and mutex marked as locked
2727 * statistics updated too
2728 */
2729 break;
2730 default:
2731 panic("lck_mtx_lock_spinwait_x86 returned %d for mutex %p\n", ret, lock);
2732 }
2733
2734 /*
2735 * interlock is already acquired here
2736 */
2737
2738 /* mutex has been acquired */
2739 thread = (thread_t)lock->lck_mtx_owner;
2740 if (state & LCK_MTX_WAITERS_MSK) {
2741 /*
2742 * lck_mtx_lock_acquire_tail will call
2743 * turnstile_complete.
2744 */
2745 return lck_mtx_lock_acquire_tail(lock, indirect, ts);
2746 }
2747
2748 if (ts != NULL) {
2749 turnstile_complete((uintptr_t)lock, NULL, NULL, TURNSTILE_KERNEL_MUTEX);
2750 }
2751
2752 assert(current_thread()->turnstile != NULL);
2753
2754 /* release the interlock */
2755 lck_mtx_lock_finish_inline_with_cleanup(lock, ordered_load_mtx_state(lock), indirect);
2756 }
2757
2758 /*
2759 * Helper noinline functions for calling
2760 * panic to optimize compiled code.
2761 */
2762
2763 __attribute__((noinline)) __abortlike
2764 static void
2765 lck_mtx_destroyed(
2766 lck_mtx_t *lock)
2767 {
2768 panic("trying to interlock destroyed mutex (%p)", lock);
2769 }
2770
2771 __attribute__((noinline))
2772 static boolean_t
2773 lck_mtx_try_destroyed(
2774 lck_mtx_t *lock)
2775 {
2776 panic("trying to interlock destroyed mutex (%p)", lock);
2777 return FALSE;
2778 }
2779
2780 __attribute__((always_inline))
2781 static boolean_t
2782 lck_mtx_lock_wait_interlock_to_clear(
2783 lck_mtx_t *lock,
2784 uint32_t* new_state)
2785 {
2786 uint32_t state;
2787
2788 for (;;) {
2789 cpu_pause();
2790 state = ordered_load_mtx_state(lock);
2791 if (!(state & (LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK))) {
2792 *new_state = state;
2793 return TRUE;
2794 }
2795 if (state & LCK_MTX_MLOCKED_MSK) {
2796 /* if it is held as mutex, just fail */
2797 return FALSE;
2798 }
2799 }
2800 }
2801
2802 __attribute__((always_inline))
2803 static boolean_t
2804 lck_mtx_try_lock_wait_interlock_to_clear(
2805 lck_mtx_t *lock,
2806 uint32_t* new_state)
2807 {
2808 uint32_t state;
2809
2810 for (;;) {
2811 cpu_pause();
2812 state = ordered_load_mtx_state(lock);
2813 if (state & (LCK_MTX_MLOCKED_MSK | LCK_MTX_SPIN_MSK)) {
2814 /* if it is held as mutex or spin, just fail */
2815 return FALSE;
2816 }
2817 if (!(state & LCK_MTX_ILOCKED_MSK)) {
2818 *new_state = state;
2819 return TRUE;
2820 }
2821 }
2822 }
2823
2824 /*
2825 * Routine: lck_mtx_lock_slow
2826 *
2827 * Locks a mutex for current thread.
2828 * If the lock is contended this function might
2829 * sleep.
2830 *
2831 * Called with interlock not held.
2832 */
2833 __attribute__((noinline))
2834 void
2835 lck_mtx_lock_slow(
2836 lck_mtx_t *lock)
2837 {
2838 boolean_t indirect = FALSE;
2839 uint32_t state;
2840 int first_miss = 0;
2841
2842 state = ordered_load_mtx_state(lock);
2843
2844 /* is the interlock or mutex held */
2845 if (__improbable(state & ((LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK)))) {
2846 /*
2847 * Note: both LCK_MTX_TAG_DESTROYED and LCK_MTX_TAG_INDIRECT
2848 * have LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK
2849 * set in state (state == lck_mtx_tag)
2850 */
2851
2852
2853 /* is the mutex already held and not indirect */
2854 if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))) {
2855 /* no, must have been the mutex */
2856 return lck_mtx_lock_contended(lock, indirect, &first_miss);
2857 }
2858
2859 /* check to see if it is marked destroyed */
2860 if (__improbable(state == LCK_MTX_TAG_DESTROYED)) {
2861 lck_mtx_destroyed(lock);
2862 }
2863
2864 /* Is this an indirect mutex? */
2865 if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
2866 indirect = get_indirect_mutex(&lock, &state);
2867
2868 first_miss = 0;
2869 lck_grp_mtx_update_held((struct _lck_mtx_ext_*)lock);
2870
2871 if (state & LCK_MTX_SPIN_MSK) {
2872 /* M_SPIN_MSK was set, so M_ILOCKED_MSK must also be present */
2873 assert(state & LCK_MTX_ILOCKED_MSK);
2874 lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
2875 }
2876 }
2877
2878 if (!lck_mtx_lock_wait_interlock_to_clear(lock, &state)) {
2879 return lck_mtx_lock_contended(lock, indirect, &first_miss);
2880 }
2881 }
2882
2883 /* no - can't be INDIRECT, DESTROYED or locked */
2884 while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_MLOCKED_MSK, &state))) {
2885 if (!lck_mtx_lock_wait_interlock_to_clear(lock, &state)) {
2886 return lck_mtx_lock_contended(lock, indirect, &first_miss);
2887 }
2888 }
2889
2890 /* lock and interlock acquired */
2891
2892 thread_t thread = current_thread();
2893 /* record owner of mutex */
2894 ordered_store_mtx_owner(lock, (uintptr_t)thread);
2895
2896 #if MACH_LDEBUG
2897 if (thread) {
2898 thread->mutex_count++; /* lock statistic */
2899 }
2900 #endif
2901 /*
2902 * Check if there are waiters to
2903 * inherit their priority.
2904 */
2905 if (__improbable(state & LCK_MTX_WAITERS_MSK)) {
2906 return lck_mtx_lock_acquire_tail(lock, indirect, NULL);
2907 }
2908
2909 /* release the interlock */
2910 lck_mtx_lock_finish_inline(lock, ordered_load_mtx_state(lock), indirect);
2911
2912 return;
2913 }
2914
2915 __attribute__((noinline))
2916 boolean_t
2917 lck_mtx_try_lock_slow(
2918 lck_mtx_t *lock)
2919 {
2920 boolean_t indirect = FALSE;
2921 uint32_t state;
2922 int first_miss = 0;
2923
2924 state = ordered_load_mtx_state(lock);
2925
2926 /* is the interlock or mutex held */
2927 if (__improbable(state & ((LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK)))) {
2928 /*
2929 * Note: both LCK_MTX_TAG_DESTROYED and LCK_MTX_TAG_INDIRECT
2930 * have LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK
2931 * set in state (state == lck_mtx_tag)
2932 */
2933
2934 /* is the mutex already held and not indirect */
2935 if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))) {
2936 return FALSE;
2937 }
2938
2939 /* check to see if it is marked destroyed */
2940 if (__improbable(state == LCK_MTX_TAG_DESTROYED)) {
2941 lck_mtx_try_destroyed(lock);
2942 }
2943
2944 /* Is this an indirect mutex? */
2945 if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
2946 indirect = get_indirect_mutex(&lock, &state);
2947
2948 first_miss = 0;
2949 lck_grp_mtx_update_held((struct _lck_mtx_ext_*)lock);
2950 }
2951
2952 if (!lck_mtx_try_lock_wait_interlock_to_clear(lock, &state)) {
2953 if (indirect) {
2954 lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
2955 }
2956 return FALSE;
2957 }
2958 }
2959
2960 /* no - can't be INDIRECT, DESTROYED or locked */
2961 while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_MLOCKED_MSK, &state))) {
2962 if (!lck_mtx_try_lock_wait_interlock_to_clear(lock, &state)) {
2963 if (indirect) {
2964 lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
2965 }
2966 return FALSE;
2967 }
2968 }
2969
2970 /* lock and interlock acquired */
2971
2972 thread_t thread = current_thread();
2973 /* record owner of mutex */
2974 ordered_store_mtx_owner(lock, (uintptr_t)thread);
2975
2976 #if MACH_LDEBUG
2977 if (thread) {
2978 thread->mutex_count++; /* lock statistic */
2979 }
2980 #endif
2981 /*
2982 * Check if there are waiters to
2983 * inherit their priority.
2984 */
2985 if (__improbable(state & LCK_MTX_WAITERS_MSK)) {
2986 return lck_mtx_try_lock_acquire_tail(lock);
2987 }
2988
2989 /* release the interlock */
2990 lck_mtx_try_lock_finish_inline(lock, ordered_load_mtx_state(lock));
2991
2992 return TRUE;
2993 }
2994
2995 __attribute__((noinline))
2996 void
2997 lck_mtx_lock_spin_slow(
2998 lck_mtx_t *lock)
2999 {
3000 boolean_t indirect = FALSE;
3001 uint32_t state;
3002 int first_miss = 0;
3003
3004 state = ordered_load_mtx_state(lock);
3005
3006 /* is the interlock or mutex held */
3007 if (__improbable(state & ((LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK)))) {
3008 /*
3009 * Note: both LCK_MTX_TAG_DESTROYED and LCK_MTX_TAG_INDIRECT
3010 * have LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK
3011 * set in state (state == lck_mtx_tag)
3012 */
3013
3014
3015 /* is the mutex already held and not indirect */
3016 if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))) {
3017 /* no, must have been the mutex */
3018 return lck_mtx_lock_contended(lock, indirect, &first_miss);
3019 }
3020
3021 /* check to see if it is marked destroyed */
3022 if (__improbable(state == LCK_MTX_TAG_DESTROYED)) {
3023 lck_mtx_destroyed(lock);
3024 }
3025
3026 /* Is this an indirect mutex? */
3027 if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
3028 indirect = get_indirect_mutex(&lock, &state);
3029
3030 first_miss = 0;
3031 lck_grp_mtx_update_held((struct _lck_mtx_ext_*)lock);
3032
3033 if (state & LCK_MTX_SPIN_MSK) {
3034 /* M_SPIN_MSK was set, so M_ILOCKED_MSK must also be present */
3035 assert(state & LCK_MTX_ILOCKED_MSK);
3036 lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
3037 }
3038 }
3039
3040 if (!lck_mtx_lock_wait_interlock_to_clear(lock, &state)) {
3041 return lck_mtx_lock_contended(lock, indirect, &first_miss);
3042 }
3043 }
3044
3045 /* no - can't be INDIRECT, DESTROYED or locked */
3046 while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_SPIN_MSK, &state))) {
3047 if (!lck_mtx_lock_wait_interlock_to_clear(lock, &state)) {
3048 return lck_mtx_lock_contended(lock, indirect, &first_miss);
3049 }
3050 }
3051
3052 /* lock as spinlock and interlock acquired */
3053
3054 thread_t thread = current_thread();
3055 /* record owner of mutex */
3056 ordered_store_mtx_owner(lock, (uintptr_t)thread);
3057
3058 #if MACH_LDEBUG
3059 if (thread) {
3060 thread->mutex_count++; /* lock statistic */
3061 }
3062 #endif
3063
3064 #if CONFIG_DTRACE
3065 LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN_ACQUIRE, lock, 0);
3066 #endif
3067 /* return with the interlock held and preemption disabled */
3068 return;
3069 }
3070
3071 __attribute__((noinline))
3072 boolean_t
3073 lck_mtx_try_lock_spin_slow(
3074 lck_mtx_t *lock)
3075 {
3076 boolean_t indirect = FALSE;
3077 uint32_t state;
3078 int first_miss = 0;
3079
3080 state = ordered_load_mtx_state(lock);
3081
3082 /* is the interlock or mutex held */
3083 if (__improbable(state & ((LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK)))) {
3084 /*
3085 * Note: both LCK_MTX_TAG_DESTROYED and LCK_MTX_TAG_INDIRECT
3086 * have LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK
3087 * set in state (state == lck_mtx_tag)
3088 */
3089
3090 /* is the mutex already held and not indirect */
3091 if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))) {
3092 return FALSE;
3093 }
3094
3095 /* check to see if it is marked destroyed */
3096 if (__improbable(state == LCK_MTX_TAG_DESTROYED)) {
3097 lck_mtx_try_destroyed(lock);
3098 }
3099
3100 /* Is this an indirect mutex? */
3101 if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
3102 indirect = get_indirect_mutex(&lock, &state);
3103
3104 first_miss = 0;
3105 lck_grp_mtx_update_held((struct _lck_mtx_ext_*)lock);
3106 }
3107
3108 if (!lck_mtx_try_lock_wait_interlock_to_clear(lock, &state)) {
3109 if (indirect) {
3110 lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
3111 }
3112 return FALSE;
3113 }
3114 }
3115
3116 /* no - can't be INDIRECT, DESTROYED or locked */
3117 while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_SPIN_MSK, &state))) {
3118 if (!lck_mtx_try_lock_wait_interlock_to_clear(lock, &state)) {
3119 if (indirect) {
3120 lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
3121 }
3122 return FALSE;
3123 }
3124 }
3125
3126 /* lock and interlock acquired */
3127
3128 thread_t thread = current_thread();
3129 /* record owner of mutex */
3130 ordered_store_mtx_owner(lock, (uintptr_t)thread);
3131
3132 #if MACH_LDEBUG
3133 if (thread) {
3134 thread->mutex_count++; /* lock statistic */
3135 }
3136 #endif
3137
3138 #if CONFIG_DTRACE
3139 LOCKSTAT_RECORD(LS_LCK_MTX_TRY_SPIN_LOCK_ACQUIRE, lock, 0);
3140 #endif
3141 return TRUE;
3142 }
3143
3144 __attribute__((noinline))
3145 void
3146 lck_mtx_convert_spin(
3147 lck_mtx_t *lock)
3148 {
3149 uint32_t state;
3150
3151 state = ordered_load_mtx_state(lock);
3152
3153 /* Is this an indirect mutex? */
3154 if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
3155 /* If so, take indirection */
3156 get_indirect_mutex(&lock, &state);
3157 }
3158
3159 assertf((thread_t)lock->lck_mtx_owner == current_thread(), "lock %p not owned by thread %p (current owner %p)", lock, current_thread(), (thread_t)lock->lck_mtx_owner );
3160
3161 if (__improbable(state & LCK_MTX_MLOCKED_MSK)) {
3162 /* already owned as a mutex, just return */
3163 return;
3164 }
3165
3166 assert(get_preemption_level() > 0);
3167 assert(state & LCK_MTX_ILOCKED_MSK);
3168 assert(state & LCK_MTX_SPIN_MSK);
3169
3170 /*
3171 * Check if there are waiters to
3172 * inherit their priority.
3173 */
3174 if (__improbable(state & LCK_MTX_WAITERS_MSK)) {
3175 return lck_mtx_convert_spin_acquire_tail(lock);
3176 }
3177
3178 lck_mtx_convert_spin_finish_inline(lock, ordered_load_mtx_state(lock));
3179
3180 return;
3181 }
3182
3183 static inline boolean_t
3184 lck_mtx_lock_grab_mutex(
3185 lck_mtx_t *lock)
3186 {
3187 uint32_t state;
3188
3189 state = ordered_load_mtx_state(lock);
3190
3191 if (!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_MLOCKED_MSK, &state)) {
3192 return FALSE;
3193 }
3194
3195 /* lock and interlock acquired */
3196
3197 thread_t thread = current_thread();
3198 /* record owner of mutex */
3199 ordered_store_mtx_owner(lock, (uintptr_t)thread);
3200
3201 #if MACH_LDEBUG
3202 if (thread) {
3203 thread->mutex_count++; /* lock statistic */
3204 }
3205 #endif
3206 return TRUE;
3207 }
3208
3209 __attribute__((noinline))
3210 void
3211 lck_mtx_assert(
3212 lck_mtx_t *lock,
3213 unsigned int type)
3214 {
3215 thread_t thread, owner;
3216 uint32_t state;
3217
3218 thread = current_thread();
3219 state = ordered_load_mtx_state(lock);
3220
3221 if (state == LCK_MTX_TAG_INDIRECT) {
3222 get_indirect_mutex(&lock, &state);
3223 }
3224
3225 owner = (thread_t)lock->lck_mtx_owner;
3226
3227 if (type == LCK_MTX_ASSERT_OWNED) {
3228 if (owner != thread || !(state & (LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK))) {
3229 panic("mutex (%p) not owned\n", lock);
3230 }
3231 } else {
3232 assert(type == LCK_MTX_ASSERT_NOTOWNED);
3233 if (owner == thread) {
3234 panic("mutex (%p) owned\n", lock);
3235 }
3236 }
3237 }
3238
3239 /*
3240 * Routine: lck_mtx_lock_spinwait_x86
3241 *
3242 * Invoked trying to acquire a mutex when there is contention but
3243 * the holder is running on another processor. We spin for up to a maximum
3244 * time waiting for the lock to be released.
3245 *
3246 * Called with the interlock unlocked.
3247 * returns LCK_MTX_SPINWAIT_ACQUIRED if mutex acquired
3248 * returns LCK_MTX_SPINWAIT_SPUN if we spun
3249 * returns LCK_MTX_SPINWAIT_NO_SPIN if we didn't spin due to the holder not running
3250 */
3251 __attribute__((noinline))
3252 lck_mtx_spinwait_ret_type_t
3253 lck_mtx_lock_spinwait_x86(
3254 lck_mtx_t *mutex)
3255 {
3256 __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex);
3257 thread_t owner, prev_owner;
3258 uint64_t window_deadline, sliding_deadline, high_deadline;
3259 uint64_t start_time, cur_time, avg_hold_time, bias, delta;
3260 lck_mtx_spinwait_ret_type_t retval = LCK_MTX_SPINWAIT_SPUN_HIGH_THR;
3261 int loopcount = 0;
3262 int total_hold_time_samples, window_hold_time_samples, unfairness;
3263 uint i, prev_owner_cpu;
3264 bool owner_on_core, adjust;
3265
3266 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_START,
3267 trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, 0, 0);
3268
3269 start_time = mach_absolute_time();
3270 /*
3271 * window_deadline represents the "learning" phase.
3272 * The thread collects statistics about the lock during
3273 * window_deadline and then it makes a decision on whether to spin more
3274 * or block according to the concurrency behavior
3275 * observed.
3276 *
3277 * Every thread can spin at least low_MutexSpin.
3278 */
3279 window_deadline = start_time + low_MutexSpin;
3280 /*
3281 * Sliding_deadline is the adjusted spin deadline
3282 * computed after the "learning" phase.
3283 */
3284 sliding_deadline = window_deadline;
3285 /*
3286 * High_deadline is a hard deadline. No thread
3287 * can spin more than this deadline.
3288 */
3289 if (high_MutexSpin >= 0) {
3290 high_deadline = start_time + high_MutexSpin;
3291 } else {
3292 high_deadline = start_time + low_MutexSpin * real_ncpus;
3293 }
3294
3295 /*
3296 * Do not know yet which is the owner cpu.
3297 * Initialize prev_owner_cpu with next cpu.
3298 */
3299 prev_owner_cpu = (cpu_number() + 1) % real_ncpus;
3300 total_hold_time_samples = 0;
3301 window_hold_time_samples = 0;
3302 avg_hold_time = 0;
3303 adjust = TRUE;
3304 bias = (os_hash_kernel_pointer(mutex) + cpu_number()) % real_ncpus;
3305
3306 prev_owner = (thread_t) mutex->lck_mtx_owner;
3307 /*
3308 * Spin while:
3309 * - mutex is locked, and
3310 * - it's locked as a spin lock, and
3311 * - owner is running on another processor, and
3312 * - we haven't spun for long enough.
3313 */
3314 do {
3315 /*
3316 * Try to acquire the lock.
3317 */
3318 if (__probable(lck_mtx_lock_grab_mutex(mutex))) {
3319 retval = LCK_MTX_SPINWAIT_ACQUIRED;
3320 break;
3321 }
3322
3323 cur_time = mach_absolute_time();
3324
3325 /*
3326 * Never spin past high_deadline.
3327 */
3328 if (cur_time >= high_deadline) {
3329 retval = LCK_MTX_SPINWAIT_SPUN_HIGH_THR;
3330 break;
3331 }
3332
3333 /*
3334 * Check if owner is on core. If not block.
3335 */
3336 owner = (thread_t) mutex->lck_mtx_owner;
3337 if (owner) {
3338 i = prev_owner_cpu;
3339 owner_on_core = FALSE;
3340
3341 disable_preemption();
3342 owner = (thread_t) mutex->lck_mtx_owner;
3343
3344 /*
3345 * For scalability we want to check if the owner is on core
3346 * without locking the mutex interlock.
3347 * If we do not lock the mutex interlock, the owner that we see might be
3348 * invalid, so we cannot dereference it. Therefore we cannot check
3349 * any field of the thread to tell us if it is on core.
3350 * Check if the thread that is running on the other cpus matches the owner.
3351 */
3352 if (owner) {
3353 do {
3354 if ((cpu_data_ptr[i] != NULL) && (cpu_data_ptr[i]->cpu_active_thread == owner)) {
3355 owner_on_core = TRUE;
3356 break;
3357 }
3358 if (++i >= real_ncpus) {
3359 i = 0;
3360 }
3361 } while (i != prev_owner_cpu);
3362 enable_preemption();
3363
3364 if (owner_on_core) {
3365 prev_owner_cpu = i;
3366 } else {
3367 prev_owner = owner;
3368 owner = (thread_t) mutex->lck_mtx_owner;
3369 if (owner == prev_owner) {
3370 /*
3371 * Owner is not on core.
3372 * Stop spinning.
3373 */
3374 if (loopcount == 0) {
3375 retval = LCK_MTX_SPINWAIT_NO_SPIN;
3376 } else {
3377 retval = LCK_MTX_SPINWAIT_SPUN_OWNER_NOT_CORE;
3378 }
3379 break;
3380 }
3381 /*
3382 * Fall through if the owner changed while we were scanning.
3383 * The new owner could potentially be on core, so loop
3384 * again.
3385 */
3386 }
3387 } else {
3388 enable_preemption();
3389 }
3390 }
3391
3392 /*
3393 * Save how many times we see the owner changing.
3394 * We can roughly estimate the mutex hold
3395 * time and the fairness with that.
3396 */
3397 if (owner != prev_owner) {
3398 prev_owner = owner;
3399 total_hold_time_samples++;
3400 window_hold_time_samples++;
3401 }
3402
3403 /*
3404 * Learning window expired.
3405 * Try to adjust the sliding_deadline.
3406 */
3407 if (cur_time >= window_deadline) {
3408 /*
3409 * If there was not contention during the window
3410 * stop spinning.
3411 */
3412 if (window_hold_time_samples < 1) {
3413 retval = LCK_MTX_SPINWAIT_SPUN_NO_WINDOW_CONTENTION;
3414 break;
3415 }
3416
3417 if (adjust) {
3418 /*
3419 * For a fair lock, we'd wait for at most (NCPU-1) periods,
3420 * but the lock is unfair, so let's try to estimate by how much.
3421 */
3422 unfairness = total_hold_time_samples / real_ncpus;
3423
3424 if (unfairness == 0) {
3425 /*
3426 * We observed the owner changing `total_hold_time_samples` times which
3427 * let us estimate the average hold time of this mutex for the duration
3428 * of the spin time.
3429 * avg_hold_time = (cur_time - start_time) / total_hold_time_samples;
3430 *
3431 * In this case spin at max avg_hold_time * (real_ncpus - 1)
3432 */
3433 delta = cur_time - start_time;
3434 sliding_deadline = start_time + (delta * (real_ncpus - 1)) / total_hold_time_samples;
3435 } else {
3436 /*
3437 * In this case at least one of the other cpus was able to get the lock twice
3438 * while I was spinning.
3439 * We could spin longer but it won't necessarily help if the system is unfair.
3440 * Try to randomize the wait to reduce contention.
3441 *
3442 * We compute how much time we could potentially spin
3443 * and distribute it over the cpus.
3444 *
3445 * bias is an integer between 0 and real_ncpus.
3446 * distributed_increment = ((high_deadline - cur_time) / real_ncpus) * bias
3447 */
3448 delta = high_deadline - cur_time;
3449 sliding_deadline = cur_time + ((delta * bias) / real_ncpus);
3450 adjust = FALSE;
3451 }
3452 }
3453
3454 window_deadline += low_MutexSpin;
3455 window_hold_time_samples = 0;
3456 }
3457
3458 /*
3459 * Stop spinning if we past
3460 * the adjusted deadline.
3461 */
3462 if (cur_time >= sliding_deadline) {
3463 retval = LCK_MTX_SPINWAIT_SPUN_SLIDING_THR;
3464 break;
3465 }
3466
3467 if ((thread_t) mutex->lck_mtx_owner != NULL) {
3468 cpu_pause();
3469 }
3470
3471 loopcount++;
3472 } while (TRUE);
3473
3474 #if CONFIG_DTRACE
3475 /*
3476 * Note that we record a different probe id depending on whether
3477 * this is a direct or indirect mutex. This allows us to
3478 * penalize only lock groups that have debug/stats enabled
3479 * with dtrace processing if desired.
3480 */
3481 if (__probable(mutex->lck_mtx_is_ext == 0)) {
3482 LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN, mutex,
3483 mach_absolute_time() - start_time);
3484 } else {
3485 LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_SPIN, mutex,
3486 mach_absolute_time() - start_time);
3487 }
3488 /* The lockstat acquire event is recorded by the assembly code beneath us. */
3489 #endif
3490
3491 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_END,
3492 trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, retval, 0);
3493
3494 return retval;
3495 }
3496
3497
3498
3499 /*
3500 * Routine: lck_mtx_lock_wait_x86
3501 *
3502 * Invoked in order to wait on contention.
3503 *
3504 * Called with the interlock locked and
3505 * preemption disabled...
3506 * returns it unlocked and with preemption enabled
3507 *
3508 * lck_mtx_waiters is 1:1 with a wakeup needing to occur.
3509 * A runnable waiter can exist between wait and acquire
3510 * without a waiters count being set.
3511 * This allows us to never make a spurious wakeup call.
3512 *
3513 * Priority:
3514 * This avoids taking the thread lock if the owning thread is the same priority.
3515 * This optimizes the case of same-priority threads contending on a lock.
3516 * However, that allows the owning thread to drop in priority while holding the lock,
3517 * because there is no state that the priority change can notice that
3518 * says that the targeted thread holds a contended mutex.
3519 *
3520 * One possible solution: priority changes could look for some atomic tag
3521 * on the thread saying 'holding contended lock', and then set up a promotion.
3522 * Needs a story for dropping that promotion - the last contended unlock
3523 * has to notice that this has happened.
3524 */
3525 __attribute__((noinline))
3526 void
3527 lck_mtx_lock_wait_x86(
3528 lck_mtx_t *mutex,
3529 struct turnstile **ts)
3530 {
3531 thread_t self = current_thread();
3532
3533 #if CONFIG_DTRACE
3534 uint64_t sleep_start = 0;
3535
3536 if (lockstat_probemap[LS_LCK_MTX_LOCK_BLOCK] || lockstat_probemap[LS_LCK_MTX_EXT_LOCK_BLOCK]) {
3537 sleep_start = mach_absolute_time();
3538 }
3539 #endif
3540 __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex);
3541
3542 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_START,
3543 trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner),
3544 mutex->lck_mtx_waiters, 0, 0);
3545
3546 assert(self->waiting_for_mutex == NULL);
3547 self->waiting_for_mutex = mutex;
3548 mutex->lck_mtx_waiters++;
3549
3550 thread_t holder = (thread_t)mutex->lck_mtx_owner;
3551 assert(holder != NULL);
3552
3553 /*
3554 * lck_mtx_lock_wait_x86 might be called on a loop. Call prepare just once and reuse
3555 * the same turnstile while looping, the matching turnstile compleate will be called
3556 * by lck_mtx_lock_contended when finally acquiring the lock.
3557 */
3558 if (*ts == NULL) {
3559 *ts = turnstile_prepare((uintptr_t)mutex, NULL, TURNSTILE_NULL, TURNSTILE_KERNEL_MUTEX);
3560 }
3561
3562 struct turnstile *turnstile = *ts;
3563 thread_set_pending_block_hint(self, kThreadWaitKernelMutex);
3564 turnstile_update_inheritor(turnstile, holder, (TURNSTILE_DELAYED_UPDATE | TURNSTILE_INHERITOR_THREAD));
3565
3566 waitq_assert_wait64(&turnstile->ts_waitq, CAST_EVENT64_T(LCK_MTX_EVENT(mutex)), THREAD_UNINT | THREAD_WAIT_NOREPORT_USER, TIMEOUT_WAIT_FOREVER);
3567
3568 lck_mtx_ilk_unlock(mutex);
3569
3570 turnstile_update_inheritor_complete(turnstile, TURNSTILE_INTERLOCK_NOT_HELD);
3571
3572 thread_block(THREAD_CONTINUE_NULL);
3573
3574 self->waiting_for_mutex = NULL;
3575
3576 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_END,
3577 trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner),
3578 mutex->lck_mtx_waiters, 0, 0);
3579
3580 #if CONFIG_DTRACE
3581 /*
3582 * Record the Dtrace lockstat probe for blocking, block time
3583 * measured from when we were entered.
3584 */
3585 if (sleep_start) {
3586 if (mutex->lck_mtx_is_ext == 0) {
3587 LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_BLOCK, mutex,
3588 mach_absolute_time() - sleep_start);
3589 } else {
3590 LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_BLOCK, mutex,
3591 mach_absolute_time() - sleep_start);
3592 }
3593 }
3594 #endif
3595 }
3596
3597 /*
3598 * Routine: kdp_lck_mtx_lock_spin_is_acquired
3599 * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
3600 * Returns: TRUE if lock is acquired.
3601 */
3602 boolean_t
3603 kdp_lck_mtx_lock_spin_is_acquired(lck_mtx_t *lck)
3604 {
3605 if (not_in_kdp) {
3606 panic("panic: kdp_lck_mtx_lock_spin_is_acquired called outside of kernel debugger");
3607 }
3608
3609 if (lck->lck_mtx_ilocked || lck->lck_mtx_mlocked) {
3610 return TRUE;
3611 }
3612
3613 return FALSE;
3614 }
3615
3616 void
3617 kdp_lck_mtx_find_owner(__unused struct waitq * waitq, event64_t event, thread_waitinfo_t * waitinfo)
3618 {
3619 lck_mtx_t * mutex = LCK_EVENT_TO_MUTEX(event);
3620 waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(mutex);
3621 thread_t holder = (thread_t)mutex->lck_mtx_owner;
3622 waitinfo->owner = thread_tid(holder);
3623 }
3624
3625 void
3626 kdp_rwlck_find_owner(__unused struct waitq * waitq, event64_t event, thread_waitinfo_t * waitinfo)
3627 {
3628 lck_rw_t *rwlck = NULL;
3629 switch (waitinfo->wait_type) {
3630 case kThreadWaitKernelRWLockRead:
3631 rwlck = READ_EVENT_TO_RWLOCK(event);
3632 break;
3633 case kThreadWaitKernelRWLockWrite:
3634 case kThreadWaitKernelRWLockUpgrade:
3635 rwlck = WRITE_EVENT_TO_RWLOCK(event);
3636 break;
3637 default:
3638 panic("%s was called with an invalid blocking type", __FUNCTION__);
3639 break;
3640 }
3641 waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(rwlck);
3642 waitinfo->owner = 0;
3643 }