]> git.saurik.com Git - apple/xnu.git/blob - osfmk/i386/locks_i386.c
xnu-6153.61.1.tar.gz
[apple/xnu.git] / osfmk / i386 / locks_i386.c
1 /*
2 * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 * File: kern/lock.c
58 * Author: Avadis Tevanian, Jr., Michael Wayne Young
59 * Date: 1985
60 *
61 * Locking primitives implementation
62 */
63
64 #define LOCK_PRIVATE 1
65
66 #include <mach_ldebug.h>
67
68 #include <kern/lock_stat.h>
69 #include <kern/locks.h>
70 #include <kern/kalloc.h>
71 #include <kern/misc_protos.h>
72 #include <kern/thread.h>
73 #include <kern/processor.h>
74 #include <kern/cpu_data.h>
75 #include <kern/cpu_number.h>
76 #include <kern/sched_prim.h>
77 #include <kern/debug.h>
78 #include <string.h>
79
80 #include <i386/machine_routines.h> /* machine_timeout_suspended() */
81 #include <machine/atomic.h>
82 #include <machine/machine_cpu.h>
83 #include <i386/mp.h>
84 #include <machine/atomic.h>
85 #include <sys/kdebug.h>
86 #include <i386/locks_i386_inlines.h>
87
88 #if CONFIG_DTRACE
89 #define DTRACE_RW_SHARED 0x0 //reader
90 #define DTRACE_RW_EXCL 0x1 //writer
91 #define DTRACE_NO_FLAG 0x0 //not applicable
92 #endif /* CONFIG_DTRACE */
93
94 #define LCK_RW_LCK_EXCLUSIVE_CODE 0x100
95 #define LCK_RW_LCK_EXCLUSIVE1_CODE 0x101
96 #define LCK_RW_LCK_SHARED_CODE 0x102
97 #define LCK_RW_LCK_SH_TO_EX_CODE 0x103
98 #define LCK_RW_LCK_SH_TO_EX1_CODE 0x104
99 #define LCK_RW_LCK_EX_TO_SH_CODE 0x105
100
101 #define LCK_RW_LCK_EX_WRITER_SPIN_CODE 0x106
102 #define LCK_RW_LCK_EX_WRITER_WAIT_CODE 0x107
103 #define LCK_RW_LCK_EX_READER_SPIN_CODE 0x108
104 #define LCK_RW_LCK_EX_READER_WAIT_CODE 0x109
105 #define LCK_RW_LCK_SHARED_SPIN_CODE 0x110
106 #define LCK_RW_LCK_SHARED_WAIT_CODE 0x111
107 #define LCK_RW_LCK_SH_TO_EX_SPIN_CODE 0x112
108 #define LCK_RW_LCK_SH_TO_EX_WAIT_CODE 0x113
109
110
111 #define ANY_LOCK_DEBUG (USLOCK_DEBUG || LOCK_DEBUG || MUTEX_DEBUG)
112
113 unsigned int LcksOpts=0;
114
115 #if DEVELOPMENT || DEBUG
116 unsigned int LckDisablePreemptCheck = 0;
117 #endif
118
119 /* Forwards */
120
121 #if USLOCK_DEBUG
122 /*
123 * Perform simple lock checks.
124 */
125 int uslock_check = 1;
126 int max_lock_loops = 100000000;
127 decl_simple_lock_data(extern , printf_lock);
128 decl_simple_lock_data(extern , panic_lock);
129 #endif /* USLOCK_DEBUG */
130
131 extern unsigned int not_in_kdp;
132
133 /*
134 * We often want to know the addresses of the callers
135 * of the various lock routines. However, this information
136 * is only used for debugging and statistics.
137 */
138 typedef void *pc_t;
139 #define INVALID_PC ((void *) VM_MAX_KERNEL_ADDRESS)
140 #define INVALID_THREAD ((void *) VM_MAX_KERNEL_ADDRESS)
141 #if ANY_LOCK_DEBUG
142 #define OBTAIN_PC(pc) ((pc) = GET_RETURN_PC())
143 #define DECL_PC(pc) pc_t pc;
144 #else /* ANY_LOCK_DEBUG */
145 #define DECL_PC(pc)
146 #ifdef lint
147 /*
148 * Eliminate lint complaints about unused local pc variables.
149 */
150 #define OBTAIN_PC(pc) ++pc
151 #else /* lint */
152 #define OBTAIN_PC(pc)
153 #endif /* lint */
154 #endif /* USLOCK_DEBUG */
155
156 /*
157 * atomic exchange API is a low level abstraction of the operations
158 * to atomically read, modify, and write a pointer. This abstraction works
159 * for both Intel and ARMv8.1 compare and exchange atomic instructions as
160 * well as the ARM exclusive instructions.
161 *
162 * atomic_exchange_begin() - begin exchange and retrieve current value
163 * atomic_exchange_complete() - conclude an exchange
164 * atomic_exchange_abort() - cancel an exchange started with atomic_exchange_begin()
165 */
166 static uint32_t
167 atomic_exchange_begin32(uint32_t *target, uint32_t *previous, enum memory_order ord)
168 {
169 uint32_t val;
170
171 (void)ord; // Memory order not used
172 val = os_atomic_load(target, relaxed);
173 *previous = val;
174 return val;
175 }
176
177 static boolean_t
178 atomic_exchange_complete32(uint32_t *target, uint32_t previous, uint32_t newval, enum memory_order ord)
179 {
180 return __c11_atomic_compare_exchange_strong((_Atomic uint32_t *)target, &previous, newval, ord, memory_order_relaxed);
181 }
182
183 static void
184 atomic_exchange_abort(void) { }
185
186 static boolean_t
187 atomic_test_and_set32(uint32_t *target, uint32_t test_mask, uint32_t set_mask, enum memory_order ord, boolean_t wait)
188 {
189 uint32_t value, prev;
190
191 for ( ; ; ) {
192 value = atomic_exchange_begin32(target, &prev, ord);
193 if (value & test_mask) {
194 if (wait)
195 cpu_pause();
196 else
197 atomic_exchange_abort();
198 return FALSE;
199 }
200 value |= set_mask;
201 if (atomic_exchange_complete32(target, prev, value, ord))
202 return TRUE;
203 }
204 }
205
206 inline boolean_t
207 hw_atomic_test_and_set32(uint32_t *target, uint32_t test_mask, uint32_t set_mask, enum memory_order ord, boolean_t wait)
208 {
209 return atomic_test_and_set32(target, test_mask, set_mask, ord, wait);
210 }
211
212 /*
213 * Portable lock package implementation of usimple_locks.
214 */
215
216 #if USLOCK_DEBUG
217 #define USLDBG(stmt) stmt
218 void usld_lock_init(usimple_lock_t, unsigned short);
219 void usld_lock_pre(usimple_lock_t, pc_t);
220 void usld_lock_post(usimple_lock_t, pc_t);
221 void usld_unlock(usimple_lock_t, pc_t);
222 void usld_lock_try_pre(usimple_lock_t, pc_t);
223 void usld_lock_try_post(usimple_lock_t, pc_t);
224 int usld_lock_common_checks(usimple_lock_t, char *);
225 #else /* USLOCK_DEBUG */
226 #define USLDBG(stmt)
227 #endif /* USLOCK_DEBUG */
228
229 /*
230 * Forward definitions
231 */
232
233 static void lck_rw_lock_shared_gen(lck_rw_t *lck);
234 static void lck_rw_lock_exclusive_gen(lck_rw_t *lck);
235 static boolean_t lck_rw_lock_shared_to_exclusive_success(lck_rw_t *lck);
236 static boolean_t lck_rw_lock_shared_to_exclusive_failure(lck_rw_t *lck, uint32_t prior_lock_state);
237 static void lck_rw_lock_exclusive_to_shared_gen(lck_rw_t *lck, uint32_t prior_lock_state);
238 static lck_rw_type_t lck_rw_done_gen(lck_rw_t *lck, uint32_t prior_lock_state);
239 void lck_rw_clear_promotions_x86(thread_t thread);
240 static boolean_t lck_rw_held_read_or_upgrade(lck_rw_t *lock);
241 static boolean_t lck_rw_grab_want(lck_rw_t *lock);
242 static boolean_t lck_rw_grab_shared(lck_rw_t *lock);
243 static void lck_mtx_unlock_wakeup_tail(lck_mtx_t *mutex, uint32_t state, boolean_t indirect);
244 static void lck_mtx_interlock_lock(lck_mtx_t *mutex, uint32_t *new_state);
245 static void lck_mtx_interlock_lock_clear_flags(lck_mtx_t *mutex, uint32_t and_flags, uint32_t *new_state);
246 static int lck_mtx_interlock_try_lock(lck_mtx_t *mutex, uint32_t *new_state);
247 static int lck_mtx_interlock_try_lock_set_flags(lck_mtx_t *mutex, uint32_t or_flags, uint32_t *new_state);
248 static boolean_t lck_mtx_lock_wait_interlock_to_clear(lck_mtx_t *lock, uint32_t *new_state);
249 static boolean_t lck_mtx_try_lock_wait_interlock_to_clear(lck_mtx_t *lock, uint32_t *new_state);
250
251
252 /*
253 * Routine: lck_spin_alloc_init
254 */
255 lck_spin_t *
256 lck_spin_alloc_init(
257 lck_grp_t *grp,
258 lck_attr_t *attr)
259 {
260 lck_spin_t *lck;
261
262 if ((lck = (lck_spin_t *)kalloc(sizeof(lck_spin_t))) != 0)
263 lck_spin_init(lck, grp, attr);
264
265 return(lck);
266 }
267
268 /*
269 * Routine: lck_spin_free
270 */
271 void
272 lck_spin_free(
273 lck_spin_t *lck,
274 lck_grp_t *grp)
275 {
276 lck_spin_destroy(lck, grp);
277 kfree(lck, sizeof(lck_spin_t));
278 }
279
280 /*
281 * Routine: lck_spin_init
282 */
283 void
284 lck_spin_init(
285 lck_spin_t *lck,
286 lck_grp_t *grp,
287 __unused lck_attr_t *attr)
288 {
289 usimple_lock_init((usimple_lock_t) lck, 0);
290 if (grp) {
291 lck_grp_reference(grp);
292 lck_grp_lckcnt_incr(grp, LCK_TYPE_SPIN);
293 }
294 }
295
296 /*
297 * Routine: lck_spin_destroy
298 */
299 void
300 lck_spin_destroy(
301 lck_spin_t *lck,
302 lck_grp_t *grp)
303 {
304 if (lck->interlock == LCK_SPIN_TAG_DESTROYED)
305 return;
306 lck->interlock = LCK_SPIN_TAG_DESTROYED;
307 if (grp) {
308 lck_grp_lckcnt_decr(grp, LCK_TYPE_SPIN);
309 lck_grp_deallocate(grp);
310 }
311 return;
312 }
313
314 /*
315 * Routine: lck_spin_lock
316 */
317 void
318 lck_spin_lock_grp(
319 lck_spin_t *lck,
320 lck_grp_t *grp)
321 {
322 #pragma unused(grp)
323 usimple_lock((usimple_lock_t) lck, grp);
324 }
325
326 void
327 lck_spin_lock(
328 lck_spin_t *lck)
329 {
330 usimple_lock((usimple_lock_t) lck, NULL);
331 }
332
333 /*
334 * Routine: lck_spin_unlock
335 */
336 void
337 lck_spin_unlock(
338 lck_spin_t *lck)
339 {
340 usimple_unlock((usimple_lock_t) lck);
341 }
342
343 boolean_t
344 lck_spin_try_lock_grp(
345 lck_spin_t *lck,
346 lck_grp_t *grp)
347 {
348 #pragma unused(grp)
349 boolean_t lrval = (boolean_t)usimple_lock_try((usimple_lock_t) lck, grp);
350 #if DEVELOPMENT || DEBUG
351 if (lrval) {
352 pltrace(FALSE);
353 }
354 #endif
355 return(lrval);
356 }
357
358
359 /*
360 * Routine: lck_spin_try_lock
361 */
362 boolean_t
363 lck_spin_try_lock(
364 lck_spin_t *lck)
365 {
366 boolean_t lrval = (boolean_t)usimple_lock_try((usimple_lock_t) lck, LCK_GRP_NULL);
367 #if DEVELOPMENT || DEBUG
368 if (lrval) {
369 pltrace(FALSE);
370 }
371 #endif
372 return(lrval);
373 }
374
375 /*
376 * Routine: lck_spin_assert
377 */
378 void
379 lck_spin_assert(lck_spin_t *lock, unsigned int type)
380 {
381 thread_t thread, holder;
382 uintptr_t state;
383
384 if (__improbable(type != LCK_ASSERT_OWNED && type != LCK_ASSERT_NOTOWNED)) {
385 panic("lck_spin_assert(): invalid arg (%u)", type);
386 }
387
388 state = lock->interlock;
389 holder = (thread_t)state;
390 thread = current_thread();
391 if (type == LCK_ASSERT_OWNED) {
392 if (__improbable(holder == THREAD_NULL)) {
393 panic("Lock not owned %p = %lx", lock, state);
394 }
395 if (__improbable(holder != thread)) {
396 panic("Lock not owned by current thread %p = %lx", lock, state);
397 }
398 } else if (type == LCK_ASSERT_NOTOWNED) {
399 if (__improbable(holder != THREAD_NULL)) {
400 if (holder == thread) {
401 panic("Lock owned by current thread %p = %lx", lock, state);
402 }
403 }
404 }
405 }
406
407 /*
408 * Routine: kdp_lck_spin_is_acquired
409 * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
410 * Returns: TRUE if lock is acquired.
411 */
412 boolean_t
413 kdp_lck_spin_is_acquired(lck_spin_t *lck) {
414 if (not_in_kdp) {
415 panic("panic: spinlock acquired check done outside of kernel debugger");
416 }
417 return (lck->interlock != 0)? TRUE : FALSE;
418 }
419
420 /*
421 * Initialize a usimple_lock.
422 *
423 * No change in preemption state.
424 */
425 void
426 usimple_lock_init(
427 usimple_lock_t l,
428 __unused unsigned short tag)
429 {
430 #ifndef MACHINE_SIMPLE_LOCK
431 USLDBG(usld_lock_init(l, tag));
432 hw_lock_init(&l->interlock);
433 #else
434 simple_lock_init((simple_lock_t)l,tag);
435 #endif
436 }
437
438 volatile uint32_t spinlock_owner_cpu = ~0;
439 volatile usimple_lock_t spinlock_timed_out;
440
441 uint32_t spinlock_timeout_NMI(uintptr_t thread_addr) {
442 uint32_t i;
443
444 for (i = 0; i < real_ncpus; i++) {
445 if ((cpu_data_ptr[i] != NULL) && ((uintptr_t)cpu_data_ptr[i]->cpu_active_thread == thread_addr)) {
446 spinlock_owner_cpu = i;
447 if ((uint32_t) cpu_number() != i) {
448 /* Cause NMI and panic on the owner's cpu */
449 NMIPI_panic(cpu_to_cpumask(i), SPINLOCK_TIMEOUT);
450 }
451 break;
452 }
453 }
454
455 return spinlock_owner_cpu;
456 }
457
458 /*
459 * Acquire a usimple_lock.
460 *
461 * Returns with preemption disabled. Note
462 * that the hw_lock routines are responsible for
463 * maintaining preemption state.
464 */
465 void
466 (usimple_lock)(
467 usimple_lock_t l
468 LCK_GRP_ARG(lck_grp_t *grp))
469 {
470 #ifndef MACHINE_SIMPLE_LOCK
471 DECL_PC(pc);
472
473 OBTAIN_PC(pc);
474 USLDBG(usld_lock_pre(l, pc));
475
476 if(__improbable(hw_lock_to(&l->interlock, LockTimeOutTSC, grp) == 0)) {
477 boolean_t uslock_acquired = FALSE;
478 while (machine_timeout_suspended()) {
479 enable_preemption();
480 if ((uslock_acquired = hw_lock_to(&l->interlock, LockTimeOutTSC, grp)))
481 break;
482 }
483
484 if (uslock_acquired == FALSE) {
485 uint32_t lock_cpu;
486 uintptr_t lowner = (uintptr_t)l->interlock.lock_data;
487 spinlock_timed_out = l;
488 lock_cpu = spinlock_timeout_NMI(lowner);
489 panic("Spinlock acquisition timed out: lock=%p, lock owner thread=0x%lx, current_thread: %p, lock owner active on CPU 0x%x, current owner: 0x%lx, time: %llu",
490 l, lowner, current_thread(), lock_cpu, (uintptr_t)l->interlock.lock_data, mach_absolute_time());
491 }
492 }
493 #if DEVELOPMENT || DEBUG
494 pltrace(FALSE);
495 #endif
496
497 USLDBG(usld_lock_post(l, pc));
498 #else
499 simple_lock((simple_lock_t)l, grp);
500 #endif
501 #if CONFIG_DTRACE
502 LOCKSTAT_RECORD(LS_LCK_SPIN_LOCK_ACQUIRE, l, 0, (uintptr_t)LCK_GRP_PROBEARG(grp));
503 #endif
504 }
505
506
507 /*
508 * Release a usimple_lock.
509 *
510 * Returns with preemption enabled. Note
511 * that the hw_lock routines are responsible for
512 * maintaining preemption state.
513 */
514 void
515 usimple_unlock(
516 usimple_lock_t l)
517 {
518 #ifndef MACHINE_SIMPLE_LOCK
519 DECL_PC(pc);
520
521 OBTAIN_PC(pc);
522 USLDBG(usld_unlock(l, pc));
523 #if DEVELOPMENT || DEBUG
524 pltrace(TRUE);
525 #endif
526 hw_lock_unlock(&l->interlock);
527 #else
528 simple_unlock_rwmb((simple_lock_t)l);
529 #endif
530 }
531
532
533 /*
534 * Conditionally acquire a usimple_lock.
535 *
536 * On success, returns with preemption disabled.
537 * On failure, returns with preemption in the same state
538 * as when first invoked. Note that the hw_lock routines
539 * are responsible for maintaining preemption state.
540 *
541 * XXX No stats are gathered on a miss; I preserved this
542 * behavior from the original assembly-language code, but
543 * doesn't it make sense to log misses? XXX
544 */
545 unsigned int
546 usimple_lock_try(
547 usimple_lock_t l,
548 lck_grp_t *grp)
549 {
550 #ifndef MACHINE_SIMPLE_LOCK
551 unsigned int success;
552 DECL_PC(pc);
553
554 OBTAIN_PC(pc);
555 USLDBG(usld_lock_try_pre(l, pc));
556 if ((success = hw_lock_try(&l->interlock, grp))) {
557 #if DEVELOPMENT || DEBUG
558 pltrace(FALSE);
559 #endif
560 USLDBG(usld_lock_try_post(l, pc));
561 }
562 return success;
563 #else
564 return(simple_lock_try((simple_lock_t)l, grp));
565 #endif
566 }
567
568 /*
569 * Acquire a usimple_lock while polling for pending cpu signals
570 * and spinning on a lock.
571 *
572 */
573 unsigned int
574 (usimple_lock_try_lock_mp_signal_safe_loop_deadline)(usimple_lock_t l,
575 uint64_t deadline
576 LCK_GRP_ARG(lck_grp_t *grp))
577 {
578 boolean_t istate = ml_get_interrupts_enabled();
579
580 if (deadline < mach_absolute_time()) {
581 return 0;
582 }
583
584 while (!simple_lock_try(l, grp)) {
585 if (!istate)
586 cpu_signal_handler(NULL);
587
588 if (deadline < mach_absolute_time()) {
589 return 0;
590 }
591
592 cpu_pause();
593 }
594
595 return 1;
596 }
597
598 void
599 (usimple_lock_try_lock_loop)(usimple_lock_t l
600 LCK_GRP_ARG(lck_grp_t *grp))
601 {
602 usimple_lock_try_lock_mp_signal_safe_loop_deadline(l, ULLONG_MAX, grp);
603 }
604
605 unsigned int
606 (usimple_lock_try_lock_mp_signal_safe_loop_duration)(usimple_lock_t l,
607 uint64_t duration
608 LCK_GRP_ARG(lck_grp_t *grp))
609 {
610 uint64_t deadline;
611 uint64_t base_at = mach_absolute_time();
612 uint64_t duration_at;
613
614 nanoseconds_to_absolutetime(duration, &duration_at);
615 deadline = base_at + duration_at;
616 if (deadline < base_at) {
617 /* deadline has overflowed, make it saturate */
618 deadline = ULLONG_MAX;
619 }
620
621 return usimple_lock_try_lock_mp_signal_safe_loop_deadline(l, deadline, grp);
622 }
623
624 #if USLOCK_DEBUG
625 /*
626 * States of a usimple_lock. The default when initializing
627 * a usimple_lock is setting it up for debug checking.
628 */
629 #define USLOCK_CHECKED 0x0001 /* lock is being checked */
630 #define USLOCK_TAKEN 0x0002 /* lock has been taken */
631 #define USLOCK_INIT 0xBAA0 /* lock has been initialized */
632 #define USLOCK_INITIALIZED (USLOCK_INIT|USLOCK_CHECKED)
633 #define USLOCK_CHECKING(l) (uslock_check && \
634 ((l)->debug.state & USLOCK_CHECKED))
635
636 /*
637 * Initialize the debugging information contained
638 * in a usimple_lock.
639 */
640 void
641 usld_lock_init(
642 usimple_lock_t l,
643 __unused unsigned short tag)
644 {
645 if (l == USIMPLE_LOCK_NULL)
646 panic("lock initialization: null lock pointer");
647 l->lock_type = USLOCK_TAG;
648 l->debug.state = uslock_check ? USLOCK_INITIALIZED : 0;
649 l->debug.lock_cpu = l->debug.unlock_cpu = 0;
650 l->debug.lock_pc = l->debug.unlock_pc = INVALID_PC;
651 l->debug.lock_thread = l->debug.unlock_thread = INVALID_THREAD;
652 l->debug.duration[0] = l->debug.duration[1] = 0;
653 l->debug.unlock_cpu = l->debug.unlock_cpu = 0;
654 l->debug.unlock_pc = l->debug.unlock_pc = INVALID_PC;
655 l->debug.unlock_thread = l->debug.unlock_thread = INVALID_THREAD;
656 }
657
658
659 /*
660 * These checks apply to all usimple_locks, not just
661 * those with USLOCK_CHECKED turned on.
662 */
663 int
664 usld_lock_common_checks(
665 usimple_lock_t l,
666 char *caller)
667 {
668 if (l == USIMPLE_LOCK_NULL)
669 panic("%s: null lock pointer", caller);
670 if (l->lock_type != USLOCK_TAG)
671 panic("%s: %p is not a usimple lock, 0x%x", caller, l, l->lock_type);
672 if (!(l->debug.state & USLOCK_INIT))
673 panic("%s: %p is not an initialized lock, 0x%x", caller, l, l->debug.state);
674 return USLOCK_CHECKING(l);
675 }
676
677
678 /*
679 * Debug checks on a usimple_lock just before attempting
680 * to acquire it.
681 */
682 /* ARGSUSED */
683 void
684 usld_lock_pre(
685 usimple_lock_t l,
686 pc_t pc)
687 {
688 char caller[] = "usimple_lock";
689
690
691 if (!usld_lock_common_checks(l, caller))
692 return;
693
694 /*
695 * Note that we have a weird case where we are getting a lock when we are]
696 * in the process of putting the system to sleep. We are running with no
697 * current threads, therefore we can't tell if we are trying to retake a lock
698 * we have or someone on the other processor has it. Therefore we just
699 * ignore this test if the locking thread is 0.
700 */
701
702 if ((l->debug.state & USLOCK_TAKEN) && l->debug.lock_thread &&
703 l->debug.lock_thread == (void *) current_thread()) {
704 printf("%s: lock %p already locked (at %p) by",
705 caller, l, l->debug.lock_pc);
706 printf(" current thread %p (new attempt at pc %p)\n",
707 l->debug.lock_thread, pc);
708 panic("%s", caller);
709 }
710 mp_disable_preemption();
711 mp_enable_preemption();
712 }
713
714
715 /*
716 * Debug checks on a usimple_lock just after acquiring it.
717 *
718 * Pre-emption has been disabled at this point,
719 * so we are safe in using cpu_number.
720 */
721 void
722 usld_lock_post(
723 usimple_lock_t l,
724 pc_t pc)
725 {
726 int mycpu;
727 char caller[] = "successful usimple_lock";
728
729
730 if (!usld_lock_common_checks(l, caller))
731 return;
732
733 if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED))
734 panic("%s: lock %p became uninitialized",
735 caller, l);
736 if ((l->debug.state & USLOCK_TAKEN))
737 panic("%s: lock 0x%p became TAKEN by someone else",
738 caller, l);
739
740 mycpu = cpu_number();
741 l->debug.lock_thread = (void *)current_thread();
742 l->debug.state |= USLOCK_TAKEN;
743 l->debug.lock_pc = pc;
744 l->debug.lock_cpu = mycpu;
745 }
746
747
748 /*
749 * Debug checks on a usimple_lock just before
750 * releasing it. Note that the caller has not
751 * yet released the hardware lock.
752 *
753 * Preemption is still disabled, so there's
754 * no problem using cpu_number.
755 */
756 void
757 usld_unlock(
758 usimple_lock_t l,
759 pc_t pc)
760 {
761 int mycpu;
762 char caller[] = "usimple_unlock";
763
764
765 if (!usld_lock_common_checks(l, caller))
766 return;
767
768 mycpu = cpu_number();
769
770 if (!(l->debug.state & USLOCK_TAKEN))
771 panic("%s: lock 0x%p hasn't been taken",
772 caller, l);
773 if (l->debug.lock_thread != (void *) current_thread())
774 panic("%s: unlocking lock 0x%p, owned by thread %p",
775 caller, l, l->debug.lock_thread);
776 if (l->debug.lock_cpu != mycpu) {
777 printf("%s: unlocking lock 0x%p on cpu 0x%x",
778 caller, l, mycpu);
779 printf(" (acquired on cpu 0x%x)\n", l->debug.lock_cpu);
780 panic("%s", caller);
781 }
782
783 l->debug.unlock_thread = l->debug.lock_thread;
784 l->debug.lock_thread = INVALID_PC;
785 l->debug.state &= ~USLOCK_TAKEN;
786 l->debug.unlock_pc = pc;
787 l->debug.unlock_cpu = mycpu;
788 }
789
790
791 /*
792 * Debug checks on a usimple_lock just before
793 * attempting to acquire it.
794 *
795 * Preemption isn't guaranteed to be disabled.
796 */
797 void
798 usld_lock_try_pre(
799 usimple_lock_t l,
800 __unused pc_t pc)
801 {
802 char caller[] = "usimple_lock_try";
803
804 if (!usld_lock_common_checks(l, caller))
805 return;
806 }
807
808
809 /*
810 * Debug checks on a usimple_lock just after
811 * successfully attempting to acquire it.
812 *
813 * Preemption has been disabled by the
814 * lock acquisition attempt, so it's safe
815 * to use cpu_number.
816 */
817 void
818 usld_lock_try_post(
819 usimple_lock_t l,
820 pc_t pc)
821 {
822 int mycpu;
823 char caller[] = "successful usimple_lock_try";
824
825 if (!usld_lock_common_checks(l, caller))
826 return;
827
828 if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED))
829 panic("%s: lock 0x%p became uninitialized",
830 caller, l);
831 if ((l->debug.state & USLOCK_TAKEN))
832 panic("%s: lock 0x%p became TAKEN by someone else",
833 caller, l);
834
835 mycpu = cpu_number();
836 l->debug.lock_thread = (void *) current_thread();
837 l->debug.state |= USLOCK_TAKEN;
838 l->debug.lock_pc = pc;
839 l->debug.lock_cpu = mycpu;
840 }
841 #endif /* USLOCK_DEBUG */
842
843 /*
844 * Routine: lck_rw_alloc_init
845 */
846 lck_rw_t *
847 lck_rw_alloc_init(
848 lck_grp_t *grp,
849 lck_attr_t *attr) {
850 lck_rw_t *lck;
851
852 if ((lck = (lck_rw_t *)kalloc(sizeof(lck_rw_t))) != 0) {
853 bzero(lck, sizeof(lck_rw_t));
854 lck_rw_init(lck, grp, attr);
855 }
856
857 return(lck);
858 }
859
860 /*
861 * Routine: lck_rw_free
862 */
863 void
864 lck_rw_free(
865 lck_rw_t *lck,
866 lck_grp_t *grp) {
867 lck_rw_destroy(lck, grp);
868 kfree(lck, sizeof(lck_rw_t));
869 }
870
871 /*
872 * Routine: lck_rw_init
873 */
874 void
875 lck_rw_init(
876 lck_rw_t *lck,
877 lck_grp_t *grp,
878 lck_attr_t *attr)
879 {
880 lck_attr_t *lck_attr = (attr != LCK_ATTR_NULL) ?
881 attr : &LockDefaultLckAttr;
882
883 hw_lock_byte_init(&lck->lck_rw_interlock);
884 lck->lck_rw_want_write = FALSE;
885 lck->lck_rw_want_upgrade = FALSE;
886 lck->lck_rw_shared_count = 0;
887 lck->lck_rw_can_sleep = TRUE;
888 lck->lck_r_waiting = lck->lck_w_waiting = 0;
889 lck->lck_rw_tag = 0;
890 lck->lck_rw_priv_excl = ((lck_attr->lck_attr_val &
891 LCK_ATTR_RW_SHARED_PRIORITY) == 0);
892
893 lck_grp_reference(grp);
894 lck_grp_lckcnt_incr(grp, LCK_TYPE_RW);
895 }
896
897 /*
898 * Routine: lck_rw_destroy
899 */
900 void
901 lck_rw_destroy(
902 lck_rw_t *lck,
903 lck_grp_t *grp)
904 {
905 if (lck->lck_rw_tag == LCK_RW_TAG_DESTROYED)
906 return;
907 #if MACH_LDEBUG
908 lck_rw_assert(lck, LCK_RW_ASSERT_NOTHELD);
909 #endif
910 lck->lck_rw_tag = LCK_RW_TAG_DESTROYED;
911 lck_grp_lckcnt_decr(grp, LCK_TYPE_RW);
912 lck_grp_deallocate(grp);
913 return;
914 }
915
916 /*
917 * Sleep locks. These use the same data structure and algorithm
918 * as the spin locks, but the process sleeps while it is waiting
919 * for the lock. These work on uniprocessor systems.
920 */
921
922 #define DECREMENTER_TIMEOUT 1000000
923
924 /*
925 * We disable interrupts while holding the RW interlock to prevent an
926 * interrupt from exacerbating hold time.
927 * Hence, local helper functions lck_interlock_lock()/lck_interlock_unlock().
928 */
929 static inline boolean_t
930 lck_interlock_lock(lck_rw_t *lck)
931 {
932 boolean_t istate;
933
934 istate = ml_set_interrupts_enabled(FALSE);
935 hw_lock_byte_lock(&lck->lck_rw_interlock);
936 return istate;
937 }
938
939 static inline void
940 lck_interlock_unlock(lck_rw_t *lck, boolean_t istate)
941 {
942 hw_lock_byte_unlock(&lck->lck_rw_interlock);
943 ml_set_interrupts_enabled(istate);
944 }
945
946 /*
947 * This inline is used when busy-waiting for an rw lock.
948 * If interrupts were disabled when the lock primitive was called,
949 * we poll the IPI handler for pending tlb flushes.
950 * XXX This is a hack to avoid deadlocking on the pmap_system_lock.
951 */
952 static inline void
953 lck_rw_lock_pause(boolean_t interrupts_enabled)
954 {
955 if (!interrupts_enabled)
956 handle_pending_TLB_flushes();
957 cpu_pause();
958 }
959
960 static inline boolean_t
961 lck_rw_held_read_or_upgrade(lck_rw_t *lock)
962 {
963 if (ordered_load(&lock->data) & (LCK_RW_SHARED_MASK | LCK_RW_INTERLOCK | LCK_RW_WANT_UPGRADE))
964 return TRUE;
965 return FALSE;
966 }
967
968 /*
969 * compute the deadline to spin against when
970 * waiting for a change of state on a lck_rw_t
971 */
972 static inline uint64_t
973 lck_rw_deadline_for_spin(lck_rw_t *lck)
974 {
975 if (lck->lck_rw_can_sleep) {
976 if (lck->lck_r_waiting || lck->lck_w_waiting || lck->lck_rw_shared_count > machine_info.max_cpus) {
977 /*
978 * there are already threads waiting on this lock... this
979 * implies that they have spun beyond their deadlines waiting for
980 * the desired state to show up so we will not bother spinning at this time...
981 * or
982 * the current number of threads sharing this lock exceeds our capacity to run them
983 * concurrently and since all states we're going to spin for require the rw_shared_count
984 * to be at 0, we'll not bother spinning since the latency for this to happen is
985 * unpredictable...
986 */
987 return (mach_absolute_time());
988 }
989 return (mach_absolute_time() + MutexSpin);
990 } else
991 return (mach_absolute_time() + (100000LL * 1000000000LL));
992 }
993
994
995 /*
996 * Spin while interlock is held.
997 */
998
999 static inline void
1000 lck_rw_interlock_spin(lck_rw_t *lock)
1001 {
1002 while (ordered_load(&lock->data) & LCK_RW_INTERLOCK) {
1003 cpu_pause();
1004 }
1005 }
1006
1007 static boolean_t
1008 lck_rw_grab_want(lck_rw_t *lock)
1009 {
1010 uint32_t data, prev;
1011
1012 for ( ; ; ) {
1013 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_relaxed);
1014 if ((data & LCK_RW_INTERLOCK) == 0)
1015 break;
1016 atomic_exchange_abort();
1017 lck_rw_interlock_spin(lock);
1018 }
1019 if (data & LCK_RW_WANT_WRITE) {
1020 atomic_exchange_abort();
1021 return FALSE;
1022 }
1023 data |= LCK_RW_WANT_WRITE;
1024 return atomic_exchange_complete32(&lock->data, prev, data, memory_order_relaxed);
1025 }
1026
1027 static boolean_t
1028 lck_rw_grab_shared(lck_rw_t *lock)
1029 {
1030 uint32_t data, prev;
1031
1032 for ( ; ; ) {
1033 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
1034 if ((data & LCK_RW_INTERLOCK) == 0)
1035 break;
1036 atomic_exchange_abort();
1037 lck_rw_interlock_spin(lock);
1038 }
1039 if (data & (LCK_RW_WANT_WRITE | LCK_RW_WANT_UPGRADE)) {
1040 if (((data & LCK_RW_SHARED_MASK) == 0) || (data & LCK_RW_PRIV_EXCL)) {
1041 atomic_exchange_abort();
1042 return FALSE;
1043 }
1044 }
1045 data += LCK_RW_SHARED_READER;
1046 return atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp);
1047 }
1048
1049 /*
1050 * Routine: lck_rw_lock_exclusive
1051 */
1052 static void
1053 lck_rw_lock_exclusive_gen(
1054 lck_rw_t *lck)
1055 {
1056 __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck);
1057 uint64_t deadline = 0;
1058 int slept = 0;
1059 int gotlock = 0;
1060 int lockheld = 0;
1061 wait_result_t res = 0;
1062 boolean_t istate = -1;
1063
1064 #if CONFIG_DTRACE
1065 boolean_t dtrace_ls_initialized = FALSE;
1066 boolean_t dtrace_rwl_excl_spin, dtrace_rwl_excl_block, dtrace_ls_enabled= FALSE;
1067 uint64_t wait_interval = 0;
1068 int readers_at_sleep = 0;
1069 #endif
1070
1071 /*
1072 * Try to acquire the lck_rw_want_write bit.
1073 */
1074 while ( !lck_rw_grab_want(lck)) {
1075
1076 #if CONFIG_DTRACE
1077 if (dtrace_ls_initialized == FALSE) {
1078 dtrace_ls_initialized = TRUE;
1079 dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
1080 dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
1081 dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
1082 if (dtrace_ls_enabled) {
1083 /*
1084 * Either sleeping or spinning is happening,
1085 * start a timing of our delay interval now.
1086 */
1087 readers_at_sleep = lck->lck_rw_shared_count;
1088 wait_interval = mach_absolute_time();
1089 }
1090 }
1091 #endif
1092 if (istate == -1)
1093 istate = ml_get_interrupts_enabled();
1094
1095 deadline = lck_rw_deadline_for_spin(lck);
1096
1097 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1098
1099 while (((gotlock = lck_rw_grab_want(lck)) == 0) && mach_absolute_time() < deadline)
1100 lck_rw_lock_pause(istate);
1101
1102 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_END, trace_lck, 0, 0, gotlock, 0);
1103
1104 if (gotlock)
1105 break;
1106 /*
1107 * if we get here, the deadline has expired w/o us
1108 * being able to grab the lock exclusively
1109 * check to see if we're allowed to do a thread_block
1110 */
1111 if (lck->lck_rw_can_sleep) {
1112
1113 istate = lck_interlock_lock(lck);
1114
1115 if (lck->lck_rw_want_write) {
1116
1117 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1118
1119 lck->lck_w_waiting = TRUE;
1120
1121 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite);
1122 res = assert_wait(RW_LOCK_WRITER_EVENT(lck),
1123 THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
1124 lck_interlock_unlock(lck, istate);
1125
1126 if (res == THREAD_WAITING) {
1127 res = thread_block(THREAD_CONTINUE_NULL);
1128 slept++;
1129 }
1130 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_END, trace_lck, res, slept, 0, 0);
1131 } else {
1132 lck->lck_rw_want_write = TRUE;
1133 lck_interlock_unlock(lck, istate);
1134 break;
1135 }
1136 }
1137 }
1138 /*
1139 * Wait for readers (and upgrades) to finish...
1140 * the test for these conditions must be done simultaneously with
1141 * a check of the interlock not being held since
1142 * the rw_shared_count will drop to 0 first and then want_upgrade
1143 * will be set to 1 in the shared_to_exclusive scenario... those
1144 * adjustments are done behind the interlock and represent an
1145 * atomic change in state and must be considered as such
1146 * however, once we see the read count at 0, the want_upgrade not set
1147 * and the interlock not held, we are safe to proceed
1148 */
1149 while (lck_rw_held_read_or_upgrade(lck)) {
1150
1151 #if CONFIG_DTRACE
1152 /*
1153 * Either sleeping or spinning is happening, start
1154 * a timing of our delay interval now. If we set it
1155 * to -1 we don't have accurate data so we cannot later
1156 * decide to record a dtrace spin or sleep event.
1157 */
1158 if (dtrace_ls_initialized == FALSE) {
1159 dtrace_ls_initialized = TRUE;
1160 dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
1161 dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
1162 dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
1163 if (dtrace_ls_enabled) {
1164 /*
1165 * Either sleeping or spinning is happening,
1166 * start a timing of our delay interval now.
1167 */
1168 readers_at_sleep = lck->lck_rw_shared_count;
1169 wait_interval = mach_absolute_time();
1170 }
1171 }
1172 #endif
1173 if (istate == -1)
1174 istate = ml_get_interrupts_enabled();
1175
1176 deadline = lck_rw_deadline_for_spin(lck);
1177
1178 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1179
1180 while ((lockheld = lck_rw_held_read_or_upgrade(lck)) && mach_absolute_time() < deadline)
1181 lck_rw_lock_pause(istate);
1182
1183 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_END, trace_lck, 0, 0, lockheld, 0);
1184
1185 if ( !lockheld)
1186 break;
1187 /*
1188 * if we get here, the deadline has expired w/o us
1189 * being able to grab the lock exclusively
1190 * check to see if we're allowed to do a thread_block
1191 */
1192 if (lck->lck_rw_can_sleep) {
1193
1194 istate = lck_interlock_lock(lck);
1195
1196 if (lck->lck_rw_shared_count != 0 || lck->lck_rw_want_upgrade) {
1197 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1198
1199 lck->lck_w_waiting = TRUE;
1200
1201 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite);
1202 res = assert_wait(RW_LOCK_WRITER_EVENT(lck),
1203 THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
1204 lck_interlock_unlock(lck, istate);
1205
1206 if (res == THREAD_WAITING) {
1207 res = thread_block(THREAD_CONTINUE_NULL);
1208 slept++;
1209 }
1210 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_END, trace_lck, res, slept, 0, 0);
1211 } else {
1212 lck_interlock_unlock(lck, istate);
1213 /*
1214 * must own the lock now, since we checked for
1215 * readers or upgrade owner behind the interlock
1216 * no need for a call to 'lck_rw_held_read_or_upgrade'
1217 */
1218 break;
1219 }
1220 }
1221 }
1222
1223 #if CONFIG_DTRACE
1224 /*
1225 * Decide what latencies we suffered that are Dtrace events.
1226 * If we have set wait_interval, then we either spun or slept.
1227 * At least we get out from under the interlock before we record
1228 * which is the best we can do here to minimize the impact
1229 * of the tracing.
1230 * If we have set wait_interval to -1, then dtrace was not enabled when we
1231 * started sleeping/spinning so we don't record this event.
1232 */
1233 if (dtrace_ls_enabled == TRUE) {
1234 if (slept == 0) {
1235 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_SPIN, lck,
1236 mach_absolute_time() - wait_interval, 1);
1237 } else {
1238 /*
1239 * For the blocking case, we also record if when we blocked
1240 * it was held for read or write, and how many readers.
1241 * Notice that above we recorded this before we dropped
1242 * the interlock so the count is accurate.
1243 */
1244 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_BLOCK, lck,
1245 mach_absolute_time() - wait_interval, 1,
1246 (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1247 }
1248 }
1249 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lck, 1);
1250 #endif
1251 }
1252
1253 /*
1254 * Routine: lck_rw_done
1255 */
1256
1257 lck_rw_type_t lck_rw_done(lck_rw_t *lock)
1258 {
1259 uint32_t data, prev;
1260
1261 for ( ; ; ) {
1262 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_release_smp);
1263 if (data & LCK_RW_INTERLOCK) { /* wait for interlock to clear */
1264 atomic_exchange_abort();
1265 lck_rw_interlock_spin(lock);
1266 continue;
1267 }
1268 if (data & LCK_RW_SHARED_MASK) {
1269 data -= LCK_RW_SHARED_READER;
1270 if ((data & LCK_RW_SHARED_MASK) == 0) /* if reader count has now gone to 0, check for waiters */
1271 goto check_waiters;
1272 } else { /* if reader count == 0, must be exclusive lock */
1273 if (data & LCK_RW_WANT_UPGRADE) {
1274 data &= ~(LCK_RW_WANT_UPGRADE);
1275 } else {
1276 if (data & LCK_RW_WANT_WRITE)
1277 data &= ~(LCK_RW_WANT_EXCL);
1278 else /* lock is not 'owned', panic */
1279 panic("Releasing non-exclusive RW lock without a reader refcount!");
1280 }
1281 check_waiters:
1282 if (prev & LCK_RW_W_WAITING) {
1283 data &= ~(LCK_RW_W_WAITING);
1284 if ((prev & LCK_RW_PRIV_EXCL) == 0)
1285 data &= ~(LCK_RW_R_WAITING);
1286 } else
1287 data &= ~(LCK_RW_R_WAITING);
1288 }
1289 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_release_smp))
1290 break;
1291 cpu_pause();
1292 }
1293 return lck_rw_done_gen(lock, prev);
1294 }
1295
1296 /*
1297 * Routine: lck_rw_done_gen
1298 *
1299 * called from lck_rw_done()
1300 * prior_lock_state is the value in the 1st
1301 * word of the lock at the time of a successful
1302 * atomic compare and exchange with the new value...
1303 * it represents the state of the lock before we
1304 * decremented the rw_shared_count or cleared either
1305 * rw_want_upgrade or rw_want_write and
1306 * the lck_x_waiting bits... since the wrapper
1307 * routine has already changed the state atomically,
1308 * we just need to decide if we should
1309 * wake up anyone and what value to return... we do
1310 * this by examining the state of the lock before
1311 * we changed it
1312 */
1313 static lck_rw_type_t
1314 lck_rw_done_gen(
1315 lck_rw_t *lck,
1316 uint32_t prior_lock_state)
1317 {
1318 lck_rw_t *fake_lck;
1319 lck_rw_type_t lock_type;
1320 thread_t thread;
1321 uint32_t rwlock_count;
1322
1323 thread = current_thread();
1324 rwlock_count = thread->rwlock_count--;
1325 fake_lck = (lck_rw_t *)&prior_lock_state;
1326
1327 if (lck->lck_rw_can_sleep) {
1328 /*
1329 * prior_lock state is a snapshot of the 1st word of the
1330 * lock in question... we'll fake up a pointer to it
1331 * and carefully not access anything beyond whats defined
1332 * in the first word of a lck_rw_t
1333 */
1334
1335 if (fake_lck->lck_rw_shared_count <= 1) {
1336 if (fake_lck->lck_w_waiting) {
1337 thread_wakeup(RW_LOCK_WRITER_EVENT(lck));
1338 }
1339
1340 if (!(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting) && fake_lck->lck_r_waiting) {
1341 thread_wakeup(RW_LOCK_READER_EVENT(lck));
1342 }
1343 }
1344 #if MACH_LDEBUG
1345 if (rwlock_count == 0) {
1346 panic("rw lock count underflow for thread %p", thread);
1347 }
1348 #endif
1349 /* Check if dropping the lock means that we need to unpromote */
1350
1351 if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
1352 /* sched_flags checked without lock, but will be rechecked while clearing */
1353 lck_rw_clear_promotion(thread, unslide_for_kdebug(lck));
1354 }
1355 }
1356 if (fake_lck->lck_rw_shared_count) {
1357 lock_type = LCK_RW_TYPE_SHARED;
1358 } else {
1359 lock_type = LCK_RW_TYPE_EXCLUSIVE;
1360 }
1361
1362 #if CONFIG_DTRACE
1363 LOCKSTAT_RECORD(LS_LCK_RW_DONE_RELEASE, lck, lock_type == LCK_RW_TYPE_SHARED ? 0 : 1);
1364 #endif
1365
1366 return lock_type;
1367 }
1368
1369
1370 /*
1371 * Routine: lck_rw_unlock
1372 */
1373 void
1374 lck_rw_unlock(
1375 lck_rw_t *lck,
1376 lck_rw_type_t lck_rw_type)
1377 {
1378 if (lck_rw_type == LCK_RW_TYPE_SHARED)
1379 lck_rw_unlock_shared(lck);
1380 else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE)
1381 lck_rw_unlock_exclusive(lck);
1382 else
1383 panic("lck_rw_unlock(): Invalid RW lock type: %d\n", lck_rw_type);
1384 }
1385
1386
1387 /*
1388 * Routine: lck_rw_unlock_shared
1389 */
1390 void
1391 lck_rw_unlock_shared(
1392 lck_rw_t *lck)
1393 {
1394 lck_rw_type_t ret;
1395
1396 assertf(lck->lck_rw_shared_count > 0, "lck %p has shared_count=0x%x", lck, lck->lck_rw_shared_count);
1397 ret = lck_rw_done(lck);
1398
1399 if (ret != LCK_RW_TYPE_SHARED)
1400 panic("lck_rw_unlock_shared(): lock %p held in mode: %d\n", lck, ret);
1401 }
1402
1403
1404 /*
1405 * Routine: lck_rw_unlock_exclusive
1406 */
1407 void
1408 lck_rw_unlock_exclusive(
1409 lck_rw_t *lck)
1410 {
1411 lck_rw_type_t ret;
1412
1413 ret = lck_rw_done(lck);
1414
1415 if (ret != LCK_RW_TYPE_EXCLUSIVE)
1416 panic("lck_rw_unlock_exclusive(): lock held in mode: %d\n", ret);
1417 }
1418
1419
1420 /*
1421 * Routine: lck_rw_lock
1422 */
1423 void
1424 lck_rw_lock(
1425 lck_rw_t *lck,
1426 lck_rw_type_t lck_rw_type)
1427 {
1428 if (lck_rw_type == LCK_RW_TYPE_SHARED)
1429 lck_rw_lock_shared(lck);
1430 else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE)
1431 lck_rw_lock_exclusive(lck);
1432 else
1433 panic("lck_rw_lock(): Invalid RW lock type: %x\n", lck_rw_type);
1434 }
1435
1436 /*
1437 * Routine: lck_rw_lock_shared
1438 */
1439 void
1440 lck_rw_lock_shared(lck_rw_t *lock)
1441 {
1442 uint32_t data, prev;
1443
1444 current_thread()->rwlock_count++;
1445 for ( ; ; ) {
1446 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
1447 if (data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE | LCK_RW_INTERLOCK)) {
1448 atomic_exchange_abort();
1449 if (lock->lck_rw_can_sleep) {
1450 lck_rw_lock_shared_gen(lock);
1451 } else {
1452 cpu_pause();
1453 continue;
1454 }
1455 break;
1456 }
1457 data += LCK_RW_SHARED_READER;
1458 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp))
1459 break;
1460 cpu_pause();
1461 }
1462 #if CONFIG_DTRACE
1463 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lock, DTRACE_RW_SHARED);
1464 #endif /* CONFIG_DTRACE */
1465 return;
1466 }
1467
1468 /*
1469 * Routine: lck_rw_lock_shared_gen
1470 * Function:
1471 * assembly fast path code has determined that this lock
1472 * is held exclusively... this is where we spin/block
1473 * until we can acquire the lock in the shared mode
1474 */
1475 static void
1476 lck_rw_lock_shared_gen(
1477 lck_rw_t *lck)
1478 {
1479 __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck);
1480 uint64_t deadline = 0;
1481 int gotlock = 0;
1482 int slept = 0;
1483 wait_result_t res = 0;
1484 boolean_t istate = -1;
1485
1486 #if CONFIG_DTRACE
1487 uint64_t wait_interval = 0;
1488 int readers_at_sleep = 0;
1489 boolean_t dtrace_ls_initialized = FALSE;
1490 boolean_t dtrace_rwl_shared_spin, dtrace_rwl_shared_block, dtrace_ls_enabled = FALSE;
1491 #endif
1492
1493 while ( !lck_rw_grab_shared(lck)) {
1494
1495 #if CONFIG_DTRACE
1496 if (dtrace_ls_initialized == FALSE) {
1497 dtrace_ls_initialized = TRUE;
1498 dtrace_rwl_shared_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_SPIN] != 0);
1499 dtrace_rwl_shared_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_BLOCK] != 0);
1500 dtrace_ls_enabled = dtrace_rwl_shared_spin || dtrace_rwl_shared_block;
1501 if (dtrace_ls_enabled) {
1502 /*
1503 * Either sleeping or spinning is happening,
1504 * start a timing of our delay interval now.
1505 */
1506 readers_at_sleep = lck->lck_rw_shared_count;
1507 wait_interval = mach_absolute_time();
1508 }
1509 }
1510 #endif
1511 if (istate == -1)
1512 istate = ml_get_interrupts_enabled();
1513
1514 deadline = lck_rw_deadline_for_spin(lck);
1515
1516 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_START,
1517 trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0);
1518
1519 while (((gotlock = lck_rw_grab_shared(lck)) == 0) && mach_absolute_time() < deadline)
1520 lck_rw_lock_pause(istate);
1521
1522 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_END,
1523 trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, gotlock, 0);
1524
1525 if (gotlock)
1526 break;
1527 /*
1528 * if we get here, the deadline has expired w/o us
1529 * being able to grab the lock for read
1530 * check to see if we're allowed to do a thread_block
1531 */
1532 if (lck->lck_rw_can_sleep) {
1533
1534 istate = lck_interlock_lock(lck);
1535
1536 if ((lck->lck_rw_want_write || lck->lck_rw_want_upgrade) &&
1537 ((lck->lck_rw_shared_count == 0) || lck->lck_rw_priv_excl)) {
1538
1539 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_START,
1540 trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0);
1541
1542 lck->lck_r_waiting = TRUE;
1543
1544 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockRead);
1545 res = assert_wait(RW_LOCK_READER_EVENT(lck),
1546 THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
1547 lck_interlock_unlock(lck, istate);
1548
1549 if (res == THREAD_WAITING) {
1550 res = thread_block(THREAD_CONTINUE_NULL);
1551 slept++;
1552 }
1553 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_END,
1554 trace_lck, res, slept, 0, 0);
1555 } else {
1556 lck->lck_rw_shared_count++;
1557 lck_interlock_unlock(lck, istate);
1558 break;
1559 }
1560 }
1561 }
1562
1563 #if CONFIG_DTRACE
1564 if (dtrace_ls_enabled == TRUE) {
1565 if (slept == 0) {
1566 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_SPIN, lck, mach_absolute_time() - wait_interval, 0);
1567 } else {
1568 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_BLOCK, lck,
1569 mach_absolute_time() - wait_interval, 0,
1570 (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1571 }
1572 }
1573 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lck, 0);
1574 #endif
1575 }
1576
1577
1578 /*
1579 * Routine: lck_rw_lock_exclusive
1580 */
1581
1582 void
1583 lck_rw_lock_exclusive(lck_rw_t *lock)
1584 {
1585 current_thread()->rwlock_count++;
1586 if (atomic_test_and_set32(&lock->data,
1587 (LCK_RW_SHARED_MASK | LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE | LCK_RW_INTERLOCK),
1588 LCK_RW_WANT_EXCL, memory_order_acquire_smp, FALSE)) {
1589 #if CONFIG_DTRACE
1590 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL);
1591 #endif /* CONFIG_DTRACE */
1592 } else
1593 lck_rw_lock_exclusive_gen(lock);
1594 }
1595
1596
1597 /*
1598 * Routine: lck_rw_lock_shared_to_exclusive
1599 *
1600 * False returned upon failure, in this case the shared lock is dropped.
1601 */
1602
1603 boolean_t
1604 lck_rw_lock_shared_to_exclusive(lck_rw_t *lock)
1605 {
1606 uint32_t data, prev;
1607
1608 for ( ; ; ) {
1609 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
1610 if (data & LCK_RW_INTERLOCK) {
1611 atomic_exchange_abort();
1612 lck_rw_interlock_spin(lock);
1613 continue;
1614 }
1615 if (data & LCK_RW_WANT_UPGRADE) {
1616 data -= LCK_RW_SHARED_READER;
1617 if ((data & LCK_RW_SHARED_MASK) == 0) /* we were the last reader */
1618 data &= ~(LCK_RW_W_WAITING); /* so clear the wait indicator */
1619 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp))
1620 return lck_rw_lock_shared_to_exclusive_failure(lock, prev);
1621 } else {
1622 data |= LCK_RW_WANT_UPGRADE; /* ask for WANT_UPGRADE */
1623 data -= LCK_RW_SHARED_READER; /* and shed our read count */
1624 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp))
1625 break;
1626 }
1627 cpu_pause();
1628 }
1629 /* we now own the WANT_UPGRADE */
1630 if (data & LCK_RW_SHARED_MASK) /* check to see if all of the readers are drained */
1631 lck_rw_lock_shared_to_exclusive_success(lock); /* if not, we need to go wait */
1632 #if CONFIG_DTRACE
1633 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lock, 0);
1634 #endif
1635 return TRUE;
1636 }
1637
1638
1639 /*
1640 * Routine: lck_rw_lock_shared_to_exclusive_failure
1641 * Function:
1642 * assembly fast path code has already dropped our read
1643 * count and determined that someone else owns 'lck_rw_want_upgrade'
1644 * if 'lck_rw_shared_count' == 0, its also already dropped 'lck_w_waiting'
1645 * all we need to do here is determine if a wakeup is needed
1646 */
1647 static boolean_t
1648 lck_rw_lock_shared_to_exclusive_failure(
1649 lck_rw_t *lck,
1650 uint32_t prior_lock_state)
1651 {
1652 lck_rw_t *fake_lck;
1653 thread_t thread = current_thread();
1654 uint32_t rwlock_count;
1655
1656 /* Check if dropping the lock means that we need to unpromote */
1657 rwlock_count = thread->rwlock_count--;
1658 #if MACH_LDEBUG
1659 if (rwlock_count == 0) {
1660 panic("rw lock count underflow for thread %p", thread);
1661 }
1662 #endif
1663 fake_lck = (lck_rw_t *)&prior_lock_state;
1664
1665 if (fake_lck->lck_w_waiting && fake_lck->lck_rw_shared_count == 1) {
1666 /*
1667 * Someone else has requested upgrade.
1668 * Since we've released the read lock, wake
1669 * him up if he's blocked waiting
1670 */
1671 thread_wakeup(RW_LOCK_WRITER_EVENT(lck));
1672 }
1673
1674 if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
1675 /* sched_flags checked without lock, but will be rechecked while clearing */
1676 lck_rw_clear_promotion(thread, unslide_for_kdebug(lck));
1677 }
1678
1679 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_CODE) | DBG_FUNC_NONE,
1680 VM_KERNEL_UNSLIDE_OR_PERM(lck), lck->lck_rw_shared_count, lck->lck_rw_want_upgrade, 0, 0);
1681
1682 return (FALSE);
1683 }
1684
1685
1686 /*
1687 * Routine: lck_rw_lock_shared_to_exclusive_failure
1688 * Function:
1689 * assembly fast path code has already dropped our read
1690 * count and successfully acquired 'lck_rw_want_upgrade'
1691 * we just need to wait for the rest of the readers to drain
1692 * and then we can return as the exclusive holder of this lock
1693 */
1694 static boolean_t
1695 lck_rw_lock_shared_to_exclusive_success(
1696 lck_rw_t *lck)
1697 {
1698 __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck);
1699 uint64_t deadline = 0;
1700 int slept = 0;
1701 int still_shared = 0;
1702 wait_result_t res;
1703 boolean_t istate = -1;
1704
1705 #if CONFIG_DTRACE
1706 uint64_t wait_interval = 0;
1707 int readers_at_sleep = 0;
1708 boolean_t dtrace_ls_initialized = FALSE;
1709 boolean_t dtrace_rwl_shared_to_excl_spin, dtrace_rwl_shared_to_excl_block, dtrace_ls_enabled = FALSE;
1710 #endif
1711
1712 while (lck->lck_rw_shared_count != 0) {
1713
1714 #if CONFIG_DTRACE
1715 if (dtrace_ls_initialized == FALSE) {
1716 dtrace_ls_initialized = TRUE;
1717 dtrace_rwl_shared_to_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN] != 0);
1718 dtrace_rwl_shared_to_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK] != 0);
1719 dtrace_ls_enabled = dtrace_rwl_shared_to_excl_spin || dtrace_rwl_shared_to_excl_block;
1720 if (dtrace_ls_enabled) {
1721 /*
1722 * Either sleeping or spinning is happening,
1723 * start a timing of our delay interval now.
1724 */
1725 readers_at_sleep = lck->lck_rw_shared_count;
1726 wait_interval = mach_absolute_time();
1727 }
1728 }
1729 #endif
1730 if (istate == -1)
1731 istate = ml_get_interrupts_enabled();
1732
1733 deadline = lck_rw_deadline_for_spin(lck);
1734
1735 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_START,
1736 trace_lck, lck->lck_rw_shared_count, 0, 0, 0);
1737
1738 while ((still_shared = lck->lck_rw_shared_count) && mach_absolute_time() < deadline)
1739 lck_rw_lock_pause(istate);
1740
1741 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_END,
1742 trace_lck, lck->lck_rw_shared_count, 0, 0, 0);
1743
1744 if ( !still_shared)
1745 break;
1746 /*
1747 * if we get here, the deadline has expired w/o
1748 * the rw_shared_count having drained to 0
1749 * check to see if we're allowed to do a thread_block
1750 */
1751 if (lck->lck_rw_can_sleep) {
1752
1753 istate = lck_interlock_lock(lck);
1754
1755 if (lck->lck_rw_shared_count != 0) {
1756 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_START,
1757 trace_lck, lck->lck_rw_shared_count, 0, 0, 0);
1758
1759 lck->lck_w_waiting = TRUE;
1760
1761 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockUpgrade);
1762 res = assert_wait(RW_LOCK_WRITER_EVENT(lck),
1763 THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
1764 lck_interlock_unlock(lck, istate);
1765
1766 if (res == THREAD_WAITING) {
1767 res = thread_block(THREAD_CONTINUE_NULL);
1768 slept++;
1769 }
1770 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_END,
1771 trace_lck, res, slept, 0, 0);
1772 } else {
1773 lck_interlock_unlock(lck, istate);
1774 break;
1775 }
1776 }
1777 }
1778 #if CONFIG_DTRACE
1779 /*
1780 * We infer whether we took the sleep/spin path above by checking readers_at_sleep.
1781 */
1782 if (dtrace_ls_enabled == TRUE) {
1783 if (slept == 0) {
1784 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN, lck, mach_absolute_time() - wait_interval, 0);
1785 } else {
1786 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK, lck,
1787 mach_absolute_time() - wait_interval, 1,
1788 (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1789 }
1790 }
1791 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lck, 1);
1792 #endif
1793 return (TRUE);
1794 }
1795
1796 /*
1797 * Routine: lck_rw_lock_exclusive_to_shared
1798 */
1799
1800 void lck_rw_lock_exclusive_to_shared(lck_rw_t *lock)
1801 {
1802 uint32_t data, prev;
1803
1804 for ( ; ; ) {
1805 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_release_smp);
1806 if (data & LCK_RW_INTERLOCK) {
1807 atomic_exchange_abort();
1808 lck_rw_interlock_spin(lock); /* wait for interlock to clear */
1809 continue;
1810 }
1811 data += LCK_RW_SHARED_READER;
1812 if (data & LCK_RW_WANT_UPGRADE)
1813 data &= ~(LCK_RW_WANT_UPGRADE);
1814 else
1815 data &= ~(LCK_RW_WANT_EXCL);
1816 if (!((prev & LCK_RW_W_WAITING) && (prev & LCK_RW_PRIV_EXCL)))
1817 data &= ~(LCK_RW_W_WAITING);
1818 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_release_smp))
1819 break;
1820 cpu_pause();
1821 }
1822 return lck_rw_lock_exclusive_to_shared_gen(lock, prev);
1823 }
1824
1825
1826 /*
1827 * Routine: lck_rw_lock_exclusive_to_shared_gen
1828 * Function:
1829 * assembly fast path has already dropped
1830 * our exclusive state and bumped lck_rw_shared_count
1831 * all we need to do here is determine if anyone
1832 * needs to be awakened.
1833 */
1834 static void
1835 lck_rw_lock_exclusive_to_shared_gen(
1836 lck_rw_t *lck,
1837 uint32_t prior_lock_state)
1838 {
1839 __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck);
1840 lck_rw_t *fake_lck;
1841
1842 fake_lck = (lck_rw_t *)&prior_lock_state;
1843
1844 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_START,
1845 trace_lck, fake_lck->lck_rw_want_write, fake_lck->lck_rw_want_upgrade, 0, 0);
1846
1847 /*
1848 * don't wake up anyone waiting to take the lock exclusively
1849 * since we hold a read count... when the read count drops to 0,
1850 * the writers will be woken.
1851 *
1852 * wake up any waiting readers if we don't have any writers waiting,
1853 * or the lock is NOT marked as rw_priv_excl (writers have privilege)
1854 */
1855 if (!(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting) && fake_lck->lck_r_waiting)
1856 thread_wakeup(RW_LOCK_READER_EVENT(lck));
1857
1858 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_END,
1859 trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, lck->lck_rw_shared_count, 0);
1860
1861 #if CONFIG_DTRACE
1862 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_TO_SHARED_DOWNGRADE, lck, 0);
1863 #endif
1864 }
1865
1866
1867 /*
1868 * Routine: lck_rw_try_lock
1869 */
1870 boolean_t
1871 lck_rw_try_lock(
1872 lck_rw_t *lck,
1873 lck_rw_type_t lck_rw_type)
1874 {
1875 if (lck_rw_type == LCK_RW_TYPE_SHARED)
1876 return(lck_rw_try_lock_shared(lck));
1877 else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE)
1878 return(lck_rw_try_lock_exclusive(lck));
1879 else
1880 panic("lck_rw_try_lock(): Invalid rw lock type: %x\n", lck_rw_type);
1881 return(FALSE);
1882 }
1883
1884 /*
1885 * Routine: lck_rw_try_lock_shared
1886 */
1887
1888 boolean_t lck_rw_try_lock_shared(lck_rw_t *lock)
1889 {
1890 uint32_t data, prev;
1891
1892 for ( ; ; ) {
1893 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
1894 if (data & LCK_RW_INTERLOCK) {
1895 atomic_exchange_abort();
1896 lck_rw_interlock_spin(lock);
1897 continue;
1898 }
1899 if (data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) {
1900 atomic_exchange_abort();
1901 return FALSE; /* lock is busy */
1902 }
1903 data += LCK_RW_SHARED_READER; /* Increment reader refcount */
1904 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp))
1905 break;
1906 cpu_pause();
1907 }
1908 current_thread()->rwlock_count++;
1909 /* There is a 3 instr window where preemption may not notice rwlock_count after cmpxchg */
1910 #if CONFIG_DTRACE
1911 LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_SHARED_ACQUIRE, lock, DTRACE_RW_SHARED);
1912 #endif /* CONFIG_DTRACE */
1913 return TRUE;
1914 }
1915
1916
1917 /*
1918 * Routine: lck_rw_try_lock_exclusive
1919 */
1920
1921 boolean_t lck_rw_try_lock_exclusive(lck_rw_t *lock)
1922 {
1923 uint32_t data, prev;
1924
1925 for ( ; ; ) {
1926 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
1927 if (data & LCK_RW_INTERLOCK) {
1928 atomic_exchange_abort();
1929 lck_rw_interlock_spin(lock);
1930 continue;
1931 }
1932 if (data & (LCK_RW_SHARED_MASK | LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) {
1933 atomic_exchange_abort();
1934 return FALSE; /* can't get it */
1935 }
1936 data |= LCK_RW_WANT_EXCL;
1937 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp))
1938 break;
1939 cpu_pause();
1940 }
1941
1942 current_thread()->rwlock_count++;
1943 #if CONFIG_DTRACE
1944 LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL);
1945 #endif /* CONFIG_DTRACE */
1946 return TRUE;
1947 }
1948
1949
1950 void
1951 lck_rw_assert(
1952 lck_rw_t *lck,
1953 unsigned int type)
1954 {
1955 switch (type) {
1956 case LCK_RW_ASSERT_SHARED:
1957 if (lck->lck_rw_shared_count != 0) {
1958 return;
1959 }
1960 break;
1961 case LCK_RW_ASSERT_EXCLUSIVE:
1962 if ((lck->lck_rw_want_write ||
1963 lck->lck_rw_want_upgrade) &&
1964 lck->lck_rw_shared_count == 0) {
1965 return;
1966 }
1967 break;
1968 case LCK_RW_ASSERT_HELD:
1969 if (lck->lck_rw_want_write ||
1970 lck->lck_rw_want_upgrade ||
1971 lck->lck_rw_shared_count != 0) {
1972 return;
1973 }
1974 break;
1975 case LCK_RW_ASSERT_NOTHELD:
1976 if (!(lck->lck_rw_want_write ||
1977 lck->lck_rw_want_upgrade ||
1978 lck->lck_rw_shared_count != 0)) {
1979 return;
1980 }
1981 break;
1982 default:
1983 break;
1984 }
1985
1986 panic("rw lock (%p)%s held (mode=%u), first word %08x\n", lck, (type == LCK_RW_ASSERT_NOTHELD ? "" : " not"), type, *(uint32_t *)lck);
1987 }
1988
1989 /* On return to userspace, this routine is called if the rwlock_count is somehow imbalanced */
1990 #if MACH_LDEBUG
1991 __dead2
1992 #endif
1993 void
1994 lck_rw_clear_promotions_x86(thread_t thread)
1995 {
1996 #if MACH_LDEBUG
1997 /* It's fatal to leave a RW lock locked and return to userspace */
1998 panic("%u rw lock(s) held on return to userspace for thread %p", thread->rwlock_count, thread);
1999 #else
2000 /* Paper over the issue */
2001 thread->rwlock_count = 0;
2002 lck_rw_clear_promotion(thread, 0);
2003 #endif
2004 }
2005
2006 boolean_t
2007 lck_rw_lock_yield_shared(lck_rw_t *lck, boolean_t force_yield)
2008 {
2009 lck_rw_assert(lck, LCK_RW_ASSERT_SHARED);
2010
2011 if (lck->lck_rw_want_write || lck->lck_rw_want_upgrade || force_yield) {
2012 lck_rw_unlock_shared(lck);
2013 mutex_pause(2);
2014 lck_rw_lock_shared(lck);
2015 return TRUE;
2016 }
2017
2018 return FALSE;
2019 }
2020
2021 /*
2022 * Routine: kdp_lck_rw_lock_is_acquired_exclusive
2023 * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
2024 */
2025 boolean_t
2026 kdp_lck_rw_lock_is_acquired_exclusive(lck_rw_t *lck) {
2027 if (not_in_kdp) {
2028 panic("panic: rw lock exclusive check done outside of kernel debugger");
2029 }
2030 return ((lck->lck_rw_want_upgrade || lck->lck_rw_want_write) && (lck->lck_rw_shared_count == 0)) ? TRUE : FALSE;
2031 }
2032
2033 /*
2034 * Slow path routines for lck_mtx locking and unlocking functions.
2035 *
2036 * These functions were previously implemented in x86 assembly,
2037 * and some optimizations are in place in this c code to obtain a compiled code
2038 * as performant and compact as the assembly version.
2039 *
2040 * To avoid to inline these functions on the fast path, all functions directly called by
2041 * the fast paths have the __attribute__((noinline)) specified. Also they are all implemented
2042 * in such a way the fast path can tail call into them. In this way the return address
2043 * does not need to be pushed on the caller stack and stack optimization can happen on the caller.
2044 *
2045 * Slow path code is structured in such a way there are no calls to functions that will return
2046 * on the context of the caller function, i.e. all functions called are or tail call functions
2047 * or inline functions. The number of arguments of the tail call functions are less then six,
2048 * so that they can be passed over registers and do not need to be pushed on stack.
2049 * This allows the compiler to not create a stack frame for the functions.
2050 *
2051 * __improbable and __probable are used to compile the slow path code in such a way
2052 * the fast path case will be on a sequence of instructions with as less jumps as possible,
2053 * to make this case the most optimized even if falling through the slow path.
2054 */
2055
2056 /*
2057 * Intel lock invariants:
2058 *
2059 * lck_mtx_waiters: contains the count of threads currently in the mutex waitqueue
2060 *
2061 * The lock owner is promoted to the max priority of all its waiters only if it
2062 * was a lower priority when it acquired or was an owner when a waiter waited.
2063 * Max priority is capped at MAXPRI_PROMOTE.
2064 *
2065 * The last waiter will not be promoted as it is woken up, but the last
2066 * lock owner may not have been the last thread to have been woken up depending on the
2067 * luck of the draw. Therefore a last-owner may still have the promoted-on-wakeup
2068 * flag set.
2069 *
2070 * TODO: Figure out an algorithm for stopping a lock holder which is already at the right
2071 * priority from dropping priority in the future without having to take thread lock
2072 * on acquire.
2073 */
2074
2075 #ifdef MUTEX_ZONE
2076 extern zone_t lck_mtx_zone;
2077 #endif
2078
2079 /*
2080 * Routine: lck_mtx_alloc_init
2081 */
2082 lck_mtx_t *
2083 lck_mtx_alloc_init(
2084 lck_grp_t *grp,
2085 lck_attr_t *attr)
2086 {
2087 lck_mtx_t *lck;
2088 #ifdef MUTEX_ZONE
2089 if ((lck = (lck_mtx_t *)zalloc(lck_mtx_zone)) != 0)
2090 lck_mtx_init(lck, grp, attr);
2091 #else
2092 if ((lck = (lck_mtx_t *)kalloc(sizeof(lck_mtx_t))) != 0)
2093 lck_mtx_init(lck, grp, attr);
2094 #endif
2095 return(lck);
2096 }
2097
2098 /*
2099 * Routine: lck_mtx_free
2100 */
2101 void
2102 lck_mtx_free(
2103 lck_mtx_t *lck,
2104 lck_grp_t *grp)
2105 {
2106 lck_mtx_destroy(lck, grp);
2107 #ifdef MUTEX_ZONE
2108 zfree(lck_mtx_zone, lck);
2109 #else
2110 kfree(lck, sizeof(lck_mtx_t));
2111 #endif
2112 }
2113
2114 /*
2115 * Routine: lck_mtx_ext_init
2116 */
2117 static void
2118 lck_mtx_ext_init(
2119 lck_mtx_ext_t *lck,
2120 lck_grp_t *grp,
2121 lck_attr_t *attr)
2122 {
2123 bzero((void *)lck, sizeof(lck_mtx_ext_t));
2124
2125 if ((attr->lck_attr_val) & LCK_ATTR_DEBUG) {
2126 lck->lck_mtx_deb.type = MUTEX_TAG;
2127 lck->lck_mtx_attr |= LCK_MTX_ATTR_DEBUG;
2128 }
2129
2130 lck->lck_mtx_grp = grp;
2131
2132 if (grp->lck_grp_attr & LCK_GRP_ATTR_STAT)
2133 lck->lck_mtx_attr |= LCK_MTX_ATTR_STAT;
2134
2135 lck->lck_mtx.lck_mtx_is_ext = 1;
2136 lck->lck_mtx.lck_mtx_pad32 = 0xFFFFFFFF;
2137 }
2138
2139 /*
2140 * Routine: lck_mtx_init
2141 */
2142 void
2143 lck_mtx_init(
2144 lck_mtx_t *lck,
2145 lck_grp_t *grp,
2146 lck_attr_t *attr)
2147 {
2148 lck_mtx_ext_t *lck_ext;
2149 lck_attr_t *lck_attr;
2150
2151 if (attr != LCK_ATTR_NULL)
2152 lck_attr = attr;
2153 else
2154 lck_attr = &LockDefaultLckAttr;
2155
2156 if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) {
2157 if ((lck_ext = (lck_mtx_ext_t *)kalloc(sizeof(lck_mtx_ext_t))) != 0) {
2158 lck_mtx_ext_init(lck_ext, grp, lck_attr);
2159 lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT;
2160 lck->lck_mtx_ptr = lck_ext;
2161 }
2162 } else {
2163 lck->lck_mtx_owner = 0;
2164 lck->lck_mtx_state = 0;
2165 }
2166 lck->lck_mtx_pad32 = 0xFFFFFFFF;
2167 lck_grp_reference(grp);
2168 lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX);
2169 }
2170
2171 /*
2172 * Routine: lck_mtx_init_ext
2173 */
2174 void
2175 lck_mtx_init_ext(
2176 lck_mtx_t *lck,
2177 lck_mtx_ext_t *lck_ext,
2178 lck_grp_t *grp,
2179 lck_attr_t *attr)
2180 {
2181 lck_attr_t *lck_attr;
2182
2183 if (attr != LCK_ATTR_NULL)
2184 lck_attr = attr;
2185 else
2186 lck_attr = &LockDefaultLckAttr;
2187
2188 if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) {
2189 lck_mtx_ext_init(lck_ext, grp, lck_attr);
2190 lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT;
2191 lck->lck_mtx_ptr = lck_ext;
2192 } else {
2193 lck->lck_mtx_owner = 0;
2194 lck->lck_mtx_state = 0;
2195 }
2196 lck->lck_mtx_pad32 = 0xFFFFFFFF;
2197
2198 lck_grp_reference(grp);
2199 lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX);
2200 }
2201
2202 static void
2203 lck_mtx_lock_mark_destroyed(
2204 lck_mtx_t *mutex,
2205 boolean_t indirect)
2206 {
2207 uint32_t state;
2208
2209 if (indirect) {
2210 /* convert to destroyed state */
2211 ordered_store_mtx_state_release(mutex, LCK_MTX_TAG_DESTROYED);
2212 return;
2213 }
2214
2215 state = ordered_load_mtx_state(mutex);
2216 lck_mtx_interlock_lock(mutex, &state);
2217
2218 ordered_store_mtx_state_release(mutex, LCK_MTX_TAG_DESTROYED);
2219
2220 enable_preemption();
2221 }
2222
2223 /*
2224 * Routine: lck_mtx_destroy
2225 */
2226 void
2227 lck_mtx_destroy(
2228 lck_mtx_t *lck,
2229 lck_grp_t *grp)
2230 {
2231 boolean_t indirect;
2232
2233 if (lck->lck_mtx_tag == LCK_MTX_TAG_DESTROYED)
2234 return;
2235 #if MACH_LDEBUG
2236 lck_mtx_assert(lck, LCK_MTX_ASSERT_NOTOWNED);
2237 #endif
2238 indirect = (lck->lck_mtx_tag == LCK_MTX_TAG_INDIRECT);
2239
2240 lck_mtx_lock_mark_destroyed(lck, indirect);
2241
2242 if (indirect)
2243 kfree(lck->lck_mtx_ptr, sizeof(lck_mtx_ext_t));
2244 lck_grp_lckcnt_decr(grp, LCK_TYPE_MTX);
2245 lck_grp_deallocate(grp);
2246 return;
2247 }
2248
2249
2250 #if DEVELOPMENT | DEBUG
2251 __attribute__((noinline))
2252 void
2253 lck_mtx_owner_check_panic(
2254 lck_mtx_t *lock)
2255 {
2256 thread_t owner = (thread_t)lock->lck_mtx_owner;
2257 panic("Mutex unlock attempted from non-owner thread. Owner=%p lock=%p", owner, lock);
2258 }
2259 #endif
2260
2261 __attribute__((always_inline))
2262 static boolean_t
2263 get_indirect_mutex(
2264 lck_mtx_t **lock,
2265 uint32_t *state)
2266 {
2267 *lock = &((*lock)->lck_mtx_ptr->lck_mtx);
2268 *state = ordered_load_mtx_state(*lock);
2269 return TRUE;
2270 }
2271
2272 /*
2273 * Routine: lck_mtx_unlock_slow
2274 *
2275 * Unlocks a mutex held by current thread.
2276 *
2277 * It will wake up waiters if necessary.
2278 *
2279 * Interlock can be held.
2280 */
2281 __attribute__((noinline))
2282 void
2283 lck_mtx_unlock_slow(
2284 lck_mtx_t *lock)
2285 {
2286 thread_t thread;
2287 uint32_t state, prev;
2288 boolean_t indirect = FALSE;
2289
2290 state = ordered_load_mtx_state(lock);
2291
2292 /* Is this an indirect mutex? */
2293 if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
2294 indirect = get_indirect_mutex(&lock, &state);
2295 }
2296
2297 thread = current_thread();
2298
2299 #if DEVELOPMENT | DEBUG
2300 thread_t owner = (thread_t)lock->lck_mtx_owner;
2301 if(__improbable(owner != thread))
2302 lck_mtx_owner_check_panic(lock);
2303 #endif
2304
2305 /* check if it is held as a spinlock */
2306 if (__improbable((state & LCK_MTX_MLOCKED_MSK) == 0))
2307 goto unlock;
2308
2309 lck_mtx_interlock_lock_clear_flags(lock, LCK_MTX_MLOCKED_MSK, &state);
2310
2311 unlock:
2312 /* preemption disabled, interlock held and mutex not held */
2313
2314 /* clear owner */
2315 ordered_store_mtx_owner(lock, 0);
2316 /* keep original state in prev for later evaluation */
2317 prev = state;
2318
2319 if (__improbable(state & LCK_MTX_WAITERS_MSK)) {
2320 #if MACH_LDEBUG
2321 if (thread)
2322 thread->mutex_count--;
2323 #endif
2324 return lck_mtx_unlock_wakeup_tail(lock, state, indirect);
2325 }
2326
2327 /* release interlock, promotion and clear spin flag */
2328 state &= (~(LCK_MTX_ILOCKED_MSK | LCK_MTX_SPIN_MSK));
2329 ordered_store_mtx_state_release(lock, state); /* since I own the interlock, I don't need an atomic update */
2330
2331 #if MACH_LDEBUG
2332 /* perform lock statistics after drop to prevent delay */
2333 if (thread)
2334 thread->mutex_count--; /* lock statistic */
2335 #endif /* MACH_LDEBUG */
2336
2337 /* re-enable preemption */
2338 lck_mtx_unlock_finish_inline(lock, FALSE);
2339
2340 return;
2341 }
2342
2343 #define LCK_MTX_LCK_WAIT_CODE 0x20
2344 #define LCK_MTX_LCK_WAKEUP_CODE 0x21
2345 #define LCK_MTX_LCK_SPIN_CODE 0x22
2346 #define LCK_MTX_LCK_ACQUIRE_CODE 0x23
2347 #define LCK_MTX_LCK_DEMOTE_CODE 0x24
2348
2349 /*
2350 * Routine: lck_mtx_unlock_wakeup_tail
2351 *
2352 * Invoked on unlock when there is
2353 * contention, i.e. the assembly routine sees
2354 * that mutex->lck_mtx_waiters != 0
2355 *
2356 * neither the mutex or interlock is held
2357 *
2358 * Note that this routine might not be called if there are pending
2359 * waiters which have previously been woken up, and they didn't
2360 * end up boosting the old owner.
2361 *
2362 * assembly routine previously did the following to mutex:
2363 * (after saving the state in prior_lock_state)
2364 * decremented lck_mtx_waiters if nonzero
2365 *
2366 * This function needs to be called as a tail call
2367 * to optimize the compiled code.
2368 */
2369 __attribute__((noinline))
2370 static void
2371 lck_mtx_unlock_wakeup_tail (
2372 lck_mtx_t *mutex,
2373 uint32_t state,
2374 boolean_t indirect)
2375 {
2376 struct turnstile *ts;
2377
2378 __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex);
2379 kern_return_t did_wake;
2380
2381 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_START,
2382 trace_lck, 0, mutex->lck_mtx_waiters, 0, 0);
2383
2384 ts = turnstile_prepare((uintptr_t)mutex, NULL, TURNSTILE_NULL, TURNSTILE_KERNEL_MUTEX);
2385
2386 if (mutex->lck_mtx_waiters > 1) {
2387 /* WAITQ_PROMOTE_ON_WAKE will call turnstile_update_inheritor on the wokenup thread */
2388 did_wake = waitq_wakeup64_one(&ts->ts_waitq, CAST_EVENT64_T(LCK_MTX_EVENT(mutex)), THREAD_AWAKENED, WAITQ_PROMOTE_ON_WAKE);
2389 } else {
2390 did_wake = waitq_wakeup64_one(&ts->ts_waitq, CAST_EVENT64_T(LCK_MTX_EVENT(mutex)), THREAD_AWAKENED, WAITQ_ALL_PRIORITIES);
2391 turnstile_update_inheritor(ts, NULL, TURNSTILE_IMMEDIATE_UPDATE);
2392 }
2393 assert(did_wake == KERN_SUCCESS);
2394
2395 turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
2396 turnstile_complete((uintptr_t)mutex, NULL, NULL, TURNSTILE_KERNEL_MUTEX);
2397
2398 state -= LCK_MTX_WAITER;
2399 state &= (~(LCK_MTX_SPIN_MSK | LCK_MTX_ILOCKED_MSK));
2400 ordered_store_mtx_state_release(mutex, state);
2401
2402 assert(current_thread()->turnstile != NULL);
2403
2404 turnstile_cleanup();
2405
2406 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_END,
2407 trace_lck, 0, mutex->lck_mtx_waiters, 0, 0);
2408
2409 lck_mtx_unlock_finish_inline(mutex, indirect);
2410 }
2411
2412 /*
2413 * Routine: lck_mtx_lock_acquire_x86
2414 *
2415 * Invoked on acquiring the mutex when there is
2416 * contention (i.e. the assembly routine sees that
2417 * that mutex->lck_mtx_waiters != 0
2418 *
2419 * mutex is owned... interlock is held... preemption is disabled
2420 */
2421 __attribute__((always_inline))
2422 static void
2423 lck_mtx_lock_acquire_inline(
2424 lck_mtx_t *mutex,
2425 struct turnstile *ts)
2426 {
2427 __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex);
2428
2429 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_START,
2430 trace_lck, 0, mutex->lck_mtx_waiters, 0, 0);
2431
2432 thread_t thread = (thread_t)mutex->lck_mtx_owner; /* faster than current_thread() */
2433 assert(thread->waiting_for_mutex == NULL);
2434
2435 if (mutex->lck_mtx_waiters > 0) {
2436 if (ts == NULL) {
2437 ts = turnstile_prepare((uintptr_t)mutex, NULL, TURNSTILE_NULL, TURNSTILE_KERNEL_MUTEX);
2438 }
2439
2440 turnstile_update_inheritor(ts, thread, (TURNSTILE_IMMEDIATE_UPDATE | TURNSTILE_INHERITOR_THREAD));
2441 turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
2442 }
2443
2444 if (ts != NULL) {
2445 turnstile_complete((uintptr_t)mutex, NULL, NULL, TURNSTILE_KERNEL_MUTEX);
2446 }
2447
2448 assert(current_thread()->turnstile != NULL);
2449
2450 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_END,
2451 trace_lck, 0, mutex->lck_mtx_waiters, 0, 0);
2452 }
2453
2454 void
2455 lck_mtx_lock_acquire_x86(
2456 lck_mtx_t *mutex)
2457 {
2458 return lck_mtx_lock_acquire_inline(mutex, NULL);
2459 }
2460
2461 /*
2462 * Tail call helpers for lock functions that perform
2463 * lck_mtx_lock_acquire followed by the caller's finish routine, to optimize
2464 * the caller's compiled code.
2465 */
2466
2467 __attribute__((noinline))
2468 static void
2469 lck_mtx_lock_acquire_tail(
2470 lck_mtx_t *mutex,
2471 boolean_t indirect,
2472 struct turnstile *ts)
2473 {
2474 lck_mtx_lock_acquire_inline(mutex, ts);
2475 lck_mtx_lock_finish_inline_with_cleanup(mutex, ordered_load_mtx_state(mutex), indirect);
2476 }
2477
2478 __attribute__((noinline))
2479 static boolean_t
2480 lck_mtx_try_lock_acquire_tail(
2481 lck_mtx_t *mutex)
2482 {
2483 lck_mtx_lock_acquire_inline(mutex, NULL);
2484 lck_mtx_try_lock_finish_inline(mutex, ordered_load_mtx_state(mutex));
2485
2486 return TRUE;
2487 }
2488
2489 __attribute__((noinline))
2490 static void
2491 lck_mtx_convert_spin_acquire_tail(
2492 lck_mtx_t *mutex)
2493 {
2494 lck_mtx_lock_acquire_inline(mutex, NULL);
2495 lck_mtx_convert_spin_finish_inline(mutex, ordered_load_mtx_state(mutex));
2496 }
2497
2498 boolean_t
2499 lck_mtx_ilk_unlock(
2500 lck_mtx_t *mutex)
2501 {
2502 lck_mtx_ilk_unlock_inline(mutex, ordered_load_mtx_state(mutex));
2503 return TRUE;
2504 }
2505
2506 static inline void
2507 lck_mtx_interlock_lock_set_and_clear_flags(
2508 lck_mtx_t *mutex,
2509 uint32_t xor_flags,
2510 uint32_t and_flags,
2511 uint32_t *new_state)
2512 {
2513 uint32_t state, prev;
2514 state = *new_state;
2515
2516 for ( ; ; ) {
2517 /* have to wait for interlock to clear */
2518 while (__improbable(state & (LCK_MTX_ILOCKED_MSK | xor_flags))) {
2519 cpu_pause();
2520 state = ordered_load_mtx_state(mutex);
2521 }
2522 prev = state; /* prev contains snapshot for exchange */
2523 state |= LCK_MTX_ILOCKED_MSK | xor_flags; /* pick up interlock */
2524 state &= ~and_flags; /* clear flags */
2525
2526 disable_preemption();
2527 if (os_atomic_cmpxchg(&mutex->lck_mtx_state, prev, state, acquire))
2528 break;
2529 enable_preemption();
2530 cpu_pause();
2531 state = ordered_load_mtx_state(mutex);
2532 }
2533 *new_state = state;
2534 return;
2535 }
2536
2537 static inline void
2538 lck_mtx_interlock_lock_clear_flags(
2539 lck_mtx_t *mutex,
2540 uint32_t and_flags,
2541 uint32_t *new_state)
2542 {
2543 return lck_mtx_interlock_lock_set_and_clear_flags(mutex, 0, and_flags, new_state);
2544 }
2545
2546 static inline void
2547 lck_mtx_interlock_lock(
2548 lck_mtx_t *mutex,
2549 uint32_t *new_state)
2550 {
2551 return lck_mtx_interlock_lock_set_and_clear_flags(mutex, 0, 0, new_state);
2552 }
2553
2554 static inline int
2555 lck_mtx_interlock_try_lock_set_flags(
2556 lck_mtx_t *mutex,
2557 uint32_t or_flags,
2558 uint32_t *new_state)
2559 {
2560 uint32_t state, prev;
2561 state = *new_state;
2562
2563 /* have to wait for interlock to clear */
2564 if (state & (LCK_MTX_ILOCKED_MSK | or_flags)) {
2565 return 0;
2566 }
2567 prev = state; /* prev contains snapshot for exchange */
2568 state |= LCK_MTX_ILOCKED_MSK | or_flags; /* pick up interlock */
2569 disable_preemption();
2570 if (os_atomic_cmpxchg(&mutex->lck_mtx_state, prev, state, acquire)) {
2571 *new_state = state;
2572 return 1;
2573 }
2574
2575 enable_preemption();
2576 return 0;
2577 }
2578
2579 static inline int
2580 lck_mtx_interlock_try_lock(
2581 lck_mtx_t *mutex,
2582 uint32_t *new_state)
2583 {
2584 return lck_mtx_interlock_try_lock_set_flags(mutex, 0, new_state);
2585 }
2586
2587 static inline int
2588 lck_mtx_interlock_try_lock_disable_interrupts(
2589 lck_mtx_t *mutex,
2590 boolean_t *istate)
2591 {
2592 uint32_t state;
2593
2594 *istate = ml_set_interrupts_enabled(FALSE);
2595 state = ordered_load_mtx_state(mutex);
2596
2597 if (lck_mtx_interlock_try_lock(mutex, &state)) {
2598 return 1;
2599 } else {
2600 ml_set_interrupts_enabled(*istate);
2601 return 0;
2602 }
2603 }
2604
2605 static inline void
2606 lck_mtx_interlock_unlock_enable_interrupts(
2607 lck_mtx_t *mutex,
2608 boolean_t istate)
2609 {
2610 lck_mtx_ilk_unlock(mutex);
2611 ml_set_interrupts_enabled(istate);
2612 }
2613
2614 __attribute__((noinline))
2615 static void
2616 lck_mtx_lock_contended(
2617 lck_mtx_t *lock,
2618 boolean_t indirect,
2619 boolean_t *first_miss)
2620 {
2621 lck_mtx_spinwait_ret_type_t ret;
2622 uint32_t state;
2623 thread_t thread;
2624 struct turnstile *ts = NULL;
2625
2626 try_again:
2627
2628 if (indirect) {
2629 lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, first_miss);
2630 }
2631
2632 ret = lck_mtx_lock_spinwait_x86(lock);
2633 state = ordered_load_mtx_state(lock);
2634 switch (ret) {
2635 case LCK_MTX_SPINWAIT_NO_SPIN:
2636 /*
2637 * owner not on core, lck_mtx_lock_spinwait_x86 didn't even
2638 * try to spin.
2639 */
2640 if (indirect) {
2641 lck_grp_mtx_update_direct_wait((struct _lck_mtx_ext_*)lock);
2642 }
2643
2644 /* just fall through case LCK_MTX_SPINWAIT_SPUN */
2645 case LCK_MTX_SPINWAIT_SPUN:
2646 /*
2647 * mutex not acquired but lck_mtx_lock_spinwait_x86 tried to spin
2648 * interlock not held
2649 */
2650 lck_mtx_interlock_lock(lock, &state);
2651 assert(state & LCK_MTX_ILOCKED_MSK);
2652
2653 if (state & LCK_MTX_MLOCKED_MSK) {
2654 if (indirect) {
2655 lck_grp_mtx_update_wait((struct _lck_mtx_ext_*)lock, first_miss);
2656 }
2657 lck_mtx_lock_wait_x86(lock, &ts);
2658 /*
2659 * interlock is not held here.
2660 */
2661 goto try_again;
2662 } else {
2663
2664 /* grab the mutex */
2665 state |= LCK_MTX_MLOCKED_MSK;
2666 ordered_store_mtx_state_release(lock, state);
2667 thread = current_thread();
2668 ordered_store_mtx_owner(lock, (uintptr_t)thread);
2669 #if MACH_LDEBUG
2670 if (thread) {
2671 thread->mutex_count++;
2672 }
2673 #endif /* MACH_LDEBUG */
2674 }
2675
2676 break;
2677 case LCK_MTX_SPINWAIT_ACQUIRED:
2678 /*
2679 * mutex has been acquired by lck_mtx_lock_spinwait_x86
2680 * interlock is held and preemption disabled
2681 * owner is set and mutex marked as locked
2682 * statistics updated too
2683 */
2684 break;
2685 default:
2686 panic("lck_mtx_lock_spinwait_x86 returned %d for mutex %p\n", ret, lock);
2687 }
2688
2689 /*
2690 * interlock is already acquired here
2691 */
2692
2693 /* mutex has been acquired */
2694 thread = (thread_t)lock->lck_mtx_owner;
2695 if (state & LCK_MTX_WAITERS_MSK) {
2696 /*
2697 * lck_mtx_lock_acquire_tail will call
2698 * turnstile_complete.
2699 */
2700 return lck_mtx_lock_acquire_tail(lock, indirect, ts);
2701 }
2702
2703 if (ts != NULL) {
2704 turnstile_complete((uintptr_t)lock, NULL, NULL, TURNSTILE_KERNEL_MUTEX);
2705 }
2706
2707 assert(current_thread()->turnstile != NULL);
2708
2709 /* release the interlock */
2710 lck_mtx_lock_finish_inline_with_cleanup(lock, ordered_load_mtx_state(lock), indirect);
2711 }
2712
2713 /*
2714 * Helper noinline functions for calling
2715 * panic to optimize compiled code.
2716 */
2717
2718 __attribute__((noinline)) __abortlike
2719 static void
2720 lck_mtx_destroyed(
2721 lck_mtx_t *lock)
2722 {
2723 panic("trying to interlock destroyed mutex (%p)", lock);
2724 }
2725
2726 __attribute__((noinline))
2727 static boolean_t
2728 lck_mtx_try_destroyed(
2729 lck_mtx_t *lock)
2730 {
2731 panic("trying to interlock destroyed mutex (%p)", lock);
2732 return FALSE;
2733 }
2734
2735 __attribute__((always_inline))
2736 static boolean_t
2737 lck_mtx_lock_wait_interlock_to_clear(
2738 lck_mtx_t *lock,
2739 uint32_t* new_state)
2740 {
2741 uint32_t state;
2742
2743 for ( ; ; ) {
2744 cpu_pause();
2745 state = ordered_load_mtx_state(lock);
2746 if (!(state & (LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK))) {
2747 *new_state = state;
2748 return TRUE;
2749 }
2750 if (state & LCK_MTX_MLOCKED_MSK) {
2751 /* if it is held as mutex, just fail */
2752 return FALSE;
2753 }
2754 }
2755 }
2756
2757 __attribute__((always_inline))
2758 static boolean_t
2759 lck_mtx_try_lock_wait_interlock_to_clear(
2760 lck_mtx_t *lock,
2761 uint32_t* new_state)
2762 {
2763 uint32_t state;
2764
2765 for ( ; ; ) {
2766 cpu_pause();
2767 state = ordered_load_mtx_state(lock);
2768 if (state & (LCK_MTX_MLOCKED_MSK | LCK_MTX_SPIN_MSK)) {
2769 /* if it is held as mutex or spin, just fail */
2770 return FALSE;
2771 }
2772 if (!(state & LCK_MTX_ILOCKED_MSK)) {
2773 *new_state = state;
2774 return TRUE;
2775 }
2776 }
2777 }
2778
2779 /*
2780 * Routine: lck_mtx_lock_slow
2781 *
2782 * Locks a mutex for current thread.
2783 * If the lock is contended this function might
2784 * sleep.
2785 *
2786 * Called with interlock not held.
2787 */
2788 __attribute__((noinline))
2789 void
2790 lck_mtx_lock_slow(
2791 lck_mtx_t *lock)
2792 {
2793 boolean_t indirect = FALSE;
2794 uint32_t state;
2795 int first_miss = 0;
2796
2797 state = ordered_load_mtx_state(lock);
2798
2799 /* is the interlock or mutex held */
2800 if (__improbable(state & ((LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK)))) {
2801 /*
2802 * Note: both LCK_MTX_TAG_DESTROYED and LCK_MTX_TAG_INDIRECT
2803 * have LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK
2804 * set in state (state == lck_mtx_tag)
2805 */
2806
2807
2808 /* is the mutex already held and not indirect */
2809 if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))){
2810 /* no, must have been the mutex */
2811 return lck_mtx_lock_contended(lock, indirect, &first_miss);
2812 }
2813
2814 /* check to see if it is marked destroyed */
2815 if (__improbable(state == LCK_MTX_TAG_DESTROYED)) {
2816 lck_mtx_destroyed(lock);
2817 }
2818
2819 /* Is this an indirect mutex? */
2820 if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
2821 indirect = get_indirect_mutex(&lock, &state);
2822
2823 first_miss = 0;
2824 lck_grp_mtx_update_held((struct _lck_mtx_ext_*)lock);
2825
2826 if (state & LCK_MTX_SPIN_MSK) {
2827 /* M_SPIN_MSK was set, so M_ILOCKED_MSK must also be present */
2828 assert(state & LCK_MTX_ILOCKED_MSK);
2829 lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
2830 }
2831 }
2832
2833 if (!lck_mtx_lock_wait_interlock_to_clear(lock, &state)) {
2834 return lck_mtx_lock_contended(lock, indirect, &first_miss);
2835 }
2836 }
2837
2838 /* no - can't be INDIRECT, DESTROYED or locked */
2839 while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_MLOCKED_MSK, &state))) {
2840 if (!lck_mtx_lock_wait_interlock_to_clear(lock, &state)) {
2841 return lck_mtx_lock_contended(lock, indirect, &first_miss);
2842 }
2843 }
2844
2845 /* lock and interlock acquired */
2846
2847 thread_t thread = current_thread();
2848 /* record owner of mutex */
2849 ordered_store_mtx_owner(lock, (uintptr_t)thread);
2850
2851 #if MACH_LDEBUG
2852 if (thread) {
2853 thread->mutex_count++; /* lock statistic */
2854 }
2855 #endif
2856 /*
2857 * Check if there are waiters to
2858 * inherit their priority.
2859 */
2860 if (__improbable(state & LCK_MTX_WAITERS_MSK)) {
2861 return lck_mtx_lock_acquire_tail(lock, indirect, NULL);
2862 }
2863
2864 /* release the interlock */
2865 lck_mtx_lock_finish_inline(lock, ordered_load_mtx_state(lock), indirect);
2866
2867 return;
2868 }
2869
2870 __attribute__((noinline))
2871 boolean_t
2872 lck_mtx_try_lock_slow(
2873 lck_mtx_t *lock)
2874 {
2875 boolean_t indirect = FALSE;
2876 uint32_t state;
2877 int first_miss = 0;
2878
2879 state = ordered_load_mtx_state(lock);
2880
2881 /* is the interlock or mutex held */
2882 if (__improbable(state & ((LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK)))) {
2883 /*
2884 * Note: both LCK_MTX_TAG_DESTROYED and LCK_MTX_TAG_INDIRECT
2885 * have LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK
2886 * set in state (state == lck_mtx_tag)
2887 */
2888
2889 /* is the mutex already held and not indirect */
2890 if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))){
2891 return FALSE;
2892 }
2893
2894 /* check to see if it is marked destroyed */
2895 if (__improbable(state == LCK_MTX_TAG_DESTROYED)) {
2896 lck_mtx_try_destroyed(lock);
2897 }
2898
2899 /* Is this an indirect mutex? */
2900 if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
2901 indirect = get_indirect_mutex(&lock, &state);
2902
2903 first_miss = 0;
2904 lck_grp_mtx_update_held((struct _lck_mtx_ext_*)lock);
2905 }
2906
2907 if (!lck_mtx_try_lock_wait_interlock_to_clear(lock, &state)) {
2908 if (indirect)
2909 lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
2910 return FALSE;
2911 }
2912 }
2913
2914 /* no - can't be INDIRECT, DESTROYED or locked */
2915 while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_MLOCKED_MSK, &state))) {
2916 if (!lck_mtx_try_lock_wait_interlock_to_clear(lock, &state)) {
2917 if (indirect)
2918 lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
2919 return FALSE;
2920 }
2921 }
2922
2923 /* lock and interlock acquired */
2924
2925 thread_t thread = current_thread();
2926 /* record owner of mutex */
2927 ordered_store_mtx_owner(lock, (uintptr_t)thread);
2928
2929 #if MACH_LDEBUG
2930 if (thread) {
2931 thread->mutex_count++; /* lock statistic */
2932 }
2933 #endif
2934 /*
2935 * Check if there are waiters to
2936 * inherit their priority.
2937 */
2938 if (__improbable(state & LCK_MTX_WAITERS_MSK)) {
2939 return lck_mtx_try_lock_acquire_tail(lock);
2940 }
2941
2942 /* release the interlock */
2943 lck_mtx_try_lock_finish_inline(lock, ordered_load_mtx_state(lock));
2944
2945 return TRUE;
2946
2947 }
2948
2949 __attribute__((noinline))
2950 void
2951 lck_mtx_lock_spin_slow(
2952 lck_mtx_t *lock)
2953 {
2954 boolean_t indirect = FALSE;
2955 uint32_t state;
2956 int first_miss = 0;
2957
2958 state = ordered_load_mtx_state(lock);
2959
2960 /* is the interlock or mutex held */
2961 if (__improbable(state & ((LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK)))) {
2962 /*
2963 * Note: both LCK_MTX_TAG_DESTROYED and LCK_MTX_TAG_INDIRECT
2964 * have LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK
2965 * set in state (state == lck_mtx_tag)
2966 */
2967
2968
2969 /* is the mutex already held and not indirect */
2970 if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))){
2971 /* no, must have been the mutex */
2972 return lck_mtx_lock_contended(lock, indirect, &first_miss);
2973 }
2974
2975 /* check to see if it is marked destroyed */
2976 if (__improbable(state == LCK_MTX_TAG_DESTROYED)) {
2977 lck_mtx_destroyed(lock);
2978 }
2979
2980 /* Is this an indirect mutex? */
2981 if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
2982 indirect = get_indirect_mutex(&lock, &state);
2983
2984 first_miss = 0;
2985 lck_grp_mtx_update_held((struct _lck_mtx_ext_*)lock);
2986
2987 if (state & LCK_MTX_SPIN_MSK) {
2988 /* M_SPIN_MSK was set, so M_ILOCKED_MSK must also be present */
2989 assert(state & LCK_MTX_ILOCKED_MSK);
2990 lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
2991 }
2992 }
2993
2994 if (!lck_mtx_lock_wait_interlock_to_clear(lock, &state)) {
2995 return lck_mtx_lock_contended(lock, indirect, &first_miss);
2996 }
2997 }
2998
2999 /* no - can't be INDIRECT, DESTROYED or locked */
3000 while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_SPIN_MSK, &state) )) {
3001 if (!lck_mtx_lock_wait_interlock_to_clear(lock, &state)) {
3002 return lck_mtx_lock_contended(lock, indirect, &first_miss);
3003 }
3004 }
3005
3006 /* lock as spinlock and interlock acquired */
3007
3008 thread_t thread = current_thread();
3009 /* record owner of mutex */
3010 ordered_store_mtx_owner(lock, (uintptr_t)thread);
3011
3012 #if MACH_LDEBUG
3013 if (thread) {
3014 thread->mutex_count++; /* lock statistic */
3015 }
3016 #endif
3017
3018 #if CONFIG_DTRACE
3019 LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN_ACQUIRE, lock, 0);
3020 #endif
3021 /* return with the interlock held and preemption disabled */
3022 return;
3023 }
3024
3025 __attribute__((noinline))
3026 boolean_t
3027 lck_mtx_try_lock_spin_slow(
3028 lck_mtx_t *lock)
3029 {
3030 boolean_t indirect = FALSE;
3031 uint32_t state;
3032 int first_miss = 0;
3033
3034 state = ordered_load_mtx_state(lock);
3035
3036 /* is the interlock or mutex held */
3037 if (__improbable(state & ((LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK)))) {
3038 /*
3039 * Note: both LCK_MTX_TAG_DESTROYED and LCK_MTX_TAG_INDIRECT
3040 * have LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK
3041 * set in state (state == lck_mtx_tag)
3042 */
3043
3044 /* is the mutex already held and not indirect */
3045 if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))){
3046 return FALSE;
3047 }
3048
3049 /* check to see if it is marked destroyed */
3050 if (__improbable(state == LCK_MTX_TAG_DESTROYED)) {
3051 lck_mtx_try_destroyed(lock);
3052 }
3053
3054 /* Is this an indirect mutex? */
3055 if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
3056 indirect = get_indirect_mutex(&lock, &state);
3057
3058 first_miss = 0;
3059 lck_grp_mtx_update_held((struct _lck_mtx_ext_*)lock);
3060 }
3061
3062 if (!lck_mtx_try_lock_wait_interlock_to_clear(lock, &state)) {
3063 if (indirect)
3064 lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
3065 return FALSE;
3066 }
3067 }
3068
3069 /* no - can't be INDIRECT, DESTROYED or locked */
3070 while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_SPIN_MSK, &state))) {
3071 if (!lck_mtx_try_lock_wait_interlock_to_clear(lock, &state)) {
3072 if (indirect)
3073 lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
3074 return FALSE;
3075 }
3076 }
3077
3078 /* lock and interlock acquired */
3079
3080 thread_t thread = current_thread();
3081 /* record owner of mutex */
3082 ordered_store_mtx_owner(lock, (uintptr_t)thread);
3083
3084 #if MACH_LDEBUG
3085 if (thread) {
3086 thread->mutex_count++; /* lock statistic */
3087 }
3088 #endif
3089
3090 #if CONFIG_DTRACE
3091 LOCKSTAT_RECORD(LS_LCK_MTX_TRY_SPIN_LOCK_ACQUIRE, lock, 0);
3092 #endif
3093 return TRUE;
3094
3095 }
3096
3097 __attribute__((noinline))
3098 void
3099 lck_mtx_convert_spin(
3100 lck_mtx_t *lock)
3101 {
3102 uint32_t state;
3103
3104 state = ordered_load_mtx_state(lock);
3105
3106 /* Is this an indirect mutex? */
3107 if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
3108 /* If so, take indirection */
3109 get_indirect_mutex(&lock, &state);
3110 }
3111
3112 assertf((thread_t)lock->lck_mtx_owner == current_thread(), "lock %p not owned by thread %p (current owner %p)", lock, current_thread(), (thread_t)lock->lck_mtx_owner );
3113
3114 if (__improbable(state & LCK_MTX_MLOCKED_MSK)) {
3115 /* already owned as a mutex, just return */
3116 return;
3117 }
3118
3119 assert(get_preemption_level() > 0);
3120 assert(state & LCK_MTX_ILOCKED_MSK);
3121 assert(state & LCK_MTX_SPIN_MSK);
3122
3123 /*
3124 * Check if there are waiters to
3125 * inherit their priority.
3126 */
3127 if (__improbable(state & LCK_MTX_WAITERS_MSK)) {
3128 return lck_mtx_convert_spin_acquire_tail(lock);
3129 }
3130
3131 lck_mtx_convert_spin_finish_inline(lock, ordered_load_mtx_state(lock));
3132
3133 return;
3134 }
3135
3136 static inline boolean_t
3137 lck_mtx_lock_grab_mutex(
3138 lck_mtx_t *lock)
3139 {
3140 uint32_t state;
3141
3142 state = ordered_load_mtx_state(lock);
3143
3144 if (!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_MLOCKED_MSK, &state)) {
3145 return FALSE;
3146 }
3147
3148 /* lock and interlock acquired */
3149
3150 thread_t thread = current_thread();
3151 /* record owner of mutex */
3152 ordered_store_mtx_owner(lock, (uintptr_t)thread);
3153
3154 #if MACH_LDEBUG
3155 if (thread) {
3156 thread->mutex_count++; /* lock statistic */
3157 }
3158 #endif
3159 return TRUE;
3160 }
3161
3162 __attribute__((noinline))
3163 void
3164 lck_mtx_assert(
3165 lck_mtx_t *lock,
3166 unsigned int type)
3167 {
3168 thread_t thread, owner;
3169 uint32_t state;
3170
3171 thread = current_thread();
3172 state = ordered_load_mtx_state(lock);
3173
3174 if (state == LCK_MTX_TAG_INDIRECT) {
3175 get_indirect_mutex(&lock, &state);
3176 }
3177
3178 owner = (thread_t)lock->lck_mtx_owner;
3179
3180 if (type == LCK_MTX_ASSERT_OWNED) {
3181 if (owner != thread || !(state & (LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK)))
3182 panic("mutex (%p) not owned\n", lock);
3183 } else {
3184 assert (type == LCK_MTX_ASSERT_NOTOWNED);
3185 if (owner == thread)
3186 panic("mutex (%p) owned\n", lock);
3187 }
3188 }
3189
3190 /*
3191 * Routine: lck_mtx_lock_spinwait_x86
3192 *
3193 * Invoked trying to acquire a mutex when there is contention but
3194 * the holder is running on another processor. We spin for up to a maximum
3195 * time waiting for the lock to be released.
3196 *
3197 * Called with the interlock unlocked.
3198 * returns LCK_MTX_SPINWAIT_ACQUIRED if mutex acquired
3199 * returns LCK_MTX_SPINWAIT_SPUN if we spun
3200 * returns LCK_MTX_SPINWAIT_NO_SPIN if we didn't spin due to the holder not running
3201 */
3202 __attribute__((noinline))
3203 lck_mtx_spinwait_ret_type_t
3204 lck_mtx_lock_spinwait_x86(
3205 lck_mtx_t *mutex)
3206 {
3207 __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex);
3208 thread_t holder;
3209 uint64_t overall_deadline;
3210 uint64_t check_owner_deadline;
3211 uint64_t cur_time;
3212 lck_mtx_spinwait_ret_type_t retval = LCK_MTX_SPINWAIT_SPUN;
3213 int loopcount = 0;
3214
3215 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_START,
3216 trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, 0, 0);
3217
3218 cur_time = mach_absolute_time();
3219 overall_deadline = cur_time + MutexSpin;
3220 check_owner_deadline = cur_time;
3221
3222 /*
3223 * Spin while:
3224 * - mutex is locked, and
3225 * - its locked as a spin lock, and
3226 * - owner is running on another processor, and
3227 * - owner (processor) is not idling, and
3228 * - we haven't spun for long enough.
3229 */
3230 do {
3231 if (__probable(lck_mtx_lock_grab_mutex(mutex))) {
3232 retval = LCK_MTX_SPINWAIT_ACQUIRED;
3233 break;
3234 }
3235 cur_time = mach_absolute_time();
3236
3237 if (cur_time >= overall_deadline)
3238 break;
3239
3240 if (cur_time >= check_owner_deadline && mutex->lck_mtx_owner) {
3241 boolean_t istate;
3242
3243 /*
3244 * We will repeatedly peek at the state of the lock while spinning,
3245 * and we will acquire the interlock to do so.
3246 * The thread that will unlock the mutex will also need to acquire
3247 * the interlock, and we want to avoid to slow it down.
3248 * To avoid to get an interrupt while holding the interlock
3249 * and increase the time we are holding it, we
3250 * will try to acquire the interlock with interrupts disabled.
3251 * This is safe because it is a "try_lock", if we can't acquire
3252 * the interlock we re-enable the interrupts and fail, so it is
3253 * ok to call it even if the interlock was already held.
3254 */
3255 if (lck_mtx_interlock_try_lock_disable_interrupts(mutex, &istate)) {
3256
3257 if ((holder = (thread_t) mutex->lck_mtx_owner) != NULL) {
3258
3259 if ( !(holder->machine.specFlags & OnProc) ||
3260 (holder->state & TH_IDLE)) {
3261
3262 lck_mtx_interlock_unlock_enable_interrupts(mutex, istate);
3263
3264 if (loopcount == 0)
3265 retval = LCK_MTX_SPINWAIT_NO_SPIN;
3266 break;
3267 }
3268 }
3269 lck_mtx_interlock_unlock_enable_interrupts(mutex, istate);
3270
3271 check_owner_deadline = cur_time + (MutexSpin / 4);
3272 }
3273 }
3274 cpu_pause();
3275
3276 loopcount++;
3277
3278 } while (TRUE);
3279
3280 #if CONFIG_DTRACE
3281 /*
3282 * We've already kept a count via overall_deadline of how long we spun.
3283 * If dtrace is active, then we compute backwards to decide how
3284 * long we spun.
3285 *
3286 * Note that we record a different probe id depending on whether
3287 * this is a direct or indirect mutex. This allows us to
3288 * penalize only lock groups that have debug/stats enabled
3289 * with dtrace processing if desired.
3290 */
3291 if (__probable(mutex->lck_mtx_is_ext == 0)) {
3292 LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN, mutex,
3293 mach_absolute_time() - (overall_deadline - MutexSpin));
3294 } else {
3295 LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_SPIN, mutex,
3296 mach_absolute_time() - (overall_deadline - MutexSpin));
3297 }
3298 /* The lockstat acquire event is recorded by the assembly code beneath us. */
3299 #endif
3300
3301 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_END,
3302 trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, retval, 0);
3303
3304 return retval;
3305 }
3306
3307
3308
3309 /*
3310 * Routine: lck_mtx_lock_wait_x86
3311 *
3312 * Invoked in order to wait on contention.
3313 *
3314 * Called with the interlock locked and
3315 * preemption disabled...
3316 * returns it unlocked and with preemption enabled
3317 *
3318 * lck_mtx_waiters is 1:1 with a wakeup needing to occur.
3319 * A runnable waiter can exist between wait and acquire
3320 * without a waiters count being set.
3321 * This allows us to never make a spurious wakeup call.
3322 *
3323 * Priority:
3324 * This avoids taking the thread lock if the owning thread is the same priority.
3325 * This optimizes the case of same-priority threads contending on a lock.
3326 * However, that allows the owning thread to drop in priority while holding the lock,
3327 * because there is no state that the priority change can notice that
3328 * says that the targeted thread holds a contended mutex.
3329 *
3330 * One possible solution: priority changes could look for some atomic tag
3331 * on the thread saying 'holding contended lock', and then set up a promotion.
3332 * Needs a story for dropping that promotion - the last contended unlock
3333 * has to notice that this has happened.
3334 */
3335 __attribute__((noinline))
3336 void
3337 lck_mtx_lock_wait_x86 (
3338 lck_mtx_t *mutex,
3339 struct turnstile **ts)
3340 {
3341 thread_t self = current_thread();
3342
3343 #if CONFIG_DTRACE
3344 uint64_t sleep_start = 0;
3345
3346 if (lockstat_probemap[LS_LCK_MTX_LOCK_BLOCK] || lockstat_probemap[LS_LCK_MTX_EXT_LOCK_BLOCK]) {
3347 sleep_start = mach_absolute_time();
3348 }
3349 #endif
3350 __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex);
3351
3352 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_START,
3353 trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner),
3354 mutex->lck_mtx_waiters, 0, 0);
3355
3356 assert(self->waiting_for_mutex == NULL);
3357 self->waiting_for_mutex = mutex;
3358 mutex->lck_mtx_waiters++;
3359
3360 thread_t holder = (thread_t)mutex->lck_mtx_owner;
3361 assert(holder != NULL);
3362
3363 /*
3364 * lck_mtx_lock_wait_x86 might be called on a loop. Call prepare just once and reuse
3365 * the same turnstile while looping, the matching turnstile compleate will be called
3366 * by lck_mtx_lock_contended when finally acquiring the lock.
3367 */
3368 if (*ts == NULL) {
3369 *ts = turnstile_prepare((uintptr_t)mutex, NULL, TURNSTILE_NULL, TURNSTILE_KERNEL_MUTEX);
3370 }
3371
3372 struct turnstile *turnstile = *ts;
3373 thread_set_pending_block_hint(self, kThreadWaitKernelMutex);
3374 turnstile_update_inheritor(turnstile, holder, (TURNSTILE_DELAYED_UPDATE | TURNSTILE_INHERITOR_THREAD));
3375
3376 waitq_assert_wait64(&turnstile->ts_waitq, CAST_EVENT64_T(LCK_MTX_EVENT(mutex)), THREAD_UNINT | THREAD_WAIT_NOREPORT_USER, TIMEOUT_WAIT_FOREVER);
3377
3378 lck_mtx_ilk_unlock(mutex);
3379
3380 turnstile_update_inheritor_complete(turnstile, TURNSTILE_INTERLOCK_NOT_HELD);
3381
3382 thread_block(THREAD_CONTINUE_NULL);
3383
3384 self->waiting_for_mutex = NULL;
3385
3386 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_END,
3387 trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner),
3388 mutex->lck_mtx_waiters, 0, 0);
3389
3390 #if CONFIG_DTRACE
3391 /*
3392 * Record the Dtrace lockstat probe for blocking, block time
3393 * measured from when we were entered.
3394 */
3395 if (sleep_start) {
3396 if (mutex->lck_mtx_is_ext == 0) {
3397 LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_BLOCK, mutex,
3398 mach_absolute_time() - sleep_start);
3399 } else {
3400 LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_BLOCK, mutex,
3401 mach_absolute_time() - sleep_start);
3402 }
3403 }
3404 #endif
3405 }
3406
3407 /*
3408 * Routine: kdp_lck_mtx_lock_spin_is_acquired
3409 * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
3410 * Returns: TRUE if lock is acquired.
3411 */
3412 boolean_t
3413 kdp_lck_mtx_lock_spin_is_acquired(lck_mtx_t *lck)
3414 {
3415 if (not_in_kdp) {
3416 panic("panic: kdp_lck_mtx_lock_spin_is_acquired called outside of kernel debugger");
3417 }
3418
3419 if (lck->lck_mtx_ilocked || lck->lck_mtx_mlocked) {
3420 return TRUE;
3421 }
3422
3423 return FALSE;
3424 }
3425
3426 void
3427 kdp_lck_mtx_find_owner(__unused struct waitq * waitq, event64_t event, thread_waitinfo_t * waitinfo)
3428 {
3429 lck_mtx_t * mutex = LCK_EVENT_TO_MUTEX(event);
3430 waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(mutex);
3431 thread_t holder = (thread_t)mutex->lck_mtx_owner;
3432 waitinfo->owner = thread_tid(holder);
3433 }
3434
3435 void
3436 kdp_rwlck_find_owner(__unused struct waitq * waitq, event64_t event, thread_waitinfo_t * waitinfo)
3437 {
3438 lck_rw_t *rwlck = NULL;
3439 switch(waitinfo->wait_type) {
3440 case kThreadWaitKernelRWLockRead:
3441 rwlck = READ_EVENT_TO_RWLOCK(event);
3442 break;
3443 case kThreadWaitKernelRWLockWrite:
3444 case kThreadWaitKernelRWLockUpgrade:
3445 rwlck = WRITE_EVENT_TO_RWLOCK(event);
3446 break;
3447 default:
3448 panic("%s was called with an invalid blocking type", __FUNCTION__);
3449 break;
3450 }
3451 waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(rwlck);
3452 waitinfo->owner = 0;
3453 }